From ddac75e0efef0deeb8ea189e69909371db13f0ef Mon Sep 17 00:00:00 2001 From: austinpagan Date: Thu, 1 Feb 2024 12:24:25 -0600 Subject: [PATCH 01/23] Adding .C versions of CGEMM and ZGEMM --- kernel/power/KERNEL.POWER10 | 24 +- kernel/power/cgemm_kernel_power10.c | 1154 +++++++++++++++++++++++++++ kernel/power/zgemm_kernel_power10.c | 761 ++++++++++++++++++ 3 files changed, 1931 insertions(+), 8 deletions(-) create mode 100644 kernel/power/cgemm_kernel_power10.c create mode 100644 kernel/power/zgemm_kernel_power10.c diff --git a/kernel/power/KERNEL.POWER10 b/kernel/power/KERNEL.POWER10 index 9047c714c..5f49b9c46 100644 --- a/kernel/power/KERNEL.POWER10 +++ b/kernel/power/KERNEL.POWER10 @@ -17,11 +17,15 @@ SBGEMMOTCOPYOBJ = sbgemm_otcopy$(TSUFFIX).$(SUFFIX) STRMMKERNEL = sgemm_kernel_power10.c DTRMMKERNEL = dgemm_kernel_power10.c ifeq ($(OSNAME), AIX) -CTRMMKERNEL = ctrmm_kernel_8x4_power8.S -ZTRMMKERNEL = ztrmm_kernel_8x2_power8.S +#CTRMMKERNEL = ctrmm_kernel_8x4_power8.S +#ZTRMMKERNEL = ztrmm_kernel_8x2_power8.S +CTRMMKERNEL = cgemm_kernel_power10.c +ZTRMMKERNEL = zgemm_kernel_power10.c else -CTRMMKERNEL = cgemm_kernel_power10.S -ZTRMMKERNEL = zgemm_kernel_power10.S +#CTRMMKERNEL = cgemm_kernel_power10.S +#ZTRMMKERNEL = zgemm_kernel_power10.S +CTRMMKERNEL = cgemm_kernel_power10.c +ZTRMMKERNEL = zgemm_kernel_power10.c endif SGEMMKERNEL = sgemm_kernel_power10.c @@ -65,9 +69,11 @@ DGEMM_SMALL_K_TN = dgemm_small_kernel_tn_power10.c DGEMM_SMALL_K_B0_TN = dgemm_small_kernel_tn_power10.c ifeq ($(OSNAME), AIX) -CGEMMKERNEL = cgemm_kernel_8x4_power8.S +#CGEMMKERNEL = cgemm_kernel_8x4_power8.S +CGEMMKERNEL = cgemm_kernel_power10.c else -CGEMMKERNEL = cgemm_kernel_power10.S +#CGEMMKERNEL = cgemm_kernel_power10.S +CGEMMKERNEL = cgemm_kernel_power10.c endif #CGEMMKERNEL = cgemm_kernel_8x4_power8.S CGEMMINCOPY = ../generic/zgemm_ncopy_8.c @@ -84,9 +90,11 @@ CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) ifeq ($(OSNAME), AIX) -ZGEMMKERNEL = zgemm_kernel_8x2_power8.S +#ZGEMMKERNEL = zgemm_kernel_8x2_power8.S +ZGEMMKERNEL = zgemm_kernel_power10.c else -ZGEMMKERNEL = zgemm_kernel_power10.S +#ZGEMMKERNEL = zgemm_kernel_power10.S +ZGEMMKERNEL = zgemm_kernel_power10.c endif ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c diff --git a/kernel/power/cgemm_kernel_power10.c b/kernel/power/cgemm_kernel_power10.c new file mode 100644 index 000000000..279c83aec --- /dev/null +++ b/kernel/power/cgemm_kernel_power10.c @@ -0,0 +1,1154 @@ +/********************************************************************************* +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ +#include "common.h" +#include + +typedef __vector unsigned char vec_t; +typedef FLOAT v4sf_t __attribute__ ((vector_size (16))); +typedef FLOAT v2sf_t __attribute__ ((vector_size (8))); + +#define SET_ACC_ZERO() \ + __builtin_mma_xxsetaccz (&acc0); \ + __builtin_mma_xxsetaccz (&acc1); \ + __builtin_mma_xxsetaccz (&acc2); \ + __builtin_mma_xxsetaccz (&acc3); \ + __builtin_mma_xxsetaccz (&acc4); \ + __builtin_mma_xxsetaccz (&acc5); \ + __builtin_mma_xxsetaccz (&acc6); \ + __builtin_mma_xxsetaccz (&acc7); + +#if (defined(NN) || defined(NT) || defined(TN) || defined(TT)) +#define COMP_MUL(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real = _arbr - _aibi; _imag = _arbi + _aibr; } +#define COMP_MAC(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real += _arbr - _aibi; _imag += _arbi + _aibr; } +#endif + +#if (defined(NR) || defined(NC) || defined(TR) || defined(TC)) +#define COMP_MUL(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real = _arbr + _aibi; _imag = -_arbi + _aibr; } +#define COMP_MAC(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real += _arbr + _aibi; _imag += -_arbi + _aibr; } +#endif + +#if (defined(RN) || defined(RT) || defined(CN) || defined(CT)) +#define COMP_MUL(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real = _arbr + _aibi; _imag = _arbi - _aibr; } +#define COMP_MAC(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real += _arbr + _aibi; _imag += _arbi - _aibr; } +#endif + +#if (defined(RR) || defined(RC) || defined(CR) || defined(CC)) +#define COMP_MUL(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real = _arbr - _aibi; _imag = -_arbi - _aibr; } +#define COMP_MAC(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real += _arbr - _aibi; _imag += -_arbi - _aibr; } +#endif + +#if defined (TRMMKERNEL) +#define A_OP = +#else +#define A_OP += +#endif + +#define BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + __builtin_mma_disassemble_acc ((void *)result, &acc0); \ + __builtin_mma_disassemble_acc ((void *)&result[ 4], &acc1); \ + __builtin_mma_disassemble_acc ((void *)&result[ 8], &acc2); \ + __builtin_mma_disassemble_acc ((void *)&result[12], &acc3); \ + __builtin_mma_disassemble_acc ((void *)&result[16], &acc4); \ + __builtin_mma_disassemble_acc ((void *)&result[20], &acc5); \ + __builtin_mma_disassemble_acc ((void *)&result[24], &acc6); \ + __builtin_mma_disassemble_acc ((void *)&result[28], &acc7); + +#define COMP_MUL_1 \ + COMP_MUL(tr[0], res[ 0], res[ 5], ti[0], res[ 1], res[ 4]) + +#define COMP_MAC_1(_offset) { \ + FLOAT *_ro = &res[_offset]; \ + COMP_MAC(tr[0], _ro[ 0], _ro[ 5], ti[0], _ro[ 1], _ro[ 4]) \ +} + +#define COMP_MUL_2A \ + COMP_MUL(tr[0], res[ 0], res[ 5], ti[0], res[ 1], res[ 4]) \ + COMP_MUL(tr[1], res[ 2], res[ 7], ti[1], res[ 3], res[ 6]) + +#define COMP_MAC_2A(_offset) { \ + FLOAT *_ro = &res[_offset]; \ + COMP_MAC(tr[0], _ro[ 0], _ro[ 5], ti[0], _ro[ 1], _ro[ 4]) \ + COMP_MAC(tr[1], _ro[ 2], _ro[ 7], ti[1], _ro[ 3], _ro[ 6]) \ +} + +#define COMP_MUL_2B \ + COMP_MUL(tr[0], res[ 0], res[ 5], ti[0], res[ 1], res[ 4]) \ + COMP_MUL(tr[1], res[ 8], res[13], ti[1], res[ 9], res[12]) + +#define COMP_MAC_2B(_offset) { \ + FLOAT *_ro = &res[_offset]; \ + COMP_MAC(tr[0], _ro[ 0], _ro[ 5], ti[0], _ro[ 1], _ro[ 4]) \ + COMP_MAC(tr[1], _ro[ 8], _ro[13], ti[1], _ro[ 9], _ro[12]) \ +} + +#define COMP_MUL_4A(_offset) { \ + FLOAT *_ro = &res[_offset]; \ + COMP_MUL(tr[0], _ro[ 0], _ro[ 5], ti[0], _ro[ 1], _ro[ 4]) \ + COMP_MUL(tr[1], _ro[ 8], _ro[13], ti[1], _ro[ 9], _ro[12]) \ + COMP_MUL(tr[2], _ro[16], _ro[21], ti[2], _ro[17], _ro[20]) \ + COMP_MUL(tr[3], _ro[24], _ro[29], ti[3], _ro[25], _ro[28]) \ +} + +#define COMP_MAC_4A(_offset) { \ + FLOAT *_ro = &res[_offset]; \ + COMP_MAC(tr[0], _ro[ 0], _ro[ 5], ti[0], _ro[ 1], _ro[ 4]) \ + COMP_MAC(tr[1], _ro[ 8], _ro[13], ti[1], _ro[ 9], _ro[12]) \ + COMP_MAC(tr[2], _ro[16], _ro[21], ti[2], _ro[17], _ro[20]) \ + COMP_MAC(tr[3], _ro[24], _ro[29], ti[3], _ro[25], _ro[28]) \ +} + +#define COMP_MUL_4B(_offset) { \ + FLOAT *_ro = &res[_offset]; \ + COMP_MUL(tr[0], _ro[ 0], _ro[ 5], ti[0], _ro[ 1], _ro[ 4]) \ + COMP_MUL(tr[1], _ro[ 8], _ro[13], ti[1], _ro[ 9], _ro[12]) \ + COMP_MUL(tr[2], _ro[ 2], _ro[ 7], ti[2], _ro[ 3], _ro[ 6]) \ + COMP_MUL(tr[3], _ro[10], _ro[15], ti[3], _ro[11], _ro[14]) \ +} + +#define COMP_MAC_4B(_offset) { \ + FLOAT *_ro = &res[_offset]; \ + COMP_MAC(tr[0], _ro[ 0], _ro[ 5], ti[0], _ro[ 1], _ro[ 4]) \ + COMP_MAC(tr[1], _ro[ 8], _ro[13], ti[1], _ro[ 9], _ro[12]) \ + COMP_MAC(tr[2], _ro[ 2], _ro[ 7], ti[2], _ro[ 3], _ro[ 6]) \ + COMP_MAC(tr[3], _ro[10], _ro[15], ti[3], _ro[11], _ro[14]) \ +} + + +#define SAVE_ACC_COMPLEX_11 \ + BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + COMP_MUL_1 \ + COMP_MAC_1(16) \ + COMP_MAC_1(32) \ + COMP_MAC_1(48) \ + COMP_MAC_1(64) \ + COMP_MAC_1(80) \ + COMP_MAC_1(96) \ + COMP_MAC_1(112) \ + CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; + +#define SAVE_ACC_COMPLEX_12 \ + BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + COMP_MUL_2A \ + COMP_MAC_2A(16) \ + COMP_MAC_2A(32) \ + COMP_MAC_2A(48) \ + COMP_MAC_2A(64) \ + COMP_MAC_2A(80) \ + COMP_MAC_2A(96) \ + COMP_MAC_2A(112) \ + CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[2*ldc+0] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[2*ldc+1] A_OP ti[1] * alpha_r + tr[1] * alpha_i; + +#define SAVE_ACC_COMPLEX_21_1 \ + BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + COMP_MUL_2B \ + COMP_MAC_2B(16) \ + COMP_MAC_2B(32) \ + COMP_MAC_2B(48) \ + COMP_MAC_2B(64) \ + COMP_MAC_2B(80) \ + COMP_MAC_2B(96) \ + COMP_MAC_2B(112) \ + CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; + +#define SAVE_ACC_COMPLEX_21_2 \ + BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + COMP_MUL_4A(0) \ + COMP_MAC_4A(32) \ + COMP_MAC_4A(64) \ + COMP_MAC_4A(96) \ + CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[4] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[5] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[6] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[7] A_OP ti[3] * alpha_r + tr[3] * alpha_i; + +#define SAVE_ACC_COMPLEX_21_4 \ + BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + COMP_MUL_4A(0) \ + COMP_MAC_4A(64) \ + CO[ 0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[ 1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[ 2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[ 3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[ 4] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[ 5] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[ 6] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[ 7] A_OP ti[3] * alpha_r + tr[3] * alpha_i; \ + COMP_MUL_4A(32) \ + COMP_MAC_4A(96) \ + CO[ 8] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[ 9] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[10] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[11] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[12] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[13] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[14] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[15] A_OP ti[3] * alpha_r + tr[3] * alpha_i; + +#define SAVE_ACC_COMPLEX_22_4 \ + BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + COMP_MUL_4B(0) \ + CO[ 0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[ 1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[ 2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[ 3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[2*ldc+ 0] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[2*ldc+ 1] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[2*ldc+ 2] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[2*ldc+ 3] A_OP ti[3] * alpha_r + tr[3] * alpha_i; \ + COMP_MUL_4B(16) \ + CO[ 4] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[ 5] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[ 6] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[ 7] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[2*ldc+ 4] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[2*ldc+ 5] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[2*ldc+ 6] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[2*ldc+ 7] A_OP ti[3] * alpha_r + tr[3] * alpha_i; \ + COMP_MUL_4B(32) \ + CO[ 8] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[ 9] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[10] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[11] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[2*ldc+ 8] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[2*ldc+ 9] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[2*ldc+10] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[2*ldc+11] A_OP ti[3] * alpha_r + tr[3] * alpha_i; \ + COMP_MUL_4B(48) \ + CO[12] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[13] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[14] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[15] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[2*ldc+12] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[2*ldc+13] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[2*ldc+14] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[2*ldc+15] A_OP ti[3] * alpha_r + tr[3] * alpha_i; + +#define SAVE_ACC_COMPLEX_22_2 \ + BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + COMP_MUL_4B(0) \ + CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[2*ldc+0] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[2*ldc+1] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[2*ldc+2] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[2*ldc+3] A_OP ti[3] * alpha_r + tr[3] * alpha_i; \ + COMP_MUL_4B(16) \ + CO[4] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[5] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[6] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[7] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[2*ldc+4] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[2*ldc+5] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[2*ldc+6] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[2*ldc+7] A_OP ti[3] * alpha_r + tr[3] * alpha_i; + +#define SAVE_ACC_COMPLEX_22_1 \ + BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + COMP_MUL_4B(0) \ + CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[2*ldc+0] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[2*ldc+1] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[2*ldc+2] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[2*ldc+3] A_OP ti[3] * alpha_r + tr[3] * alpha_i; + +#define SAVE_ACC_COMPLEX_24_ALL \ + __builtin_mma_disassemble_acc ((void *)result, &acc0); \ + __builtin_mma_disassemble_acc ((void *)(&result[4]), &acc4); \ + __builtin_mma_disassemble_acc ((void *)(&result[8]), &acc1); \ + __builtin_mma_disassemble_acc ((void *)(&result[12]), &acc5); \ + __builtin_mma_disassemble_acc ((void *)(&result[16]), &acc2); \ + __builtin_mma_disassemble_acc ((void *)(&result[20]), &acc6); \ + __builtin_mma_disassemble_acc ((void *)(&result[24]), &acc3); \ + __builtin_mma_disassemble_acc ((void *)(&result[28]), &acc7); \ + COMP_MUL(tr[ 0], res[ 0], res[ 5], ti[ 0], res[ 1], res[ 4]) \ + COMP_MUL(tr[ 1], res[ 8], res[ 13], ti[ 1], res[ 9], res[ 12]) \ + COMP_MUL(tr[ 2], res[ 2], res[ 7], ti[ 2], res[ 3], res[ 6]) \ + COMP_MUL(tr[ 3], res[ 10], res[ 15], ti[ 3], res[ 11], res[ 14]) \ + COMP_MUL(tr[ 4], res[ 16], res[ 21], ti[ 4], res[ 17], res[ 20]) \ + COMP_MUL(tr[ 5], res[ 24], res[ 29], ti[ 5], res[ 25], res[ 28]) \ + COMP_MUL(tr[ 6], res[ 18], res[ 23], ti[ 6], res[ 19], res[ 22]) \ + COMP_MUL(tr[ 7], res[ 26], res[ 31], ti[ 7], res[ 27], res[ 30]) \ + COMP_MUL(tr[ 8], res[ 32], res[ 37], ti[ 8], res[ 33], res[ 36]) \ + COMP_MUL(tr[ 9], res[ 40], res[ 45], ti[ 9], res[ 41], res[ 44]) \ + COMP_MUL(tr[10], res[ 34], res[ 39], ti[10], res[ 35], res[ 38]) \ + COMP_MUL(tr[11], res[ 42], res[ 47], ti[11], res[ 43], res[ 46]) \ + COMP_MUL(tr[12], res[ 48], res[ 53], ti[12], res[ 49], res[ 52]) \ + COMP_MUL(tr[13], res[ 56], res[ 61], ti[13], res[ 57], res[ 60]) \ + COMP_MUL(tr[14], res[ 50], res[ 55], ti[14], res[ 51], res[ 54]) \ + COMP_MUL(tr[15], res[ 58], res[ 63], ti[15], res[ 59], res[ 62]) \ + COMP_MUL(tr[16], res[ 64], res[ 69], ti[16], res[ 65], res[ 68]) \ + COMP_MUL(tr[17], res[ 72], res[ 77], ti[17], res[ 73], res[ 76]) \ + COMP_MUL(tr[18], res[ 66], res[ 71], ti[18], res[ 67], res[ 70]) \ + COMP_MUL(tr[19], res[ 74], res[ 79], ti[19], res[ 75], res[ 78]) \ + COMP_MUL(tr[20], res[ 80], res[ 85], ti[20], res[ 81], res[ 84]) \ + COMP_MUL(tr[21], res[ 88], res[ 93], ti[21], res[ 89], res[ 92]) \ + COMP_MUL(tr[22], res[ 82], res[ 87], ti[22], res[ 83], res[ 86]) \ + COMP_MUL(tr[23], res[ 90], res[ 95], ti[23], res[ 91], res[ 94]) \ + COMP_MUL(tr[24], res[ 96], res[101], ti[24], res[ 97], res[100]) \ + COMP_MUL(tr[25], res[104], res[109], ti[25], res[105], res[108]) \ + COMP_MUL(tr[26], res[ 98], res[103], ti[26], res[ 99], res[102]) \ + COMP_MUL(tr[27], res[106], res[111], ti[27], res[107], res[110]) \ + COMP_MUL(tr[28], res[112], res[117], ti[28], res[113], res[116]) \ + COMP_MUL(tr[29], res[120], res[125], ti[29], res[121], res[124]) \ + COMP_MUL(tr[30], res[114], res[119], ti[30], res[115], res[118]) \ + COMP_MUL(tr[31], res[122], res[127], ti[31], res[123], res[126]) \ + CO[ 0] A_OP tr[ 0] * alpha_r - ti[ 0] * alpha_i; \ + CO[ 1] A_OP ti[ 0] * alpha_r + tr[ 0] * alpha_i; \ + CO[ 2] A_OP tr[ 1] * alpha_r - ti[ 1] * alpha_i; \ + CO[ 3] A_OP ti[ 1] * alpha_r + tr[ 1] * alpha_i; \ + CO[2*ldc+ 0] A_OP tr[ 2] * alpha_r - ti[ 2] * alpha_i; \ + CO[2*ldc+ 1] A_OP ti[ 2] * alpha_r + tr[ 2] * alpha_i; \ + CO[2*ldc+ 2] A_OP tr[ 3] * alpha_r - ti[ 3] * alpha_i; \ + CO[2*ldc+ 3] A_OP ti[ 3] * alpha_r + tr[ 3] * alpha_i; \ + CO[4*ldc+ 0] A_OP tr[ 4] * alpha_r - ti[ 4] * alpha_i; \ + CO[4*ldc+ 1] A_OP ti[ 4] * alpha_r + tr[ 4] * alpha_i; \ + CO[4*ldc+ 2] A_OP tr[ 5] * alpha_r - ti[ 5] * alpha_i; \ + CO[4*ldc+ 3] A_OP ti[ 5] * alpha_r + tr[ 5] * alpha_i; \ + CO[6*ldc+ 0] A_OP tr[ 6] * alpha_r - ti[ 6] * alpha_i; \ + CO[6*ldc+ 1] A_OP ti[ 6] * alpha_r + tr[ 6] * alpha_i; \ + CO[6*ldc+ 2] A_OP tr[ 7] * alpha_r - ti[ 7] * alpha_i; \ + CO[6*ldc+ 3] A_OP ti[ 7] * alpha_r + tr[ 7] * alpha_i; \ + CO[ 4] A_OP tr[ 8] * alpha_r - ti[ 8] * alpha_i; \ + CO[ 5] A_OP ti[ 8] * alpha_r + tr[ 8] * alpha_i; \ + CO[ 6] A_OP tr[ 9] * alpha_r - ti[ 9] * alpha_i; \ + CO[ 7] A_OP ti[ 9] * alpha_r + tr[ 9] * alpha_i; \ + CO[2*ldc+ 4] A_OP tr[10] * alpha_r - ti[10] * alpha_i; \ + CO[2*ldc+ 5] A_OP ti[10] * alpha_r + tr[10] * alpha_i; \ + CO[2*ldc+ 6] A_OP tr[11] * alpha_r - ti[11] * alpha_i; \ + CO[2*ldc+ 7] A_OP ti[11] * alpha_r + tr[11] * alpha_i; \ + CO[4*ldc+ 4] A_OP tr[12] * alpha_r - ti[12] * alpha_i; \ + CO[4*ldc+ 5] A_OP ti[12] * alpha_r + tr[12] * alpha_i; \ + CO[4*ldc+ 6] A_OP tr[13] * alpha_r - ti[13] * alpha_i; \ + CO[4*ldc+ 7] A_OP ti[13] * alpha_r + tr[13] * alpha_i; \ + CO[6*ldc+ 4] A_OP tr[14] * alpha_r - ti[14] * alpha_i; \ + CO[6*ldc+ 5] A_OP ti[14] * alpha_r + tr[14] * alpha_i; \ + CO[6*ldc+ 6] A_OP tr[15] * alpha_r - ti[15] * alpha_i; \ + CO[6*ldc+ 7] A_OP ti[15] * alpha_r + tr[15] * alpha_i; \ + CO[ 8] A_OP tr[16] * alpha_r - ti[16] * alpha_i; \ + CO[ 9] A_OP ti[16] * alpha_r + tr[16] * alpha_i; \ + CO[ 10] A_OP tr[17] * alpha_r - ti[17] * alpha_i; \ + CO[ 11] A_OP ti[17] * alpha_r + tr[17] * alpha_i; \ + CO[2*ldc+ 8] A_OP tr[18] * alpha_r - ti[18] * alpha_i; \ + CO[2*ldc+ 9] A_OP ti[18] * alpha_r + tr[18] * alpha_i; \ + CO[2*ldc+10] A_OP tr[19] * alpha_r - ti[19] * alpha_i; \ + CO[2*ldc+11] A_OP ti[19] * alpha_r + tr[19] * alpha_i; \ + CO[4*ldc+ 8] A_OP tr[20] * alpha_r - ti[20] * alpha_i; \ + CO[4*ldc+ 9] A_OP ti[20] * alpha_r + tr[20] * alpha_i; \ + CO[4*ldc+10] A_OP tr[21] * alpha_r - ti[21] * alpha_i; \ + CO[4*ldc+11] A_OP ti[21] * alpha_r + tr[21] * alpha_i; \ + CO[6*ldc+ 8] A_OP tr[22] * alpha_r - ti[22] * alpha_i; \ + CO[6*ldc+ 9] A_OP ti[22] * alpha_r + tr[22] * alpha_i; \ + CO[6*ldc+10] A_OP tr[23] * alpha_r - ti[23] * alpha_i; \ + CO[6*ldc+11] A_OP ti[23] * alpha_r + tr[23] * alpha_i; \ + CO[ 12] A_OP tr[24] * alpha_r - ti[24] * alpha_i; \ + CO[ 13] A_OP ti[24] * alpha_r + tr[24] * alpha_i; \ + CO[ 14] A_OP tr[25] * alpha_r - ti[25] * alpha_i; \ + CO[ 15] A_OP ti[25] * alpha_r + tr[25] * alpha_i; \ + CO[2*ldc+12] A_OP tr[26] * alpha_r - ti[26] * alpha_i; \ + CO[2*ldc+13] A_OP ti[26] * alpha_r + tr[26] * alpha_i; \ + CO[2*ldc+14] A_OP tr[27] * alpha_r - ti[27] * alpha_i; \ + CO[2*ldc+15] A_OP ti[27] * alpha_r + tr[27] * alpha_i; \ + CO[4*ldc+12] A_OP tr[28] * alpha_r - ti[28] * alpha_i; \ + CO[4*ldc+13] A_OP ti[28] * alpha_r + tr[28] * alpha_i; \ + CO[4*ldc+14] A_OP tr[29] * alpha_r - ti[29] * alpha_i; \ + CO[4*ldc+15] A_OP ti[29] * alpha_r + tr[29] * alpha_i; \ + CO[6*ldc+12] A_OP tr[30] * alpha_r - ti[30] * alpha_i; \ + CO[6*ldc+13] A_OP ti[30] * alpha_r + tr[30] * alpha_i; \ + CO[6*ldc+14] A_OP tr[31] * alpha_r - ti[31] * alpha_i; \ + CO[6*ldc+15] A_OP ti[31] * alpha_r + tr[31] * alpha_i; + +#define SAVE_ACC_COMPLEX_24(ACC1, ACC2, CI) \ + __builtin_mma_disassemble_acc ((void *)result, ACC1); \ + __builtin_mma_disassemble_acc ((void *)(&result[4]), ACC2); \ + COMP_MUL(tr[0], res[0], res[5], ti[0], res[1], res[4]) \ + COMP_MUL(tr[1], res[8], res[13], ti[1], res[9], res[12]) \ + COMP_MUL(tr[2], res[2], res[7], ti[2], res[3], res[6]) \ + COMP_MUL(tr[3], res[10], res[15], ti[3], res[11], res[14]) \ + COMP_MUL(tr[4], res[16], res[21], ti[4], res[17], res[20]) \ + COMP_MUL(tr[5], res[24], res[29], ti[5], res[25], res[28]) \ + COMP_MUL(tr[6], res[18], res[23], ti[6], res[19], res[22]) \ + COMP_MUL(tr[7], res[26], res[31], ti[7], res[27], res[30]) \ + CO[CI+0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[CI+1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[CI+2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[CI+3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[CI+2*ldc+0] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[CI+2*ldc+1] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[CI+2*ldc+2] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[CI+2*ldc+3] A_OP ti[3] * alpha_r + tr[3] * alpha_i; \ + CO[CI+4*ldc+0] A_OP tr[4] * alpha_r - ti[4] * alpha_i; \ + CO[CI+4*ldc+1] A_OP ti[4] * alpha_r + tr[4] * alpha_i; \ + CO[CI+4*ldc+2] A_OP tr[5] * alpha_r - ti[5] * alpha_i; \ + CO[CI+4*ldc+3] A_OP ti[5] * alpha_r + tr[5] * alpha_i; \ + CO[CI+6*ldc+0] A_OP tr[6] * alpha_r - ti[6] * alpha_i; \ + CO[CI+6*ldc+1] A_OP ti[6] * alpha_r + tr[6] * alpha_i; \ + CO[CI+6*ldc+2] A_OP tr[7] * alpha_r - ti[7] * alpha_i; \ + CO[CI+6*ldc+3] A_OP ti[7] * alpha_r + tr[7] * alpha_i; + +#define SAVE_ACC_COMPLEX_14 \ + BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + COMP_MUL(tr[0], res[ 0], res[ 5], ti[0], res[ 1], res[ 4]) \ + COMP_MUL(tr[1], res[ 2], res[ 7], ti[1], res[ 3], res[ 6]) \ + COMP_MUL(tr[2], res[ 16], res[ 21], ti[2], res[ 17], res[ 20]) \ + COMP_MUL(tr[3], res[ 18], res[ 23], ti[3], res[ 19], res[ 22]) \ + COMP_MAC(tr[0], res[ 32], res[ 37], ti[0], res[ 33], res[ 36]) \ + COMP_MAC(tr[1], res[ 34], res[ 39], ti[1], res[ 35], res[ 38]) \ + COMP_MAC(tr[2], res[ 48], res[ 53], ti[2], res[ 49], res[ 52]) \ + COMP_MAC(tr[3], res[ 50], res[ 55], ti[3], res[ 51], res[ 54]) \ + COMP_MAC(tr[0], res[ 64], res[ 69], ti[0], res[ 65], res[ 68]) \ + COMP_MAC(tr[1], res[ 66], res[ 71], ti[1], res[ 67], res[ 70]) \ + COMP_MAC(tr[2], res[ 80], res[ 85], ti[2], res[ 81], res[ 84]) \ + COMP_MAC(tr[3], res[ 82], res[ 87], ti[3], res[ 83], res[ 86]) \ + COMP_MAC(tr[0], res[ 96], res[101], ti[0], res[ 97], res[100]) \ + COMP_MAC(tr[1], res[ 98], res[103], ti[1], res[ 99], res[102]) \ + COMP_MAC(tr[2], res[112], res[117], ti[2], res[113], res[116]) \ + COMP_MAC(tr[3], res[114], res[119], ti[3], res[115], res[118]) \ + CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[2*ldc+0] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[2*ldc+1] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[4*ldc+0] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[4*ldc+1] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[6*ldc+0] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[6*ldc+1] A_OP ti[3] * alpha_r + tr[3] * alpha_i; + +#define PREFETCH1(x, y) asm volatile ("dcbt %0, %1" : : "r" (x), "b" (y) : "memory"); + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) +#define REFRESH_TEMP_BK(x, y) \ + temp = k - off; +#elif defined(LEFT) +#define REFRESH_TEMP_BK(x, y) \ + temp = off + x; +#else +#define REFRESH_TEMP_BK(x, y) \ + temp = off + y; +#endif +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +#define REFRESH_POINTERS(x, y) \ + BO = B; \ + REFRESH_TEMP_BK(x, y) +#else +#define REFRESH_POINTERS(x, y) \ + AO += off * (2*x); \ + BO = B + off * (2*y); \ + REFRESH_TEMP_BK(x, y) +#endif + +#ifdef LEFT +#define REFRESH_OFF(x) \ + off += x; +#else +#define REFRESH_OFF(x) +#endif + +#ifdef LEFT +#define UPDATE_TEMP(x, y) \ + temp -= x; +#else +#define UPDATE_TEMP(x, y) \ + temp -= y; +#endif + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +#define REFRESH_TMP_AFTER_SAVE(x, y) \ + temp = k - off; \ + UPDATE_TEMP(x, y) \ + AO += temp * (2*x); \ + BO += temp * (2*y); +#else +#define REFRESH_TMP_AFTER_SAVE(x, y) +#endif + +#define REFRESH_AFTER_SAVE(x,y) \ + REFRESH_TMP_AFTER_SAVE(x, y) \ + REFRESH_OFF(x) +/************************************************************************************* +* GEMM Kernel +*************************************************************************************/ +int +CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT * A, FLOAT * B, + FLOAT * C, BLASLONG ldc +#ifdef TRMMKERNEL + , BLASLONG offset +#endif + ) +{ + BLASLONG i1, i, l, temp; + FLOAT *AO, *BO, *CO; +#if defined(TRMMKERNEL) + BLASLONG off; +#endif +#if defined(TRMMKERNEL) && !defined(LEFT) + off = -offset; +#endif + + __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; + + v4sf_t result[32]; + FLOAT *res, tr[64], ti[64]; + res = (FLOAT *) result; + + for (i1 = 0; i1 < (n >> 2); i1++) + { +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + AO = A; + CO = C; + C += ldc << 3; + + for (i = 0; i < (m >> 3); i++) + { +#if defined(TRMMKERNEL) + REFRESH_POINTERS (8, 4); +#else + BO = B; + temp = k; +#endif + SET_ACC_ZERO() + for (l = 0; l < temp; ++l) + { + vec_t rowA1 = *(vec_t *) & AO[l<<4]; + vec_t rowB1 = *(vec_t *) & BO[l<<3]; + vec_t rowA2 = *(vec_t *) & AO[(l<<4)+4]; + vec_t rowB2 = *(vec_t *) & BO[(l<<3)+4]; + vec_t rowA3 = *(vec_t *) & AO[(l<<4)+8]; + vec_t rowA4 = *(vec_t *) & AO[(l<<4)+12]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1); + __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB1); + __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB1); + __builtin_mma_xvf32gerpp(&acc4, rowA1, rowB2); + __builtin_mma_xvf32gerpp(&acc5, rowA2, rowB2); + __builtin_mma_xvf32gerpp(&acc6, rowA3, rowB2); + __builtin_mma_xvf32gerpp(&acc7, rowA4, rowB2); + } + SAVE_ACC_COMPLEX_24_ALL + CO += 16; + AO += temp << 4; + BO += temp << 3; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (8, 4) +#endif + } + if (m & 4) + { +#if defined(TRMMKERNEL) + REFRESH_POINTERS (4, 4); +#else + BO = B; + temp = k; +#endif + SET_ACC_ZERO() + for (l = 0; l < (temp & (~1)); l+=2) + { + vec_t rowA1 = *(vec_t *) & AO[l<<3]; + vec_t rowA2 = *(vec_t *) & AO[(l<<3)+4]; + vec_t rowA3 = *(vec_t *) & AO[(l<<3)+8]; + vec_t rowA4 = *(vec_t *) & AO[(l<<3)+12]; + vec_t rowB1 = *(vec_t *) & BO[l<<3]; + vec_t rowB2 = *(vec_t *) & BO[(l<<3)+4]; + vec_t rowB3 = *(vec_t *) & BO[(l<<3)+8]; + vec_t rowB4 = *(vec_t *) & BO[(l<<3)+12]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1); + __builtin_mma_xvf32gerpp(&acc2, rowA1, rowB2); + __builtin_mma_xvf32gerpp(&acc3, rowA2, rowB2); + __builtin_mma_xvf32gerpp(&acc0, rowA3, rowB3); + __builtin_mma_xvf32gerpp(&acc1, rowA4, rowB3); + __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB4); + __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB4); + } + for (l = (temp & (~1)); l < temp; ++l) + { + vec_t rowA1 = *(vec_t *) & AO[l<<3]; + vec_t rowA2 = *(vec_t *) & AO[(l<<3)+4]; + vec_t rowB1 = *(vec_t *) & BO[l<<3]; + vec_t rowB2 = *(vec_t *) & BO[(l<<3)+4]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1); + __builtin_mma_xvf32gerpp(&acc2, rowA1, rowB2); + __builtin_mma_xvf32gerpp(&acc3, rowA2, rowB2); + } + SAVE_ACC_COMPLEX_24(&acc0, &acc2, 0) + SAVE_ACC_COMPLEX_24(&acc1, &acc3, 4) + CO += 8; + AO += temp << 3; + BO += temp << 3; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (4, 4) +#endif + } + if (m & 2) + { +#if defined(TRMMKERNEL) + REFRESH_POINTERS (2, 4); +#else + BO = B; + temp = k; +#endif + SET_ACC_ZERO() + for (l = 0; l < (temp & (~3)); l+=4) + { + vec_t rowA1 = *(vec_t *) & AO[l<<2]; + vec_t rowA2 = *(vec_t *) & AO[(l<<2)+4]; + vec_t rowA3 = *(vec_t *) & AO[(l<<2)+8]; + vec_t rowA4 = *(vec_t *) & AO[(l<<2)+12]; + vec_t rowB1 = *(vec_t *) & BO[l<<3]; + vec_t rowB2 = *(vec_t *) & BO[(l<<3)+4]; + vec_t rowB3 = *(vec_t *) & BO[(l<<3)+8]; + vec_t rowB4 = *(vec_t *) & BO[(l<<3)+12]; + vec_t rowB5 = *(vec_t *) & BO[(l<<3)+16]; + vec_t rowB6 = *(vec_t *) & BO[(l<<3)+20]; + vec_t rowB7 = *(vec_t *) & BO[(l<<3)+24]; + vec_t rowB8 = *(vec_t *) & BO[(l<<3)+28]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA1, rowB2); + __builtin_mma_xvf32gerpp(&acc0, rowA2, rowB3); + __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB4); + __builtin_mma_xvf32gerpp(&acc0, rowA3, rowB5); + __builtin_mma_xvf32gerpp(&acc1, rowA3, rowB6); + __builtin_mma_xvf32gerpp(&acc0, rowA4, rowB7); + __builtin_mma_xvf32gerpp(&acc1, rowA4, rowB8); + } + for (l = (temp & (~3)); l < temp; ++l) + { + vec_t rowA1 = *(vec_t *) & AO[l<<2]; + vec_t rowB1 = *(vec_t *) & BO[l<<3]; + vec_t rowB2 = *(vec_t *) & BO[(l<<3)+4]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA1, rowB2); + } + SAVE_ACC_COMPLEX_24(&acc0, &acc1, 0) + CO += 4; + AO += temp << 2; + BO += temp << 3; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (2, 4) +#endif + } + if (m & 1) + { +#if defined(TRMMKERNEL) + REFRESH_POINTERS (1, 4) +#else + BO = B; + temp = k; +#endif + SET_ACC_ZERO() + for (l = 0; l < (temp & (~3)); l+=4) + { + vec_t rowA1 = *(vec_t *) & AO[l<<1]; + vec_t rowA2 = *(vec_t *) & AO[(l<<1)+2]; + vec_t rowA3 = *(vec_t *) & AO[(l<<1)+4]; + vec_t rowA4 = *(vec_t *) & AO[(l<<1)+6]; + vec_t rowB1 = *(vec_t *) & BO[l<<3]; + vec_t rowB2 = *(vec_t *) & BO[(l<<3)+4]; + vec_t rowB3 = *(vec_t *) & BO[(l<<3)+8]; + vec_t rowB4 = *(vec_t *) & BO[(l<<3)+12]; + vec_t rowB5 = *(vec_t *) & BO[(l<<3)+16]; + vec_t rowB6 = *(vec_t *) & BO[(l<<3)+20]; + vec_t rowB7 = *(vec_t *) & BO[(l<<3)+24]; + vec_t rowB8 = *(vec_t *) & BO[(l<<3)+28]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA1, rowB2); + __builtin_mma_xvf32gerpp(&acc2, rowA2, rowB3); + __builtin_mma_xvf32gerpp(&acc3, rowA2, rowB4); + __builtin_mma_xvf32gerpp(&acc4, rowA3, rowB5); + __builtin_mma_xvf32gerpp(&acc5, rowA3, rowB6); + __builtin_mma_xvf32gerpp(&acc6, rowA4, rowB7); + __builtin_mma_xvf32gerpp(&acc7, rowA4, rowB8); + } + for (l = (temp & (~3)); l < temp; ++l) + { + vec_t rowA1 = *(vec_t *) & AO[l<<1]; + vec_t rowB1 = *(vec_t *) & BO[l<<3]; + vec_t rowB2 = *(vec_t *) & BO[(l<<3)+4]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA1, rowB2); + } + SAVE_ACC_COMPLEX_14 + CO += 2; + AO += temp << 1; + BO += temp << 3; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (1, 4) +#endif + } +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 4; // number of values in A +#endif + + B += k << 3; + } + + if (n & 2) + { +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + AO = A; + CO = C; + C += ldc << 2; + + for (i = 0; i < (m >> 3); i++) + { +#if defined(TRMMKERNEL) + REFRESH_POINTERS (8, 2) +#else + BO = B; + temp = k; +#endif + SET_ACC_ZERO() + for (l = 0; l < (temp & (~1)); l+=2) + { + vec_t rowA1 = *(vec_t *) & AO[l<<4]; + vec_t rowA2 = *(vec_t *) & AO[(l<<4)+4]; + vec_t rowA3 = *(vec_t *) & AO[(l<<4)+8]; + vec_t rowA4 = *(vec_t *) & AO[(l<<4)+12]; + vec_t rowA5 = *(vec_t *) & AO[(l<<4)+16]; + vec_t rowA6 = *(vec_t *) & AO[(l<<4)+20]; + vec_t rowA7 = *(vec_t *) & AO[(l<<4)+24]; + vec_t rowA8 = *(vec_t *) & AO[(l<<4)+28]; + vec_t rowB1 = *(vec_t *) & BO[l<<2]; + vec_t rowB2 = *(vec_t *) & BO[(l<<2)+4]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1); + __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB1); + __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB1); + __builtin_mma_xvf32gerpp(&acc0, rowA5, rowB2); + __builtin_mma_xvf32gerpp(&acc1, rowA6, rowB2); + __builtin_mma_xvf32gerpp(&acc2, rowA7, rowB2); + __builtin_mma_xvf32gerpp(&acc3, rowA8, rowB2); + } + for (l = (temp & (~1)); l < temp; ++l) + { + vec_t rowA1 = *(vec_t *) & AO[l<<4]; + vec_t rowA2 = *(vec_t *) & AO[(l<<4)+4]; + vec_t rowA3 = *(vec_t *) & AO[(l<<4)+8]; + vec_t rowA4 = *(vec_t *) & AO[(l<<4)+12]; + vec_t rowB1 = *(vec_t *) & BO[l<<2]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1); + __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB1); + __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB1); + } + SAVE_ACC_COMPLEX_22_4 + AO += temp << 4; + BO += temp << 2; + CO += 16; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (8, 2) +#endif + } + if (m & 4) + { +#if defined(TRMMKERNEL) + REFRESH_POINTERS (4, 2) +#else + BO = B; + temp = k; +#endif + SET_ACC_ZERO() + for (l = 0; l < (temp & (~3)); l+=4) + { + vec_t rowA1 = *(vec_t *) & AO[l<<3]; + vec_t rowA2 = *(vec_t *) & AO[(l<<3)+4]; + vec_t rowA3 = *(vec_t *) & AO[(l<<3)+8]; + vec_t rowA4 = *(vec_t *) & AO[(l<<3)+12]; + vec_t rowA5 = *(vec_t *) & AO[(l<<3)+16]; + vec_t rowA6 = *(vec_t *) & AO[(l<<3)+20]; + vec_t rowA7 = *(vec_t *) & AO[(l<<3)+24]; + vec_t rowA8 = *(vec_t *) & AO[(l<<3)+28]; + vec_t rowB1 = *(vec_t *) & BO[l<<2]; + vec_t rowB2 = *(vec_t *) & BO[(l<<2)+4]; + vec_t rowB3 = *(vec_t *) & BO[(l<<2)+8]; + vec_t rowB4 = *(vec_t *) & BO[(l<<2)+12]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1); + __builtin_mma_xvf32gerpp(&acc0, rowA3, rowB2); + __builtin_mma_xvf32gerpp(&acc1, rowA4, rowB2); + __builtin_mma_xvf32gerpp(&acc0, rowA5, rowB3); + __builtin_mma_xvf32gerpp(&acc1, rowA6, rowB3); + __builtin_mma_xvf32gerpp(&acc0, rowA7, rowB4); + __builtin_mma_xvf32gerpp(&acc1, rowA8, rowB4); + } + for (l = (temp & (~3)); l < temp; ++l) + { + vec_t rowA1 = *(vec_t *) & AO[l<<3]; + vec_t rowA2 = *(vec_t *) & AO[(l<<3)+4]; + vec_t rowB1 = *(vec_t *) & BO[l<<2]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1); + } + SAVE_ACC_COMPLEX_22_2 + AO += temp << 3; + BO += temp << 2; + CO += 8; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (4, 2) +#endif + } if (m & 2) + { +#if defined(TRMMKERNEL) + REFRESH_POINTERS (2, 2) +#else + BO = B; + temp = k; +#endif + SET_ACC_ZERO() + for (l = 0; l < (temp & (~7)); l+=8) + { + vec_t rowA1 = *(vec_t *) & AO[l<<2]; + vec_t rowA2 = *(vec_t *) & AO[(l<<2)+4]; + vec_t rowA3 = *(vec_t *) & AO[(l<<2)+8]; + vec_t rowA4 = *(vec_t *) & AO[(l<<2)+12]; + vec_t rowA5 = *(vec_t *) & AO[(l<<2)+16]; + vec_t rowA6 = *(vec_t *) & AO[(l<<2)+20]; + vec_t rowA7 = *(vec_t *) & AO[(l<<2)+24]; + vec_t rowA8 = *(vec_t *) & AO[(l<<2)+28]; + vec_t rowB1 = *(vec_t *) & BO[l<<2]; + vec_t rowB2 = *(vec_t *) & BO[(l<<2)+4]; + vec_t rowB3 = *(vec_t *) & BO[(l<<2)+8]; + vec_t rowB4 = *(vec_t *) & BO[(l<<2)+12]; + vec_t rowB5 = *(vec_t *) & BO[(l<<2)+16]; + vec_t rowB6 = *(vec_t *) & BO[(l<<2)+20]; + vec_t rowB7 = *(vec_t *) & BO[(l<<2)+24]; + vec_t rowB8 = *(vec_t *) & BO[(l<<2)+28]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc0, rowA2, rowB2); + __builtin_mma_xvf32gerpp(&acc0, rowA3, rowB3); + __builtin_mma_xvf32gerpp(&acc0, rowA4, rowB4); + __builtin_mma_xvf32gerpp(&acc0, rowA5, rowB5); + __builtin_mma_xvf32gerpp(&acc0, rowA6, rowB6); + __builtin_mma_xvf32gerpp(&acc0, rowA7, rowB7); + __builtin_mma_xvf32gerpp(&acc0, rowA8, rowB8); + } + for (l = (temp & (~7)); l < temp; ++l) + { + vec_t rowA1 = *(vec_t *) & AO[l<<2]; + vec_t rowB1 = *(vec_t *) & BO[l<<2]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + } + SAVE_ACC_COMPLEX_22_1 + AO += temp << 2; + BO += temp << 2; + CO += 4; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (2, 2) +#endif + } + if (m & 1) + { +#if defined(TRMMKERNEL) + REFRESH_POINTERS (1, 2) +#else + BO = B; + temp = k; +#endif + // RIP OUT MMA STUFF! + SET_ACC_ZERO() + for (l = 0; l < (temp & (~7)); l+=8) + { + vec_t rowA1 = *(vec_t *) & AO[l<<1]; + vec_t rowA2 = *(vec_t *) & AO[(l<<1)+2]; + vec_t rowA3 = *(vec_t *) & AO[(l<<1)+4]; + vec_t rowA4 = *(vec_t *) & AO[(l<<1)+6]; + vec_t rowA5 = *(vec_t *) & AO[(l<<1)+8]; + vec_t rowA6 = *(vec_t *) & AO[(l<<1)+10]; + vec_t rowA7 = *(vec_t *) & AO[(l<<1)+12]; + vec_t rowA8 = *(vec_t *) & AO[(l<<1)+14]; + vec_t rowB1 = *(vec_t *) & BO[l<<2]; + vec_t rowB2 = *(vec_t *) & BO[(l<<2)+4]; + vec_t rowB3 = *(vec_t *) & BO[(l<<2)+8]; + vec_t rowB4 = *(vec_t *) & BO[(l<<2)+12]; + vec_t rowB5 = *(vec_t *) & BO[(l<<2)+16]; + vec_t rowB6 = *(vec_t *) & BO[(l<<2)+20]; + vec_t rowB7 = *(vec_t *) & BO[(l<<2)+24]; + vec_t rowB8 = *(vec_t *) & BO[(l<<2)+28]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB2); + __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB3); + __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB4); + __builtin_mma_xvf32gerpp(&acc4, rowA5, rowB5); + __builtin_mma_xvf32gerpp(&acc5, rowA6, rowB6); + __builtin_mma_xvf32gerpp(&acc6, rowA7, rowB7); + __builtin_mma_xvf32gerpp(&acc7, rowA8, rowB8); + } + for (l = (temp & (~7)); l < temp; ++l) + { + vec_t rowA1 = *(vec_t *) & AO[l<<1]; + vec_t rowB1 = *(vec_t *) & BO[l<<2]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + } + SAVE_ACC_COMPLEX_12 + AO += temp<<1; + BO += temp<<2; + CO += 2; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (1, 2) +#endif + } +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 2; // number of values in A +#endif + B += k << 2; + } + + if (n & 1) + { +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + AO = A; + CO = C; + C += ldc << 1; + + for (i = 0; i < (m >> 3); i++) + { +#if defined(TRMMKERNEL) + REFRESH_POINTERS (8, 1) +#else + BO = B; + temp = k; +#endif + SET_ACC_ZERO() + for (l = 0; l < (temp & (~1)); l+=2) + { + vec_t rowA1 = *(vec_t *) & AO[l<<4]; + vec_t rowA2 = *(vec_t *) & AO[(l<<4)+4]; + vec_t rowA3 = *(vec_t *) & AO[(l<<4)+8]; + vec_t rowA4 = *(vec_t *) & AO[(l<<4)+12]; + vec_t rowA5 = *(vec_t *) & AO[(l<<4)+16]; + vec_t rowA6 = *(vec_t *) & AO[(l<<4)+20]; + vec_t rowA7 = *(vec_t *) & AO[(l<<4)+24]; + vec_t rowA8 = *(vec_t *) & AO[(l<<4)+28]; + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1); + __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB1); + __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB1); + __builtin_mma_xvf32gerpp(&acc4, rowA5, rowB2); + __builtin_mma_xvf32gerpp(&acc5, rowA6, rowB2); + __builtin_mma_xvf32gerpp(&acc6, rowA7, rowB2); + __builtin_mma_xvf32gerpp(&acc7, rowA8, rowB2); + } + for (l = (temp & (~1)); l < temp; ++l) + { + vec_t rowA1 = *(vec_t *) & AO[l<<4]; + vec_t rowA2 = *(vec_t *) & AO[(l<<4)+4]; + vec_t rowA3 = *(vec_t *) & AO[(l<<4)+8]; + vec_t rowA4 = *(vec_t *) & AO[(l<<4)+12]; + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1); + __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB1); + __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB1); + } + SAVE_ACC_COMPLEX_21_4 + AO += temp << 4; + BO += temp << 1; + CO += 16; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (8, 1) +#endif + } + if (m & 4) + { +#if defined(TRMMKERNEL) + REFRESH_POINTERS (4, 1) +#else + BO = B; + temp = k; +#endif + SET_ACC_ZERO() + for (l = 0; l < (temp & (~3)); l+=4) + { + vec_t rowA1 = *(vec_t *) & AO[l<<3]; + vec_t rowA2 = *(vec_t *) & AO[(l<<3)+4]; + vec_t rowA3 = *(vec_t *) & AO[(l<<3)+8]; + vec_t rowA4 = *(vec_t *) & AO[(l<<3)+12]; + vec_t rowA5 = *(vec_t *) & AO[(l<<3)+16]; + vec_t rowA6 = *(vec_t *) & AO[(l<<3)+20]; + vec_t rowA7 = *(vec_t *) & AO[(l<<3)+24]; + vec_t rowA8 = *(vec_t *) & AO[(l<<3)+28]; + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2]; + vec_t rowB3 = *(vec_t *) & BO[(l<<1)+4]; + vec_t rowB4 = *(vec_t *) & BO[(l<<1)+6]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1); + __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB2); + __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB2); + __builtin_mma_xvf32gerpp(&acc4, rowA5, rowB3); + __builtin_mma_xvf32gerpp(&acc5, rowA6, rowB3); + __builtin_mma_xvf32gerpp(&acc6, rowA7, rowB4); + __builtin_mma_xvf32gerpp(&acc7, rowA8, rowB4); + } + for (l = (temp & (~3)); l < temp; ++l) + { + vec_t rowA1 = *(vec_t *) & AO[l<<3]; + vec_t rowA2 = *(vec_t *) & AO[(l<<3)+4]; + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1); + } + SAVE_ACC_COMPLEX_21_2 + AO += temp << 3; + BO += temp << 1; + CO += 8; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (4, 1) +#endif + } + if (m & 2) + { +#if defined(TRMMKERNEL) + REFRESH_POINTERS (2, 1) +#else + BO = B; + temp = k; +#endif + // RIP OUT MMA STUFF! + SET_ACC_ZERO() + for (l = 0; l < (temp & (~7)); l+=8) + { + vec_t rowA1 = *(vec_t *) & AO[l<<2]; + vec_t rowA2 = *(vec_t *) & AO[(l<<2)+4]; + vec_t rowA3 = *(vec_t *) & AO[(l<<2)+8]; + vec_t rowA4 = *(vec_t *) & AO[(l<<2)+12]; + vec_t rowA5 = *(vec_t *) & AO[(l<<2)+16]; + vec_t rowA6 = *(vec_t *) & AO[(l<<2)+20]; + vec_t rowA7 = *(vec_t *) & AO[(l<<2)+24]; + vec_t rowA8 = *(vec_t *) & AO[(l<<2)+28]; + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2]; + vec_t rowB3 = *(vec_t *) & BO[(l<<1)+4]; + vec_t rowB4 = *(vec_t *) & BO[(l<<1)+6]; + vec_t rowB5 = *(vec_t *) & BO[(l<<1)+8]; + vec_t rowB6 = *(vec_t *) & BO[(l<<1)+10]; + vec_t rowB7 = *(vec_t *) & BO[(l<<1)+12]; + vec_t rowB8 = *(vec_t *) & BO[(l<<1)+14]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB2); + __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB3); + __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB4); + __builtin_mma_xvf32gerpp(&acc4, rowA5, rowB5); + __builtin_mma_xvf32gerpp(&acc5, rowA6, rowB6); + __builtin_mma_xvf32gerpp(&acc6, rowA7, rowB7); + __builtin_mma_xvf32gerpp(&acc7, rowA8, rowB8); + } + for (l = (temp & (~7)); l < temp; ++l) + { + vec_t rowA1 = *(vec_t *) & AO[l<<2]; + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + } + SAVE_ACC_COMPLEX_21_1 + AO += temp << 2; + BO += temp << 1; + CO += 4; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (2, 1) +#endif + } + if (m & 1) + { +#if defined(TRMMKERNEL) + REFRESH_POINTERS (1, 1) +#else + BO = B; + temp = k; +#endif + // RIP OUT MMA STUFF! + SET_ACC_ZERO() + for (l = 0; l < (temp & (~7)); l+=8) + { + vec_t rowA1 = *(vec_t *) & AO[l<<1]; + vec_t rowA2 = *(vec_t *) & AO[(l<<1)+2]; + vec_t rowA3 = *(vec_t *) & AO[(l<<1)+4]; + vec_t rowA4 = *(vec_t *) & AO[(l<<1)+6]; + vec_t rowA5 = *(vec_t *) & AO[(l<<1)+8]; + vec_t rowA6 = *(vec_t *) & AO[(l<<1)+10]; + vec_t rowA7 = *(vec_t *) & AO[(l<<1)+12]; + vec_t rowA8 = *(vec_t *) & AO[(l<<1)+14]; + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2]; + vec_t rowB3 = *(vec_t *) & BO[(l<<1)+4]; + vec_t rowB4 = *(vec_t *) & BO[(l<<1)+6]; + vec_t rowB5 = *(vec_t *) & BO[(l<<1)+8]; + vec_t rowB6 = *(vec_t *) & BO[(l<<1)+10]; + vec_t rowB7 = *(vec_t *) & BO[(l<<1)+12]; + vec_t rowB8 = *(vec_t *) & BO[(l<<1)+14]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB2); + __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB3); + __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB4); + __builtin_mma_xvf32gerpp(&acc4, rowA5, rowB5); + __builtin_mma_xvf32gerpp(&acc5, rowA6, rowB6); + __builtin_mma_xvf32gerpp(&acc6, rowA7, rowB7); + __builtin_mma_xvf32gerpp(&acc7, rowA8, rowB8); + } + for (l = (temp & (~7)); l < temp; ++l) + { + vec_t rowA1 = *(vec_t *) & AO[l<<1]; + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + } + SAVE_ACC_COMPLEX_11 + AO += temp<<1; + BO += temp<<1; + CO += 2; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (1, 1) +#endif + } +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 1; // number of values in A +#endif + B += k << 1; + } + return 0; +} diff --git a/kernel/power/zgemm_kernel_power10.c b/kernel/power/zgemm_kernel_power10.c new file mode 100644 index 000000000..e4e609067 --- /dev/null +++ b/kernel/power/zgemm_kernel_power10.c @@ -0,0 +1,761 @@ +/********************************************************************************* +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ +#include "common.h" +#include + +typedef __vector unsigned char vec_t; +typedef FLOAT v4sf_t __attribute__ ((vector_size (16))); + +#define SET_ACC_ZERO() \ + __builtin_mma_xxsetaccz (&acc0); \ + __builtin_mma_xxsetaccz (&acc1); \ + __builtin_mma_xxsetaccz (&acc2); \ + __builtin_mma_xxsetaccz (&acc3); \ + __builtin_mma_xxsetaccz (&acc4); \ + __builtin_mma_xxsetaccz (&acc5); \ + __builtin_mma_xxsetaccz (&acc6); \ + __builtin_mma_xxsetaccz (&acc7); + +#if (defined(NN) || defined(NT) || defined(TN) || defined(TT)) +#define COMP_MUL(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real = _arbr - _aibi; _imag = _arbi + _aibr; } +#define COMP_MAC(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real += _arbr - _aibi; _imag += _arbi + _aibr; } +#endif + +#if (defined(NR) || defined(NC) || defined(TR) || defined(TC)) +#define COMP_MUL(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real = _arbr + _aibi; _imag = -_arbi + _aibr; } +#define COMP_MAC(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real += _arbr + _aibi; _imag += -_arbi + _aibr; } +#endif + +#if (defined(RN) || defined(RT) || defined(CN) || defined(CT)) +#define COMP_MUL(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real = _arbr + _aibi; _imag = _arbi - _aibr; } +#define COMP_MAC(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real += _arbr + _aibi; _imag += _arbi - _aibr; } +#endif + +#if (defined(RR) || defined(RC) || defined(CR) || defined(CC)) +#define COMP_MUL(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real = _arbr - _aibi; _imag = -_arbi - _aibr; } +#define COMP_MAC(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real += _arbr - _aibi; _imag += -_arbi - _aibr; } +#endif + +#if defined(TRMMKERNEL) +#define A_OP = +#else +#define A_OP += +#endif + +#define BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + __builtin_mma_disassemble_acc ((void *)result, &acc0); \ + __builtin_mma_disassemble_acc ((void *)&result[4], &acc1); \ + __builtin_mma_disassemble_acc ((void *)&result[8], &acc2); \ + __builtin_mma_disassemble_acc ((void *)&result[12], &acc3); \ + __builtin_mma_disassemble_acc ((void *)&result[16], &acc4); \ + __builtin_mma_disassemble_acc ((void *)&result[20], &acc5); \ + __builtin_mma_disassemble_acc ((void *)&result[24], &acc6); \ + __builtin_mma_disassemble_acc ((void *)&result[28], &acc7); + +#define SAVE_ACC_COMPLEX_11 \ + BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + COMP_MUL(tr[0], res[ 0], res[ 3], ti[0], res[ 1], res[ 2]) \ + COMP_MAC(tr[0], res[ 8], res[11], ti[0], res[ 9], res[10]) \ + COMP_MAC(tr[0], res[16], res[19], ti[0], res[17], res[18]) \ + COMP_MAC(tr[0], res[24], res[27], ti[0], res[25], res[26]) \ + COMP_MAC(tr[0], res[32], res[35], ti[0], res[33], res[34]) \ + COMP_MAC(tr[0], res[40], res[43], ti[0], res[41], res[42]) \ + COMP_MAC(tr[0], res[48], res[51], ti[0], res[49], res[50]) \ + COMP_MAC(tr[0], res[56], res[59], ti[0], res[57], res[58]) \ + CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; + +#define SAVE_ACC_COMPLEX_12 \ + BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + COMP_MUL(tr[0], res[ 0], res[ 3], ti[0], res[ 1], res[ 2]) \ + COMP_MUL(tr[1], res[ 8], res[11], ti[1], res[ 9], res[10]) \ + COMP_MAC(tr[0], res[16], res[19], ti[0], res[17], res[18]) \ + COMP_MAC(tr[1], res[24], res[27], ti[1], res[25], res[26]) \ + COMP_MAC(tr[0], res[32], res[35], ti[0], res[33], res[34]) \ + COMP_MAC(tr[1], res[40], res[43], ti[1], res[41], res[42]) \ + COMP_MAC(tr[0], res[48], res[51], ti[0], res[49], res[50]) \ + COMP_MAC(tr[1], res[56], res[59], ti[1], res[57], res[58]) \ + CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[2*ldc+0] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[2*ldc+1] A_OP ti[1] * alpha_r + tr[1] * alpha_i; + +#define SAVE_ACC_COMPLEX_21_1 \ + BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + COMP_MUL(tr[0], res[ 0], res[ 3], ti[0], res[ 1], res[ 2]) \ + COMP_MUL(tr[1], res[ 4], res[ 7], ti[1], res[ 5], res[ 6]) \ + COMP_MAC(tr[0], res[ 8], res[11], ti[0], res[ 9], res[10]) \ + COMP_MAC(tr[1], res[12], res[15], ti[1], res[13], res[14]) \ + COMP_MAC(tr[0], res[16], res[19], ti[0], res[17], res[18]) \ + COMP_MAC(tr[1], res[20], res[23], ti[1], res[21], res[22]) \ + COMP_MAC(tr[0], res[24], res[27], ti[0], res[25], res[26]) \ + COMP_MAC(tr[1], res[28], res[31], ti[1], res[29], res[30]) \ + COMP_MAC(tr[0], res[32], res[35], ti[0], res[33], res[34]) \ + COMP_MAC(tr[1], res[36], res[39], ti[1], res[37], res[38]) \ + COMP_MAC(tr[0], res[40], res[43], ti[0], res[41], res[42]) \ + COMP_MAC(tr[1], res[44], res[47], ti[1], res[45], res[46]) \ + COMP_MAC(tr[0], res[48], res[51], ti[0], res[49], res[50]) \ + COMP_MAC(tr[1], res[52], res[55], ti[1], res[53], res[54]) \ + COMP_MAC(tr[0], res[56], res[59], ti[0], res[57], res[58]) \ + COMP_MAC(tr[1], res[60], res[63], ti[1], res[61], res[62]) \ + CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; + +#define SAVE_ACC_COMPLEX_21_2 \ + BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + COMP_MUL(tr[0], res[ 0], res[ 3], ti[0], res[ 1], res[ 2]) \ + COMP_MUL(tr[1], res[ 4], res[ 7], ti[1], res[ 5], res[ 6]) \ + COMP_MUL(tr[2], res[ 8], res[11], ti[2], res[ 9], res[10]) \ + COMP_MUL(tr[3], res[12], res[15], ti[3], res[13], res[14]) \ + COMP_MAC(tr[0], res[16], res[19], ti[0], res[17], res[18]) \ + COMP_MAC(tr[1], res[20], res[23], ti[1], res[21], res[22]) \ + COMP_MAC(tr[2], res[24], res[27], ti[2], res[25], res[26]) \ + COMP_MAC(tr[3], res[28], res[31], ti[3], res[29], res[30]) \ + COMP_MAC(tr[0], res[32], res[35], ti[0], res[33], res[34]) \ + COMP_MAC(tr[1], res[36], res[39], ti[1], res[37], res[38]) \ + COMP_MAC(tr[2], res[40], res[43], ti[2], res[41], res[42]) \ + COMP_MAC(tr[3], res[44], res[47], ti[3], res[45], res[46]) \ + COMP_MAC(tr[0], res[48], res[51], ti[0], res[49], res[50]) \ + COMP_MAC(tr[1], res[52], res[55], ti[1], res[53], res[54]) \ + COMP_MAC(tr[2], res[56], res[59], ti[2], res[57], res[58]) \ + COMP_MAC(tr[3], res[60], res[63], ti[3], res[61], res[62]) \ + CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[4] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[5] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[6] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[7] A_OP ti[3] * alpha_r + tr[3] * alpha_i; + +#define SAVE_ACC_COMPLEX_21_4 \ + BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + COMP_MUL(tr[0], res[ 0], res[ 3], ti[0], res[ 1], res[ 2]) \ + COMP_MUL(tr[1], res[ 4], res[ 7], ti[1], res[ 5], res[ 6]) \ + COMP_MUL(tr[2], res[ 8], res[11], ti[2], res[ 9], res[10]) \ + COMP_MUL(tr[3], res[12], res[15], ti[3], res[13], res[14]) \ + COMP_MUL(tr[4], res[16], res[19], ti[4], res[17], res[18]) \ + COMP_MUL(tr[5], res[20], res[23], ti[5], res[21], res[22]) \ + COMP_MUL(tr[6], res[24], res[27], ti[6], res[25], res[26]) \ + COMP_MUL(tr[7], res[28], res[31], ti[7], res[29], res[30]) \ + COMP_MAC(tr[0], res[32], res[35], ti[0], res[33], res[34]) \ + COMP_MAC(tr[1], res[36], res[39], ti[1], res[37], res[38]) \ + COMP_MAC(tr[2], res[40], res[43], ti[2], res[41], res[42]) \ + COMP_MAC(tr[3], res[44], res[47], ti[3], res[45], res[46]) \ + COMP_MAC(tr[4], res[48], res[51], ti[4], res[49], res[50]) \ + COMP_MAC(tr[5], res[52], res[55], ti[5], res[53], res[54]) \ + COMP_MAC(tr[6], res[56], res[59], ti[6], res[57], res[58]) \ + COMP_MAC(tr[7], res[60], res[63], ti[7], res[61], res[62]) \ + CO[ 0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[ 1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[ 2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[ 3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[ 4] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[ 5] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[ 6] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[ 7] A_OP ti[3] * alpha_r + tr[3] * alpha_i; \ + CO[ 8] A_OP tr[4] * alpha_r - ti[4] * alpha_i; \ + CO[ 9] A_OP ti[4] * alpha_r + tr[4] * alpha_i; \ + CO[10] A_OP tr[5] * alpha_r - ti[5] * alpha_i; \ + CO[11] A_OP ti[5] * alpha_r + tr[5] * alpha_i; \ + CO[12] A_OP tr[6] * alpha_r - ti[6] * alpha_i; \ + CO[13] A_OP ti[6] * alpha_r + tr[6] * alpha_i; \ + CO[14] A_OP tr[7] * alpha_r - ti[7] * alpha_i; \ + CO[15] A_OP ti[7] * alpha_r + tr[7] * alpha_i; + +#define SAVE_ACC_COMPLEX_22_1 \ + __builtin_mma_disassemble_acc ((void *)result, &acc0); \ + __builtin_mma_disassemble_acc ((void *)(&result[4]), &acc1); \ + COMP_MUL(tr[0], res[0], res[3], ti[0], res[1], res[2]) \ + COMP_MUL(tr[1], res[4], res[7], ti[1], res[5], res[6]) \ + COMP_MUL(tr[2], res[8], res[11], ti[2], res[9], res[10]) \ + COMP_MUL(tr[3], res[12], res[15], ti[3], res[13], res[14] ) \ + CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[2*ldc+0] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[2*ldc+1] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[2*ldc+2] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[2*ldc+3] A_OP ti[3] * alpha_r + tr[3] * alpha_i; + +#define SAVE_ACC_COMPLEX_22_2(ACC1, ACC2, CI) \ + __builtin_mma_disassemble_acc ((void *)result, ACC1); \ + __builtin_mma_disassemble_acc ((void *)(&result[4]), ACC2); \ + COMP_MUL(tr[0], res[0], res[3], ti[0], res[1], res[2]) \ + COMP_MUL(tr[1], res[4], res[7], ti[1], res[5], res[6]) \ + COMP_MUL(tr[2], res[8], res[11], ti[2], res[9], res[10]) \ + COMP_MUL(tr[3], res[12], res[15], ti[3], res[13], res[14]) \ + CO[CI+0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[CI+1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[CI+2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[CI+3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[2*ldc+CI+0] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[2*ldc+CI+1] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[2*ldc+CI+2] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[2*ldc+CI+3] A_OP ti[3] * alpha_r + tr[3] * alpha_i; + +#define PREFETCH1(x, y) asm volatile ("dcbt %0, %1" : : "r" (x), "b" (y) : "memory"); + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) +#define REFRESH_TEMP_BK(x, y) \ + temp = k - off; +#elif defined(LEFT) +#define REFRESH_TEMP_BK(x, y) \ + temp = off + x; +#else +#define REFRESH_TEMP_BK(x, y) \ + temp = off + y; +#endif +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +#define REFRESH_POINTERS(x, y) \ + BO = B; \ + REFRESH_TEMP_BK(x, y) +#else +#define REFRESH_POINTERS(x, y) \ + AO += off * (2*x); \ + BO = B + off * (2*y); \ + REFRESH_TEMP_BK(x, y) +#endif + +#ifdef LEFT +#define REFRESH_OFF(x) \ + off += x; +#else +#define REFRESH_OFF(x) +#endif + +#ifdef LEFT +#define UPDATE_TEMP(x, y) \ + temp -= x; +#else +#define UPDATE_TEMP(x, y) \ + temp -= y; +#endif + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +#define REFRESH_TMP_AFTER_SAVE(x, y) \ + temp = k - off; \ + UPDATE_TEMP(x, y) \ + AO += temp * (2*x); \ + BO += temp * (2*y); +#else +#define REFRESH_TMP_AFTER_SAVE(x, y) +#endif + +#define REFRESH_AFTER_SAVE(x,y) \ + REFRESH_TMP_AFTER_SAVE(x, y) \ + REFRESH_OFF(x) +/************************************************************************************* +* GEMM Kernel +*************************************************************************************/ +int +CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT * A, FLOAT * B, + FLOAT * C, BLASLONG ldc +#ifdef TRMMKERNEL + , BLASLONG offset +#endif + ) +{ + BLASLONG i1, i, l, temp; + FLOAT *AO, *BO, *CO; +#if defined(TRMMKERNEL) + BLASLONG off; +#endif +#if defined(TRMMKERNEL) && !defined(LEFT) + off = -offset; +#endif + __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; + + v4sf_t result[32]; + FLOAT *res, tr[16], ti[16]; + res = (FLOAT *) result; + + for (i1 = 0; i1 < (n >> 1); i1++) + { +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + AO = A; + CO = C; + C += ldc<<2; + for (i = 0; i < (m >> 3); i++) + { +#if defined(TRMMKERNEL) + REFRESH_POINTERS (8, 2) +#else + BO = B; + temp = k; +#endif + SET_ACC_ZERO() + for (l = 0; l < temp; ++l) + { + __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<4])); + __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<4)+4])); + __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<4)+8])); + __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<4)+12])); + vec_t rowB1 = *(vec_t *) & BO[l<<2]; + vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2]; + __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1); + __builtin_mma_xvf64gerpp(&acc2, rowA3, rowB1); + __builtin_mma_xvf64gerpp(&acc3, rowA4, rowB1); + __builtin_mma_xvf64gerpp(&acc4, rowA1, rowB2); + __builtin_mma_xvf64gerpp(&acc5, rowA2, rowB2); + __builtin_mma_xvf64gerpp(&acc6, rowA3, rowB2); + __builtin_mma_xvf64gerpp(&acc7, rowA4, rowB2); + } + __builtin_mma_disassemble_acc ((void *)result, &acc0); + __builtin_mma_disassemble_acc ((void *)(&result[ 4]), &acc1); + __builtin_mma_disassemble_acc ((void *)(&result[ 8]), &acc2); + __builtin_mma_disassemble_acc ((void *)(&result[12]), &acc3); + __builtin_mma_disassemble_acc ((void *)(&result[16]), &acc4); + __builtin_mma_disassemble_acc ((void *)(&result[20]), &acc5); + __builtin_mma_disassemble_acc ((void *)(&result[24]), &acc6); + __builtin_mma_disassemble_acc ((void *)(&result[28]), &acc7); + COMP_MUL(tr[ 0], res[ 0], res[ 3], ti[ 0], res[ 1], res[ 2]) + COMP_MUL(tr[ 1], res[ 4], res[ 7], ti[ 1], res[ 5], res[ 6]) + COMP_MUL(tr[ 2], res[ 8], res[11], ti[ 2], res[ 9], res[10]) + COMP_MUL(tr[ 3], res[12], res[15], ti[ 3], res[13], res[14]) + COMP_MUL(tr[ 4], res[16], res[19], ti[ 4], res[17], res[18]) + COMP_MUL(tr[ 5], res[20], res[23], ti[ 5], res[21], res[22]) + COMP_MUL(tr[ 6], res[24], res[27], ti[ 6], res[25], res[26]) + COMP_MUL(tr[ 7], res[28], res[31], ti[ 7], res[29], res[30]) + COMP_MUL(tr[ 8], res[32], res[35], ti[ 8], res[33], res[34]) + COMP_MUL(tr[ 9], res[36], res[39], ti[ 9], res[37], res[38]) + COMP_MUL(tr[10], res[40], res[43], ti[10], res[41], res[42]) + COMP_MUL(tr[11], res[44], res[47], ti[11], res[45], res[46]) + COMP_MUL(tr[12], res[48], res[51], ti[12], res[49], res[50]) + COMP_MUL(tr[13], res[52], res[55], ti[13], res[53], res[54]) + COMP_MUL(tr[14], res[56], res[59], ti[14], res[57], res[58]) + COMP_MUL(tr[15], res[60], res[63], ti[15], res[61], res[62]) + CO[ 0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; + CO[ 1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; + CO[ 2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; + CO[ 3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; + CO[ 4] A_OP tr[2] * alpha_r - ti[2] * alpha_i; + CO[ 5] A_OP ti[2] * alpha_r + tr[2] * alpha_i; + CO[ 6] A_OP tr[3] * alpha_r - ti[3] * alpha_i; + CO[ 7] A_OP ti[3] * alpha_r + tr[3] * alpha_i; + CO[ 8] A_OP tr[4] * alpha_r - ti[4] * alpha_i; + CO[ 9] A_OP ti[4] * alpha_r + tr[4] * alpha_i; + CO[10] A_OP tr[5] * alpha_r - ti[5] * alpha_i; + CO[11] A_OP ti[5] * alpha_r + tr[5] * alpha_i; + CO[12] A_OP tr[6] * alpha_r - ti[6] * alpha_i; + CO[13] A_OP ti[6] * alpha_r + tr[6] * alpha_i; + CO[14] A_OP tr[7] * alpha_r - ti[7] * alpha_i; + CO[15] A_OP ti[7] * alpha_r + tr[7] * alpha_i; + CO[2*ldc+ 0] A_OP tr[ 8] * alpha_r - ti[ 8] * alpha_i; + CO[2*ldc+ 1] A_OP ti[ 8] * alpha_r + tr[ 8] * alpha_i; + CO[2*ldc+ 2] A_OP tr[ 9] * alpha_r - ti[ 9] * alpha_i; + CO[2*ldc+ 3] A_OP ti[ 9] * alpha_r + tr[ 9] * alpha_i; + CO[2*ldc+ 4] A_OP tr[10] * alpha_r - ti[10] * alpha_i; + CO[2*ldc+ 5] A_OP ti[10] * alpha_r + tr[10] * alpha_i; + CO[2*ldc+ 6] A_OP tr[11] * alpha_r - ti[11] * alpha_i; + CO[2*ldc+ 7] A_OP ti[11] * alpha_r + tr[11] * alpha_i; + CO[2*ldc+ 8] A_OP tr[12] * alpha_r - ti[12] * alpha_i; + CO[2*ldc+ 9] A_OP ti[12] * alpha_r + tr[12] * alpha_i; + CO[2*ldc+10] A_OP tr[13] * alpha_r - ti[13] * alpha_i; + CO[2*ldc+11] A_OP ti[13] * alpha_r + tr[13] * alpha_i; + CO[2*ldc+12] A_OP tr[14] * alpha_r - ti[14] * alpha_i; + CO[2*ldc+13] A_OP ti[14] * alpha_r + tr[14] * alpha_i; + CO[2*ldc+14] A_OP tr[15] * alpha_r - ti[15] * alpha_i; + CO[2*ldc+15] A_OP ti[15] * alpha_r + tr[15] * alpha_i; + + AO += temp << 4; + BO += temp << 2; + CO += 16; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (8, 2) +#endif + } + if (m & 4) + { +#if defined(TRMMKERNEL) + REFRESH_POINTERS (4, 2) +#else + BO = B; + temp = k; +#endif + SET_ACC_ZERO() + for (l = 0; l < (temp & (~1)); l+=2) + { + __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<3])); + __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<3)+4])); + __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<3)+8])); + __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<3)+12])); + vec_t rowB1 = *(vec_t *) & BO[l<<2]; + vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2]; + vec_t rowB3 = *(vec_t *) & BO[(l<<2)+4]; + vec_t rowB4 = *(vec_t *) & BO[(l<<2)+6]; + __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1); + __builtin_mma_xvf64gerpp(&acc2, rowA1, rowB2); + __builtin_mma_xvf64gerpp(&acc3, rowA2, rowB2); + __builtin_mma_xvf64gerpp(&acc0, rowA3, rowB3); + __builtin_mma_xvf64gerpp(&acc1, rowA4, rowB3); + __builtin_mma_xvf64gerpp(&acc2, rowA3, rowB4); + __builtin_mma_xvf64gerpp(&acc3, rowA4, rowB4); + } + for (l = (temp & (~1)); l < temp; ++l) + { + __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<3])); + __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<3)+4])); + vec_t rowB1 = *(vec_t *) & BO[l<<2]; + vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2]; + __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1); + __builtin_mma_xvf64gerpp(&acc2, rowA1, rowB2); + __builtin_mma_xvf64gerpp(&acc3, rowA2, rowB2); + } + SAVE_ACC_COMPLEX_22_2(&acc0, &acc2, 0) + SAVE_ACC_COMPLEX_22_2(&acc1, &acc3, 4) + AO += temp << 3; + BO += temp << 2; + CO += 8; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (4, 2) +#endif + } + if (m & 2) + { +#if defined(TRMMKERNEL) + REFRESH_POINTERS (2, 2) +#else + BO = B; + temp = k; +#endif + SET_ACC_ZERO() + for (l = 0; l < (temp & (~3)); l+=4) + { + __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<2])); + __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<2)+4])); + __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<2)+8])); + __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<2)+12])); + vec_t rowB1 = *(vec_t *) & BO[l<<2]; + vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2]; + vec_t rowB3 = *(vec_t *) & BO[(l<<2)+4]; + vec_t rowB4 = *(vec_t *) & BO[(l<<2)+6]; + vec_t rowB5 = *(vec_t *) & BO[(l<<2)+8]; + vec_t rowB6 = *(vec_t *) & BO[(l<<2)+10]; + vec_t rowB7 = *(vec_t *) & BO[(l<<2)+12]; + vec_t rowB8 = *(vec_t *) & BO[(l<<2)+14]; + __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf64gerpp(&acc1, rowA1, rowB2); + __builtin_mma_xvf64gerpp(&acc0, rowA2, rowB3); + __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB4); + __builtin_mma_xvf64gerpp(&acc0, rowA3, rowB5); + __builtin_mma_xvf64gerpp(&acc1, rowA3, rowB6); + __builtin_mma_xvf64gerpp(&acc0, rowA4, rowB7); + __builtin_mma_xvf64gerpp(&acc1, rowA4, rowB8); + } + for (l = (temp & (~3)); l < temp; ++l) + { + __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<2])); + vec_t rowB1 = *(vec_t *) & BO[l<<2]; + vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2]; + __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf64gerpp(&acc1, rowA1, rowB2); + } + SAVE_ACC_COMPLEX_22_1 + AO += temp << 2; + BO += temp << 2; + CO += 4; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (2, 2) +#endif + } + if (m & 1) + { +#if defined(TRMMKERNEL) + REFRESH_POINTERS (1, 2) +#else + BO = B; + temp = k; +#endif + // RIP OUT MMA STUFF! + SET_ACC_ZERO() + for (l = 0; l < (temp & (~3)); l+=4) + { + __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<1])); + __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<1)+2])); + __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<1)+4])); + __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<1)+6])); + vec_t rowB1 = *(vec_t *) & BO[l<<2]; + vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2]; + vec_t rowB3 = *(vec_t *) & BO[(l<<2)+4]; + vec_t rowB4 = *(vec_t *) & BO[(l<<2)+6]; + vec_t rowB5 = *(vec_t *) & BO[(l<<2)+8]; + vec_t rowB6 = *(vec_t *) & BO[(l<<2)+10]; + vec_t rowB7 = *(vec_t *) & BO[(l<<2)+12]; + vec_t rowB8 = *(vec_t *) & BO[(l<<2)+14]; + __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf64gerpp(&acc1, rowA1, rowB2); + __builtin_mma_xvf64gerpp(&acc0, rowA2, rowB3); + __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB4); + __builtin_mma_xvf64gerpp(&acc0, rowA3, rowB5); + __builtin_mma_xvf64gerpp(&acc1, rowA3, rowB6); + __builtin_mma_xvf64gerpp(&acc0, rowA4, rowB7); + __builtin_mma_xvf64gerpp(&acc1, rowA4, rowB8); + } + for (l = (temp & (~3)); l < temp; ++l) + { + __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<1])); + vec_t rowB1 = *(vec_t *) & BO[l<<2]; + vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2]; + __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf64gerpp(&acc1, rowA1, rowB2); + } + SAVE_ACC_COMPLEX_12 + AO += temp << 1; + BO += temp << 2; + CO += 2; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (1, 2) +#endif + } +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 2; // number of values in A +#endif + B += k << 2; + } + if (n & 1) + { +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + AO = A; + CO = C; + C += ldc<<1; + for (i = 0; i < (m >> 3); i++) + { +#if defined(TRMMKERNEL) + REFRESH_POINTERS (8, 1) +#else + BO = B; + temp = k; +#endif + SET_ACC_ZERO() + for (l = 0; l < (temp & (~1)); l+=2) + { + __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<4])); + __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<4)+4])); + __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<4)+8])); + __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<4)+12])); + __vector_pair rowA5 = *((__vector_pair *)((void *)&AO[(l<<4)+16])); + __vector_pair rowA6 = *((__vector_pair *)((void *)&AO[(l<<4)+20])); + __vector_pair rowA7 = *((__vector_pair *)((void *)&AO[(l<<4)+24])); + __vector_pair rowA8 = *((__vector_pair *)((void *)&AO[(l<<4)+28])); + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2]; + __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1); + __builtin_mma_xvf64gerpp(&acc2, rowA3, rowB1); + __builtin_mma_xvf64gerpp(&acc3, rowA4, rowB1); + __builtin_mma_xvf64gerpp(&acc0, rowA5, rowB2); + __builtin_mma_xvf64gerpp(&acc1, rowA6, rowB2); + __builtin_mma_xvf64gerpp(&acc2, rowA7, rowB2); + __builtin_mma_xvf64gerpp(&acc3, rowA8, rowB2); + } + for (l = (temp & (~1)); l < temp; ++l) + { + __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<4])); + __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<4)+4])); + __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<4)+8])); + __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<4)+12])); + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1); + __builtin_mma_xvf64gerpp(&acc2, rowA3, rowB1); + __builtin_mma_xvf64gerpp(&acc3, rowA4, rowB1); + } + SAVE_ACC_COMPLEX_21_4 + + AO += temp << 4; + BO += temp << 1; + CO += 16; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (8, 1) +#endif + } + if (m & 4) + { +#if defined(TRMMKERNEL) + REFRESH_POINTERS (4, 1) +#else + BO = B; + temp = k; +#endif + SET_ACC_ZERO() + for (l = 0; l < (temp & (~3)); l+=4) + { + __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<3])); + __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<3)+4])); + __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<3)+8])); + __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<3)+12])); + __vector_pair rowA5 = *((__vector_pair *)((void *)&AO[(l<<3)+16])); + __vector_pair rowA6 = *((__vector_pair *)((void *)&AO[(l<<3)+20])); + __vector_pair rowA7 = *((__vector_pair *)((void *)&AO[(l<<3)+24])); + __vector_pair rowA8 = *((__vector_pair *)((void *)&AO[(l<<3)+28])); + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2]; + vec_t rowB3 = *(vec_t *) & BO[(l<<1)+4]; + vec_t rowB4 = *(vec_t *) & BO[(l<<1)+6]; + __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1); + __builtin_mma_xvf64gerpp(&acc2, rowA3, rowB2); + __builtin_mma_xvf64gerpp(&acc3, rowA4, rowB2); + __builtin_mma_xvf64gerpp(&acc4, rowA5, rowB3); + __builtin_mma_xvf64gerpp(&acc5, rowA6, rowB3); + __builtin_mma_xvf64gerpp(&acc6, rowA7, rowB4); + __builtin_mma_xvf64gerpp(&acc7, rowA8, rowB4); + } + for (l = (temp & (~3)); l < temp; ++l) + { + __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<3])); + __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<3)+4])); + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1); + } + SAVE_ACC_COMPLEX_21_2 + AO += temp << 3; + BO += temp << 1; + CO += 8; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (4, 1) +#endif + } if (m & 2) + { +#if defined(TRMMKERNEL) + REFRESH_POINTERS (2, 1) +#else + BO = B; + temp = k; +#endif + SET_ACC_ZERO() + for (l = 0; l < (temp & (~7)); l+=8) + { + __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<2])); + __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<2)+4])); + __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<2)+8])); + __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<2)+12])); + __vector_pair rowA5 = *((__vector_pair *)((void *)&AO[(l<<2)+16])); + __vector_pair rowA6 = *((__vector_pair *)((void *)&AO[(l<<2)+20])); + __vector_pair rowA7 = *((__vector_pair *)((void *)&AO[(l<<2)+24])); + __vector_pair rowA8 = *((__vector_pair *)((void *)&AO[(l<<2)+28])); + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2]; + vec_t rowB3 = *(vec_t *) & BO[(l<<1)+4]; + vec_t rowB4 = *(vec_t *) & BO[(l<<1)+6]; + vec_t rowB5 = *(vec_t *) & BO[(l<<1)+8]; + vec_t rowB6 = *(vec_t *) & BO[(l<<1)+10]; + vec_t rowB7 = *(vec_t *) & BO[(l<<1)+12]; + vec_t rowB8 = *(vec_t *) & BO[(l<<1)+14]; + __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB2); + __builtin_mma_xvf64gerpp(&acc2, rowA3, rowB3); + __builtin_mma_xvf64gerpp(&acc3, rowA4, rowB4); + __builtin_mma_xvf64gerpp(&acc4, rowA5, rowB5); + __builtin_mma_xvf64gerpp(&acc5, rowA6, rowB6); + __builtin_mma_xvf64gerpp(&acc6, rowA7, rowB7); + __builtin_mma_xvf64gerpp(&acc7, rowA8, rowB8); + } + for (l = (temp & (~7)); l < temp; ++l) + { + __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<2])); + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); + } + SAVE_ACC_COMPLEX_21_1 + AO += temp << 2; + BO += temp << 1; + CO += 4; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (2, 1) +#endif + } + if (m & 1) + { +#if defined(TRMMKERNEL) + REFRESH_POINTERS (1, 1) +#else + BO = B; + temp = k; +#endif + // RIP OUT MMA STUFF! + SET_ACC_ZERO() + for (l = 0; l < (temp & (~7)); l+=8) + { + __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<1])); + __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<1)+2])); + __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<1)+4])); + __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<1)+6])); + __vector_pair rowA5 = *((__vector_pair *)((void *)&AO[(l<<1)+8])); + __vector_pair rowA6 = *((__vector_pair *)((void *)&AO[(l<<1)+10])); + __vector_pair rowA7 = *((__vector_pair *)((void *)&AO[(l<<1)+12])); + __vector_pair rowA8 = *((__vector_pair *)((void *)&AO[(l<<1)+14])); + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2]; + vec_t rowB3 = *(vec_t *) & BO[(l<<1)+4]; + vec_t rowB4 = *(vec_t *) & BO[(l<<1)+6]; + vec_t rowB5 = *(vec_t *) & BO[(l<<1)+8]; + vec_t rowB6 = *(vec_t *) & BO[(l<<1)+10]; + vec_t rowB7 = *(vec_t *) & BO[(l<<1)+12]; + vec_t rowB8 = *(vec_t *) & BO[(l<<1)+14]; + __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB2); + __builtin_mma_xvf64gerpp(&acc2, rowA3, rowB3); + __builtin_mma_xvf64gerpp(&acc3, rowA4, rowB4); + __builtin_mma_xvf64gerpp(&acc4, rowA5, rowB5); + __builtin_mma_xvf64gerpp(&acc5, rowA6, rowB6); + __builtin_mma_xvf64gerpp(&acc6, rowA7, rowB7); + __builtin_mma_xvf64gerpp(&acc7, rowA8, rowB8); + } + for (l = (temp & (~7)); l < temp; ++l) + { + __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<1])); + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); + } + SAVE_ACC_COMPLEX_11 + AO += temp << 1; + BO += temp << 1; + CO += 2; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (1, 1) +#endif + } +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 1; // number of values in A +#endif + B += k << 1; + } + return 0; +} From 87ba528d8b12d8debf77b5b952b7c31d2ba1bd08 Mon Sep 17 00:00:00 2001 From: austinpagan Date: Thu, 1 Feb 2024 18:46:07 -0600 Subject: [PATCH 02/23] Changed C files to straighten out indentation. Removed commented lines from other file. --- kernel/power/KERNEL.POWER10 | 21 - kernel/power/cgemm_kernel_power10.c | 1725 +++++++++++++-------------- kernel/power/zgemm_kernel_power10.c | 1145 +++++++++--------- 3 files changed, 1403 insertions(+), 1488 deletions(-) diff --git a/kernel/power/KERNEL.POWER10 b/kernel/power/KERNEL.POWER10 index 5f49b9c46..4e408e121 100644 --- a/kernel/power/KERNEL.POWER10 +++ b/kernel/power/KERNEL.POWER10 @@ -16,17 +16,8 @@ SBGEMMOTCOPYOBJ = sbgemm_otcopy$(TSUFFIX).$(SUFFIX) STRMMKERNEL = sgemm_kernel_power10.c DTRMMKERNEL = dgemm_kernel_power10.c -ifeq ($(OSNAME), AIX) -#CTRMMKERNEL = ctrmm_kernel_8x4_power8.S -#ZTRMMKERNEL = ztrmm_kernel_8x2_power8.S CTRMMKERNEL = cgemm_kernel_power10.c ZTRMMKERNEL = zgemm_kernel_power10.c -else -#CTRMMKERNEL = cgemm_kernel_power10.S -#ZTRMMKERNEL = zgemm_kernel_power10.S -CTRMMKERNEL = cgemm_kernel_power10.c -ZTRMMKERNEL = zgemm_kernel_power10.c -endif SGEMMKERNEL = sgemm_kernel_power10.c SGEMMINCOPY = ../generic/gemm_ncopy_16.c @@ -68,13 +59,7 @@ DGEMM_SMALL_K_B0_TT = dgemm_small_kernel_tt_power10.c DGEMM_SMALL_K_TN = dgemm_small_kernel_tn_power10.c DGEMM_SMALL_K_B0_TN = dgemm_small_kernel_tn_power10.c -ifeq ($(OSNAME), AIX) -#CGEMMKERNEL = cgemm_kernel_8x4_power8.S CGEMMKERNEL = cgemm_kernel_power10.c -else -#CGEMMKERNEL = cgemm_kernel_power10.S -CGEMMKERNEL = cgemm_kernel_power10.c -endif #CGEMMKERNEL = cgemm_kernel_8x4_power8.S CGEMMINCOPY = ../generic/zgemm_ncopy_8.c ifeq ($(OSNAME), AIX) @@ -89,13 +74,7 @@ CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) -ifeq ($(OSNAME), AIX) -#ZGEMMKERNEL = zgemm_kernel_8x2_power8.S ZGEMMKERNEL = zgemm_kernel_power10.c -else -#ZGEMMKERNEL = zgemm_kernel_power10.S -ZGEMMKERNEL = zgemm_kernel_power10.c -endif ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c ZGEMMINCOPY = ../generic/zgemm_ncopy_8.c diff --git a/kernel/power/cgemm_kernel_power10.c b/kernel/power/cgemm_kernel_power10.c index 279c83aec..233768cef 100644 --- a/kernel/power/cgemm_kernel_power10.c +++ b/kernel/power/cgemm_kernel_power10.c @@ -31,15 +31,15 @@ typedef __vector unsigned char vec_t; typedef FLOAT v4sf_t __attribute__ ((vector_size (16))); typedef FLOAT v2sf_t __attribute__ ((vector_size (8))); -#define SET_ACC_ZERO() \ - __builtin_mma_xxsetaccz (&acc0); \ - __builtin_mma_xxsetaccz (&acc1); \ - __builtin_mma_xxsetaccz (&acc2); \ - __builtin_mma_xxsetaccz (&acc3); \ - __builtin_mma_xxsetaccz (&acc4); \ - __builtin_mma_xxsetaccz (&acc5); \ - __builtin_mma_xxsetaccz (&acc6); \ - __builtin_mma_xxsetaccz (&acc7); +#define SET_ACC_ZERO() \ + __builtin_mma_xxsetaccz (&acc0); \ + __builtin_mma_xxsetaccz (&acc1); \ + __builtin_mma_xxsetaccz (&acc2); \ + __builtin_mma_xxsetaccz (&acc3); \ + __builtin_mma_xxsetaccz (&acc4); \ + __builtin_mma_xxsetaccz (&acc5); \ + __builtin_mma_xxsetaccz (&acc6); \ + __builtin_mma_xxsetaccz (&acc7); #if (defined(NN) || defined(NT) || defined(TN) || defined(TT)) #define COMP_MUL(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real = _arbr - _aibi; _imag = _arbi + _aibr; } @@ -67,231 +67,231 @@ typedef FLOAT v2sf_t __attribute__ ((vector_size (8))); #define A_OP += #endif -#define BUILTIN_MMA_DISASSEMBLE_ACC_8 \ - __builtin_mma_disassemble_acc ((void *)result, &acc0); \ - __builtin_mma_disassemble_acc ((void *)&result[ 4], &acc1); \ - __builtin_mma_disassemble_acc ((void *)&result[ 8], &acc2); \ - __builtin_mma_disassemble_acc ((void *)&result[12], &acc3); \ - __builtin_mma_disassemble_acc ((void *)&result[16], &acc4); \ - __builtin_mma_disassemble_acc ((void *)&result[20], &acc5); \ - __builtin_mma_disassemble_acc ((void *)&result[24], &acc6); \ - __builtin_mma_disassemble_acc ((void *)&result[28], &acc7); +#define BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + __builtin_mma_disassemble_acc ((void *)result, &acc0); \ + __builtin_mma_disassemble_acc ((void *)&result[ 4], &acc1); \ + __builtin_mma_disassemble_acc ((void *)&result[ 8], &acc2); \ + __builtin_mma_disassemble_acc ((void *)&result[12], &acc3); \ + __builtin_mma_disassemble_acc ((void *)&result[16], &acc4); \ + __builtin_mma_disassemble_acc ((void *)&result[20], &acc5); \ + __builtin_mma_disassemble_acc ((void *)&result[24], &acc6); \ + __builtin_mma_disassemble_acc ((void *)&result[28], &acc7); -#define COMP_MUL_1 \ - COMP_MUL(tr[0], res[ 0], res[ 5], ti[0], res[ 1], res[ 4]) +#define COMP_MUL_1 \ + COMP_MUL(tr[0], res[ 0], res[ 5], ti[0], res[ 1], res[ 4]) -#define COMP_MAC_1(_offset) { \ - FLOAT *_ro = &res[_offset]; \ - COMP_MAC(tr[0], _ro[ 0], _ro[ 5], ti[0], _ro[ 1], _ro[ 4]) \ +#define COMP_MAC_1(_offset) { \ + FLOAT *_ro = &res[_offset]; \ + COMP_MAC(tr[0], _ro[ 0], _ro[ 5], ti[0], _ro[ 1], _ro[ 4]) \ } -#define COMP_MUL_2A \ - COMP_MUL(tr[0], res[ 0], res[ 5], ti[0], res[ 1], res[ 4]) \ - COMP_MUL(tr[1], res[ 2], res[ 7], ti[1], res[ 3], res[ 6]) +#define COMP_MUL_2A \ + COMP_MUL(tr[0], res[ 0], res[ 5], ti[0], res[ 1], res[ 4]) \ + COMP_MUL(tr[1], res[ 2], res[ 7], ti[1], res[ 3], res[ 6]) -#define COMP_MAC_2A(_offset) { \ - FLOAT *_ro = &res[_offset]; \ - COMP_MAC(tr[0], _ro[ 0], _ro[ 5], ti[0], _ro[ 1], _ro[ 4]) \ - COMP_MAC(tr[1], _ro[ 2], _ro[ 7], ti[1], _ro[ 3], _ro[ 6]) \ +#define COMP_MAC_2A(_offset) { \ + FLOAT *_ro = &res[_offset]; \ + COMP_MAC(tr[0], _ro[ 0], _ro[ 5], ti[0], _ro[ 1], _ro[ 4]) \ + COMP_MAC(tr[1], _ro[ 2], _ro[ 7], ti[1], _ro[ 3], _ro[ 6]) \ } -#define COMP_MUL_2B \ - COMP_MUL(tr[0], res[ 0], res[ 5], ti[0], res[ 1], res[ 4]) \ - COMP_MUL(tr[1], res[ 8], res[13], ti[1], res[ 9], res[12]) +#define COMP_MUL_2B \ + COMP_MUL(tr[0], res[ 0], res[ 5], ti[0], res[ 1], res[ 4]) \ + COMP_MUL(tr[1], res[ 8], res[13], ti[1], res[ 9], res[12]) -#define COMP_MAC_2B(_offset) { \ - FLOAT *_ro = &res[_offset]; \ - COMP_MAC(tr[0], _ro[ 0], _ro[ 5], ti[0], _ro[ 1], _ro[ 4]) \ - COMP_MAC(tr[1], _ro[ 8], _ro[13], ti[1], _ro[ 9], _ro[12]) \ +#define COMP_MAC_2B(_offset) { \ + FLOAT *_ro = &res[_offset]; \ + COMP_MAC(tr[0], _ro[ 0], _ro[ 5], ti[0], _ro[ 1], _ro[ 4]) \ + COMP_MAC(tr[1], _ro[ 8], _ro[13], ti[1], _ro[ 9], _ro[12]) \ } -#define COMP_MUL_4A(_offset) { \ - FLOAT *_ro = &res[_offset]; \ - COMP_MUL(tr[0], _ro[ 0], _ro[ 5], ti[0], _ro[ 1], _ro[ 4]) \ - COMP_MUL(tr[1], _ro[ 8], _ro[13], ti[1], _ro[ 9], _ro[12]) \ - COMP_MUL(tr[2], _ro[16], _ro[21], ti[2], _ro[17], _ro[20]) \ - COMP_MUL(tr[3], _ro[24], _ro[29], ti[3], _ro[25], _ro[28]) \ +#define COMP_MUL_4A(_offset) { \ + FLOAT *_ro = &res[_offset]; \ + COMP_MUL(tr[0], _ro[ 0], _ro[ 5], ti[0], _ro[ 1], _ro[ 4]) \ + COMP_MUL(tr[1], _ro[ 8], _ro[13], ti[1], _ro[ 9], _ro[12]) \ + COMP_MUL(tr[2], _ro[16], _ro[21], ti[2], _ro[17], _ro[20]) \ + COMP_MUL(tr[3], _ro[24], _ro[29], ti[3], _ro[25], _ro[28]) \ } -#define COMP_MAC_4A(_offset) { \ - FLOAT *_ro = &res[_offset]; \ - COMP_MAC(tr[0], _ro[ 0], _ro[ 5], ti[0], _ro[ 1], _ro[ 4]) \ - COMP_MAC(tr[1], _ro[ 8], _ro[13], ti[1], _ro[ 9], _ro[12]) \ - COMP_MAC(tr[2], _ro[16], _ro[21], ti[2], _ro[17], _ro[20]) \ - COMP_MAC(tr[3], _ro[24], _ro[29], ti[3], _ro[25], _ro[28]) \ +#define COMP_MAC_4A(_offset) { \ + FLOAT *_ro = &res[_offset]; \ + COMP_MAC(tr[0], _ro[ 0], _ro[ 5], ti[0], _ro[ 1], _ro[ 4]) \ + COMP_MAC(tr[1], _ro[ 8], _ro[13], ti[1], _ro[ 9], _ro[12]) \ + COMP_MAC(tr[2], _ro[16], _ro[21], ti[2], _ro[17], _ro[20]) \ + COMP_MAC(tr[3], _ro[24], _ro[29], ti[3], _ro[25], _ro[28]) \ } -#define COMP_MUL_4B(_offset) { \ - FLOAT *_ro = &res[_offset]; \ - COMP_MUL(tr[0], _ro[ 0], _ro[ 5], ti[0], _ro[ 1], _ro[ 4]) \ - COMP_MUL(tr[1], _ro[ 8], _ro[13], ti[1], _ro[ 9], _ro[12]) \ - COMP_MUL(tr[2], _ro[ 2], _ro[ 7], ti[2], _ro[ 3], _ro[ 6]) \ - COMP_MUL(tr[3], _ro[10], _ro[15], ti[3], _ro[11], _ro[14]) \ +#define COMP_MUL_4B(_offset) { \ + FLOAT *_ro = &res[_offset]; \ + COMP_MUL(tr[0], _ro[ 0], _ro[ 5], ti[0], _ro[ 1], _ro[ 4]) \ + COMP_MUL(tr[1], _ro[ 8], _ro[13], ti[1], _ro[ 9], _ro[12]) \ + COMP_MUL(tr[2], _ro[ 2], _ro[ 7], ti[2], _ro[ 3], _ro[ 6]) \ + COMP_MUL(tr[3], _ro[10], _ro[15], ti[3], _ro[11], _ro[14]) \ } -#define COMP_MAC_4B(_offset) { \ - FLOAT *_ro = &res[_offset]; \ - COMP_MAC(tr[0], _ro[ 0], _ro[ 5], ti[0], _ro[ 1], _ro[ 4]) \ - COMP_MAC(tr[1], _ro[ 8], _ro[13], ti[1], _ro[ 9], _ro[12]) \ - COMP_MAC(tr[2], _ro[ 2], _ro[ 7], ti[2], _ro[ 3], _ro[ 6]) \ - COMP_MAC(tr[3], _ro[10], _ro[15], ti[3], _ro[11], _ro[14]) \ +#define COMP_MAC_4B(_offset) { \ + FLOAT *_ro = &res[_offset]; \ + COMP_MAC(tr[0], _ro[ 0], _ro[ 5], ti[0], _ro[ 1], _ro[ 4]) \ + COMP_MAC(tr[1], _ro[ 8], _ro[13], ti[1], _ro[ 9], _ro[12]) \ + COMP_MAC(tr[2], _ro[ 2], _ro[ 7], ti[2], _ro[ 3], _ro[ 6]) \ + COMP_MAC(tr[3], _ro[10], _ro[15], ti[3], _ro[11], _ro[14]) \ } -#define SAVE_ACC_COMPLEX_11 \ - BUILTIN_MMA_DISASSEMBLE_ACC_8 \ - COMP_MUL_1 \ - COMP_MAC_1(16) \ - COMP_MAC_1(32) \ - COMP_MAC_1(48) \ - COMP_MAC_1(64) \ - COMP_MAC_1(80) \ - COMP_MAC_1(96) \ - COMP_MAC_1(112) \ - CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ - CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; +#define SAVE_ACC_COMPLEX_11 \ + BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + COMP_MUL_1 \ + COMP_MAC_1(16) \ + COMP_MAC_1(32) \ + COMP_MAC_1(48) \ + COMP_MAC_1(64) \ + COMP_MAC_1(80) \ + COMP_MAC_1(96) \ + COMP_MAC_1(112) \ + CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; -#define SAVE_ACC_COMPLEX_12 \ - BUILTIN_MMA_DISASSEMBLE_ACC_8 \ - COMP_MUL_2A \ - COMP_MAC_2A(16) \ - COMP_MAC_2A(32) \ - COMP_MAC_2A(48) \ - COMP_MAC_2A(64) \ - COMP_MAC_2A(80) \ - COMP_MAC_2A(96) \ - COMP_MAC_2A(112) \ - CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ - CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ - CO[2*ldc+0] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ - CO[2*ldc+1] A_OP ti[1] * alpha_r + tr[1] * alpha_i; +#define SAVE_ACC_COMPLEX_12 \ + BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + COMP_MUL_2A \ + COMP_MAC_2A(16) \ + COMP_MAC_2A(32) \ + COMP_MAC_2A(48) \ + COMP_MAC_2A(64) \ + COMP_MAC_2A(80) \ + COMP_MAC_2A(96) \ + COMP_MAC_2A(112) \ + CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[2*ldc+0] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[2*ldc+1] A_OP ti[1] * alpha_r + tr[1] * alpha_i; -#define SAVE_ACC_COMPLEX_21_1 \ - BUILTIN_MMA_DISASSEMBLE_ACC_8 \ - COMP_MUL_2B \ - COMP_MAC_2B(16) \ - COMP_MAC_2B(32) \ - COMP_MAC_2B(48) \ - COMP_MAC_2B(64) \ - COMP_MAC_2B(80) \ - COMP_MAC_2B(96) \ - COMP_MAC_2B(112) \ - CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ - CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ - CO[2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ - CO[3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; +#define SAVE_ACC_COMPLEX_21_1 \ + BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + COMP_MUL_2B \ + COMP_MAC_2B(16) \ + COMP_MAC_2B(32) \ + COMP_MAC_2B(48) \ + COMP_MAC_2B(64) \ + COMP_MAC_2B(80) \ + COMP_MAC_2B(96) \ + COMP_MAC_2B(112) \ + CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; -#define SAVE_ACC_COMPLEX_21_2 \ - BUILTIN_MMA_DISASSEMBLE_ACC_8 \ - COMP_MUL_4A(0) \ - COMP_MAC_4A(32) \ - COMP_MAC_4A(64) \ - COMP_MAC_4A(96) \ - CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ - CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ - CO[2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ - CO[3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ - CO[4] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ - CO[5] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ - CO[6] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ - CO[7] A_OP ti[3] * alpha_r + tr[3] * alpha_i; +#define SAVE_ACC_COMPLEX_21_2 \ + BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + COMP_MUL_4A(0) \ + COMP_MAC_4A(32) \ + COMP_MAC_4A(64) \ + COMP_MAC_4A(96) \ + CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[4] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[5] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[6] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[7] A_OP ti[3] * alpha_r + tr[3] * alpha_i; -#define SAVE_ACC_COMPLEX_21_4 \ - BUILTIN_MMA_DISASSEMBLE_ACC_8 \ - COMP_MUL_4A(0) \ - COMP_MAC_4A(64) \ - CO[ 0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ - CO[ 1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ - CO[ 2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ - CO[ 3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ - CO[ 4] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ - CO[ 5] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ - CO[ 6] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ - CO[ 7] A_OP ti[3] * alpha_r + tr[3] * alpha_i; \ - COMP_MUL_4A(32) \ - COMP_MAC_4A(96) \ - CO[ 8] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ - CO[ 9] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ - CO[10] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ - CO[11] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ - CO[12] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ - CO[13] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ - CO[14] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ - CO[15] A_OP ti[3] * alpha_r + tr[3] * alpha_i; +#define SAVE_ACC_COMPLEX_21_4 \ + BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + COMP_MUL_4A(0) \ + COMP_MAC_4A(64) \ + CO[ 0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[ 1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[ 2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[ 3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[ 4] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[ 5] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[ 6] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[ 7] A_OP ti[3] * alpha_r + tr[3] * alpha_i; \ + COMP_MUL_4A(32) \ + COMP_MAC_4A(96) \ + CO[ 8] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[ 9] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[10] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[11] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[12] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[13] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[14] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[15] A_OP ti[3] * alpha_r + tr[3] * alpha_i; -#define SAVE_ACC_COMPLEX_22_4 \ - BUILTIN_MMA_DISASSEMBLE_ACC_8 \ - COMP_MUL_4B(0) \ - CO[ 0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ - CO[ 1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ - CO[ 2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ - CO[ 3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ - CO[2*ldc+ 0] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ - CO[2*ldc+ 1] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ - CO[2*ldc+ 2] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ - CO[2*ldc+ 3] A_OP ti[3] * alpha_r + tr[3] * alpha_i; \ - COMP_MUL_4B(16) \ - CO[ 4] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ - CO[ 5] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ - CO[ 6] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ - CO[ 7] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ - CO[2*ldc+ 4] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ - CO[2*ldc+ 5] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ - CO[2*ldc+ 6] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ - CO[2*ldc+ 7] A_OP ti[3] * alpha_r + tr[3] * alpha_i; \ - COMP_MUL_4B(32) \ - CO[ 8] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ - CO[ 9] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ - CO[10] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ - CO[11] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ - CO[2*ldc+ 8] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ - CO[2*ldc+ 9] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ - CO[2*ldc+10] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ - CO[2*ldc+11] A_OP ti[3] * alpha_r + tr[3] * alpha_i; \ - COMP_MUL_4B(48) \ - CO[12] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ - CO[13] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ - CO[14] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ - CO[15] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ - CO[2*ldc+12] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ - CO[2*ldc+13] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ - CO[2*ldc+14] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ - CO[2*ldc+15] A_OP ti[3] * alpha_r + tr[3] * alpha_i; +#define SAVE_ACC_COMPLEX_22_4 \ + BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + COMP_MUL_4B(0) \ + CO[ 0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[ 1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[ 2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[ 3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[2*ldc+ 0] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[2*ldc+ 1] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[2*ldc+ 2] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[2*ldc+ 3] A_OP ti[3] * alpha_r + tr[3] * alpha_i; \ + COMP_MUL_4B(16) \ + CO[ 4] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[ 5] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[ 6] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[ 7] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[2*ldc+ 4] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[2*ldc+ 5] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[2*ldc+ 6] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[2*ldc+ 7] A_OP ti[3] * alpha_r + tr[3] * alpha_i; \ + COMP_MUL_4B(32) \ + CO[ 8] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[ 9] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[10] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[11] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[2*ldc+ 8] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[2*ldc+ 9] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[2*ldc+10] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[2*ldc+11] A_OP ti[3] * alpha_r + tr[3] * alpha_i; \ + COMP_MUL_4B(48) \ + CO[12] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[13] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[14] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[15] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[2*ldc+12] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[2*ldc+13] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[2*ldc+14] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[2*ldc+15] A_OP ti[3] * alpha_r + tr[3] * alpha_i; -#define SAVE_ACC_COMPLEX_22_2 \ - BUILTIN_MMA_DISASSEMBLE_ACC_8 \ - COMP_MUL_4B(0) \ - CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ - CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ - CO[2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ - CO[3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ - CO[2*ldc+0] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ - CO[2*ldc+1] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ - CO[2*ldc+2] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ - CO[2*ldc+3] A_OP ti[3] * alpha_r + tr[3] * alpha_i; \ - COMP_MUL_4B(16) \ - CO[4] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ - CO[5] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ - CO[6] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ - CO[7] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ - CO[2*ldc+4] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ - CO[2*ldc+5] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ - CO[2*ldc+6] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ - CO[2*ldc+7] A_OP ti[3] * alpha_r + tr[3] * alpha_i; +#define SAVE_ACC_COMPLEX_22_2 \ + BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + COMP_MUL_4B(0) \ + CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[2*ldc+0] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[2*ldc+1] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[2*ldc+2] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[2*ldc+3] A_OP ti[3] * alpha_r + tr[3] * alpha_i; \ + COMP_MUL_4B(16) \ + CO[4] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[5] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[6] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[7] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[2*ldc+4] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[2*ldc+5] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[2*ldc+6] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[2*ldc+7] A_OP ti[3] * alpha_r + tr[3] * alpha_i; -#define SAVE_ACC_COMPLEX_22_1 \ - BUILTIN_MMA_DISASSEMBLE_ACC_8 \ - COMP_MUL_4B(0) \ - CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ - CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ - CO[2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ - CO[3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ - CO[2*ldc+0] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ - CO[2*ldc+1] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ - CO[2*ldc+2] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ - CO[2*ldc+3] A_OP ti[3] * alpha_r + tr[3] * alpha_i; +#define SAVE_ACC_COMPLEX_22_1 \ + BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + COMP_MUL_4B(0) \ + CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[2*ldc+0] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[2*ldc+1] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[2*ldc+2] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[2*ldc+3] A_OP ti[3] * alpha_r + tr[3] * alpha_i; -#define SAVE_ACC_COMPLEX_24_ALL \ +#define SAVE_ACC_COMPLEX_24_ALL \ __builtin_mma_disassemble_acc ((void *)result, &acc0); \ __builtin_mma_disassemble_acc ((void *)(&result[4]), &acc4); \ __builtin_mma_disassemble_acc ((void *)(&result[8]), &acc1); \ @@ -332,205 +332,205 @@ typedef FLOAT v2sf_t __attribute__ ((vector_size (8))); COMP_MUL(tr[29], res[120], res[125], ti[29], res[121], res[124]) \ COMP_MUL(tr[30], res[114], res[119], ti[30], res[115], res[118]) \ COMP_MUL(tr[31], res[122], res[127], ti[31], res[123], res[126]) \ - CO[ 0] A_OP tr[ 0] * alpha_r - ti[ 0] * alpha_i; \ - CO[ 1] A_OP ti[ 0] * alpha_r + tr[ 0] * alpha_i; \ - CO[ 2] A_OP tr[ 1] * alpha_r - ti[ 1] * alpha_i; \ - CO[ 3] A_OP ti[ 1] * alpha_r + tr[ 1] * alpha_i; \ - CO[2*ldc+ 0] A_OP tr[ 2] * alpha_r - ti[ 2] * alpha_i; \ - CO[2*ldc+ 1] A_OP ti[ 2] * alpha_r + tr[ 2] * alpha_i; \ - CO[2*ldc+ 2] A_OP tr[ 3] * alpha_r - ti[ 3] * alpha_i; \ - CO[2*ldc+ 3] A_OP ti[ 3] * alpha_r + tr[ 3] * alpha_i; \ - CO[4*ldc+ 0] A_OP tr[ 4] * alpha_r - ti[ 4] * alpha_i; \ - CO[4*ldc+ 1] A_OP ti[ 4] * alpha_r + tr[ 4] * alpha_i; \ - CO[4*ldc+ 2] A_OP tr[ 5] * alpha_r - ti[ 5] * alpha_i; \ - CO[4*ldc+ 3] A_OP ti[ 5] * alpha_r + tr[ 5] * alpha_i; \ - CO[6*ldc+ 0] A_OP tr[ 6] * alpha_r - ti[ 6] * alpha_i; \ - CO[6*ldc+ 1] A_OP ti[ 6] * alpha_r + tr[ 6] * alpha_i; \ - CO[6*ldc+ 2] A_OP tr[ 7] * alpha_r - ti[ 7] * alpha_i; \ - CO[6*ldc+ 3] A_OP ti[ 7] * alpha_r + tr[ 7] * alpha_i; \ - CO[ 4] A_OP tr[ 8] * alpha_r - ti[ 8] * alpha_i; \ - CO[ 5] A_OP ti[ 8] * alpha_r + tr[ 8] * alpha_i; \ - CO[ 6] A_OP tr[ 9] * alpha_r - ti[ 9] * alpha_i; \ - CO[ 7] A_OP ti[ 9] * alpha_r + tr[ 9] * alpha_i; \ - CO[2*ldc+ 4] A_OP tr[10] * alpha_r - ti[10] * alpha_i; \ - CO[2*ldc+ 5] A_OP ti[10] * alpha_r + tr[10] * alpha_i; \ - CO[2*ldc+ 6] A_OP tr[11] * alpha_r - ti[11] * alpha_i; \ - CO[2*ldc+ 7] A_OP ti[11] * alpha_r + tr[11] * alpha_i; \ - CO[4*ldc+ 4] A_OP tr[12] * alpha_r - ti[12] * alpha_i; \ - CO[4*ldc+ 5] A_OP ti[12] * alpha_r + tr[12] * alpha_i; \ - CO[4*ldc+ 6] A_OP tr[13] * alpha_r - ti[13] * alpha_i; \ - CO[4*ldc+ 7] A_OP ti[13] * alpha_r + tr[13] * alpha_i; \ - CO[6*ldc+ 4] A_OP tr[14] * alpha_r - ti[14] * alpha_i; \ - CO[6*ldc+ 5] A_OP ti[14] * alpha_r + tr[14] * alpha_i; \ - CO[6*ldc+ 6] A_OP tr[15] * alpha_r - ti[15] * alpha_i; \ - CO[6*ldc+ 7] A_OP ti[15] * alpha_r + tr[15] * alpha_i; \ - CO[ 8] A_OP tr[16] * alpha_r - ti[16] * alpha_i; \ - CO[ 9] A_OP ti[16] * alpha_r + tr[16] * alpha_i; \ - CO[ 10] A_OP tr[17] * alpha_r - ti[17] * alpha_i; \ - CO[ 11] A_OP ti[17] * alpha_r + tr[17] * alpha_i; \ - CO[2*ldc+ 8] A_OP tr[18] * alpha_r - ti[18] * alpha_i; \ - CO[2*ldc+ 9] A_OP ti[18] * alpha_r + tr[18] * alpha_i; \ - CO[2*ldc+10] A_OP tr[19] * alpha_r - ti[19] * alpha_i; \ - CO[2*ldc+11] A_OP ti[19] * alpha_r + tr[19] * alpha_i; \ - CO[4*ldc+ 8] A_OP tr[20] * alpha_r - ti[20] * alpha_i; \ - CO[4*ldc+ 9] A_OP ti[20] * alpha_r + tr[20] * alpha_i; \ - CO[4*ldc+10] A_OP tr[21] * alpha_r - ti[21] * alpha_i; \ - CO[4*ldc+11] A_OP ti[21] * alpha_r + tr[21] * alpha_i; \ - CO[6*ldc+ 8] A_OP tr[22] * alpha_r - ti[22] * alpha_i; \ - CO[6*ldc+ 9] A_OP ti[22] * alpha_r + tr[22] * alpha_i; \ - CO[6*ldc+10] A_OP tr[23] * alpha_r - ti[23] * alpha_i; \ - CO[6*ldc+11] A_OP ti[23] * alpha_r + tr[23] * alpha_i; \ - CO[ 12] A_OP tr[24] * alpha_r - ti[24] * alpha_i; \ - CO[ 13] A_OP ti[24] * alpha_r + tr[24] * alpha_i; \ - CO[ 14] A_OP tr[25] * alpha_r - ti[25] * alpha_i; \ - CO[ 15] A_OP ti[25] * alpha_r + tr[25] * alpha_i; \ - CO[2*ldc+12] A_OP tr[26] * alpha_r - ti[26] * alpha_i; \ - CO[2*ldc+13] A_OP ti[26] * alpha_r + tr[26] * alpha_i; \ - CO[2*ldc+14] A_OP tr[27] * alpha_r - ti[27] * alpha_i; \ - CO[2*ldc+15] A_OP ti[27] * alpha_r + tr[27] * alpha_i; \ - CO[4*ldc+12] A_OP tr[28] * alpha_r - ti[28] * alpha_i; \ - CO[4*ldc+13] A_OP ti[28] * alpha_r + tr[28] * alpha_i; \ - CO[4*ldc+14] A_OP tr[29] * alpha_r - ti[29] * alpha_i; \ - CO[4*ldc+15] A_OP ti[29] * alpha_r + tr[29] * alpha_i; \ - CO[6*ldc+12] A_OP tr[30] * alpha_r - ti[30] * alpha_i; \ - CO[6*ldc+13] A_OP ti[30] * alpha_r + tr[30] * alpha_i; \ - CO[6*ldc+14] A_OP tr[31] * alpha_r - ti[31] * alpha_i; \ - CO[6*ldc+15] A_OP ti[31] * alpha_r + tr[31] * alpha_i; + CO[ 0] A_OP tr[ 0] * alpha_r - ti[ 0] * alpha_i; \ + CO[ 1] A_OP ti[ 0] * alpha_r + tr[ 0] * alpha_i; \ + CO[ 2] A_OP tr[ 1] * alpha_r - ti[ 1] * alpha_i; \ + CO[ 3] A_OP ti[ 1] * alpha_r + tr[ 1] * alpha_i; \ + CO[2*ldc+ 0] A_OP tr[ 2] * alpha_r - ti[ 2] * alpha_i; \ + CO[2*ldc+ 1] A_OP ti[ 2] * alpha_r + tr[ 2] * alpha_i; \ + CO[2*ldc+ 2] A_OP tr[ 3] * alpha_r - ti[ 3] * alpha_i; \ + CO[2*ldc+ 3] A_OP ti[ 3] * alpha_r + tr[ 3] * alpha_i; \ + CO[4*ldc+ 0] A_OP tr[ 4] * alpha_r - ti[ 4] * alpha_i; \ + CO[4*ldc+ 1] A_OP ti[ 4] * alpha_r + tr[ 4] * alpha_i; \ + CO[4*ldc+ 2] A_OP tr[ 5] * alpha_r - ti[ 5] * alpha_i; \ + CO[4*ldc+ 3] A_OP ti[ 5] * alpha_r + tr[ 5] * alpha_i; \ + CO[6*ldc+ 0] A_OP tr[ 6] * alpha_r - ti[ 6] * alpha_i; \ + CO[6*ldc+ 1] A_OP ti[ 6] * alpha_r + tr[ 6] * alpha_i; \ + CO[6*ldc+ 2] A_OP tr[ 7] * alpha_r - ti[ 7] * alpha_i; \ + CO[6*ldc+ 3] A_OP ti[ 7] * alpha_r + tr[ 7] * alpha_i; \ + CO[ 4] A_OP tr[ 8] * alpha_r - ti[ 8] * alpha_i; \ + CO[ 5] A_OP ti[ 8] * alpha_r + tr[ 8] * alpha_i; \ + CO[ 6] A_OP tr[ 9] * alpha_r - ti[ 9] * alpha_i; \ + CO[ 7] A_OP ti[ 9] * alpha_r + tr[ 9] * alpha_i; \ + CO[2*ldc+ 4] A_OP tr[10] * alpha_r - ti[10] * alpha_i; \ + CO[2*ldc+ 5] A_OP ti[10] * alpha_r + tr[10] * alpha_i; \ + CO[2*ldc+ 6] A_OP tr[11] * alpha_r - ti[11] * alpha_i; \ + CO[2*ldc+ 7] A_OP ti[11] * alpha_r + tr[11] * alpha_i; \ + CO[4*ldc+ 4] A_OP tr[12] * alpha_r - ti[12] * alpha_i; \ + CO[4*ldc+ 5] A_OP ti[12] * alpha_r + tr[12] * alpha_i; \ + CO[4*ldc+ 6] A_OP tr[13] * alpha_r - ti[13] * alpha_i; \ + CO[4*ldc+ 7] A_OP ti[13] * alpha_r + tr[13] * alpha_i; \ + CO[6*ldc+ 4] A_OP tr[14] * alpha_r - ti[14] * alpha_i; \ + CO[6*ldc+ 5] A_OP ti[14] * alpha_r + tr[14] * alpha_i; \ + CO[6*ldc+ 6] A_OP tr[15] * alpha_r - ti[15] * alpha_i; \ + CO[6*ldc+ 7] A_OP ti[15] * alpha_r + tr[15] * alpha_i; \ + CO[ 8] A_OP tr[16] * alpha_r - ti[16] * alpha_i; \ + CO[ 9] A_OP ti[16] * alpha_r + tr[16] * alpha_i; \ + CO[ 10] A_OP tr[17] * alpha_r - ti[17] * alpha_i; \ + CO[ 11] A_OP ti[17] * alpha_r + tr[17] * alpha_i; \ + CO[2*ldc+ 8] A_OP tr[18] * alpha_r - ti[18] * alpha_i; \ + CO[2*ldc+ 9] A_OP ti[18] * alpha_r + tr[18] * alpha_i; \ + CO[2*ldc+10] A_OP tr[19] * alpha_r - ti[19] * alpha_i; \ + CO[2*ldc+11] A_OP ti[19] * alpha_r + tr[19] * alpha_i; \ + CO[4*ldc+ 8] A_OP tr[20] * alpha_r - ti[20] * alpha_i; \ + CO[4*ldc+ 9] A_OP ti[20] * alpha_r + tr[20] * alpha_i; \ + CO[4*ldc+10] A_OP tr[21] * alpha_r - ti[21] * alpha_i; \ + CO[4*ldc+11] A_OP ti[21] * alpha_r + tr[21] * alpha_i; \ + CO[6*ldc+ 8] A_OP tr[22] * alpha_r - ti[22] * alpha_i; \ + CO[6*ldc+ 9] A_OP ti[22] * alpha_r + tr[22] * alpha_i; \ + CO[6*ldc+10] A_OP tr[23] * alpha_r - ti[23] * alpha_i; \ + CO[6*ldc+11] A_OP ti[23] * alpha_r + tr[23] * alpha_i; \ + CO[ 12] A_OP tr[24] * alpha_r - ti[24] * alpha_i; \ + CO[ 13] A_OP ti[24] * alpha_r + tr[24] * alpha_i; \ + CO[ 14] A_OP tr[25] * alpha_r - ti[25] * alpha_i; \ + CO[ 15] A_OP ti[25] * alpha_r + tr[25] * alpha_i; \ + CO[2*ldc+12] A_OP tr[26] * alpha_r - ti[26] * alpha_i; \ + CO[2*ldc+13] A_OP ti[26] * alpha_r + tr[26] * alpha_i; \ + CO[2*ldc+14] A_OP tr[27] * alpha_r - ti[27] * alpha_i; \ + CO[2*ldc+15] A_OP ti[27] * alpha_r + tr[27] * alpha_i; \ + CO[4*ldc+12] A_OP tr[28] * alpha_r - ti[28] * alpha_i; \ + CO[4*ldc+13] A_OP ti[28] * alpha_r + tr[28] * alpha_i; \ + CO[4*ldc+14] A_OP tr[29] * alpha_r - ti[29] * alpha_i; \ + CO[4*ldc+15] A_OP ti[29] * alpha_r + tr[29] * alpha_i; \ + CO[6*ldc+12] A_OP tr[30] * alpha_r - ti[30] * alpha_i; \ + CO[6*ldc+13] A_OP ti[30] * alpha_r + tr[30] * alpha_i; \ + CO[6*ldc+14] A_OP tr[31] * alpha_r - ti[31] * alpha_i; \ + CO[6*ldc+15] A_OP ti[31] * alpha_r + tr[31] * alpha_i; -#define SAVE_ACC_COMPLEX_24(ACC1, ACC2, CI) \ - __builtin_mma_disassemble_acc ((void *)result, ACC1); \ - __builtin_mma_disassemble_acc ((void *)(&result[4]), ACC2); \ - COMP_MUL(tr[0], res[0], res[5], ti[0], res[1], res[4]) \ - COMP_MUL(tr[1], res[8], res[13], ti[1], res[9], res[12]) \ - COMP_MUL(tr[2], res[2], res[7], ti[2], res[3], res[6]) \ - COMP_MUL(tr[3], res[10], res[15], ti[3], res[11], res[14]) \ - COMP_MUL(tr[4], res[16], res[21], ti[4], res[17], res[20]) \ - COMP_MUL(tr[5], res[24], res[29], ti[5], res[25], res[28]) \ - COMP_MUL(tr[6], res[18], res[23], ti[6], res[19], res[22]) \ - COMP_MUL(tr[7], res[26], res[31], ti[7], res[27], res[30]) \ - CO[CI+0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ - CO[CI+1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ - CO[CI+2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ - CO[CI+3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ - CO[CI+2*ldc+0] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ - CO[CI+2*ldc+1] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ - CO[CI+2*ldc+2] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ - CO[CI+2*ldc+3] A_OP ti[3] * alpha_r + tr[3] * alpha_i; \ - CO[CI+4*ldc+0] A_OP tr[4] * alpha_r - ti[4] * alpha_i; \ - CO[CI+4*ldc+1] A_OP ti[4] * alpha_r + tr[4] * alpha_i; \ - CO[CI+4*ldc+2] A_OP tr[5] * alpha_r - ti[5] * alpha_i; \ - CO[CI+4*ldc+3] A_OP ti[5] * alpha_r + tr[5] * alpha_i; \ - CO[CI+6*ldc+0] A_OP tr[6] * alpha_r - ti[6] * alpha_i; \ - CO[CI+6*ldc+1] A_OP ti[6] * alpha_r + tr[6] * alpha_i; \ - CO[CI+6*ldc+2] A_OP tr[7] * alpha_r - ti[7] * alpha_i; \ - CO[CI+6*ldc+3] A_OP ti[7] * alpha_r + tr[7] * alpha_i; +#define SAVE_ACC_COMPLEX_24(ACC1, ACC2, CI) \ + __builtin_mma_disassemble_acc ((void *)result, ACC1); \ + __builtin_mma_disassemble_acc ((void *)(&result[4]), ACC2); \ + COMP_MUL(tr[0], res[0], res[5], ti[0], res[1], res[4]) \ + COMP_MUL(tr[1], res[8], res[13], ti[1], res[9], res[12]) \ + COMP_MUL(tr[2], res[2], res[7], ti[2], res[3], res[6]) \ + COMP_MUL(tr[3], res[10], res[15], ti[3], res[11], res[14]) \ + COMP_MUL(tr[4], res[16], res[21], ti[4], res[17], res[20]) \ + COMP_MUL(tr[5], res[24], res[29], ti[5], res[25], res[28]) \ + COMP_MUL(tr[6], res[18], res[23], ti[6], res[19], res[22]) \ + COMP_MUL(tr[7], res[26], res[31], ti[7], res[27], res[30]) \ + CO[CI+0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[CI+1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[CI+2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[CI+3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[CI+2*ldc+0] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[CI+2*ldc+1] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[CI+2*ldc+2] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[CI+2*ldc+3] A_OP ti[3] * alpha_r + tr[3] * alpha_i; \ + CO[CI+4*ldc+0] A_OP tr[4] * alpha_r - ti[4] * alpha_i; \ + CO[CI+4*ldc+1] A_OP ti[4] * alpha_r + tr[4] * alpha_i; \ + CO[CI+4*ldc+2] A_OP tr[5] * alpha_r - ti[5] * alpha_i; \ + CO[CI+4*ldc+3] A_OP ti[5] * alpha_r + tr[5] * alpha_i; \ + CO[CI+6*ldc+0] A_OP tr[6] * alpha_r - ti[6] * alpha_i; \ + CO[CI+6*ldc+1] A_OP ti[6] * alpha_r + tr[6] * alpha_i; \ + CO[CI+6*ldc+2] A_OP tr[7] * alpha_r - ti[7] * alpha_i; \ + CO[CI+6*ldc+3] A_OP ti[7] * alpha_r + tr[7] * alpha_i; -#define SAVE_ACC_COMPLEX_14 \ - BUILTIN_MMA_DISASSEMBLE_ACC_8 \ - COMP_MUL(tr[0], res[ 0], res[ 5], ti[0], res[ 1], res[ 4]) \ - COMP_MUL(tr[1], res[ 2], res[ 7], ti[1], res[ 3], res[ 6]) \ - COMP_MUL(tr[2], res[ 16], res[ 21], ti[2], res[ 17], res[ 20]) \ - COMP_MUL(tr[3], res[ 18], res[ 23], ti[3], res[ 19], res[ 22]) \ - COMP_MAC(tr[0], res[ 32], res[ 37], ti[0], res[ 33], res[ 36]) \ - COMP_MAC(tr[1], res[ 34], res[ 39], ti[1], res[ 35], res[ 38]) \ - COMP_MAC(tr[2], res[ 48], res[ 53], ti[2], res[ 49], res[ 52]) \ - COMP_MAC(tr[3], res[ 50], res[ 55], ti[3], res[ 51], res[ 54]) \ - COMP_MAC(tr[0], res[ 64], res[ 69], ti[0], res[ 65], res[ 68]) \ - COMP_MAC(tr[1], res[ 66], res[ 71], ti[1], res[ 67], res[ 70]) \ - COMP_MAC(tr[2], res[ 80], res[ 85], ti[2], res[ 81], res[ 84]) \ - COMP_MAC(tr[3], res[ 82], res[ 87], ti[3], res[ 83], res[ 86]) \ - COMP_MAC(tr[0], res[ 96], res[101], ti[0], res[ 97], res[100]) \ - COMP_MAC(tr[1], res[ 98], res[103], ti[1], res[ 99], res[102]) \ - COMP_MAC(tr[2], res[112], res[117], ti[2], res[113], res[116]) \ - COMP_MAC(tr[3], res[114], res[119], ti[3], res[115], res[118]) \ - CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ - CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ - CO[2*ldc+0] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ - CO[2*ldc+1] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ - CO[4*ldc+0] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ - CO[4*ldc+1] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ - CO[6*ldc+0] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ - CO[6*ldc+1] A_OP ti[3] * alpha_r + tr[3] * alpha_i; +#define SAVE_ACC_COMPLEX_14 \ + BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + COMP_MUL(tr[0], res[ 0], res[ 5], ti[0], res[ 1], res[ 4]) \ + COMP_MUL(tr[1], res[ 2], res[ 7], ti[1], res[ 3], res[ 6]) \ + COMP_MUL(tr[2], res[ 16], res[ 21], ti[2], res[ 17], res[ 20]) \ + COMP_MUL(tr[3], res[ 18], res[ 23], ti[3], res[ 19], res[ 22]) \ + COMP_MAC(tr[0], res[ 32], res[ 37], ti[0], res[ 33], res[ 36]) \ + COMP_MAC(tr[1], res[ 34], res[ 39], ti[1], res[ 35], res[ 38]) \ + COMP_MAC(tr[2], res[ 48], res[ 53], ti[2], res[ 49], res[ 52]) \ + COMP_MAC(tr[3], res[ 50], res[ 55], ti[3], res[ 51], res[ 54]) \ + COMP_MAC(tr[0], res[ 64], res[ 69], ti[0], res[ 65], res[ 68]) \ + COMP_MAC(tr[1], res[ 66], res[ 71], ti[1], res[ 67], res[ 70]) \ + COMP_MAC(tr[2], res[ 80], res[ 85], ti[2], res[ 81], res[ 84]) \ + COMP_MAC(tr[3], res[ 82], res[ 87], ti[3], res[ 83], res[ 86]) \ + COMP_MAC(tr[0], res[ 96], res[101], ti[0], res[ 97], res[100]) \ + COMP_MAC(tr[1], res[ 98], res[103], ti[1], res[ 99], res[102]) \ + COMP_MAC(tr[2], res[112], res[117], ti[2], res[113], res[116]) \ + COMP_MAC(tr[3], res[114], res[119], ti[3], res[115], res[118]) \ + CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[2*ldc+0] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[2*ldc+1] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[4*ldc+0] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[4*ldc+1] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[6*ldc+0] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[6*ldc+1] A_OP ti[3] * alpha_r + tr[3] * alpha_i; #define PREFETCH1(x, y) asm volatile ("dcbt %0, %1" : : "r" (x), "b" (y) : "memory"); #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) #define REFRESH_TEMP_BK(x, y) \ - temp = k - off; + temp = k - off; #elif defined(LEFT) #define REFRESH_TEMP_BK(x, y) \ - temp = off + x; + temp = off + x; #else #define REFRESH_TEMP_BK(x, y) \ - temp = off + y; + temp = off + y; #endif #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) #define REFRESH_POINTERS(x, y) \ - BO = B; \ - REFRESH_TEMP_BK(x, y) + BO = B; \ + REFRESH_TEMP_BK(x, y) #else #define REFRESH_POINTERS(x, y) \ - AO += off * (2*x); \ - BO = B + off * (2*y); \ - REFRESH_TEMP_BK(x, y) + AO += off * (2*x); \ + BO = B + off * (2*y); \ + REFRESH_TEMP_BK(x, y) #endif #ifdef LEFT #define REFRESH_OFF(x) \ - off += x; + off += x; #else #define REFRESH_OFF(x) #endif #ifdef LEFT #define UPDATE_TEMP(x, y) \ - temp -= x; + temp -= x; #else #define UPDATE_TEMP(x, y) \ - temp -= y; + temp -= y; #endif #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) #define REFRESH_TMP_AFTER_SAVE(x, y) \ - temp = k - off; \ - UPDATE_TEMP(x, y) \ - AO += temp * (2*x); \ - BO += temp * (2*y); + temp = k - off; \ + UPDATE_TEMP(x, y) \ + AO += temp * (2*x); \ + BO += temp * (2*y); #else #define REFRESH_TMP_AFTER_SAVE(x, y) #endif -#define REFRESH_AFTER_SAVE(x,y) \ - REFRESH_TMP_AFTER_SAVE(x, y) \ - REFRESH_OFF(x) +#define REFRESH_AFTER_SAVE(x,y) \ + REFRESH_TMP_AFTER_SAVE(x, y) \ + REFRESH_OFF(x) /************************************************************************************* * GEMM Kernel *************************************************************************************/ int -CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT * A, FLOAT * B, - FLOAT * C, BLASLONG ldc #ifdef TRMMKERNEL - , BLASLONG offset +CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, + FLOAT * A, FLOAT * B, FLOAT * C, BLASLONG ldc, BLASLONG offset) +#else +CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, + FLOAT * A, FLOAT * B, FLOAT * C, BLASLONG ldc) #endif - ) { - BLASLONG i1, i, l, temp; - FLOAT *AO, *BO, *CO; + BLASLONG i1, i, l, temp; + FLOAT *AO, *BO, *CO; #if defined(TRMMKERNEL) - BLASLONG off; + BLASLONG off; #endif #if defined(TRMMKERNEL) && !defined(LEFT) - off = -offset; + off = -offset; #endif - __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; + __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; - v4sf_t result[32]; - FLOAT *res, tr[64], ti[64]; - res = (FLOAT *) result; + v4sf_t result[32]; + FLOAT *res, tr[64], ti[64]; + res = (FLOAT *) result; - for (i1 = 0; i1 < (n >> 2); i1++) - { + for (i1 = 0; i1 < (n >> 2); i1++) { #if defined(TRMMKERNEL) && defined(LEFT) off = offset; #endif @@ -538,193 +538,181 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT * CO = C; C += ldc << 3; - for (i = 0; i < (m >> 3); i++) - { + for (i = 0; i < (m >> 3); i++) { #if defined(TRMMKERNEL) - REFRESH_POINTERS (8, 4); + REFRESH_POINTERS (8, 4); #else - BO = B; - temp = k; + BO = B; + temp = k; #endif - SET_ACC_ZERO() - for (l = 0; l < temp; ++l) - { - vec_t rowA1 = *(vec_t *) & AO[l<<4]; - vec_t rowB1 = *(vec_t *) & BO[l<<3]; - vec_t rowA2 = *(vec_t *) & AO[(l<<4)+4]; - vec_t rowB2 = *(vec_t *) & BO[(l<<3)+4]; - vec_t rowA3 = *(vec_t *) & AO[(l<<4)+8]; - vec_t rowA4 = *(vec_t *) & AO[(l<<4)+12]; - __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); - __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1); - __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB1); - __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB1); - __builtin_mma_xvf32gerpp(&acc4, rowA1, rowB2); - __builtin_mma_xvf32gerpp(&acc5, rowA2, rowB2); - __builtin_mma_xvf32gerpp(&acc6, rowA3, rowB2); - __builtin_mma_xvf32gerpp(&acc7, rowA4, rowB2); - } + SET_ACC_ZERO() + for (l = 0; l < temp; ++l) { + vec_t rowA1 = *(vec_t *) & AO[l<<4]; + vec_t rowB1 = *(vec_t *) & BO[l<<3]; + vec_t rowA2 = *(vec_t *) & AO[(l<<4)+4]; + vec_t rowB2 = *(vec_t *) & BO[(l<<3)+4]; + vec_t rowA3 = *(vec_t *) & AO[(l<<4)+8]; + vec_t rowA4 = *(vec_t *) & AO[(l<<4)+12]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1); + __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB1); + __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB1); + __builtin_mma_xvf32gerpp(&acc4, rowA1, rowB2); + __builtin_mma_xvf32gerpp(&acc5, rowA2, rowB2); + __builtin_mma_xvf32gerpp(&acc6, rowA3, rowB2); + __builtin_mma_xvf32gerpp(&acc7, rowA4, rowB2); + } SAVE_ACC_COMPLEX_24_ALL - CO += 16; - AO += temp << 4; - BO += temp << 3; + CO += 16; + AO += temp << 4; + BO += temp << 3; #if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE (8, 4) + REFRESH_AFTER_SAVE (8, 4) #endif - } - if (m & 4) - { + } + if (m & 4) { #if defined(TRMMKERNEL) - REFRESH_POINTERS (4, 4); + REFRESH_POINTERS (4, 4); #else - BO = B; - temp = k; + BO = B; + temp = k; #endif - SET_ACC_ZERO() - for (l = 0; l < (temp & (~1)); l+=2) - { - vec_t rowA1 = *(vec_t *) & AO[l<<3]; - vec_t rowA2 = *(vec_t *) & AO[(l<<3)+4]; - vec_t rowA3 = *(vec_t *) & AO[(l<<3)+8]; - vec_t rowA4 = *(vec_t *) & AO[(l<<3)+12]; - vec_t rowB1 = *(vec_t *) & BO[l<<3]; - vec_t rowB2 = *(vec_t *) & BO[(l<<3)+4]; - vec_t rowB3 = *(vec_t *) & BO[(l<<3)+8]; - vec_t rowB4 = *(vec_t *) & BO[(l<<3)+12]; - __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); - __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1); - __builtin_mma_xvf32gerpp(&acc2, rowA1, rowB2); - __builtin_mma_xvf32gerpp(&acc3, rowA2, rowB2); - __builtin_mma_xvf32gerpp(&acc0, rowA3, rowB3); - __builtin_mma_xvf32gerpp(&acc1, rowA4, rowB3); - __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB4); - __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB4); - } - for (l = (temp & (~1)); l < temp; ++l) - { - vec_t rowA1 = *(vec_t *) & AO[l<<3]; - vec_t rowA2 = *(vec_t *) & AO[(l<<3)+4]; - vec_t rowB1 = *(vec_t *) & BO[l<<3]; - vec_t rowB2 = *(vec_t *) & BO[(l<<3)+4]; - __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); - __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1); - __builtin_mma_xvf32gerpp(&acc2, rowA1, rowB2); - __builtin_mma_xvf32gerpp(&acc3, rowA2, rowB2); - } - SAVE_ACC_COMPLEX_24(&acc0, &acc2, 0) - SAVE_ACC_COMPLEX_24(&acc1, &acc3, 4) - CO += 8; - AO += temp << 3; - BO += temp << 3; + SET_ACC_ZERO() + for (l = 0; l < (temp & (~1)); l+=2) { + vec_t rowA1 = *(vec_t *) & AO[l<<3]; + vec_t rowA2 = *(vec_t *) & AO[(l<<3)+4]; + vec_t rowA3 = *(vec_t *) & AO[(l<<3)+8]; + vec_t rowA4 = *(vec_t *) & AO[(l<<3)+12]; + vec_t rowB1 = *(vec_t *) & BO[l<<3]; + vec_t rowB2 = *(vec_t *) & BO[(l<<3)+4]; + vec_t rowB3 = *(vec_t *) & BO[(l<<3)+8]; + vec_t rowB4 = *(vec_t *) & BO[(l<<3)+12]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1); + __builtin_mma_xvf32gerpp(&acc2, rowA1, rowB2); + __builtin_mma_xvf32gerpp(&acc3, rowA2, rowB2); + __builtin_mma_xvf32gerpp(&acc0, rowA3, rowB3); + __builtin_mma_xvf32gerpp(&acc1, rowA4, rowB3); + __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB4); + __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB4); + } + for (l = (temp & (~1)); l < temp; ++l) { + vec_t rowA1 = *(vec_t *) & AO[l<<3]; + vec_t rowA2 = *(vec_t *) & AO[(l<<3)+4]; + vec_t rowB1 = *(vec_t *) & BO[l<<3]; + vec_t rowB2 = *(vec_t *) & BO[(l<<3)+4]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1); + __builtin_mma_xvf32gerpp(&acc2, rowA1, rowB2); + __builtin_mma_xvf32gerpp(&acc3, rowA2, rowB2); + } + SAVE_ACC_COMPLEX_24(&acc0, &acc2, 0) + SAVE_ACC_COMPLEX_24(&acc1, &acc3, 4) + CO += 8; + AO += temp << 3; + BO += temp << 3; #if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE (4, 4) + REFRESH_AFTER_SAVE (4, 4) #endif - } - if (m & 2) - { + } + if (m & 2) { #if defined(TRMMKERNEL) - REFRESH_POINTERS (2, 4); + REFRESH_POINTERS (2, 4); #else - BO = B; - temp = k; + BO = B; + temp = k; #endif - SET_ACC_ZERO() - for (l = 0; l < (temp & (~3)); l+=4) - { - vec_t rowA1 = *(vec_t *) & AO[l<<2]; - vec_t rowA2 = *(vec_t *) & AO[(l<<2)+4]; - vec_t rowA3 = *(vec_t *) & AO[(l<<2)+8]; - vec_t rowA4 = *(vec_t *) & AO[(l<<2)+12]; - vec_t rowB1 = *(vec_t *) & BO[l<<3]; - vec_t rowB2 = *(vec_t *) & BO[(l<<3)+4]; - vec_t rowB3 = *(vec_t *) & BO[(l<<3)+8]; - vec_t rowB4 = *(vec_t *) & BO[(l<<3)+12]; - vec_t rowB5 = *(vec_t *) & BO[(l<<3)+16]; - vec_t rowB6 = *(vec_t *) & BO[(l<<3)+20]; - vec_t rowB7 = *(vec_t *) & BO[(l<<3)+24]; - vec_t rowB8 = *(vec_t *) & BO[(l<<3)+28]; - __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); - __builtin_mma_xvf32gerpp(&acc1, rowA1, rowB2); - __builtin_mma_xvf32gerpp(&acc0, rowA2, rowB3); - __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB4); - __builtin_mma_xvf32gerpp(&acc0, rowA3, rowB5); - __builtin_mma_xvf32gerpp(&acc1, rowA3, rowB6); - __builtin_mma_xvf32gerpp(&acc0, rowA4, rowB7); - __builtin_mma_xvf32gerpp(&acc1, rowA4, rowB8); - } - for (l = (temp & (~3)); l < temp; ++l) - { - vec_t rowA1 = *(vec_t *) & AO[l<<2]; - vec_t rowB1 = *(vec_t *) & BO[l<<3]; - vec_t rowB2 = *(vec_t *) & BO[(l<<3)+4]; - __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); - __builtin_mma_xvf32gerpp(&acc1, rowA1, rowB2); - } - SAVE_ACC_COMPLEX_24(&acc0, &acc1, 0) - CO += 4; - AO += temp << 2; - BO += temp << 3; + SET_ACC_ZERO() + for (l = 0; l < (temp & (~3)); l+=4) { + vec_t rowA1 = *(vec_t *) & AO[l<<2]; + vec_t rowA2 = *(vec_t *) & AO[(l<<2)+4]; + vec_t rowA3 = *(vec_t *) & AO[(l<<2)+8]; + vec_t rowA4 = *(vec_t *) & AO[(l<<2)+12]; + vec_t rowB1 = *(vec_t *) & BO[l<<3]; + vec_t rowB2 = *(vec_t *) & BO[(l<<3)+4]; + vec_t rowB3 = *(vec_t *) & BO[(l<<3)+8]; + vec_t rowB4 = *(vec_t *) & BO[(l<<3)+12]; + vec_t rowB5 = *(vec_t *) & BO[(l<<3)+16]; + vec_t rowB6 = *(vec_t *) & BO[(l<<3)+20]; + vec_t rowB7 = *(vec_t *) & BO[(l<<3)+24]; + vec_t rowB8 = *(vec_t *) & BO[(l<<3)+28]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA1, rowB2); + __builtin_mma_xvf32gerpp(&acc0, rowA2, rowB3); + __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB4); + __builtin_mma_xvf32gerpp(&acc0, rowA3, rowB5); + __builtin_mma_xvf32gerpp(&acc1, rowA3, rowB6); + __builtin_mma_xvf32gerpp(&acc0, rowA4, rowB7); + __builtin_mma_xvf32gerpp(&acc1, rowA4, rowB8); + } + for (l = (temp & (~3)); l < temp; ++l) { + vec_t rowA1 = *(vec_t *) & AO[l<<2]; + vec_t rowB1 = *(vec_t *) & BO[l<<3]; + vec_t rowB2 = *(vec_t *) & BO[(l<<3)+4]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA1, rowB2); + } + SAVE_ACC_COMPLEX_24(&acc0, &acc1, 0) + CO += 4; + AO += temp << 2; + BO += temp << 3; #if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE (2, 4) + REFRESH_AFTER_SAVE (2, 4) #endif - } - if (m & 1) - { + } + if (m & 1) { #if defined(TRMMKERNEL) - REFRESH_POINTERS (1, 4) + REFRESH_POINTERS (1, 4) #else - BO = B; - temp = k; + BO = B; + temp = k; #endif - SET_ACC_ZERO() - for (l = 0; l < (temp & (~3)); l+=4) - { - vec_t rowA1 = *(vec_t *) & AO[l<<1]; - vec_t rowA2 = *(vec_t *) & AO[(l<<1)+2]; - vec_t rowA3 = *(vec_t *) & AO[(l<<1)+4]; - vec_t rowA4 = *(vec_t *) & AO[(l<<1)+6]; - vec_t rowB1 = *(vec_t *) & BO[l<<3]; - vec_t rowB2 = *(vec_t *) & BO[(l<<3)+4]; - vec_t rowB3 = *(vec_t *) & BO[(l<<3)+8]; - vec_t rowB4 = *(vec_t *) & BO[(l<<3)+12]; - vec_t rowB5 = *(vec_t *) & BO[(l<<3)+16]; - vec_t rowB6 = *(vec_t *) & BO[(l<<3)+20]; - vec_t rowB7 = *(vec_t *) & BO[(l<<3)+24]; - vec_t rowB8 = *(vec_t *) & BO[(l<<3)+28]; - __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); - __builtin_mma_xvf32gerpp(&acc1, rowA1, rowB2); - __builtin_mma_xvf32gerpp(&acc2, rowA2, rowB3); - __builtin_mma_xvf32gerpp(&acc3, rowA2, rowB4); - __builtin_mma_xvf32gerpp(&acc4, rowA3, rowB5); - __builtin_mma_xvf32gerpp(&acc5, rowA3, rowB6); - __builtin_mma_xvf32gerpp(&acc6, rowA4, rowB7); - __builtin_mma_xvf32gerpp(&acc7, rowA4, rowB8); - } - for (l = (temp & (~3)); l < temp; ++l) - { - vec_t rowA1 = *(vec_t *) & AO[l<<1]; - vec_t rowB1 = *(vec_t *) & BO[l<<3]; - vec_t rowB2 = *(vec_t *) & BO[(l<<3)+4]; - __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); - __builtin_mma_xvf32gerpp(&acc1, rowA1, rowB2); - } - SAVE_ACC_COMPLEX_14 - CO += 2; - AO += temp << 1; - BO += temp << 3; + SET_ACC_ZERO() + for (l = 0; l < (temp & (~3)); l+=4) { + vec_t rowA1 = *(vec_t *) & AO[l<<1]; + vec_t rowA2 = *(vec_t *) & AO[(l<<1)+2]; + vec_t rowA3 = *(vec_t *) & AO[(l<<1)+4]; + vec_t rowA4 = *(vec_t *) & AO[(l<<1)+6]; + vec_t rowB1 = *(vec_t *) & BO[l<<3]; + vec_t rowB2 = *(vec_t *) & BO[(l<<3)+4]; + vec_t rowB3 = *(vec_t *) & BO[(l<<3)+8]; + vec_t rowB4 = *(vec_t *) & BO[(l<<3)+12]; + vec_t rowB5 = *(vec_t *) & BO[(l<<3)+16]; + vec_t rowB6 = *(vec_t *) & BO[(l<<3)+20]; + vec_t rowB7 = *(vec_t *) & BO[(l<<3)+24]; + vec_t rowB8 = *(vec_t *) & BO[(l<<3)+28]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA1, rowB2); + __builtin_mma_xvf32gerpp(&acc2, rowA2, rowB3); + __builtin_mma_xvf32gerpp(&acc3, rowA2, rowB4); + __builtin_mma_xvf32gerpp(&acc4, rowA3, rowB5); + __builtin_mma_xvf32gerpp(&acc5, rowA3, rowB6); + __builtin_mma_xvf32gerpp(&acc6, rowA4, rowB7); + __builtin_mma_xvf32gerpp(&acc7, rowA4, rowB8); + } + for (l = (temp & (~3)); l < temp; ++l) { + vec_t rowA1 = *(vec_t *) & AO[l<<1]; + vec_t rowB1 = *(vec_t *) & BO[l<<3]; + vec_t rowB2 = *(vec_t *) & BO[(l<<3)+4]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA1, rowB2); + } + SAVE_ACC_COMPLEX_14 + CO += 2; + AO += temp << 1; + BO += temp << 3; #if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE (1, 4) + REFRESH_AFTER_SAVE (1, 4) #endif - } + } #if defined(TRMMKERNEL) && !defined(LEFT) - off += 4; // number of values in A + off += 4; // number of values in A #endif B += k << 3; - } + } - if (n & 2) - { + if (n & 2) { #if defined(TRMMKERNEL) && defined(LEFT) off = offset; #endif @@ -732,212 +720,199 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT * CO = C; C += ldc << 2; - for (i = 0; i < (m >> 3); i++) - { + for (i = 0; i < (m >> 3); i++) { #if defined(TRMMKERNEL) - REFRESH_POINTERS (8, 2) + REFRESH_POINTERS (8, 2) #else - BO = B; - temp = k; + BO = B; + temp = k; #endif - SET_ACC_ZERO() - for (l = 0; l < (temp & (~1)); l+=2) - { - vec_t rowA1 = *(vec_t *) & AO[l<<4]; - vec_t rowA2 = *(vec_t *) & AO[(l<<4)+4]; - vec_t rowA3 = *(vec_t *) & AO[(l<<4)+8]; - vec_t rowA4 = *(vec_t *) & AO[(l<<4)+12]; - vec_t rowA5 = *(vec_t *) & AO[(l<<4)+16]; - vec_t rowA6 = *(vec_t *) & AO[(l<<4)+20]; - vec_t rowA7 = *(vec_t *) & AO[(l<<4)+24]; - vec_t rowA8 = *(vec_t *) & AO[(l<<4)+28]; - vec_t rowB1 = *(vec_t *) & BO[l<<2]; - vec_t rowB2 = *(vec_t *) & BO[(l<<2)+4]; - __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); - __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1); - __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB1); - __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB1); - __builtin_mma_xvf32gerpp(&acc0, rowA5, rowB2); - __builtin_mma_xvf32gerpp(&acc1, rowA6, rowB2); - __builtin_mma_xvf32gerpp(&acc2, rowA7, rowB2); - __builtin_mma_xvf32gerpp(&acc3, rowA8, rowB2); - } - for (l = (temp & (~1)); l < temp; ++l) - { - vec_t rowA1 = *(vec_t *) & AO[l<<4]; - vec_t rowA2 = *(vec_t *) & AO[(l<<4)+4]; - vec_t rowA3 = *(vec_t *) & AO[(l<<4)+8]; - vec_t rowA4 = *(vec_t *) & AO[(l<<4)+12]; - vec_t rowB1 = *(vec_t *) & BO[l<<2]; - __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); - __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1); - __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB1); - __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB1); - } - SAVE_ACC_COMPLEX_22_4 - AO += temp << 4; - BO += temp << 2; - CO += 16; + SET_ACC_ZERO() + for (l = 0; l < (temp & (~1)); l+=2) { + vec_t rowA1 = *(vec_t *) & AO[l<<4]; + vec_t rowA2 = *(vec_t *) & AO[(l<<4)+4]; + vec_t rowA3 = *(vec_t *) & AO[(l<<4)+8]; + vec_t rowA4 = *(vec_t *) & AO[(l<<4)+12]; + vec_t rowA5 = *(vec_t *) & AO[(l<<4)+16]; + vec_t rowA6 = *(vec_t *) & AO[(l<<4)+20]; + vec_t rowA7 = *(vec_t *) & AO[(l<<4)+24]; + vec_t rowA8 = *(vec_t *) & AO[(l<<4)+28]; + vec_t rowB1 = *(vec_t *) & BO[l<<2]; + vec_t rowB2 = *(vec_t *) & BO[(l<<2)+4]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1); + __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB1); + __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB1); + __builtin_mma_xvf32gerpp(&acc0, rowA5, rowB2); + __builtin_mma_xvf32gerpp(&acc1, rowA6, rowB2); + __builtin_mma_xvf32gerpp(&acc2, rowA7, rowB2); + __builtin_mma_xvf32gerpp(&acc3, rowA8, rowB2); + } + for (l = (temp & (~1)); l < temp; ++l) { + vec_t rowA1 = *(vec_t *) & AO[l<<4]; + vec_t rowA2 = *(vec_t *) & AO[(l<<4)+4]; + vec_t rowA3 = *(vec_t *) & AO[(l<<4)+8]; + vec_t rowA4 = *(vec_t *) & AO[(l<<4)+12]; + vec_t rowB1 = *(vec_t *) & BO[l<<2]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1); + __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB1); + __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB1); + } + SAVE_ACC_COMPLEX_22_4 + AO += temp << 4; + BO += temp << 2; + CO += 16; #if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE (8, 2) + REFRESH_AFTER_SAVE (8, 2) #endif - } - if (m & 4) - { + } + if (m & 4) { #if defined(TRMMKERNEL) - REFRESH_POINTERS (4, 2) + REFRESH_POINTERS (4, 2) #else - BO = B; - temp = k; + BO = B; + temp = k; #endif - SET_ACC_ZERO() - for (l = 0; l < (temp & (~3)); l+=4) - { - vec_t rowA1 = *(vec_t *) & AO[l<<3]; - vec_t rowA2 = *(vec_t *) & AO[(l<<3)+4]; - vec_t rowA3 = *(vec_t *) & AO[(l<<3)+8]; - vec_t rowA4 = *(vec_t *) & AO[(l<<3)+12]; - vec_t rowA5 = *(vec_t *) & AO[(l<<3)+16]; - vec_t rowA6 = *(vec_t *) & AO[(l<<3)+20]; - vec_t rowA7 = *(vec_t *) & AO[(l<<3)+24]; - vec_t rowA8 = *(vec_t *) & AO[(l<<3)+28]; - vec_t rowB1 = *(vec_t *) & BO[l<<2]; - vec_t rowB2 = *(vec_t *) & BO[(l<<2)+4]; - vec_t rowB3 = *(vec_t *) & BO[(l<<2)+8]; - vec_t rowB4 = *(vec_t *) & BO[(l<<2)+12]; - __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); - __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1); - __builtin_mma_xvf32gerpp(&acc0, rowA3, rowB2); - __builtin_mma_xvf32gerpp(&acc1, rowA4, rowB2); - __builtin_mma_xvf32gerpp(&acc0, rowA5, rowB3); - __builtin_mma_xvf32gerpp(&acc1, rowA6, rowB3); - __builtin_mma_xvf32gerpp(&acc0, rowA7, rowB4); - __builtin_mma_xvf32gerpp(&acc1, rowA8, rowB4); - } - for (l = (temp & (~3)); l < temp; ++l) - { - vec_t rowA1 = *(vec_t *) & AO[l<<3]; - vec_t rowA2 = *(vec_t *) & AO[(l<<3)+4]; - vec_t rowB1 = *(vec_t *) & BO[l<<2]; - __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); - __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1); - } - SAVE_ACC_COMPLEX_22_2 - AO += temp << 3; - BO += temp << 2; - CO += 8; + SET_ACC_ZERO() + for (l = 0; l < (temp & (~3)); l+=4) { + vec_t rowA1 = *(vec_t *) & AO[l<<3]; + vec_t rowA2 = *(vec_t *) & AO[(l<<3)+4]; + vec_t rowA3 = *(vec_t *) & AO[(l<<3)+8]; + vec_t rowA4 = *(vec_t *) & AO[(l<<3)+12]; + vec_t rowA5 = *(vec_t *) & AO[(l<<3)+16]; + vec_t rowA6 = *(vec_t *) & AO[(l<<3)+20]; + vec_t rowA7 = *(vec_t *) & AO[(l<<3)+24]; + vec_t rowA8 = *(vec_t *) & AO[(l<<3)+28]; + vec_t rowB1 = *(vec_t *) & BO[l<<2]; + vec_t rowB2 = *(vec_t *) & BO[(l<<2)+4]; + vec_t rowB3 = *(vec_t *) & BO[(l<<2)+8]; + vec_t rowB4 = *(vec_t *) & BO[(l<<2)+12]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1); + __builtin_mma_xvf32gerpp(&acc0, rowA3, rowB2); + __builtin_mma_xvf32gerpp(&acc1, rowA4, rowB2); + __builtin_mma_xvf32gerpp(&acc0, rowA5, rowB3); + __builtin_mma_xvf32gerpp(&acc1, rowA6, rowB3); + __builtin_mma_xvf32gerpp(&acc0, rowA7, rowB4); + __builtin_mma_xvf32gerpp(&acc1, rowA8, rowB4); + } + for (l = (temp & (~3)); l < temp; ++l) { + vec_t rowA1 = *(vec_t *) & AO[l<<3]; + vec_t rowA2 = *(vec_t *) & AO[(l<<3)+4]; + vec_t rowB1 = *(vec_t *) & BO[l<<2]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1); + } + SAVE_ACC_COMPLEX_22_2 + AO += temp << 3; + BO += temp << 2; + CO += 8; #if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE (4, 2) + REFRESH_AFTER_SAVE (4, 2) #endif - } if (m & 2) - { + } + if (m & 2) { #if defined(TRMMKERNEL) - REFRESH_POINTERS (2, 2) + REFRESH_POINTERS (2, 2) #else - BO = B; - temp = k; + BO = B; + temp = k; #endif - SET_ACC_ZERO() - for (l = 0; l < (temp & (~7)); l+=8) - { - vec_t rowA1 = *(vec_t *) & AO[l<<2]; - vec_t rowA2 = *(vec_t *) & AO[(l<<2)+4]; - vec_t rowA3 = *(vec_t *) & AO[(l<<2)+8]; - vec_t rowA4 = *(vec_t *) & AO[(l<<2)+12]; - vec_t rowA5 = *(vec_t *) & AO[(l<<2)+16]; - vec_t rowA6 = *(vec_t *) & AO[(l<<2)+20]; - vec_t rowA7 = *(vec_t *) & AO[(l<<2)+24]; - vec_t rowA8 = *(vec_t *) & AO[(l<<2)+28]; - vec_t rowB1 = *(vec_t *) & BO[l<<2]; - vec_t rowB2 = *(vec_t *) & BO[(l<<2)+4]; - vec_t rowB3 = *(vec_t *) & BO[(l<<2)+8]; - vec_t rowB4 = *(vec_t *) & BO[(l<<2)+12]; - vec_t rowB5 = *(vec_t *) & BO[(l<<2)+16]; - vec_t rowB6 = *(vec_t *) & BO[(l<<2)+20]; - vec_t rowB7 = *(vec_t *) & BO[(l<<2)+24]; - vec_t rowB8 = *(vec_t *) & BO[(l<<2)+28]; - __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); - __builtin_mma_xvf32gerpp(&acc0, rowA2, rowB2); - __builtin_mma_xvf32gerpp(&acc0, rowA3, rowB3); - __builtin_mma_xvf32gerpp(&acc0, rowA4, rowB4); - __builtin_mma_xvf32gerpp(&acc0, rowA5, rowB5); - __builtin_mma_xvf32gerpp(&acc0, rowA6, rowB6); - __builtin_mma_xvf32gerpp(&acc0, rowA7, rowB7); - __builtin_mma_xvf32gerpp(&acc0, rowA8, rowB8); - } - for (l = (temp & (~7)); l < temp; ++l) - { - vec_t rowA1 = *(vec_t *) & AO[l<<2]; - vec_t rowB1 = *(vec_t *) & BO[l<<2]; - __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); - } - SAVE_ACC_COMPLEX_22_1 - AO += temp << 2; - BO += temp << 2; - CO += 4; + SET_ACC_ZERO() + for (l = 0; l < (temp & (~7)); l+=8) { + vec_t rowA1 = *(vec_t *) & AO[l<<2]; + vec_t rowA2 = *(vec_t *) & AO[(l<<2)+4]; + vec_t rowA3 = *(vec_t *) & AO[(l<<2)+8]; + vec_t rowA4 = *(vec_t *) & AO[(l<<2)+12]; + vec_t rowA5 = *(vec_t *) & AO[(l<<2)+16]; + vec_t rowA6 = *(vec_t *) & AO[(l<<2)+20]; + vec_t rowA7 = *(vec_t *) & AO[(l<<2)+24]; + vec_t rowA8 = *(vec_t *) & AO[(l<<2)+28]; + vec_t rowB1 = *(vec_t *) & BO[l<<2]; + vec_t rowB2 = *(vec_t *) & BO[(l<<2)+4]; + vec_t rowB3 = *(vec_t *) & BO[(l<<2)+8]; + vec_t rowB4 = *(vec_t *) & BO[(l<<2)+12]; + vec_t rowB5 = *(vec_t *) & BO[(l<<2)+16]; + vec_t rowB6 = *(vec_t *) & BO[(l<<2)+20]; + vec_t rowB7 = *(vec_t *) & BO[(l<<2)+24]; + vec_t rowB8 = *(vec_t *) & BO[(l<<2)+28]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc0, rowA2, rowB2); + __builtin_mma_xvf32gerpp(&acc0, rowA3, rowB3); + __builtin_mma_xvf32gerpp(&acc0, rowA4, rowB4); + __builtin_mma_xvf32gerpp(&acc0, rowA5, rowB5); + __builtin_mma_xvf32gerpp(&acc0, rowA6, rowB6); + __builtin_mma_xvf32gerpp(&acc0, rowA7, rowB7); + __builtin_mma_xvf32gerpp(&acc0, rowA8, rowB8); + } + for (l = (temp & (~7)); l < temp; ++l) { + vec_t rowA1 = *(vec_t *) & AO[l<<2]; + vec_t rowB1 = *(vec_t *) & BO[l<<2]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + } + SAVE_ACC_COMPLEX_22_1 + AO += temp << 2; + BO += temp << 2; + CO += 4; #if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE (2, 2) + REFRESH_AFTER_SAVE (2, 2) #endif - } - if (m & 1) - { + } + if (m & 1) { #if defined(TRMMKERNEL) - REFRESH_POINTERS (1, 2) + REFRESH_POINTERS (1, 2) #else - BO = B; - temp = k; + BO = B; + temp = k; #endif - // RIP OUT MMA STUFF! - SET_ACC_ZERO() - for (l = 0; l < (temp & (~7)); l+=8) - { - vec_t rowA1 = *(vec_t *) & AO[l<<1]; - vec_t rowA2 = *(vec_t *) & AO[(l<<1)+2]; - vec_t rowA3 = *(vec_t *) & AO[(l<<1)+4]; - vec_t rowA4 = *(vec_t *) & AO[(l<<1)+6]; - vec_t rowA5 = *(vec_t *) & AO[(l<<1)+8]; - vec_t rowA6 = *(vec_t *) & AO[(l<<1)+10]; - vec_t rowA7 = *(vec_t *) & AO[(l<<1)+12]; - vec_t rowA8 = *(vec_t *) & AO[(l<<1)+14]; - vec_t rowB1 = *(vec_t *) & BO[l<<2]; - vec_t rowB2 = *(vec_t *) & BO[(l<<2)+4]; - vec_t rowB3 = *(vec_t *) & BO[(l<<2)+8]; - vec_t rowB4 = *(vec_t *) & BO[(l<<2)+12]; - vec_t rowB5 = *(vec_t *) & BO[(l<<2)+16]; - vec_t rowB6 = *(vec_t *) & BO[(l<<2)+20]; - vec_t rowB7 = *(vec_t *) & BO[(l<<2)+24]; - vec_t rowB8 = *(vec_t *) & BO[(l<<2)+28]; - __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); - __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB2); - __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB3); - __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB4); - __builtin_mma_xvf32gerpp(&acc4, rowA5, rowB5); - __builtin_mma_xvf32gerpp(&acc5, rowA6, rowB6); - __builtin_mma_xvf32gerpp(&acc6, rowA7, rowB7); - __builtin_mma_xvf32gerpp(&acc7, rowA8, rowB8); - } - for (l = (temp & (~7)); l < temp; ++l) - { - vec_t rowA1 = *(vec_t *) & AO[l<<1]; - vec_t rowB1 = *(vec_t *) & BO[l<<2]; - __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); - } - SAVE_ACC_COMPLEX_12 - AO += temp<<1; - BO += temp<<2; - CO += 2; + SET_ACC_ZERO() + for (l = 0; l < (temp & (~7)); l+=8) { + vec_t rowA1 = *(vec_t *) & AO[l<<1]; + vec_t rowA2 = *(vec_t *) & AO[(l<<1)+2]; + vec_t rowA3 = *(vec_t *) & AO[(l<<1)+4]; + vec_t rowA4 = *(vec_t *) & AO[(l<<1)+6]; + vec_t rowA5 = *(vec_t *) & AO[(l<<1)+8]; + vec_t rowA6 = *(vec_t *) & AO[(l<<1)+10]; + vec_t rowA7 = *(vec_t *) & AO[(l<<1)+12]; + vec_t rowA8 = *(vec_t *) & AO[(l<<1)+14]; + vec_t rowB1 = *(vec_t *) & BO[l<<2]; + vec_t rowB2 = *(vec_t *) & BO[(l<<2)+4]; + vec_t rowB3 = *(vec_t *) & BO[(l<<2)+8]; + vec_t rowB4 = *(vec_t *) & BO[(l<<2)+12]; + vec_t rowB5 = *(vec_t *) & BO[(l<<2)+16]; + vec_t rowB6 = *(vec_t *) & BO[(l<<2)+20]; + vec_t rowB7 = *(vec_t *) & BO[(l<<2)+24]; + vec_t rowB8 = *(vec_t *) & BO[(l<<2)+28]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB2); + __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB3); + __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB4); + __builtin_mma_xvf32gerpp(&acc4, rowA5, rowB5); + __builtin_mma_xvf32gerpp(&acc5, rowA6, rowB6); + __builtin_mma_xvf32gerpp(&acc6, rowA7, rowB7); + __builtin_mma_xvf32gerpp(&acc7, rowA8, rowB8); + } + for (l = (temp & (~7)); l < temp; ++l) { + vec_t rowA1 = *(vec_t *) & AO[l<<1]; + vec_t rowB1 = *(vec_t *) & BO[l<<2]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + } + SAVE_ACC_COMPLEX_12 + AO += temp<<1; + BO += temp<<2; + CO += 2; #if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE (1, 2) + REFRESH_AFTER_SAVE (1, 2) #endif - } + } #if defined(TRMMKERNEL) && !defined(LEFT) - off += 2; // number of values in A + off += 2; // number of values in A #endif B += k << 2; - } + } - if (n & 1) - { + if (n & 1) { #if defined(TRMMKERNEL) && defined(LEFT) off = offset; #endif @@ -945,210 +920,196 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT * CO = C; C += ldc << 1; - for (i = 0; i < (m >> 3); i++) - { + for (i = 0; i < (m >> 3); i++) { #if defined(TRMMKERNEL) - REFRESH_POINTERS (8, 1) + REFRESH_POINTERS (8, 1) #else - BO = B; - temp = k; + BO = B; + temp = k; #endif - SET_ACC_ZERO() - for (l = 0; l < (temp & (~1)); l+=2) - { - vec_t rowA1 = *(vec_t *) & AO[l<<4]; - vec_t rowA2 = *(vec_t *) & AO[(l<<4)+4]; - vec_t rowA3 = *(vec_t *) & AO[(l<<4)+8]; - vec_t rowA4 = *(vec_t *) & AO[(l<<4)+12]; - vec_t rowA5 = *(vec_t *) & AO[(l<<4)+16]; - vec_t rowA6 = *(vec_t *) & AO[(l<<4)+20]; - vec_t rowA7 = *(vec_t *) & AO[(l<<4)+24]; - vec_t rowA8 = *(vec_t *) & AO[(l<<4)+28]; - vec_t rowB1 = *(vec_t *) & BO[l<<1]; - vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2]; - __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); - __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1); - __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB1); - __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB1); - __builtin_mma_xvf32gerpp(&acc4, rowA5, rowB2); - __builtin_mma_xvf32gerpp(&acc5, rowA6, rowB2); - __builtin_mma_xvf32gerpp(&acc6, rowA7, rowB2); - __builtin_mma_xvf32gerpp(&acc7, rowA8, rowB2); - } - for (l = (temp & (~1)); l < temp; ++l) - { - vec_t rowA1 = *(vec_t *) & AO[l<<4]; - vec_t rowA2 = *(vec_t *) & AO[(l<<4)+4]; - vec_t rowA3 = *(vec_t *) & AO[(l<<4)+8]; - vec_t rowA4 = *(vec_t *) & AO[(l<<4)+12]; - vec_t rowB1 = *(vec_t *) & BO[l<<1]; - __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); - __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1); - __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB1); - __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB1); - } - SAVE_ACC_COMPLEX_21_4 - AO += temp << 4; - BO += temp << 1; - CO += 16; + SET_ACC_ZERO() + for (l = 0; l < (temp & (~1)); l+=2) { + vec_t rowA1 = *(vec_t *) & AO[l<<4]; + vec_t rowA2 = *(vec_t *) & AO[(l<<4)+4]; + vec_t rowA3 = *(vec_t *) & AO[(l<<4)+8]; + vec_t rowA4 = *(vec_t *) & AO[(l<<4)+12]; + vec_t rowA5 = *(vec_t *) & AO[(l<<4)+16]; + vec_t rowA6 = *(vec_t *) & AO[(l<<4)+20]; + vec_t rowA7 = *(vec_t *) & AO[(l<<4)+24]; + vec_t rowA8 = *(vec_t *) & AO[(l<<4)+28]; + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1); + __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB1); + __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB1); + __builtin_mma_xvf32gerpp(&acc4, rowA5, rowB2); + __builtin_mma_xvf32gerpp(&acc5, rowA6, rowB2); + __builtin_mma_xvf32gerpp(&acc6, rowA7, rowB2); + __builtin_mma_xvf32gerpp(&acc7, rowA8, rowB2); + } + for (l = (temp & (~1)); l < temp; ++l) { + vec_t rowA1 = *(vec_t *) & AO[l<<4]; + vec_t rowA2 = *(vec_t *) & AO[(l<<4)+4]; + vec_t rowA3 = *(vec_t *) & AO[(l<<4)+8]; + vec_t rowA4 = *(vec_t *) & AO[(l<<4)+12]; + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1); + __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB1); + __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB1); + } + SAVE_ACC_COMPLEX_21_4 + AO += temp << 4; + BO += temp << 1; + CO += 16; #if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE (8, 1) + REFRESH_AFTER_SAVE (8, 1) #endif - } - if (m & 4) - { + } + if (m & 4) { #if defined(TRMMKERNEL) - REFRESH_POINTERS (4, 1) + REFRESH_POINTERS (4, 1) #else - BO = B; - temp = k; + BO = B; + temp = k; #endif - SET_ACC_ZERO() - for (l = 0; l < (temp & (~3)); l+=4) - { - vec_t rowA1 = *(vec_t *) & AO[l<<3]; - vec_t rowA2 = *(vec_t *) & AO[(l<<3)+4]; - vec_t rowA3 = *(vec_t *) & AO[(l<<3)+8]; - vec_t rowA4 = *(vec_t *) & AO[(l<<3)+12]; - vec_t rowA5 = *(vec_t *) & AO[(l<<3)+16]; - vec_t rowA6 = *(vec_t *) & AO[(l<<3)+20]; - vec_t rowA7 = *(vec_t *) & AO[(l<<3)+24]; - vec_t rowA8 = *(vec_t *) & AO[(l<<3)+28]; - vec_t rowB1 = *(vec_t *) & BO[l<<1]; - vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2]; - vec_t rowB3 = *(vec_t *) & BO[(l<<1)+4]; - vec_t rowB4 = *(vec_t *) & BO[(l<<1)+6]; - __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); - __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1); - __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB2); - __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB2); - __builtin_mma_xvf32gerpp(&acc4, rowA5, rowB3); - __builtin_mma_xvf32gerpp(&acc5, rowA6, rowB3); - __builtin_mma_xvf32gerpp(&acc6, rowA7, rowB4); - __builtin_mma_xvf32gerpp(&acc7, rowA8, rowB4); - } - for (l = (temp & (~3)); l < temp; ++l) - { - vec_t rowA1 = *(vec_t *) & AO[l<<3]; - vec_t rowA2 = *(vec_t *) & AO[(l<<3)+4]; - vec_t rowB1 = *(vec_t *) & BO[l<<1]; - __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); - __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1); - } - SAVE_ACC_COMPLEX_21_2 - AO += temp << 3; - BO += temp << 1; - CO += 8; + SET_ACC_ZERO() + for (l = 0; l < (temp & (~3)); l+=4) { + vec_t rowA1 = *(vec_t *) & AO[l<<3]; + vec_t rowA2 = *(vec_t *) & AO[(l<<3)+4]; + vec_t rowA3 = *(vec_t *) & AO[(l<<3)+8]; + vec_t rowA4 = *(vec_t *) & AO[(l<<3)+12]; + vec_t rowA5 = *(vec_t *) & AO[(l<<3)+16]; + vec_t rowA6 = *(vec_t *) & AO[(l<<3)+20]; + vec_t rowA7 = *(vec_t *) & AO[(l<<3)+24]; + vec_t rowA8 = *(vec_t *) & AO[(l<<3)+28]; + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2]; + vec_t rowB3 = *(vec_t *) & BO[(l<<1)+4]; + vec_t rowB4 = *(vec_t *) & BO[(l<<1)+6]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1); + __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB2); + __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB2); + __builtin_mma_xvf32gerpp(&acc4, rowA5, rowB3); + __builtin_mma_xvf32gerpp(&acc5, rowA6, rowB3); + __builtin_mma_xvf32gerpp(&acc6, rowA7, rowB4); + __builtin_mma_xvf32gerpp(&acc7, rowA8, rowB4); + } + for (l = (temp & (~3)); l < temp; ++l) { + vec_t rowA1 = *(vec_t *) & AO[l<<3]; + vec_t rowA2 = *(vec_t *) & AO[(l<<3)+4]; + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1); + } + SAVE_ACC_COMPLEX_21_2 + AO += temp << 3; + BO += temp << 1; + CO += 8; #if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE (4, 1) + REFRESH_AFTER_SAVE (4, 1) #endif - } - if (m & 2) - { + } + if (m & 2) { #if defined(TRMMKERNEL) - REFRESH_POINTERS (2, 1) + REFRESH_POINTERS (2, 1) #else - BO = B; - temp = k; + BO = B; + temp = k; #endif - // RIP OUT MMA STUFF! - SET_ACC_ZERO() - for (l = 0; l < (temp & (~7)); l+=8) - { - vec_t rowA1 = *(vec_t *) & AO[l<<2]; - vec_t rowA2 = *(vec_t *) & AO[(l<<2)+4]; - vec_t rowA3 = *(vec_t *) & AO[(l<<2)+8]; - vec_t rowA4 = *(vec_t *) & AO[(l<<2)+12]; - vec_t rowA5 = *(vec_t *) & AO[(l<<2)+16]; - vec_t rowA6 = *(vec_t *) & AO[(l<<2)+20]; - vec_t rowA7 = *(vec_t *) & AO[(l<<2)+24]; - vec_t rowA8 = *(vec_t *) & AO[(l<<2)+28]; - vec_t rowB1 = *(vec_t *) & BO[l<<1]; - vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2]; - vec_t rowB3 = *(vec_t *) & BO[(l<<1)+4]; - vec_t rowB4 = *(vec_t *) & BO[(l<<1)+6]; - vec_t rowB5 = *(vec_t *) & BO[(l<<1)+8]; - vec_t rowB6 = *(vec_t *) & BO[(l<<1)+10]; - vec_t rowB7 = *(vec_t *) & BO[(l<<1)+12]; - vec_t rowB8 = *(vec_t *) & BO[(l<<1)+14]; - __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); - __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB2); - __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB3); - __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB4); - __builtin_mma_xvf32gerpp(&acc4, rowA5, rowB5); - __builtin_mma_xvf32gerpp(&acc5, rowA6, rowB6); - __builtin_mma_xvf32gerpp(&acc6, rowA7, rowB7); - __builtin_mma_xvf32gerpp(&acc7, rowA8, rowB8); - } - for (l = (temp & (~7)); l < temp; ++l) - { - vec_t rowA1 = *(vec_t *) & AO[l<<2]; - vec_t rowB1 = *(vec_t *) & BO[l<<1]; - __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); - } - SAVE_ACC_COMPLEX_21_1 - AO += temp << 2; - BO += temp << 1; - CO += 4; + SET_ACC_ZERO() + for (l = 0; l < (temp & (~7)); l+=8) { + vec_t rowA1 = *(vec_t *) & AO[l<<2]; + vec_t rowA2 = *(vec_t *) & AO[(l<<2)+4]; + vec_t rowA3 = *(vec_t *) & AO[(l<<2)+8]; + vec_t rowA4 = *(vec_t *) & AO[(l<<2)+12]; + vec_t rowA5 = *(vec_t *) & AO[(l<<2)+16]; + vec_t rowA6 = *(vec_t *) & AO[(l<<2)+20]; + vec_t rowA7 = *(vec_t *) & AO[(l<<2)+24]; + vec_t rowA8 = *(vec_t *) & AO[(l<<2)+28]; + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2]; + vec_t rowB3 = *(vec_t *) & BO[(l<<1)+4]; + vec_t rowB4 = *(vec_t *) & BO[(l<<1)+6]; + vec_t rowB5 = *(vec_t *) & BO[(l<<1)+8]; + vec_t rowB6 = *(vec_t *) & BO[(l<<1)+10]; + vec_t rowB7 = *(vec_t *) & BO[(l<<1)+12]; + vec_t rowB8 = *(vec_t *) & BO[(l<<1)+14]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB2); + __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB3); + __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB4); + __builtin_mma_xvf32gerpp(&acc4, rowA5, rowB5); + __builtin_mma_xvf32gerpp(&acc5, rowA6, rowB6); + __builtin_mma_xvf32gerpp(&acc6, rowA7, rowB7); + __builtin_mma_xvf32gerpp(&acc7, rowA8, rowB8); + } + for (l = (temp & (~7)); l < temp; ++l) { + vec_t rowA1 = *(vec_t *) & AO[l<<2]; + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + } + SAVE_ACC_COMPLEX_21_1 + AO += temp << 2; + BO += temp << 1; + CO += 4; #if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE (2, 1) + REFRESH_AFTER_SAVE (2, 1) #endif - } - if (m & 1) - { + } + if (m & 1) { #if defined(TRMMKERNEL) - REFRESH_POINTERS (1, 1) + REFRESH_POINTERS (1, 1) #else - BO = B; - temp = k; + BO = B; + temp = k; #endif - // RIP OUT MMA STUFF! - SET_ACC_ZERO() - for (l = 0; l < (temp & (~7)); l+=8) - { - vec_t rowA1 = *(vec_t *) & AO[l<<1]; - vec_t rowA2 = *(vec_t *) & AO[(l<<1)+2]; - vec_t rowA3 = *(vec_t *) & AO[(l<<1)+4]; - vec_t rowA4 = *(vec_t *) & AO[(l<<1)+6]; - vec_t rowA5 = *(vec_t *) & AO[(l<<1)+8]; - vec_t rowA6 = *(vec_t *) & AO[(l<<1)+10]; - vec_t rowA7 = *(vec_t *) & AO[(l<<1)+12]; - vec_t rowA8 = *(vec_t *) & AO[(l<<1)+14]; - vec_t rowB1 = *(vec_t *) & BO[l<<1]; - vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2]; - vec_t rowB3 = *(vec_t *) & BO[(l<<1)+4]; - vec_t rowB4 = *(vec_t *) & BO[(l<<1)+6]; - vec_t rowB5 = *(vec_t *) & BO[(l<<1)+8]; - vec_t rowB6 = *(vec_t *) & BO[(l<<1)+10]; - vec_t rowB7 = *(vec_t *) & BO[(l<<1)+12]; - vec_t rowB8 = *(vec_t *) & BO[(l<<1)+14]; - __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); - __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB2); - __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB3); - __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB4); - __builtin_mma_xvf32gerpp(&acc4, rowA5, rowB5); - __builtin_mma_xvf32gerpp(&acc5, rowA6, rowB6); - __builtin_mma_xvf32gerpp(&acc6, rowA7, rowB7); - __builtin_mma_xvf32gerpp(&acc7, rowA8, rowB8); - } - for (l = (temp & (~7)); l < temp; ++l) - { - vec_t rowA1 = *(vec_t *) & AO[l<<1]; - vec_t rowB1 = *(vec_t *) & BO[l<<1]; - __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); - } - SAVE_ACC_COMPLEX_11 - AO += temp<<1; - BO += temp<<1; - CO += 2; + SET_ACC_ZERO() + for (l = 0; l < (temp & (~7)); l+=8) { + vec_t rowA1 = *(vec_t *) & AO[l<<1]; + vec_t rowA2 = *(vec_t *) & AO[(l<<1)+2]; + vec_t rowA3 = *(vec_t *) & AO[(l<<1)+4]; + vec_t rowA4 = *(vec_t *) & AO[(l<<1)+6]; + vec_t rowA5 = *(vec_t *) & AO[(l<<1)+8]; + vec_t rowA6 = *(vec_t *) & AO[(l<<1)+10]; + vec_t rowA7 = *(vec_t *) & AO[(l<<1)+12]; + vec_t rowA8 = *(vec_t *) & AO[(l<<1)+14]; + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2]; + vec_t rowB3 = *(vec_t *) & BO[(l<<1)+4]; + vec_t rowB4 = *(vec_t *) & BO[(l<<1)+6]; + vec_t rowB5 = *(vec_t *) & BO[(l<<1)+8]; + vec_t rowB6 = *(vec_t *) & BO[(l<<1)+10]; + vec_t rowB7 = *(vec_t *) & BO[(l<<1)+12]; + vec_t rowB8 = *(vec_t *) & BO[(l<<1)+14]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB2); + __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB3); + __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB4); + __builtin_mma_xvf32gerpp(&acc4, rowA5, rowB5); + __builtin_mma_xvf32gerpp(&acc5, rowA6, rowB6); + __builtin_mma_xvf32gerpp(&acc6, rowA7, rowB7); + __builtin_mma_xvf32gerpp(&acc7, rowA8, rowB8); + } + for (l = (temp & (~7)); l < temp; ++l) { + vec_t rowA1 = *(vec_t *) & AO[l<<1]; + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + } + SAVE_ACC_COMPLEX_11 + AO += temp<<1; + BO += temp<<1; + CO += 2; #if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE (1, 1) + REFRESH_AFTER_SAVE (1, 1) #endif - } + } #if defined(TRMMKERNEL) && !defined(LEFT) - off += 1; // number of values in A + off += 1; // number of values in A #endif B += k << 1; - } - return 0; + } + return 0; } diff --git a/kernel/power/zgemm_kernel_power10.c b/kernel/power/zgemm_kernel_power10.c index e4e609067..370d12af3 100644 --- a/kernel/power/zgemm_kernel_power10.c +++ b/kernel/power/zgemm_kernel_power10.c @@ -30,15 +30,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. typedef __vector unsigned char vec_t; typedef FLOAT v4sf_t __attribute__ ((vector_size (16))); -#define SET_ACC_ZERO() \ - __builtin_mma_xxsetaccz (&acc0); \ - __builtin_mma_xxsetaccz (&acc1); \ - __builtin_mma_xxsetaccz (&acc2); \ - __builtin_mma_xxsetaccz (&acc3); \ - __builtin_mma_xxsetaccz (&acc4); \ - __builtin_mma_xxsetaccz (&acc5); \ - __builtin_mma_xxsetaccz (&acc6); \ - __builtin_mma_xxsetaccz (&acc7); +#define SET_ACC_ZERO() \ + __builtin_mma_xxsetaccz (&acc0); \ + __builtin_mma_xxsetaccz (&acc1); \ + __builtin_mma_xxsetaccz (&acc2); \ + __builtin_mma_xxsetaccz (&acc3); \ + __builtin_mma_xxsetaccz (&acc4); \ + __builtin_mma_xxsetaccz (&acc5); \ + __builtin_mma_xxsetaccz (&acc6); \ + __builtin_mma_xxsetaccz (&acc7); #if (defined(NN) || defined(NT) || defined(TN) || defined(TT)) #define COMP_MUL(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real = _arbr - _aibi; _imag = _arbi + _aibr; } @@ -66,696 +66,671 @@ typedef FLOAT v4sf_t __attribute__ ((vector_size (16))); #define A_OP += #endif -#define BUILTIN_MMA_DISASSEMBLE_ACC_8 \ - __builtin_mma_disassemble_acc ((void *)result, &acc0); \ - __builtin_mma_disassemble_acc ((void *)&result[4], &acc1); \ - __builtin_mma_disassemble_acc ((void *)&result[8], &acc2); \ - __builtin_mma_disassemble_acc ((void *)&result[12], &acc3); \ - __builtin_mma_disassemble_acc ((void *)&result[16], &acc4); \ - __builtin_mma_disassemble_acc ((void *)&result[20], &acc5); \ - __builtin_mma_disassemble_acc ((void *)&result[24], &acc6); \ - __builtin_mma_disassemble_acc ((void *)&result[28], &acc7); +#define BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + __builtin_mma_disassemble_acc ((void *)result, &acc0); \ + __builtin_mma_disassemble_acc ((void *)&result[4], &acc1); \ + __builtin_mma_disassemble_acc ((void *)&result[8], &acc2); \ + __builtin_mma_disassemble_acc ((void *)&result[12], &acc3); \ + __builtin_mma_disassemble_acc ((void *)&result[16], &acc4); \ + __builtin_mma_disassemble_acc ((void *)&result[20], &acc5); \ + __builtin_mma_disassemble_acc ((void *)&result[24], &acc6); \ + __builtin_mma_disassemble_acc ((void *)&result[28], &acc7); -#define SAVE_ACC_COMPLEX_11 \ - BUILTIN_MMA_DISASSEMBLE_ACC_8 \ - COMP_MUL(tr[0], res[ 0], res[ 3], ti[0], res[ 1], res[ 2]) \ - COMP_MAC(tr[0], res[ 8], res[11], ti[0], res[ 9], res[10]) \ - COMP_MAC(tr[0], res[16], res[19], ti[0], res[17], res[18]) \ - COMP_MAC(tr[0], res[24], res[27], ti[0], res[25], res[26]) \ - COMP_MAC(tr[0], res[32], res[35], ti[0], res[33], res[34]) \ - COMP_MAC(tr[0], res[40], res[43], ti[0], res[41], res[42]) \ - COMP_MAC(tr[0], res[48], res[51], ti[0], res[49], res[50]) \ - COMP_MAC(tr[0], res[56], res[59], ti[0], res[57], res[58]) \ - CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ - CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; +#define SAVE_ACC_COMPLEX_11 \ + BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + COMP_MUL(tr[0], res[ 0], res[ 3], ti[0], res[ 1], res[ 2]) \ + COMP_MAC(tr[0], res[ 8], res[11], ti[0], res[ 9], res[10]) \ + COMP_MAC(tr[0], res[16], res[19], ti[0], res[17], res[18]) \ + COMP_MAC(tr[0], res[24], res[27], ti[0], res[25], res[26]) \ + COMP_MAC(tr[0], res[32], res[35], ti[0], res[33], res[34]) \ + COMP_MAC(tr[0], res[40], res[43], ti[0], res[41], res[42]) \ + COMP_MAC(tr[0], res[48], res[51], ti[0], res[49], res[50]) \ + COMP_MAC(tr[0], res[56], res[59], ti[0], res[57], res[58]) \ + CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; -#define SAVE_ACC_COMPLEX_12 \ - BUILTIN_MMA_DISASSEMBLE_ACC_8 \ - COMP_MUL(tr[0], res[ 0], res[ 3], ti[0], res[ 1], res[ 2]) \ - COMP_MUL(tr[1], res[ 8], res[11], ti[1], res[ 9], res[10]) \ - COMP_MAC(tr[0], res[16], res[19], ti[0], res[17], res[18]) \ - COMP_MAC(tr[1], res[24], res[27], ti[1], res[25], res[26]) \ - COMP_MAC(tr[0], res[32], res[35], ti[0], res[33], res[34]) \ - COMP_MAC(tr[1], res[40], res[43], ti[1], res[41], res[42]) \ - COMP_MAC(tr[0], res[48], res[51], ti[0], res[49], res[50]) \ - COMP_MAC(tr[1], res[56], res[59], ti[1], res[57], res[58]) \ - CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ - CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ - CO[2*ldc+0] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ - CO[2*ldc+1] A_OP ti[1] * alpha_r + tr[1] * alpha_i; +#define SAVE_ACC_COMPLEX_12 \ + BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + COMP_MUL(tr[0], res[ 0], res[ 3], ti[0], res[ 1], res[ 2]) \ + COMP_MUL(tr[1], res[ 8], res[11], ti[1], res[ 9], res[10]) \ + COMP_MAC(tr[0], res[16], res[19], ti[0], res[17], res[18]) \ + COMP_MAC(tr[1], res[24], res[27], ti[1], res[25], res[26]) \ + COMP_MAC(tr[0], res[32], res[35], ti[0], res[33], res[34]) \ + COMP_MAC(tr[1], res[40], res[43], ti[1], res[41], res[42]) \ + COMP_MAC(tr[0], res[48], res[51], ti[0], res[49], res[50]) \ + COMP_MAC(tr[1], res[56], res[59], ti[1], res[57], res[58]) \ + CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[2*ldc+0] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[2*ldc+1] A_OP ti[1] * alpha_r + tr[1] * alpha_i; -#define SAVE_ACC_COMPLEX_21_1 \ - BUILTIN_MMA_DISASSEMBLE_ACC_8 \ - COMP_MUL(tr[0], res[ 0], res[ 3], ti[0], res[ 1], res[ 2]) \ - COMP_MUL(tr[1], res[ 4], res[ 7], ti[1], res[ 5], res[ 6]) \ - COMP_MAC(tr[0], res[ 8], res[11], ti[0], res[ 9], res[10]) \ - COMP_MAC(tr[1], res[12], res[15], ti[1], res[13], res[14]) \ - COMP_MAC(tr[0], res[16], res[19], ti[0], res[17], res[18]) \ - COMP_MAC(tr[1], res[20], res[23], ti[1], res[21], res[22]) \ - COMP_MAC(tr[0], res[24], res[27], ti[0], res[25], res[26]) \ - COMP_MAC(tr[1], res[28], res[31], ti[1], res[29], res[30]) \ - COMP_MAC(tr[0], res[32], res[35], ti[0], res[33], res[34]) \ - COMP_MAC(tr[1], res[36], res[39], ti[1], res[37], res[38]) \ - COMP_MAC(tr[0], res[40], res[43], ti[0], res[41], res[42]) \ - COMP_MAC(tr[1], res[44], res[47], ti[1], res[45], res[46]) \ - COMP_MAC(tr[0], res[48], res[51], ti[0], res[49], res[50]) \ - COMP_MAC(tr[1], res[52], res[55], ti[1], res[53], res[54]) \ - COMP_MAC(tr[0], res[56], res[59], ti[0], res[57], res[58]) \ - COMP_MAC(tr[1], res[60], res[63], ti[1], res[61], res[62]) \ - CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ - CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ - CO[2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ - CO[3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; +#define SAVE_ACC_COMPLEX_21_1 \ + BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + COMP_MUL(tr[0], res[ 0], res[ 3], ti[0], res[ 1], res[ 2]) \ + COMP_MUL(tr[1], res[ 4], res[ 7], ti[1], res[ 5], res[ 6]) \ + COMP_MAC(tr[0], res[ 8], res[11], ti[0], res[ 9], res[10]) \ + COMP_MAC(tr[1], res[12], res[15], ti[1], res[13], res[14]) \ + COMP_MAC(tr[0], res[16], res[19], ti[0], res[17], res[18]) \ + COMP_MAC(tr[1], res[20], res[23], ti[1], res[21], res[22]) \ + COMP_MAC(tr[0], res[24], res[27], ti[0], res[25], res[26]) \ + COMP_MAC(tr[1], res[28], res[31], ti[1], res[29], res[30]) \ + COMP_MAC(tr[0], res[32], res[35], ti[0], res[33], res[34]) \ + COMP_MAC(tr[1], res[36], res[39], ti[1], res[37], res[38]) \ + COMP_MAC(tr[0], res[40], res[43], ti[0], res[41], res[42]) \ + COMP_MAC(tr[1], res[44], res[47], ti[1], res[45], res[46]) \ + COMP_MAC(tr[0], res[48], res[51], ti[0], res[49], res[50]) \ + COMP_MAC(tr[1], res[52], res[55], ti[1], res[53], res[54]) \ + COMP_MAC(tr[0], res[56], res[59], ti[0], res[57], res[58]) \ + COMP_MAC(tr[1], res[60], res[63], ti[1], res[61], res[62]) \ + CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; -#define SAVE_ACC_COMPLEX_21_2 \ - BUILTIN_MMA_DISASSEMBLE_ACC_8 \ - COMP_MUL(tr[0], res[ 0], res[ 3], ti[0], res[ 1], res[ 2]) \ - COMP_MUL(tr[1], res[ 4], res[ 7], ti[1], res[ 5], res[ 6]) \ - COMP_MUL(tr[2], res[ 8], res[11], ti[2], res[ 9], res[10]) \ - COMP_MUL(tr[3], res[12], res[15], ti[3], res[13], res[14]) \ - COMP_MAC(tr[0], res[16], res[19], ti[0], res[17], res[18]) \ - COMP_MAC(tr[1], res[20], res[23], ti[1], res[21], res[22]) \ - COMP_MAC(tr[2], res[24], res[27], ti[2], res[25], res[26]) \ - COMP_MAC(tr[3], res[28], res[31], ti[3], res[29], res[30]) \ - COMP_MAC(tr[0], res[32], res[35], ti[0], res[33], res[34]) \ - COMP_MAC(tr[1], res[36], res[39], ti[1], res[37], res[38]) \ - COMP_MAC(tr[2], res[40], res[43], ti[2], res[41], res[42]) \ - COMP_MAC(tr[3], res[44], res[47], ti[3], res[45], res[46]) \ - COMP_MAC(tr[0], res[48], res[51], ti[0], res[49], res[50]) \ - COMP_MAC(tr[1], res[52], res[55], ti[1], res[53], res[54]) \ - COMP_MAC(tr[2], res[56], res[59], ti[2], res[57], res[58]) \ - COMP_MAC(tr[3], res[60], res[63], ti[3], res[61], res[62]) \ - CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ - CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ - CO[2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ - CO[3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ - CO[4] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ - CO[5] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ - CO[6] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ - CO[7] A_OP ti[3] * alpha_r + tr[3] * alpha_i; +#define SAVE_ACC_COMPLEX_21_2 \ + BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + COMP_MUL(tr[0], res[ 0], res[ 3], ti[0], res[ 1], res[ 2]) \ + COMP_MUL(tr[1], res[ 4], res[ 7], ti[1], res[ 5], res[ 6]) \ + COMP_MUL(tr[2], res[ 8], res[11], ti[2], res[ 9], res[10]) \ + COMP_MUL(tr[3], res[12], res[15], ti[3], res[13], res[14]) \ + COMP_MAC(tr[0], res[16], res[19], ti[0], res[17], res[18]) \ + COMP_MAC(tr[1], res[20], res[23], ti[1], res[21], res[22]) \ + COMP_MAC(tr[2], res[24], res[27], ti[2], res[25], res[26]) \ + COMP_MAC(tr[3], res[28], res[31], ti[3], res[29], res[30]) \ + COMP_MAC(tr[0], res[32], res[35], ti[0], res[33], res[34]) \ + COMP_MAC(tr[1], res[36], res[39], ti[1], res[37], res[38]) \ + COMP_MAC(tr[2], res[40], res[43], ti[2], res[41], res[42]) \ + COMP_MAC(tr[3], res[44], res[47], ti[3], res[45], res[46]) \ + COMP_MAC(tr[0], res[48], res[51], ti[0], res[49], res[50]) \ + COMP_MAC(tr[1], res[52], res[55], ti[1], res[53], res[54]) \ + COMP_MAC(tr[2], res[56], res[59], ti[2], res[57], res[58]) \ + COMP_MAC(tr[3], res[60], res[63], ti[3], res[61], res[62]) \ + CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[4] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[5] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[6] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[7] A_OP ti[3] * alpha_r + tr[3] * alpha_i; -#define SAVE_ACC_COMPLEX_21_4 \ - BUILTIN_MMA_DISASSEMBLE_ACC_8 \ - COMP_MUL(tr[0], res[ 0], res[ 3], ti[0], res[ 1], res[ 2]) \ - COMP_MUL(tr[1], res[ 4], res[ 7], ti[1], res[ 5], res[ 6]) \ - COMP_MUL(tr[2], res[ 8], res[11], ti[2], res[ 9], res[10]) \ - COMP_MUL(tr[3], res[12], res[15], ti[3], res[13], res[14]) \ - COMP_MUL(tr[4], res[16], res[19], ti[4], res[17], res[18]) \ - COMP_MUL(tr[5], res[20], res[23], ti[5], res[21], res[22]) \ - COMP_MUL(tr[6], res[24], res[27], ti[6], res[25], res[26]) \ - COMP_MUL(tr[7], res[28], res[31], ti[7], res[29], res[30]) \ - COMP_MAC(tr[0], res[32], res[35], ti[0], res[33], res[34]) \ - COMP_MAC(tr[1], res[36], res[39], ti[1], res[37], res[38]) \ - COMP_MAC(tr[2], res[40], res[43], ti[2], res[41], res[42]) \ - COMP_MAC(tr[3], res[44], res[47], ti[3], res[45], res[46]) \ - COMP_MAC(tr[4], res[48], res[51], ti[4], res[49], res[50]) \ - COMP_MAC(tr[5], res[52], res[55], ti[5], res[53], res[54]) \ - COMP_MAC(tr[6], res[56], res[59], ti[6], res[57], res[58]) \ - COMP_MAC(tr[7], res[60], res[63], ti[7], res[61], res[62]) \ - CO[ 0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ - CO[ 1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ - CO[ 2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ - CO[ 3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ - CO[ 4] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ - CO[ 5] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ - CO[ 6] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ - CO[ 7] A_OP ti[3] * alpha_r + tr[3] * alpha_i; \ - CO[ 8] A_OP tr[4] * alpha_r - ti[4] * alpha_i; \ - CO[ 9] A_OP ti[4] * alpha_r + tr[4] * alpha_i; \ - CO[10] A_OP tr[5] * alpha_r - ti[5] * alpha_i; \ - CO[11] A_OP ti[5] * alpha_r + tr[5] * alpha_i; \ - CO[12] A_OP tr[6] * alpha_r - ti[6] * alpha_i; \ - CO[13] A_OP ti[6] * alpha_r + tr[6] * alpha_i; \ - CO[14] A_OP tr[7] * alpha_r - ti[7] * alpha_i; \ - CO[15] A_OP ti[7] * alpha_r + tr[7] * alpha_i; +#define SAVE_ACC_COMPLEX_21_4 \ + BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + COMP_MUL(tr[0], res[ 0], res[ 3], ti[0], res[ 1], res[ 2]) \ + COMP_MUL(tr[1], res[ 4], res[ 7], ti[1], res[ 5], res[ 6]) \ + COMP_MUL(tr[2], res[ 8], res[11], ti[2], res[ 9], res[10]) \ + COMP_MUL(tr[3], res[12], res[15], ti[3], res[13], res[14]) \ + COMP_MUL(tr[4], res[16], res[19], ti[4], res[17], res[18]) \ + COMP_MUL(tr[5], res[20], res[23], ti[5], res[21], res[22]) \ + COMP_MUL(tr[6], res[24], res[27], ti[6], res[25], res[26]) \ + COMP_MUL(tr[7], res[28], res[31], ti[7], res[29], res[30]) \ + COMP_MAC(tr[0], res[32], res[35], ti[0], res[33], res[34]) \ + COMP_MAC(tr[1], res[36], res[39], ti[1], res[37], res[38]) \ + COMP_MAC(tr[2], res[40], res[43], ti[2], res[41], res[42]) \ + COMP_MAC(tr[3], res[44], res[47], ti[3], res[45], res[46]) \ + COMP_MAC(tr[4], res[48], res[51], ti[4], res[49], res[50]) \ + COMP_MAC(tr[5], res[52], res[55], ti[5], res[53], res[54]) \ + COMP_MAC(tr[6], res[56], res[59], ti[6], res[57], res[58]) \ + COMP_MAC(tr[7], res[60], res[63], ti[7], res[61], res[62]) \ + CO[ 0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[ 1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[ 2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[ 3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[ 4] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[ 5] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[ 6] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[ 7] A_OP ti[3] * alpha_r + tr[3] * alpha_i; \ + CO[ 8] A_OP tr[4] * alpha_r - ti[4] * alpha_i; \ + CO[ 9] A_OP ti[4] * alpha_r + tr[4] * alpha_i; \ + CO[10] A_OP tr[5] * alpha_r - ti[5] * alpha_i; \ + CO[11] A_OP ti[5] * alpha_r + tr[5] * alpha_i; \ + CO[12] A_OP tr[6] * alpha_r - ti[6] * alpha_i; \ + CO[13] A_OP ti[6] * alpha_r + tr[6] * alpha_i; \ + CO[14] A_OP tr[7] * alpha_r - ti[7] * alpha_i; \ + CO[15] A_OP ti[7] * alpha_r + tr[7] * alpha_i; -#define SAVE_ACC_COMPLEX_22_1 \ - __builtin_mma_disassemble_acc ((void *)result, &acc0); \ - __builtin_mma_disassemble_acc ((void *)(&result[4]), &acc1); \ - COMP_MUL(tr[0], res[0], res[3], ti[0], res[1], res[2]) \ - COMP_MUL(tr[1], res[4], res[7], ti[1], res[5], res[6]) \ - COMP_MUL(tr[2], res[8], res[11], ti[2], res[9], res[10]) \ - COMP_MUL(tr[3], res[12], res[15], ti[3], res[13], res[14] ) \ - CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ - CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ - CO[2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ - CO[3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ - CO[2*ldc+0] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ - CO[2*ldc+1] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ - CO[2*ldc+2] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ - CO[2*ldc+3] A_OP ti[3] * alpha_r + tr[3] * alpha_i; +#define SAVE_ACC_COMPLEX_22_1 \ + __builtin_mma_disassemble_acc ((void *)result, &acc0); \ + __builtin_mma_disassemble_acc ((void *)(&result[4]), &acc1); \ + COMP_MUL(tr[0], res[0], res[3], ti[0], res[1], res[2]) \ + COMP_MUL(tr[1], res[4], res[7], ti[1], res[5], res[6]) \ + COMP_MUL(tr[2], res[8], res[11], ti[2], res[9], res[10]) \ + COMP_MUL(tr[3], res[12], res[15], ti[3], res[13], res[14] ) \ + CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[2*ldc+0] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[2*ldc+1] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[2*ldc+2] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[2*ldc+3] A_OP ti[3] * alpha_r + tr[3] * alpha_i; -#define SAVE_ACC_COMPLEX_22_2(ACC1, ACC2, CI) \ - __builtin_mma_disassemble_acc ((void *)result, ACC1); \ - __builtin_mma_disassemble_acc ((void *)(&result[4]), ACC2); \ - COMP_MUL(tr[0], res[0], res[3], ti[0], res[1], res[2]) \ - COMP_MUL(tr[1], res[4], res[7], ti[1], res[5], res[6]) \ - COMP_MUL(tr[2], res[8], res[11], ti[2], res[9], res[10]) \ - COMP_MUL(tr[3], res[12], res[15], ti[3], res[13], res[14]) \ - CO[CI+0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ - CO[CI+1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ - CO[CI+2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ - CO[CI+3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ - CO[2*ldc+CI+0] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ - CO[2*ldc+CI+1] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ - CO[2*ldc+CI+2] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ - CO[2*ldc+CI+3] A_OP ti[3] * alpha_r + tr[3] * alpha_i; +#define SAVE_ACC_COMPLEX_22_2(ACC1, ACC2, CI) \ + __builtin_mma_disassemble_acc ((void *)result, ACC1); \ + __builtin_mma_disassemble_acc ((void *)(&result[4]), ACC2); \ + COMP_MUL(tr[0], res[0], res[3], ti[0], res[1], res[2]) \ + COMP_MUL(tr[1], res[4], res[7], ti[1], res[5], res[6]) \ + COMP_MUL(tr[2], res[8], res[11], ti[2], res[9], res[10]) \ + COMP_MUL(tr[3], res[12], res[15], ti[3], res[13], res[14]) \ + CO[CI+0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[CI+1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[CI+2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[CI+3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[2*ldc+CI+0] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[2*ldc+CI+1] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[2*ldc+CI+2] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[2*ldc+CI+3] A_OP ti[3] * alpha_r + tr[3] * alpha_i; #define PREFETCH1(x, y) asm volatile ("dcbt %0, %1" : : "r" (x), "b" (y) : "memory"); #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) #define REFRESH_TEMP_BK(x, y) \ - temp = k - off; + temp = k - off; #elif defined(LEFT) #define REFRESH_TEMP_BK(x, y) \ - temp = off + x; + temp = off + x; #else #define REFRESH_TEMP_BK(x, y) \ - temp = off + y; + temp = off + y; #endif #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) #define REFRESH_POINTERS(x, y) \ - BO = B; \ - REFRESH_TEMP_BK(x, y) + BO = B; \ + REFRESH_TEMP_BK(x, y) #else #define REFRESH_POINTERS(x, y) \ - AO += off * (2*x); \ - BO = B + off * (2*y); \ - REFRESH_TEMP_BK(x, y) + AO += off * (2*x); \ + BO = B + off * (2*y); \ + REFRESH_TEMP_BK(x, y) #endif #ifdef LEFT #define REFRESH_OFF(x) \ - off += x; + off += x; #else #define REFRESH_OFF(x) #endif #ifdef LEFT #define UPDATE_TEMP(x, y) \ - temp -= x; + temp -= x; #else #define UPDATE_TEMP(x, y) \ - temp -= y; + temp -= y; #endif #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) #define REFRESH_TMP_AFTER_SAVE(x, y) \ - temp = k - off; \ - UPDATE_TEMP(x, y) \ - AO += temp * (2*x); \ - BO += temp * (2*y); + temp = k - off; \ + UPDATE_TEMP(x, y) \ + AO += temp * (2*x); \ + BO += temp * (2*y); #else #define REFRESH_TMP_AFTER_SAVE(x, y) #endif -#define REFRESH_AFTER_SAVE(x,y) \ - REFRESH_TMP_AFTER_SAVE(x, y) \ - REFRESH_OFF(x) +#define REFRESH_AFTER_SAVE(x,y) \ + REFRESH_TMP_AFTER_SAVE(x, y) \ + REFRESH_OFF(x) /************************************************************************************* * GEMM Kernel *************************************************************************************/ int -CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT * A, FLOAT * B, - FLOAT * C, BLASLONG ldc #ifdef TRMMKERNEL - , BLASLONG offset +CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, + FLOAT * A, FLOAT * B, FLOAT * C, BLASLONG ldc, BLASLONG offset) +#else +CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, + FLOAT * A, FLOAT * B, FLOAT * C, BLASLONG ldc) #endif - ) { - BLASLONG i1, i, l, temp; - FLOAT *AO, *BO, *CO; + BLASLONG i1, i, l, temp; + FLOAT *AO, *BO, *CO; #if defined(TRMMKERNEL) - BLASLONG off; + BLASLONG off; #endif #if defined(TRMMKERNEL) && !defined(LEFT) - off = -offset; + off = -offset; #endif - __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; + __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; - v4sf_t result[32]; - FLOAT *res, tr[16], ti[16]; - res = (FLOAT *) result; + v4sf_t result[32]; + FLOAT *res, tr[16], ti[16]; + res = (FLOAT *) result; - for (i1 = 0; i1 < (n >> 1); i1++) - { + for (i1 = 0; i1 < (n >> 1); i1++) { #if defined(TRMMKERNEL) && defined(LEFT) off = offset; #endif AO = A; CO = C; C += ldc<<2; - for (i = 0; i < (m >> 3); i++) - { + for (i = 0; i < (m >> 3); i++) { #if defined(TRMMKERNEL) - REFRESH_POINTERS (8, 2) + REFRESH_POINTERS (8, 2) #else - BO = B; - temp = k; + BO = B; + temp = k; #endif - SET_ACC_ZERO() - for (l = 0; l < temp; ++l) - { - __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<4])); - __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<4)+4])); - __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<4)+8])); - __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<4)+12])); - vec_t rowB1 = *(vec_t *) & BO[l<<2]; - vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2]; - __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); - __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1); - __builtin_mma_xvf64gerpp(&acc2, rowA3, rowB1); - __builtin_mma_xvf64gerpp(&acc3, rowA4, rowB1); - __builtin_mma_xvf64gerpp(&acc4, rowA1, rowB2); - __builtin_mma_xvf64gerpp(&acc5, rowA2, rowB2); - __builtin_mma_xvf64gerpp(&acc6, rowA3, rowB2); - __builtin_mma_xvf64gerpp(&acc7, rowA4, rowB2); - } - __builtin_mma_disassemble_acc ((void *)result, &acc0); - __builtin_mma_disassemble_acc ((void *)(&result[ 4]), &acc1); - __builtin_mma_disassemble_acc ((void *)(&result[ 8]), &acc2); - __builtin_mma_disassemble_acc ((void *)(&result[12]), &acc3); - __builtin_mma_disassemble_acc ((void *)(&result[16]), &acc4); - __builtin_mma_disassemble_acc ((void *)(&result[20]), &acc5); - __builtin_mma_disassemble_acc ((void *)(&result[24]), &acc6); - __builtin_mma_disassemble_acc ((void *)(&result[28]), &acc7); - COMP_MUL(tr[ 0], res[ 0], res[ 3], ti[ 0], res[ 1], res[ 2]) - COMP_MUL(tr[ 1], res[ 4], res[ 7], ti[ 1], res[ 5], res[ 6]) - COMP_MUL(tr[ 2], res[ 8], res[11], ti[ 2], res[ 9], res[10]) - COMP_MUL(tr[ 3], res[12], res[15], ti[ 3], res[13], res[14]) - COMP_MUL(tr[ 4], res[16], res[19], ti[ 4], res[17], res[18]) - COMP_MUL(tr[ 5], res[20], res[23], ti[ 5], res[21], res[22]) - COMP_MUL(tr[ 6], res[24], res[27], ti[ 6], res[25], res[26]) - COMP_MUL(tr[ 7], res[28], res[31], ti[ 7], res[29], res[30]) - COMP_MUL(tr[ 8], res[32], res[35], ti[ 8], res[33], res[34]) - COMP_MUL(tr[ 9], res[36], res[39], ti[ 9], res[37], res[38]) - COMP_MUL(tr[10], res[40], res[43], ti[10], res[41], res[42]) - COMP_MUL(tr[11], res[44], res[47], ti[11], res[45], res[46]) - COMP_MUL(tr[12], res[48], res[51], ti[12], res[49], res[50]) - COMP_MUL(tr[13], res[52], res[55], ti[13], res[53], res[54]) - COMP_MUL(tr[14], res[56], res[59], ti[14], res[57], res[58]) - COMP_MUL(tr[15], res[60], res[63], ti[15], res[61], res[62]) - CO[ 0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; - CO[ 1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; - CO[ 2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; - CO[ 3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; - CO[ 4] A_OP tr[2] * alpha_r - ti[2] * alpha_i; - CO[ 5] A_OP ti[2] * alpha_r + tr[2] * alpha_i; - CO[ 6] A_OP tr[3] * alpha_r - ti[3] * alpha_i; - CO[ 7] A_OP ti[3] * alpha_r + tr[3] * alpha_i; - CO[ 8] A_OP tr[4] * alpha_r - ti[4] * alpha_i; - CO[ 9] A_OP ti[4] * alpha_r + tr[4] * alpha_i; - CO[10] A_OP tr[5] * alpha_r - ti[5] * alpha_i; - CO[11] A_OP ti[5] * alpha_r + tr[5] * alpha_i; - CO[12] A_OP tr[6] * alpha_r - ti[6] * alpha_i; - CO[13] A_OP ti[6] * alpha_r + tr[6] * alpha_i; - CO[14] A_OP tr[7] * alpha_r - ti[7] * alpha_i; - CO[15] A_OP ti[7] * alpha_r + tr[7] * alpha_i; - CO[2*ldc+ 0] A_OP tr[ 8] * alpha_r - ti[ 8] * alpha_i; - CO[2*ldc+ 1] A_OP ti[ 8] * alpha_r + tr[ 8] * alpha_i; - CO[2*ldc+ 2] A_OP tr[ 9] * alpha_r - ti[ 9] * alpha_i; - CO[2*ldc+ 3] A_OP ti[ 9] * alpha_r + tr[ 9] * alpha_i; - CO[2*ldc+ 4] A_OP tr[10] * alpha_r - ti[10] * alpha_i; - CO[2*ldc+ 5] A_OP ti[10] * alpha_r + tr[10] * alpha_i; - CO[2*ldc+ 6] A_OP tr[11] * alpha_r - ti[11] * alpha_i; - CO[2*ldc+ 7] A_OP ti[11] * alpha_r + tr[11] * alpha_i; - CO[2*ldc+ 8] A_OP tr[12] * alpha_r - ti[12] * alpha_i; - CO[2*ldc+ 9] A_OP ti[12] * alpha_r + tr[12] * alpha_i; - CO[2*ldc+10] A_OP tr[13] * alpha_r - ti[13] * alpha_i; - CO[2*ldc+11] A_OP ti[13] * alpha_r + tr[13] * alpha_i; - CO[2*ldc+12] A_OP tr[14] * alpha_r - ti[14] * alpha_i; - CO[2*ldc+13] A_OP ti[14] * alpha_r + tr[14] * alpha_i; - CO[2*ldc+14] A_OP tr[15] * alpha_r - ti[15] * alpha_i; - CO[2*ldc+15] A_OP ti[15] * alpha_r + tr[15] * alpha_i; + SET_ACC_ZERO() + for (l = 0; l < temp; ++l) { + __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<4])); + __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<4)+4])); + __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<4)+8])); + __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<4)+12])); + vec_t rowB1 = *(vec_t *) & BO[l<<2]; + vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2]; + __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1); + __builtin_mma_xvf64gerpp(&acc2, rowA3, rowB1); + __builtin_mma_xvf64gerpp(&acc3, rowA4, rowB1); + __builtin_mma_xvf64gerpp(&acc4, rowA1, rowB2); + __builtin_mma_xvf64gerpp(&acc5, rowA2, rowB2); + __builtin_mma_xvf64gerpp(&acc6, rowA3, rowB2); + __builtin_mma_xvf64gerpp(&acc7, rowA4, rowB2); + } + __builtin_mma_disassemble_acc ((void *)result, &acc0); + __builtin_mma_disassemble_acc ((void *)(&result[ 4]), &acc1); + __builtin_mma_disassemble_acc ((void *)(&result[ 8]), &acc2); + __builtin_mma_disassemble_acc ((void *)(&result[12]), &acc3); + __builtin_mma_disassemble_acc ((void *)(&result[16]), &acc4); + __builtin_mma_disassemble_acc ((void *)(&result[20]), &acc5); + __builtin_mma_disassemble_acc ((void *)(&result[24]), &acc6); + __builtin_mma_disassemble_acc ((void *)(&result[28]), &acc7); + COMP_MUL(tr[ 0], res[ 0], res[ 3], ti[ 0], res[ 1], res[ 2]) + COMP_MUL(tr[ 1], res[ 4], res[ 7], ti[ 1], res[ 5], res[ 6]) + COMP_MUL(tr[ 2], res[ 8], res[11], ti[ 2], res[ 9], res[10]) + COMP_MUL(tr[ 3], res[12], res[15], ti[ 3], res[13], res[14]) + COMP_MUL(tr[ 4], res[16], res[19], ti[ 4], res[17], res[18]) + COMP_MUL(tr[ 5], res[20], res[23], ti[ 5], res[21], res[22]) + COMP_MUL(tr[ 6], res[24], res[27], ti[ 6], res[25], res[26]) + COMP_MUL(tr[ 7], res[28], res[31], ti[ 7], res[29], res[30]) + COMP_MUL(tr[ 8], res[32], res[35], ti[ 8], res[33], res[34]) + COMP_MUL(tr[ 9], res[36], res[39], ti[ 9], res[37], res[38]) + COMP_MUL(tr[10], res[40], res[43], ti[10], res[41], res[42]) + COMP_MUL(tr[11], res[44], res[47], ti[11], res[45], res[46]) + COMP_MUL(tr[12], res[48], res[51], ti[12], res[49], res[50]) + COMP_MUL(tr[13], res[52], res[55], ti[13], res[53], res[54]) + COMP_MUL(tr[14], res[56], res[59], ti[14], res[57], res[58]) + COMP_MUL(tr[15], res[60], res[63], ti[15], res[61], res[62]) + CO[ 0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; + CO[ 1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; + CO[ 2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; + CO[ 3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; + CO[ 4] A_OP tr[2] * alpha_r - ti[2] * alpha_i; + CO[ 5] A_OP ti[2] * alpha_r + tr[2] * alpha_i; + CO[ 6] A_OP tr[3] * alpha_r - ti[3] * alpha_i; + CO[ 7] A_OP ti[3] * alpha_r + tr[3] * alpha_i; + CO[ 8] A_OP tr[4] * alpha_r - ti[4] * alpha_i; + CO[ 9] A_OP ti[4] * alpha_r + tr[4] * alpha_i; + CO[10] A_OP tr[5] * alpha_r - ti[5] * alpha_i; + CO[11] A_OP ti[5] * alpha_r + tr[5] * alpha_i; + CO[12] A_OP tr[6] * alpha_r - ti[6] * alpha_i; + CO[13] A_OP ti[6] * alpha_r + tr[6] * alpha_i; + CO[14] A_OP tr[7] * alpha_r - ti[7] * alpha_i; + CO[15] A_OP ti[7] * alpha_r + tr[7] * alpha_i; + CO[2*ldc+ 0] A_OP tr[ 8] * alpha_r - ti[ 8] * alpha_i; + CO[2*ldc+ 1] A_OP ti[ 8] * alpha_r + tr[ 8] * alpha_i; + CO[2*ldc+ 2] A_OP tr[ 9] * alpha_r - ti[ 9] * alpha_i; + CO[2*ldc+ 3] A_OP ti[ 9] * alpha_r + tr[ 9] * alpha_i; + CO[2*ldc+ 4] A_OP tr[10] * alpha_r - ti[10] * alpha_i; + CO[2*ldc+ 5] A_OP ti[10] * alpha_r + tr[10] * alpha_i; + CO[2*ldc+ 6] A_OP tr[11] * alpha_r - ti[11] * alpha_i; + CO[2*ldc+ 7] A_OP ti[11] * alpha_r + tr[11] * alpha_i; + CO[2*ldc+ 8] A_OP tr[12] * alpha_r - ti[12] * alpha_i; + CO[2*ldc+ 9] A_OP ti[12] * alpha_r + tr[12] * alpha_i; + CO[2*ldc+10] A_OP tr[13] * alpha_r - ti[13] * alpha_i; + CO[2*ldc+11] A_OP ti[13] * alpha_r + tr[13] * alpha_i; + CO[2*ldc+12] A_OP tr[14] * alpha_r - ti[14] * alpha_i; + CO[2*ldc+13] A_OP ti[14] * alpha_r + tr[14] * alpha_i; + CO[2*ldc+14] A_OP tr[15] * alpha_r - ti[15] * alpha_i; + CO[2*ldc+15] A_OP ti[15] * alpha_r + tr[15] * alpha_i; - AO += temp << 4; - BO += temp << 2; - CO += 16; + AO += temp << 4; + BO += temp << 2; + CO += 16; #if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE (8, 2) + REFRESH_AFTER_SAVE (8, 2) #endif - } - if (m & 4) - { + } + if (m & 4) { #if defined(TRMMKERNEL) - REFRESH_POINTERS (4, 2) + REFRESH_POINTERS (4, 2) #else - BO = B; - temp = k; + BO = B; + temp = k; #endif - SET_ACC_ZERO() - for (l = 0; l < (temp & (~1)); l+=2) - { - __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<3])); - __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<3)+4])); - __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<3)+8])); - __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<3)+12])); - vec_t rowB1 = *(vec_t *) & BO[l<<2]; - vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2]; - vec_t rowB3 = *(vec_t *) & BO[(l<<2)+4]; - vec_t rowB4 = *(vec_t *) & BO[(l<<2)+6]; - __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); - __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1); - __builtin_mma_xvf64gerpp(&acc2, rowA1, rowB2); - __builtin_mma_xvf64gerpp(&acc3, rowA2, rowB2); - __builtin_mma_xvf64gerpp(&acc0, rowA3, rowB3); - __builtin_mma_xvf64gerpp(&acc1, rowA4, rowB3); - __builtin_mma_xvf64gerpp(&acc2, rowA3, rowB4); - __builtin_mma_xvf64gerpp(&acc3, rowA4, rowB4); - } - for (l = (temp & (~1)); l < temp; ++l) - { - __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<3])); - __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<3)+4])); - vec_t rowB1 = *(vec_t *) & BO[l<<2]; - vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2]; - __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); - __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1); - __builtin_mma_xvf64gerpp(&acc2, rowA1, rowB2); - __builtin_mma_xvf64gerpp(&acc3, rowA2, rowB2); - } - SAVE_ACC_COMPLEX_22_2(&acc0, &acc2, 0) - SAVE_ACC_COMPLEX_22_2(&acc1, &acc3, 4) - AO += temp << 3; - BO += temp << 2; - CO += 8; + SET_ACC_ZERO() + for (l = 0; l < (temp & (~1)); l+=2) { + __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<3])); + __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<3)+4])); + __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<3)+8])); + __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<3)+12])); + vec_t rowB1 = *(vec_t *) & BO[l<<2]; + vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2]; + vec_t rowB3 = *(vec_t *) & BO[(l<<2)+4]; + vec_t rowB4 = *(vec_t *) & BO[(l<<2)+6]; + __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1); + __builtin_mma_xvf64gerpp(&acc2, rowA1, rowB2); + __builtin_mma_xvf64gerpp(&acc3, rowA2, rowB2); + __builtin_mma_xvf64gerpp(&acc0, rowA3, rowB3); + __builtin_mma_xvf64gerpp(&acc1, rowA4, rowB3); + __builtin_mma_xvf64gerpp(&acc2, rowA3, rowB4); + __builtin_mma_xvf64gerpp(&acc3, rowA4, rowB4); + } + for (l = (temp & (~1)); l < temp; ++l) { + __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<3])); + __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<3)+4])); + vec_t rowB1 = *(vec_t *) & BO[l<<2]; + vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2]; + __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1); + __builtin_mma_xvf64gerpp(&acc2, rowA1, rowB2); + __builtin_mma_xvf64gerpp(&acc3, rowA2, rowB2); + } + SAVE_ACC_COMPLEX_22_2(&acc0, &acc2, 0) + SAVE_ACC_COMPLEX_22_2(&acc1, &acc3, 4) + AO += temp << 3; + BO += temp << 2; + CO += 8; #if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE (4, 2) + REFRESH_AFTER_SAVE (4, 2) #endif - } - if (m & 2) - { + } + if (m & 2) { #if defined(TRMMKERNEL) - REFRESH_POINTERS (2, 2) + REFRESH_POINTERS (2, 2) #else - BO = B; - temp = k; + BO = B; + temp = k; #endif - SET_ACC_ZERO() - for (l = 0; l < (temp & (~3)); l+=4) - { - __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<2])); - __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<2)+4])); - __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<2)+8])); - __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<2)+12])); - vec_t rowB1 = *(vec_t *) & BO[l<<2]; - vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2]; - vec_t rowB3 = *(vec_t *) & BO[(l<<2)+4]; - vec_t rowB4 = *(vec_t *) & BO[(l<<2)+6]; - vec_t rowB5 = *(vec_t *) & BO[(l<<2)+8]; - vec_t rowB6 = *(vec_t *) & BO[(l<<2)+10]; - vec_t rowB7 = *(vec_t *) & BO[(l<<2)+12]; - vec_t rowB8 = *(vec_t *) & BO[(l<<2)+14]; - __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); - __builtin_mma_xvf64gerpp(&acc1, rowA1, rowB2); - __builtin_mma_xvf64gerpp(&acc0, rowA2, rowB3); - __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB4); - __builtin_mma_xvf64gerpp(&acc0, rowA3, rowB5); - __builtin_mma_xvf64gerpp(&acc1, rowA3, rowB6); - __builtin_mma_xvf64gerpp(&acc0, rowA4, rowB7); - __builtin_mma_xvf64gerpp(&acc1, rowA4, rowB8); - } - for (l = (temp & (~3)); l < temp; ++l) - { - __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<2])); - vec_t rowB1 = *(vec_t *) & BO[l<<2]; - vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2]; - __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); - __builtin_mma_xvf64gerpp(&acc1, rowA1, rowB2); - } - SAVE_ACC_COMPLEX_22_1 - AO += temp << 2; - BO += temp << 2; - CO += 4; + SET_ACC_ZERO() + for (l = 0; l < (temp & (~3)); l+=4) { + __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<2])); + __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<2)+4])); + __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<2)+8])); + __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<2)+12])); + vec_t rowB1 = *(vec_t *) & BO[l<<2]; + vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2]; + vec_t rowB3 = *(vec_t *) & BO[(l<<2)+4]; + vec_t rowB4 = *(vec_t *) & BO[(l<<2)+6]; + vec_t rowB5 = *(vec_t *) & BO[(l<<2)+8]; + vec_t rowB6 = *(vec_t *) & BO[(l<<2)+10]; + vec_t rowB7 = *(vec_t *) & BO[(l<<2)+12]; + vec_t rowB8 = *(vec_t *) & BO[(l<<2)+14]; + __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf64gerpp(&acc1, rowA1, rowB2); + __builtin_mma_xvf64gerpp(&acc0, rowA2, rowB3); + __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB4); + __builtin_mma_xvf64gerpp(&acc0, rowA3, rowB5); + __builtin_mma_xvf64gerpp(&acc1, rowA3, rowB6); + __builtin_mma_xvf64gerpp(&acc0, rowA4, rowB7); + __builtin_mma_xvf64gerpp(&acc1, rowA4, rowB8); + } + for (l = (temp & (~3)); l < temp; ++l) { + __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<2])); + vec_t rowB1 = *(vec_t *) & BO[l<<2]; + vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2]; + __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf64gerpp(&acc1, rowA1, rowB2); + } + SAVE_ACC_COMPLEX_22_1 + AO += temp << 2; + BO += temp << 2; + CO += 4; #if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE (2, 2) + REFRESH_AFTER_SAVE (2, 2) #endif - } - if (m & 1) - { + } + if (m & 1) { #if defined(TRMMKERNEL) - REFRESH_POINTERS (1, 2) + REFRESH_POINTERS (1, 2) #else - BO = B; - temp = k; + BO = B; + temp = k; #endif - // RIP OUT MMA STUFF! - SET_ACC_ZERO() - for (l = 0; l < (temp & (~3)); l+=4) - { - __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<1])); - __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<1)+2])); - __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<1)+4])); - __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<1)+6])); - vec_t rowB1 = *(vec_t *) & BO[l<<2]; - vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2]; - vec_t rowB3 = *(vec_t *) & BO[(l<<2)+4]; - vec_t rowB4 = *(vec_t *) & BO[(l<<2)+6]; - vec_t rowB5 = *(vec_t *) & BO[(l<<2)+8]; - vec_t rowB6 = *(vec_t *) & BO[(l<<2)+10]; - vec_t rowB7 = *(vec_t *) & BO[(l<<2)+12]; - vec_t rowB8 = *(vec_t *) & BO[(l<<2)+14]; - __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); - __builtin_mma_xvf64gerpp(&acc1, rowA1, rowB2); - __builtin_mma_xvf64gerpp(&acc0, rowA2, rowB3); - __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB4); - __builtin_mma_xvf64gerpp(&acc0, rowA3, rowB5); - __builtin_mma_xvf64gerpp(&acc1, rowA3, rowB6); - __builtin_mma_xvf64gerpp(&acc0, rowA4, rowB7); - __builtin_mma_xvf64gerpp(&acc1, rowA4, rowB8); - } - for (l = (temp & (~3)); l < temp; ++l) - { - __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<1])); - vec_t rowB1 = *(vec_t *) & BO[l<<2]; - vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2]; - __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); - __builtin_mma_xvf64gerpp(&acc1, rowA1, rowB2); - } - SAVE_ACC_COMPLEX_12 - AO += temp << 1; - BO += temp << 2; - CO += 2; + SET_ACC_ZERO() + for (l = 0; l < (temp & (~3)); l+=4) { + __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<1])); + __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<1)+2])); + __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<1)+4])); + __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<1)+6])); + vec_t rowB1 = *(vec_t *) & BO[l<<2]; + vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2]; + vec_t rowB3 = *(vec_t *) & BO[(l<<2)+4]; + vec_t rowB4 = *(vec_t *) & BO[(l<<2)+6]; + vec_t rowB5 = *(vec_t *) & BO[(l<<2)+8]; + vec_t rowB6 = *(vec_t *) & BO[(l<<2)+10]; + vec_t rowB7 = *(vec_t *) & BO[(l<<2)+12]; + vec_t rowB8 = *(vec_t *) & BO[(l<<2)+14]; + __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf64gerpp(&acc1, rowA1, rowB2); + __builtin_mma_xvf64gerpp(&acc0, rowA2, rowB3); + __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB4); + __builtin_mma_xvf64gerpp(&acc0, rowA3, rowB5); + __builtin_mma_xvf64gerpp(&acc1, rowA3, rowB6); + __builtin_mma_xvf64gerpp(&acc0, rowA4, rowB7); + __builtin_mma_xvf64gerpp(&acc1, rowA4, rowB8); + } + for (l = (temp & (~3)); l < temp; ++l) { + __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<1])); + vec_t rowB1 = *(vec_t *) & BO[l<<2]; + vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2]; + __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf64gerpp(&acc1, rowA1, rowB2); + } + SAVE_ACC_COMPLEX_12 + AO += temp << 1; + BO += temp << 2; + CO += 2; #if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE (1, 2) + REFRESH_AFTER_SAVE (1, 2) #endif - } + } #if defined(TRMMKERNEL) && !defined(LEFT) off += 2; // number of values in A #endif B += k << 2; - } - if (n & 1) - { + } + if (n & 1) { #if defined(TRMMKERNEL) && defined(LEFT) off = offset; #endif AO = A; CO = C; C += ldc<<1; - for (i = 0; i < (m >> 3); i++) - { + for (i = 0; i < (m >> 3); i++) { #if defined(TRMMKERNEL) - REFRESH_POINTERS (8, 1) + REFRESH_POINTERS (8, 1) #else - BO = B; - temp = k; + BO = B; + temp = k; #endif - SET_ACC_ZERO() - for (l = 0; l < (temp & (~1)); l+=2) - { - __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<4])); - __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<4)+4])); - __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<4)+8])); - __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<4)+12])); - __vector_pair rowA5 = *((__vector_pair *)((void *)&AO[(l<<4)+16])); - __vector_pair rowA6 = *((__vector_pair *)((void *)&AO[(l<<4)+20])); - __vector_pair rowA7 = *((__vector_pair *)((void *)&AO[(l<<4)+24])); - __vector_pair rowA8 = *((__vector_pair *)((void *)&AO[(l<<4)+28])); - vec_t rowB1 = *(vec_t *) & BO[l<<1]; - vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2]; - __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); - __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1); - __builtin_mma_xvf64gerpp(&acc2, rowA3, rowB1); - __builtin_mma_xvf64gerpp(&acc3, rowA4, rowB1); - __builtin_mma_xvf64gerpp(&acc0, rowA5, rowB2); - __builtin_mma_xvf64gerpp(&acc1, rowA6, rowB2); - __builtin_mma_xvf64gerpp(&acc2, rowA7, rowB2); - __builtin_mma_xvf64gerpp(&acc3, rowA8, rowB2); - } - for (l = (temp & (~1)); l < temp; ++l) - { - __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<4])); - __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<4)+4])); - __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<4)+8])); - __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<4)+12])); - vec_t rowB1 = *(vec_t *) & BO[l<<1]; - __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); - __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1); - __builtin_mma_xvf64gerpp(&acc2, rowA3, rowB1); - __builtin_mma_xvf64gerpp(&acc3, rowA4, rowB1); - } - SAVE_ACC_COMPLEX_21_4 + SET_ACC_ZERO() + for (l = 0; l < (temp & (~1)); l+=2) { + __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<4])); + __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<4)+4])); + __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<4)+8])); + __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<4)+12])); + __vector_pair rowA5 = *((__vector_pair *)((void *)&AO[(l<<4)+16])); + __vector_pair rowA6 = *((__vector_pair *)((void *)&AO[(l<<4)+20])); + __vector_pair rowA7 = *((__vector_pair *)((void *)&AO[(l<<4)+24])); + __vector_pair rowA8 = *((__vector_pair *)((void *)&AO[(l<<4)+28])); + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2]; + __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1); + __builtin_mma_xvf64gerpp(&acc2, rowA3, rowB1); + __builtin_mma_xvf64gerpp(&acc3, rowA4, rowB1); + __builtin_mma_xvf64gerpp(&acc0, rowA5, rowB2); + __builtin_mma_xvf64gerpp(&acc1, rowA6, rowB2); + __builtin_mma_xvf64gerpp(&acc2, rowA7, rowB2); + __builtin_mma_xvf64gerpp(&acc3, rowA8, rowB2); + } + for (l = (temp & (~1)); l < temp; ++l) { + __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<4])); + __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<4)+4])); + __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<4)+8])); + __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<4)+12])); + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1); + __builtin_mma_xvf64gerpp(&acc2, rowA3, rowB1); + __builtin_mma_xvf64gerpp(&acc3, rowA4, rowB1); + } + SAVE_ACC_COMPLEX_21_4 - AO += temp << 4; - BO += temp << 1; - CO += 16; + AO += temp << 4; + BO += temp << 1; + CO += 16; #if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE (8, 1) + REFRESH_AFTER_SAVE (8, 1) #endif - } - if (m & 4) - { + } + if (m & 4) { #if defined(TRMMKERNEL) - REFRESH_POINTERS (4, 1) + REFRESH_POINTERS (4, 1) #else - BO = B; - temp = k; + BO = B; + temp = k; #endif - SET_ACC_ZERO() - for (l = 0; l < (temp & (~3)); l+=4) - { - __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<3])); - __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<3)+4])); - __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<3)+8])); - __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<3)+12])); - __vector_pair rowA5 = *((__vector_pair *)((void *)&AO[(l<<3)+16])); - __vector_pair rowA6 = *((__vector_pair *)((void *)&AO[(l<<3)+20])); - __vector_pair rowA7 = *((__vector_pair *)((void *)&AO[(l<<3)+24])); - __vector_pair rowA8 = *((__vector_pair *)((void *)&AO[(l<<3)+28])); - vec_t rowB1 = *(vec_t *) & BO[l<<1]; - vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2]; - vec_t rowB3 = *(vec_t *) & BO[(l<<1)+4]; - vec_t rowB4 = *(vec_t *) & BO[(l<<1)+6]; - __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); - __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1); - __builtin_mma_xvf64gerpp(&acc2, rowA3, rowB2); - __builtin_mma_xvf64gerpp(&acc3, rowA4, rowB2); - __builtin_mma_xvf64gerpp(&acc4, rowA5, rowB3); - __builtin_mma_xvf64gerpp(&acc5, rowA6, rowB3); - __builtin_mma_xvf64gerpp(&acc6, rowA7, rowB4); - __builtin_mma_xvf64gerpp(&acc7, rowA8, rowB4); - } - for (l = (temp & (~3)); l < temp; ++l) - { - __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<3])); - __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<3)+4])); - vec_t rowB1 = *(vec_t *) & BO[l<<1]; - __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); - __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1); - } - SAVE_ACC_COMPLEX_21_2 - AO += temp << 3; - BO += temp << 1; - CO += 8; + SET_ACC_ZERO() + for (l = 0; l < (temp & (~3)); l+=4) { + __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<3])); + __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<3)+4])); + __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<3)+8])); + __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<3)+12])); + __vector_pair rowA5 = *((__vector_pair *)((void *)&AO[(l<<3)+16])); + __vector_pair rowA6 = *((__vector_pair *)((void *)&AO[(l<<3)+20])); + __vector_pair rowA7 = *((__vector_pair *)((void *)&AO[(l<<3)+24])); + __vector_pair rowA8 = *((__vector_pair *)((void *)&AO[(l<<3)+28])); + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2]; + vec_t rowB3 = *(vec_t *) & BO[(l<<1)+4]; + vec_t rowB4 = *(vec_t *) & BO[(l<<1)+6]; + __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1); + __builtin_mma_xvf64gerpp(&acc2, rowA3, rowB2); + __builtin_mma_xvf64gerpp(&acc3, rowA4, rowB2); + __builtin_mma_xvf64gerpp(&acc4, rowA5, rowB3); + __builtin_mma_xvf64gerpp(&acc5, rowA6, rowB3); + __builtin_mma_xvf64gerpp(&acc6, rowA7, rowB4); + __builtin_mma_xvf64gerpp(&acc7, rowA8, rowB4); + } + for (l = (temp & (~3)); l < temp; ++l) { + __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<3])); + __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<3)+4])); + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1); + } + SAVE_ACC_COMPLEX_21_2 + AO += temp << 3; + BO += temp << 1; + CO += 8; #if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE (4, 1) + REFRESH_AFTER_SAVE (4, 1) #endif - } if (m & 2) - { + } + if (m & 2) { #if defined(TRMMKERNEL) - REFRESH_POINTERS (2, 1) + REFRESH_POINTERS (2, 1) #else - BO = B; - temp = k; + BO = B; + temp = k; #endif - SET_ACC_ZERO() - for (l = 0; l < (temp & (~7)); l+=8) - { - __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<2])); - __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<2)+4])); - __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<2)+8])); - __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<2)+12])); - __vector_pair rowA5 = *((__vector_pair *)((void *)&AO[(l<<2)+16])); - __vector_pair rowA6 = *((__vector_pair *)((void *)&AO[(l<<2)+20])); - __vector_pair rowA7 = *((__vector_pair *)((void *)&AO[(l<<2)+24])); - __vector_pair rowA8 = *((__vector_pair *)((void *)&AO[(l<<2)+28])); - vec_t rowB1 = *(vec_t *) & BO[l<<1]; - vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2]; - vec_t rowB3 = *(vec_t *) & BO[(l<<1)+4]; - vec_t rowB4 = *(vec_t *) & BO[(l<<1)+6]; - vec_t rowB5 = *(vec_t *) & BO[(l<<1)+8]; - vec_t rowB6 = *(vec_t *) & BO[(l<<1)+10]; - vec_t rowB7 = *(vec_t *) & BO[(l<<1)+12]; - vec_t rowB8 = *(vec_t *) & BO[(l<<1)+14]; - __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); - __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB2); - __builtin_mma_xvf64gerpp(&acc2, rowA3, rowB3); - __builtin_mma_xvf64gerpp(&acc3, rowA4, rowB4); - __builtin_mma_xvf64gerpp(&acc4, rowA5, rowB5); - __builtin_mma_xvf64gerpp(&acc5, rowA6, rowB6); - __builtin_mma_xvf64gerpp(&acc6, rowA7, rowB7); - __builtin_mma_xvf64gerpp(&acc7, rowA8, rowB8); - } - for (l = (temp & (~7)); l < temp; ++l) - { - __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<2])); - vec_t rowB1 = *(vec_t *) & BO[l<<1]; - __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); - } - SAVE_ACC_COMPLEX_21_1 - AO += temp << 2; - BO += temp << 1; - CO += 4; + SET_ACC_ZERO() + for (l = 0; l < (temp & (~7)); l+=8) { + __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<2])); + __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<2)+4])); + __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<2)+8])); + __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<2)+12])); + __vector_pair rowA5 = *((__vector_pair *)((void *)&AO[(l<<2)+16])); + __vector_pair rowA6 = *((__vector_pair *)((void *)&AO[(l<<2)+20])); + __vector_pair rowA7 = *((__vector_pair *)((void *)&AO[(l<<2)+24])); + __vector_pair rowA8 = *((__vector_pair *)((void *)&AO[(l<<2)+28])); + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2]; + vec_t rowB3 = *(vec_t *) & BO[(l<<1)+4]; + vec_t rowB4 = *(vec_t *) & BO[(l<<1)+6]; + vec_t rowB5 = *(vec_t *) & BO[(l<<1)+8]; + vec_t rowB6 = *(vec_t *) & BO[(l<<1)+10]; + vec_t rowB7 = *(vec_t *) & BO[(l<<1)+12]; + vec_t rowB8 = *(vec_t *) & BO[(l<<1)+14]; + __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB2); + __builtin_mma_xvf64gerpp(&acc2, rowA3, rowB3); + __builtin_mma_xvf64gerpp(&acc3, rowA4, rowB4); + __builtin_mma_xvf64gerpp(&acc4, rowA5, rowB5); + __builtin_mma_xvf64gerpp(&acc5, rowA6, rowB6); + __builtin_mma_xvf64gerpp(&acc6, rowA7, rowB7); + __builtin_mma_xvf64gerpp(&acc7, rowA8, rowB8); + } + for (l = (temp & (~7)); l < temp; ++l) { + __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<2])); + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); + } + SAVE_ACC_COMPLEX_21_1 + AO += temp << 2; + BO += temp << 1; + CO += 4; #if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE (2, 1) + REFRESH_AFTER_SAVE (2, 1) #endif - } - if (m & 1) - { + } + if (m & 1) { #if defined(TRMMKERNEL) - REFRESH_POINTERS (1, 1) + REFRESH_POINTERS (1, 1) #else - BO = B; - temp = k; + BO = B; + temp = k; #endif - // RIP OUT MMA STUFF! - SET_ACC_ZERO() - for (l = 0; l < (temp & (~7)); l+=8) - { - __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<1])); - __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<1)+2])); - __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<1)+4])); - __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<1)+6])); - __vector_pair rowA5 = *((__vector_pair *)((void *)&AO[(l<<1)+8])); - __vector_pair rowA6 = *((__vector_pair *)((void *)&AO[(l<<1)+10])); - __vector_pair rowA7 = *((__vector_pair *)((void *)&AO[(l<<1)+12])); - __vector_pair rowA8 = *((__vector_pair *)((void *)&AO[(l<<1)+14])); - vec_t rowB1 = *(vec_t *) & BO[l<<1]; - vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2]; - vec_t rowB3 = *(vec_t *) & BO[(l<<1)+4]; - vec_t rowB4 = *(vec_t *) & BO[(l<<1)+6]; - vec_t rowB5 = *(vec_t *) & BO[(l<<1)+8]; - vec_t rowB6 = *(vec_t *) & BO[(l<<1)+10]; - vec_t rowB7 = *(vec_t *) & BO[(l<<1)+12]; - vec_t rowB8 = *(vec_t *) & BO[(l<<1)+14]; - __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); - __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB2); - __builtin_mma_xvf64gerpp(&acc2, rowA3, rowB3); - __builtin_mma_xvf64gerpp(&acc3, rowA4, rowB4); - __builtin_mma_xvf64gerpp(&acc4, rowA5, rowB5); - __builtin_mma_xvf64gerpp(&acc5, rowA6, rowB6); - __builtin_mma_xvf64gerpp(&acc6, rowA7, rowB7); - __builtin_mma_xvf64gerpp(&acc7, rowA8, rowB8); - } - for (l = (temp & (~7)); l < temp; ++l) - { - __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<1])); - vec_t rowB1 = *(vec_t *) & BO[l<<1]; - __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); - } - SAVE_ACC_COMPLEX_11 - AO += temp << 1; - BO += temp << 1; - CO += 2; + SET_ACC_ZERO() + for (l = 0; l < (temp & (~7)); l+=8) { + __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<1])); + __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<1)+2])); + __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<1)+4])); + __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<1)+6])); + __vector_pair rowA5 = *((__vector_pair *)((void *)&AO[(l<<1)+8])); + __vector_pair rowA6 = *((__vector_pair *)((void *)&AO[(l<<1)+10])); + __vector_pair rowA7 = *((__vector_pair *)((void *)&AO[(l<<1)+12])); + __vector_pair rowA8 = *((__vector_pair *)((void *)&AO[(l<<1)+14])); + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2]; + vec_t rowB3 = *(vec_t *) & BO[(l<<1)+4]; + vec_t rowB4 = *(vec_t *) & BO[(l<<1)+6]; + vec_t rowB5 = *(vec_t *) & BO[(l<<1)+8]; + vec_t rowB6 = *(vec_t *) & BO[(l<<1)+10]; + vec_t rowB7 = *(vec_t *) & BO[(l<<1)+12]; + vec_t rowB8 = *(vec_t *) & BO[(l<<1)+14]; + __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB2); + __builtin_mma_xvf64gerpp(&acc2, rowA3, rowB3); + __builtin_mma_xvf64gerpp(&acc3, rowA4, rowB4); + __builtin_mma_xvf64gerpp(&acc4, rowA5, rowB5); + __builtin_mma_xvf64gerpp(&acc5, rowA6, rowB6); + __builtin_mma_xvf64gerpp(&acc6, rowA7, rowB7); + __builtin_mma_xvf64gerpp(&acc7, rowA8, rowB8); + } + for (l = (temp & (~7)); l < temp; ++l) { + __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<1])); + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); + } + SAVE_ACC_COMPLEX_11 + AO += temp << 1; + BO += temp << 1; + CO += 2; #if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE (1, 1) + REFRESH_AFTER_SAVE (1, 1) #endif - } + } #if defined(TRMMKERNEL) && !defined(LEFT) off += 1; // number of values in A #endif B += k << 1; - } - return 0; + } + return 0; } From 8698f9e37f63bd0a57b8182aaecb46cac2adf620 Mon Sep 17 00:00:00 2001 From: Dmitry Mikushin Date: Sat, 10 Feb 2024 19:12:16 +0100 Subject: [PATCH 03/23] Adding basic support of benchmarks into CMake for single, double, single complex and double complex cases. Each benchmarking target has a suffix to identify the data type, for example ./benchmark_gemm3m_COMPLEX_DOUBLE is a gemm3m.c source compiled with COMPLEX and DOUBLE macros defined --- CMakeLists.txt | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 5a1e4b271..059035ac3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -452,6 +452,34 @@ if (BUILD_SHARED_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFFIX} STREQUAL "") endif() endif() +if (BUILD_TESTING) + find_package(OpenMP REQUIRED) + file(GLOB SOURCES "benchmark/*.c") + foreach(source ${SOURCES}) + get_filename_component(name ${source} NAME_WE) + if ((NOT ${name} STREQUAL "zdot-intel") AND (NOT ${name} STREQUAL "cula_wrapper")) + set(defines DEFAULT COMPLEX DOUBLE "COMPLEX\;DOUBLE") + foreach(define ${defines}) + set(target_name "benchmark_${name}") + if (NOT "${define}" STREQUAL "DEFAULT") + string(JOIN "_" define_str ${define}) + set(target_name "${target_name}_${define_str}") + endif() + if ((NOT ${target_name} STREQUAL "benchmark_imax_COMPLEX") AND (NOT ${target_name} STREQUAL "benchmark_imax_COMPLEX_DOUBLE") AND + (NOT ${target_name} STREQUAL "benchmark_imin_COMPLEX") AND (NOT ${target_name} STREQUAL "benchmark_imin_COMPLEX_DOUBLE") AND + (NOT ${target_name} STREQUAL "benchmark_max_COMPLEX") AND (NOT ${target_name} STREQUAL "benchmark_max_COMPLEX_DOUBLE") AND + (NOT ${target_name} STREQUAL "benchmark_min_COMPLEX") AND (NOT ${target_name} STREQUAL "benchmark_min_COMPLEX_DOUBLE")) + add_executable(${target_name} ${source}) + target_include_directories(${target_name} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR}) + target_link_libraries(${target_name} ${OpenBLAS_LIBNAME} OpenMP::OpenMP_C) + if (NOT "${define}" STREQUAL "DEFAULT") + target_compile_definitions(${target_name} PRIVATE ${define}) + endif() + endif() + endforeach() + endif() + endforeach() +endif() # Install project From 0ce794f0c363754e0a9c7c3b2d997e9fe7465c12 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 25 Feb 2024 23:38:36 +0100 Subject: [PATCH 04/23] Enable GEMM3M tests on supported platforms --- ctest/CMakeLists.txt | 46 +++++++++++++++++++++++++++++++++++++++++++- ctest/Makefile | 36 ++++++++++++++++++++++++++++++++-- 2 files changed, 79 insertions(+), 3 deletions(-) diff --git a/ctest/CMakeLists.txt b/ctest/CMakeLists.txt index 6e0a7f309..d7baadee4 100644 --- a/ctest/CMakeLists.txt +++ b/ctest/CMakeLists.txt @@ -10,6 +10,11 @@ if (CMAKE_Fortran_COMPILER_ID STREQUAL GNU) set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -fno-tree-vectorize") endif() +set (USE_GEMM3M 0) +if (${ARCH} MATCHES x86|x86_64|ia64|mips) + set(USE_GEMM3M 1) +endif () + if(WIN32) FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/test_cblas_helper.ps1 "$ErrorActionPreference = \"Stop\"\n" @@ -88,6 +93,17 @@ if (NOT NOFORTRAN) auxiliary.c c_xerbla.c constant.c) + if (USE_GEMM3M) + if ((${float_char} STREQUAL "c") OR (${float_char} STREQUAL "z")) + add_executable(x${float_char}cblat3_3m + c_${float_char}blat3_3m.f + c_${float_char}blas3_3m.c + c_${float_char}3chke_3m.c + auxiliary.c + c_xerbla.c + constant.c) + endif() + endif() else() add_executable(x${float_char}cblat3 c_${float_char}blat3c.c @@ -96,6 +112,17 @@ else() auxiliary.c c_xerbla.c constant.c) + if (USE_GEMM3M) + if ((${float_char} STREQUAL "c") OR (${float_char} STREQUAL "z")) + add_executable(x${float_char}cblat3_3m + c_${float_char}blat3c_3m.c + c_${float_char}blas3_3m.c + c_${float_char}3chke_3m.c + auxiliary.c + c_xerbla.c + constant.c) + endif() + endif() endif() target_link_libraries(x${float_char}cblat3 ${OpenBLAS_LIBNAME}) if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang)) @@ -105,7 +132,24 @@ endif() if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX") target_link_libraries(x${float_char}cblat3 m) endif() + if (USE_GEMM3M) + if ((${float_char} STREQUAL "c") OR (${float_char} STREQUAL "z")) + target_link_libraries(x${float_char}cblat3_3m ${OpenBLAS_LIBNAME}) + if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang)) + string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}") + target_link_libraries(x${float_char}cblat3 omp pthread) + endif() + if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX") + target_link_libraries(x${float_char}cblat3_3m m) + endif() + endif() + endif() add_test(NAME "x${float_char}cblat3" COMMAND ${test_helper} $ "${PROJECT_SOURCE_DIR}/ctest/${float_char}in3") - + if (USE_GEMM3M) + if ((${float_char} STREQUAL "c") OR (${float_char} STREQUAL "z")) + add_test(NAME "x${float_char}cblat3_3m" + COMMAND ${test_helper} $ "${PROJECT_SOURCE_DIR}/ctest/${float_char}in3_3m") + endif() + endif() endforeach() diff --git a/ctest/Makefile b/ctest/Makefile index ad960b35a..36682c7b6 100644 --- a/ctest/Makefile +++ b/ctest/Makefile @@ -5,6 +5,24 @@ TOPDIR = .. include $(TOPDIR)/Makefile.system +SUPPORT_GEMM3M = 0 + +ifeq ($(ARCH), x86) +SUPPORT_GEMM3M = 1 +endif + +ifeq ($(ARCH), x86_64) +SUPPORT_GEMM3M = 1 +endif + +ifeq ($(ARCH), ia64) +SUPPORT_GEMM3M = 1 +endif + +ifeq ($(ARCH), MIPS) +SUPPORT_GEMM3M = 1 +endif + override CFLAGS += -DADD$(BU) -DCBLAS ifeq ($(F_COMPILER),GFORTRAN) override FFLAGS += -fno-tree-vectorize @@ -43,7 +61,7 @@ ztestl3o = c_zblas3.o c_z3chke.o auxiliary.o c_xerbla.o constant.o ztestl3o_3m = c_zblas3_3m.o c_z3chke_3m.o auxiliary.o c_xerbla.o constant.o -all :: all1 all2 all3 +all :: all1 all2 all3 all3_3m ifeq ($(BUILD_SINGLE),1) all1targets += xscblat1 @@ -182,8 +200,9 @@ endif endif all3_3m: xzcblat3_3m xccblat3_3m +ifeq ($(SUPPORT_GEMM3M),1) ifeq ($(USE_OPENMP), 1) -ifeq ($(BUILD_SINGLE),1) +ifeq ($(BUILD_COMPLEX),1) OMP_NUM_THREADS=2 ./xccblat3_3m < cin3_3m endif ifeq ($(BUILD_COMPLEX16),1) @@ -197,6 +216,7 @@ ifeq ($(BUILD_COMPLEX16),1) OPENBLAS_NUM_THREADS=2 ./xzcblat3_3m < zin3_3m endif endif +endif @@ -271,8 +291,10 @@ xccblat2: $(ctestl2o) c_cblat2.o $(TOPDIR)/$(LIBNAME) $(FC) $(FLDFLAGS) -o xccblat2 c_cblat2.o $(ctestl2o) $(LIB) $(EXTRALIB) $(CEXTRALIB) xccblat3: $(ctestl3o) c_cblat3.o $(TOPDIR)/$(LIBNAME) $(FC) $(FLDFLAGS) -o xccblat3 c_cblat3.o $(ctestl3o) $(LIB) $(EXTRALIB) $(CEXTRALIB) +ifeq ($(SUPPORT_GEMM3M),1) xccblat3_3m: $(ctestl3o_3m) c_cblat3_3m.o $(TOPDIR)/$(LIBNAME) $(FC) $(FLDFLAGS) -o xccblat3_3m c_cblat3_3m.o $(ctestl3o_3m) $(LIB) $(EXTRALIB) $(CEXTRALIB) +endif else xccblat1: $(ctestl1o) c_cblat1c.o $(TOPDIR)/$(LIBNAME) $(CC) $(CFLAGS) -o xccblat1 c_cblat1c.o $(ctestl1o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB)) @@ -280,6 +302,10 @@ xccblat2: $(ctestl2o) c_cblat2c.o $(TOPDIR)/$(LIBNAME) $(CC) $(CFLAGS) -o xccblat2 c_cblat2c.o $(ctestl2o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB)) xccblat3: $(ctestl3o) c_cblat3c.o $(TOPDIR)/$(LIBNAME) $(CC) $(CFLAGS) -o xccblat3 c_cblat3c.o $(ctestl3o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB)) +ifeq ($(SUPPORT_GEMM3M),1) +xccblat3_3m: $(ctestl3o_3m) c_cblat3c_3m.o $(TOPDIR)/$(LIBNAME) + $(CC) $(CFLAGS) -o xccblat3_3m c_cblat3c_3m.o $(ctestl3o_3m) $(LIB) $(EXTRALIB) $(CEXTRALIB) +endif endif endif @@ -293,8 +319,10 @@ xzcblat2: $(ztestl2o) c_zblat2.o $(TOPDIR)/$(LIBNAME) $(FC) $(FLDFLAGS) -o xzcblat2 c_zblat2.o $(ztestl2o) $(LIB) $(EXTRALIB) $(CEXTRALIB) xzcblat3: $(ztestl3o) c_zblat3.o $(TOPDIR)/$(LIBNAME) $(FC) $(FLDFLAGS) -o xzcblat3 c_zblat3.o $(ztestl3o) $(LIB) $(EXTRALIB) $(CEXTRALIB) +ifeq ($(SUPPORT_GEMM3M),1) xzcblat3_3m: $(ztestl3o_3m) c_zblat3_3m.o $(TOPDIR)/$(LIBNAME) $(FC) $(FLDFLAGS) -o xzcblat3_3m c_zblat3_3m.o $(ztestl3o_3m) $(LIB) $(EXTRALIB) $(CEXTRALIB) +endif else xzcblat1: $(ztestl1o) c_zblat1c.o $(TOPDIR)/$(LIBNAME) $(CC) $(CFLAGS) -o xzcblat1 c_zblat1c.o $(ztestl1o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB)) @@ -302,6 +330,10 @@ xzcblat2: $(ztestl2o) c_zblat2c.o $(TOPDIR)/$(LIBNAME) $(CC) $(CFLAGS) -o xzcblat2 c_zblat2c.o $(ztestl2o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB)) xzcblat3: $(ztestl3o) c_zblat3c.o $(TOPDIR)/$(LIBNAME) $(CC) $(CFLAGS) -o xzcblat3 c_zblat3c.o $(ztestl3o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB)) +ifeq ($(SUPPORT_GEMM3M),1) +xzcblat3_3m: $(ztestl3o_3m) c_zblat3c_3m.o $(TOPDIR)/$(LIBNAME) + $(CC) $(CFLAGS) -o xzcblat3_3m c_zblat3c_3m.o $(ztestl3o_3m) $(LIB) $(EXTRALIB) $(CEXTRALIB) +endif endif endif From ba201c1939fbb2e68ccb384709ee29e9e4559158 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 25 Feb 2024 23:39:24 +0100 Subject: [PATCH 05/23] Enable GEMM3M tests on supported platforms --- test/CMakeLists.txt | 19 ++++++++++++++++++- test/Makefile | 27 +++++++++++++++++++++++++++ 2 files changed, 45 insertions(+), 1 deletion(-) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index ace20dffc..b4bf36cee 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -21,7 +21,18 @@ endif() if (BUILD_COMPLEX16) list (APPEND OpenBLAS_Tests zblat1 zblat2 zblat3) endif() -message (STATUS CCOMP ${CMAKE_C_COMPILER_ID} FCOMP ${CMAKE_Fortran_COMPILER_ID}) + +set (USE_GEMM3M 0) +if (${ARCH} MATCHES x86|x86_64|ia64|mips) + set(USE_GEMM3M 1) + if (BUILD_COMPLEX) + list (APPEND OpenBLAS_Tests cblat3_3m) + endif () + if (BUILD_COMPLEX16) + list (APPEND OpenBLAS_Tests zblat3_3m) + endif () +endif () + foreach(test_bin ${OpenBLAS_Tests}) add_executable(${test_bin} ${test_bin}.f) target_link_libraries(${test_bin} ${OpenBLAS_LIBNAME}) @@ -82,4 +93,10 @@ add_test(NAME "${float_type}blas2" COMMAND ${helper_prefix} $ "${PROJECT_SOURCE_DIR}/test/${float_type}blat2.dat" ${float_type_upper}BLAT2.SUMM) add_test(NAME "${float_type}blas3" COMMAND ${helper_prefix} $ "${PROJECT_SOURCE_DIR}/test/${float_type}blat3.dat" ${float_type_upper}BLAT3.SUMM) +if (USE_GEMM3M) +if ((${float_type} STREQUAL "c") OR (${float_type} STREQUAL "z")) +add_test(NAME "${float_type}blas3_3m" + COMMAND ${helper_prefix} $ "${PROJECT_SOURCE_DIR}/test/${float_type}blat3_3m.dat" ${float_type_upper}BLAT3_3M.SUMM) +endif() +endif() endforeach() diff --git a/test/Makefile b/test/Makefile index 5a4694ce6..6a50b6c98 100644 --- a/test/Makefile +++ b/test/Makefile @@ -4,6 +4,24 @@ ifeq ($(F_COMPILER),GFORTRAN) override FFLAGS += -fno-tree-vectorize endif +SUPPORT_GEMM3M = 0 + +ifeq ($(ARCH), x86) +SUPPORT_GEMM3M = 1 +endif + +ifeq ($(ARCH), x86_64) +SUPPORT_GEMM3M = 1 +endif + +ifeq ($(ARCH), ia64) +SUPPORT_GEMM3M = 1 +endif + +ifeq ($(ARCH), MIPS) +SUPPORT_GEMM3M = 1 +endif + ifeq ($(NOFORTRAN),1) all :: else @@ -153,11 +171,20 @@ ifeq ($(BUILD_DOUBLE),1) D3=dblat3 endif ifeq ($(BUILD_COMPLEX),1) +ifeq ($(SUPPORT_GEMM3M),1) +C3=cblat3 cblat3_3m +else C3=cblat3 endif +endif ifeq ($(BUILD_COMPLEX16),1) +ifeq ($(SUPPORT_GEMM3M),1) +Z3=zblat3 zblat3_3m +else Z3=zblat3 endif +endif + level3: $(B3) $(S3) $(D3) $(C3) $(Z3) From 87dd1c710e5bec07f72b0e30d8c81db3446c456d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 26 Feb 2024 07:37:30 +0100 Subject: [PATCH 06/23] fix conditional gemm3m build --- ctest/Makefile | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/ctest/Makefile b/ctest/Makefile index 36682c7b6..3a3ee5611 100644 --- a/ctest/Makefile +++ b/ctest/Makefile @@ -61,7 +61,7 @@ ztestl3o = c_zblas3.o c_z3chke.o auxiliary.o c_xerbla.o constant.o ztestl3o_3m = c_zblas3_3m.o c_z3chke_3m.o auxiliary.o c_xerbla.o constant.o -all :: all1 all2 all3 all3_3m +all :: all1 all2 all3 ifeq ($(BUILD_SINGLE),1) all1targets += xscblat1 @@ -162,9 +162,15 @@ all3targets += xdcblat3 endif ifeq ($(BUILD_COMPLEX),1) all3targets += xccblat3 +ifeq ($(USE_GEMM3M),1) +all3targets += xccblat3_3m +endif endif ifeq ($(BUILD_COMPLEX16),1) all3targets += xzcblat3 +ifeq ($(USE_GEMM3M),1) +all3targets += xzcblat3_3m +endif endif all3: $(all3targets) @@ -199,7 +205,6 @@ endif endif endif -all3_3m: xzcblat3_3m xccblat3_3m ifeq ($(SUPPORT_GEMM3M),1) ifeq ($(USE_OPENMP), 1) ifeq ($(BUILD_COMPLEX),1) From 5aaeca2896464a21e8c7f435bb28b5a44378c49e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 26 Feb 2024 09:26:14 +0100 Subject: [PATCH 07/23] fix name --- ctest/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ctest/Makefile b/ctest/Makefile index 3a3ee5611..bbaf96f8e 100644 --- a/ctest/Makefile +++ b/ctest/Makefile @@ -162,13 +162,13 @@ all3targets += xdcblat3 endif ifeq ($(BUILD_COMPLEX),1) all3targets += xccblat3 -ifeq ($(USE_GEMM3M),1) +ifeq ($(SUPPORT_GEMM3M),1) all3targets += xccblat3_3m endif endif ifeq ($(BUILD_COMPLEX16),1) all3targets += xzcblat3 -ifeq ($(USE_GEMM3M),1) +ifeq ($(SUPPORT_GEMM3M),1) all3targets += xzcblat3_3m endif endif From ea167328f112bddc82eb5ab8ab9c330b4e210c36 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 26 Feb 2024 14:14:58 +0100 Subject: [PATCH 08/23] Add f2c-converted sources for GEMM3M tests --- ctest/c_cblat3c_3m.c | 4854 +++++++++++++++++++++++++++++++++++++++++ ctest/c_zblat3c_3m.c | 4897 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 9751 insertions(+) create mode 100644 ctest/c_cblat3c_3m.c create mode 100644 ctest/c_zblat3c_3m.c diff --git a/ctest/c_cblat3c_3m.c b/ctest/c_cblat3c_3m.c new file mode 100644 index 000000000..9cfa26a41 --- /dev/null +++ b/ctest/c_cblat3c_3m.c @@ -0,0 +1,4854 @@ +#include +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimagf(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} +//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +static float spow_ui(float x, integer n) { + float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static double dpow_ui(double x, integer n) { + double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#ifdef _MSC_VER +static _Fcomplex cpow_ui(complex x, integer n) { + complex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; + for(u = n; ; ) { + if(u & 01) pow.r *= x.r, pow.i *= x.i; + if(u >>= 1) x.r *= x.r, x.i *= x.i; + else break; + } + } + _Fcomplex p={pow.r, pow.i}; + return p; +} +#else +static _Complex float cpow_ui(_Complex float x, integer n) { + _Complex float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +#ifdef _MSC_VER +static _Dcomplex zpow_ui(_Dcomplex x, integer n) { + _Dcomplex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; + for(u = n; ; ) { + if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; + if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; + else break; + } + } + _Dcomplex p = {pow._Val[0], pow._Val[1]}; + return p; +} +#else +static _Complex double zpow_ui(_Complex double x, integer n) { + _Complex double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +static integer pow_ii(integer x, integer n) { + integer pow; unsigned long int u; + if (n <= 0) { + if (n == 0 || x == 1) pow = 1; + else if (x != -1) pow = x == 0 ? 1/x : 0; + else n = -n; + } + if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { + u = n; + for(pow = 1; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static integer dmaxloc_(double *w, integer s, integer e, integer *n) +{ + double m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static integer smaxloc_(float *w, integer s, integer e, integer *n) +{ + float m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { + integer n = *n_, incx = *incx_, incy = *incy_, i; +#ifdef _MSC_VER + _Fcomplex zdotc = {0.0, 0.0}; + if (incx == 1 && incy == 1) { + for (i=0;i= 0; + if (trace) { + o__1.oerr = 0; + o__1.ounit = ntra; + o__1.ofnmlen = 32; + o__1.ofnm = snaps; + o__1.orl = 0; + o__1.osta = 0; + o__1.oacc = 0; + o__1.ofm = 0; + o__1.oblnk = 0; + f_open(&o__1); + } +/* Read the flag that directs rewinding of the snapshot file. */ + s_rsle(&io___7); + do_lio(&c__8, &c__1, (char *)&rewi, (ftnlen)sizeof(logical)); + e_rsle(); + rewi = rewi && trace; +/* Read the flag that directs stopping on any failure. */ + s_rsle(&io___9); + do_lio(&c__8, &c__1, (char *)&sfatal, (ftnlen)sizeof(logical)); + e_rsle(); +/* Read the flag that indicates whether error exits are to be tested. */ + s_rsle(&io___11); + do_lio(&c__8, &c__1, (char *)&tsterr, (ftnlen)sizeof(logical)); + e_rsle(); +/* Read the flag that indicates whether row-major data layout to be tested. */ + s_rsle(&io___13); + do_lio(&c__3, &c__1, (char *)&layout, (ftnlen)sizeof(integer)); + e_rsle(); +/* Read the threshold value of the test ratio */ + s_rsle(&io___15); + do_lio(&c__4, &c__1, (char *)&thresh, (ftnlen)sizeof(real)); + e_rsle(); + +/* Read and check the parameter values for the tests. */ + +/* Values of N */ + s_rsle(&io___17); + do_lio(&c__3, &c__1, (char *)&nidim, (ftnlen)sizeof(integer)); + e_rsle(); + if (nidim < 1 || nidim > 9) { + s_wsfe(&io___19); + do_fio(&c__1, "N", (ftnlen)1); + do_fio(&c__1, (char *)&c__9, (ftnlen)sizeof(integer)); + e_wsfe(); + goto L220; + } + s_rsle(&io___20); + i__1 = nidim; + for (i__ = 1; i__ <= i__1; ++i__) { + do_lio(&c__3, &c__1, (char *)&idim[i__ - 1], (ftnlen)sizeof(integer)); + } + e_rsle(); + i__1 = nidim; + for (i__ = 1; i__ <= i__1; ++i__) { + if (idim[i__ - 1] < 0 || idim[i__ - 1] > 65) { + s_wsfe(&io___23); + do_fio(&c__1, (char *)&c__65, (ftnlen)sizeof(integer)); + e_wsfe(); + goto L220; + } +/* L10: */ + } +/* Values of ALPHA */ + s_rsle(&io___24); + do_lio(&c__3, &c__1, (char *)&nalf, (ftnlen)sizeof(integer)); + e_rsle(); + if (nalf < 1 || nalf > 7) { + s_wsfe(&io___26); + do_fio(&c__1, "ALPHA", (ftnlen)5); + do_fio(&c__1, (char *)&c__7, (ftnlen)sizeof(integer)); + e_wsfe(); + goto L220; + } + s_rsle(&io___27); + i__1 = nalf; + for (i__ = 1; i__ <= i__1; ++i__) { + do_lio(&c__6, &c__1, (char *)&alf[i__ - 1], (ftnlen)sizeof(complex)); + } + e_rsle(); +/* Values of BETA */ + s_rsle(&io___29); + do_lio(&c__3, &c__1, (char *)&nbet, (ftnlen)sizeof(integer)); + e_rsle(); + if (nbet < 1 || nbet > 7) { + s_wsfe(&io___31); + do_fio(&c__1, "BETA", (ftnlen)4); + do_fio(&c__1, (char *)&c__7, (ftnlen)sizeof(integer)); + e_wsfe(); + goto L220; + } + s_rsle(&io___32); + i__1 = nbet; + for (i__ = 1; i__ <= i__1; ++i__) { + do_lio(&c__6, &c__1, (char *)&bet[i__ - 1], (ftnlen)sizeof(complex)); + } + e_rsle(); + +/* Report values of parameters. */ + + s_wsfe(&io___34); + e_wsfe(); + s_wsfe(&io___35); + i__1 = nidim; + for (i__ = 1; i__ <= i__1; ++i__) { + do_fio(&c__1, (char *)&idim[i__ - 1], (ftnlen)sizeof(integer)); + } + e_wsfe(); + s_wsfe(&io___36); + i__1 = nalf; + for (i__ = 1; i__ <= i__1; ++i__) { + do_fio(&c__2, (char *)&alf[i__ - 1], (ftnlen)sizeof(real)); + } + e_wsfe(); + s_wsfe(&io___37); + i__1 = nbet; + for (i__ = 1; i__ <= i__1; ++i__) { + do_fio(&c__2, (char *)&bet[i__ - 1], (ftnlen)sizeof(real)); + } + e_wsfe(); + if (! tsterr) { + s_wsle(&io___38); + e_wsle(); + s_wsfe(&io___39); + e_wsfe(); + } + s_wsle(&io___40); + e_wsle(); + s_wsfe(&io___41); + do_fio(&c__1, (char *)&thresh, (ftnlen)sizeof(real)); + e_wsfe(); + s_wsle(&io___42); + e_wsle(); + rorder = FALSE_; + corder = FALSE_; + if (layout == 2) { + rorder = TRUE_; + corder = TRUE_; + s_wsfe(&io___45); + e_wsfe(); + } else if (layout == 1) { + rorder = TRUE_; + s_wsfe(&io___46); + e_wsfe(); + } else if (layout == 0) { + corder = TRUE_; + s_wsfe(&io___47); + e_wsfe(); + } + s_wsle(&io___48); + e_wsle(); + +/* Read names of subroutines and flags which indicate */ +/* whether they are to be tested. */ + + for (i__ = 1; i__ <= 9; ++i__) { + ltest[i__ - 1] = FALSE_; +/* L20: */ + } +L30: + i__1 = s_rsfe(&io___50); + if (i__1 != 0) { + goto L60; + } + i__1 = do_fio(&c__1, snamet, (ftnlen)13); + if (i__1 != 0) { + goto L60; + } + i__1 = do_fio(&c__1, (char *)<estt, (ftnlen)sizeof(logical)); + if (i__1 != 0) { + goto L60; + } + i__1 = e_rsfe(); + if (i__1 != 0) { + goto L60; + } + for (i__ = 1; i__ <= 9; ++i__) { + if (s_cmp(snamet, snames + (i__ - 1) * 13, (ftnlen)13, (ftnlen)13) == + 0) { + goto L50; + } +/* L40: */ + } + s_wsfe(&io___53); + do_fio(&c__1, snamet, (ftnlen)13); + e_wsfe(); + s_stop("", (ftnlen)0); +L50: + ltest[i__ - 1] = ltestt; + goto L30; + +L60: + cl__1.cerr = 0; + cl__1.cunit = 5; + cl__1.csta = 0; + f_clos(&cl__1); + +/* Compute EPS (the machine precision). */ + + eps = 1.f; +L70: + r__1 = eps + 1.f; + if (sdiff_(&r__1, &c_b91) == 0.f) { + goto L80; + } + eps *= .5f; + goto L70; +L80: + eps += eps; + s_wsfe(&io___55); + do_fio(&c__1, (char *)&eps, (ftnlen)sizeof(real)); + e_wsfe(); + +/* Check the reliability of CMMCH using exact data. */ + + n = 32; + i__1 = n; + for (j = 1; j <= i__1; ++j) { + i__2 = n; + for (i__ = 1; i__ <= i__2; ++i__) { + i__3 = i__ + j * 65 - 66; +/* Computing MAX */ + i__5 = i__ - j + 1; + i__4 = f2cmax(i__5,0); + ab[i__3].r = (real) i__4, ab[i__3].i = 0.f; +/* L90: */ + } + i__2 = j + 4224; + ab[i__2].r = (real) j, ab[i__2].i = 0.f; + i__2 = (j + 65) * 65 - 65; + ab[i__2].r = (real) j, ab[i__2].i = 0.f; + i__2 = j - 1; + c__[i__2].r = 0.f, c__[i__2].i = 0.f; +/* L100: */ + } + i__1 = n; + for (j = 1; j <= i__1; ++j) { + i__2 = j - 1; + i__3 = j * ((j + 1) * j) / 2 - (j + 1) * j * (j - 1) / 3; + cc[i__2].r = (real) i__3, cc[i__2].i = 0.f; +/* L110: */ + } +/* CC holds the exact result. On exit from CMMCH CT holds */ +/* the result computed by CMMCH. */ + *(unsigned char *)transa = 'N'; + *(unsigned char *)transb = 'N'; + cmmch_(transa, transb, &n, &c__1, &n, &c_b2, ab, &c__65, &ab[4225], & + c__65, &c_b1, c__, &c__65, ct, g, cc, &c__65, &eps, &err, &fatal, + &c__6, &c_true); + same = lce_(cc, ct, &n); + if (! same || err != 0.f) { + s_wsfe(&io___68); + do_fio(&c__1, transa, (ftnlen)1); + do_fio(&c__1, transb, (ftnlen)1); + do_fio(&c__1, (char *)&same, (ftnlen)sizeof(logical)); + do_fio(&c__1, (char *)&err, (ftnlen)sizeof(real)); + e_wsfe(); + s_stop("", (ftnlen)0); + } + *(unsigned char *)transb = 'C'; + cmmch_(transa, transb, &n, &c__1, &n, &c_b2, ab, &c__65, &ab[4225], & + c__65, &c_b1, c__, &c__65, ct, g, cc, &c__65, &eps, &err, &fatal, + &c__6, &c_true); + same = lce_(cc, ct, &n); + if (! same || err != 0.f) { + s_wsfe(&io___69); + do_fio(&c__1, transa, (ftnlen)1); + do_fio(&c__1, transb, (ftnlen)1); + do_fio(&c__1, (char *)&same, (ftnlen)sizeof(logical)); + do_fio(&c__1, (char *)&err, (ftnlen)sizeof(real)); + e_wsfe(); + s_stop("", (ftnlen)0); + } + i__1 = n; + for (j = 1; j <= i__1; ++j) { + i__2 = j + 4224; + i__3 = n - j + 1; + ab[i__2].r = (real) i__3, ab[i__2].i = 0.f; + i__2 = (j + 65) * 65 - 65; + i__3 = n - j + 1; + ab[i__2].r = (real) i__3, ab[i__2].i = 0.f; +/* L120: */ + } + i__1 = n; + for (j = 1; j <= i__1; ++j) { + i__2 = n - j; + i__3 = j * ((j + 1) * j) / 2 - (j + 1) * j * (j - 1) / 3; + cc[i__2].r = (real) i__3, cc[i__2].i = 0.f; +/* L130: */ + } + *(unsigned char *)transa = 'C'; + *(unsigned char *)transb = 'N'; + cmmch_(transa, transb, &n, &c__1, &n, &c_b2, ab, &c__65, &ab[4225], & + c__65, &c_b1, c__, &c__65, ct, g, cc, &c__65, &eps, &err, &fatal, + &c__6, &c_true); + same = lce_(cc, ct, &n); + if (! same || err != 0.f) { + s_wsfe(&io___70); + do_fio(&c__1, transa, (ftnlen)1); + do_fio(&c__1, transb, (ftnlen)1); + do_fio(&c__1, (char *)&same, (ftnlen)sizeof(logical)); + do_fio(&c__1, (char *)&err, (ftnlen)sizeof(real)); + e_wsfe(); + s_stop("", (ftnlen)0); + } + *(unsigned char *)transb = 'C'; + cmmch_(transa, transb, &n, &c__1, &n, &c_b2, ab, &c__65, &ab[4225], & + c__65, &c_b1, c__, &c__65, ct, g, cc, &c__65, &eps, &err, &fatal, + &c__6, &c_true); + same = lce_(cc, ct, &n); + if (! same || err != 0.f) { + s_wsfe(&io___71); + do_fio(&c__1, transa, (ftnlen)1); + do_fio(&c__1, transb, (ftnlen)1); + do_fio(&c__1, (char *)&same, (ftnlen)sizeof(logical)); + do_fio(&c__1, (char *)&err, (ftnlen)sizeof(real)); + e_wsfe(); + s_stop("", (ftnlen)0); + } + +/* Test each subroutine in turn. */ + + for (isnum = 1; isnum <= 9; ++isnum) { + s_wsle(&io___73); + e_wsle(); + if (! ltest[isnum - 1]) { +/* Subprogram is not to be tested. */ + s_wsfe(&io___74); + do_fio(&c__1, snames + (isnum - 1) * 13, (ftnlen)13); + e_wsfe(); + } else { + s_copy(srnamc_1.srnamt, snames + (isnum - 1) * 13, (ftnlen)13, ( + ftnlen)13); +/* Test error exits. */ + if (tsterr) { + cc3chke_(snames + (isnum - 1) * 13); + s_wsle(&io___75); + e_wsle(); + } +/* Test computations. */ + infoc_1.infot = 0; + infoc_1.ok = TRUE_; + fatal = FALSE_; + switch (isnum) { + case 1: goto L140; + case 2: goto L150; + case 3: goto L150; + case 4: goto L160; + case 5: goto L160; + case 6: goto L170; + case 7: goto L170; + case 8: goto L180; + case 9: goto L180; + } +/* Test CGEMM, 01. */ +L140: + if (corder) { + cchk1_(snames + (isnum - 1) * 13, &eps, &thresh, &c__6, &ntra, + &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, & + nbet, bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, + cc, cs, ct, g, &c__0); + } + if (rorder) { + cchk1_(snames + (isnum - 1) * 13, &eps, &thresh, &c__6, &ntra, + &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, & + nbet, bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, + cc, cs, ct, g, &c__1); + } + goto L190; +/* Test CHEMM, 02, CSYMM, 03. */ +L150: + if (corder) { + cchk2_(snames + (isnum - 1) * 13, &eps, &thresh, &c__6, &ntra, + &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, & + nbet, bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, + cc, cs, ct, g, &c__0); + } + if (rorder) { + cchk2_(snames + (isnum - 1) * 13, &eps, &thresh, &c__6, &ntra, + &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, & + nbet, bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, + cc, cs, ct, g, &c__1); + } + goto L190; +/* Test CTRMM, 04, CTRSM, 05. */ +L160: + if (corder) { + cchk3_(snames + (isnum - 1) * 13, &eps, &thresh, &c__6, &ntra, + &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, & + c__65, ab, aa, as, &ab[4225], bb, bs, ct, g, c__, & + c__0); + } + if (rorder) { + cchk3_(snames + (isnum - 1) * 13, &eps, &thresh, &c__6, &ntra, + &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, & + c__65, ab, aa, as, &ab[4225], bb, bs, ct, g, c__, & + c__1); + } + goto L190; +/* Test CHERK, 06, CSYRK, 07. */ +L170: + if (corder) { + cchk4_(snames + (isnum - 1) * 13, &eps, &thresh, &c__6, &ntra, + &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, & + nbet, bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, + cc, cs, ct, g, &c__0); + } + if (rorder) { + cchk4_(snames + (isnum - 1) * 13, &eps, &thresh, &c__6, &ntra, + &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, & + nbet, bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, + cc, cs, ct, g, &c__1); + } + goto L190; +/* Test CHER2K, 08, CSYR2K, 09. */ +L180: + if (corder) { + cchk5_(snames + (isnum - 1) * 13, &eps, &thresh, &c__6, &ntra, + &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, & + nbet, bet, &c__65, ab, aa, as, bb, bs, c__, cc, cs, + ct, g, w, &c__0); + } + if (rorder) { + cchk5_(snames + (isnum - 1) * 13, &eps, &thresh, &c__6, &ntra, + &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, & + nbet, bet, &c__65, ab, aa, as, bb, bs, c__, cc, cs, + ct, g, w, &c__1); + } + goto L190; + +L190: + if (fatal && sfatal) { + goto L210; + } + } +/* L200: */ + } + s_wsfe(&io___82); + e_wsfe(); + goto L230; + +L210: + s_wsfe(&io___83); + e_wsfe(); + goto L230; + +L220: + s_wsfe(&io___84); + e_wsfe(); + +L230: + if (trace) { + cl__1.cerr = 0; + cl__1.cunit = ntra; + cl__1.csta = 0; + f_clos(&cl__1); + } + cl__1.cerr = 0; + cl__1.cunit = 6; + cl__1.csta = 0; + f_clos(&cl__1); + s_stop("", (ftnlen)0); + + +/* End of CBLAT3. */ + + return 0; +} /* MAIN__ */ + +/* Subroutine */ int cchk1_(char *sname, real *eps, real *thresh, integer * + nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, + integer *nidim, integer *idim, integer *nalf, complex *alf, integer * + nbet, complex *bet, integer *nmax, complex *a, complex *aa, complex * + as, complex *b, complex *bb, complex *bs, complex *c__, complex *cc, + complex *cs, complex *ct, real *g, integer *iorder) +{ + /* Initialized data */ + + static char ich[3] = "NTC"; + + /* Format strings */ + static char fmt_9994[] = "(\002 ****** FATAL ERROR - ERROR-CALL MYEXIT T" + "AKEN ON VALID\002,\002 CALL ******\002)"; + static char fmt_9998[] = "(\002 ******* FATAL ERROR - PARAMETER NUMBER" + " \002,i2,\002 WAS CH\002,\002ANGED INCORRECTLY *******\002)"; + static char fmt_10000[] = "(\002 \002,a13,\002 PASSED THE COLUMN-MAJOR C" + "OMPUTATIONAL TESTS\002,\002 (\002,i6,\002 CALL\002,\002S)\002)"; + static char fmt_10001[] = "(\002 \002,a13,\002 PASSED THE ROW-MAJOR C" + "OMPUTATIONAL TESTS\002,\002 (\002,i6,\002 CALL\002,\002S)\002)"; + static char fmt_10002[] = "(\002 \002,a13,\002 COMPLETED THE COLUMN-MAJO" + "R COMPUTATIONAL \002,\002TESTS (\002,i6,\002 CALLS)\002,/\002 **" + "***** BUT WITH MAXIMUM TEST \002,\002RATIO \002,f8.2,\002 - SUSP" + "ECT *******\002)"; + static char fmt_10003[] = "(\002 \002,a13,\002 COMPLETED THE ROW-MAJOR " + " COMPUTATIONAL \002,\002TESTS (\002,i6,\002 CALLS)\002,/\002 **" + "***** BUT WITH MAXIMUM TEST \002,\002RATIO \002,f8.2,\002 - SUSP" + "ECT *******\002)"; + static char fmt_9996[] = "(\002 ******* \002,a13,\002 FAILED ON CALL NUM" + "BER:\002)"; + + /* System generated locals */ + integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, + i__3, i__4, i__5, i__6, i__7, i__8; + alist al__1; + + /* Local variables */ + complex beta; + integer ldas, ldbs, ldcs; + logical same, null; + integer i__, k, m, n; + extern /* Subroutine */ int cmake_(char *, char *, char *, integer *, + integer *, complex *, integer *, complex *, integer *, logical *, + complex *); + complex alpha; + extern /* Subroutine */ int cmmch_(char *, char *, integer *, integer *, + integer *, complex *, complex *, integer *, complex *, integer *, + complex *, complex *, integer *, complex *, real *, complex *, + integer *, real *, real *, logical *, integer *, logical *); + logical isame[13], trana, tranb; + integer nargs; + logical reset; + extern /* Subroutine */ int cprcn1_(integer *, integer *, char *, integer + *, char *, char *, integer *, integer *, integer *, complex *, + integer *, integer *, complex *, integer *); + integer ia, ib, ma, mb, na, nb, nc, ik, im, in, ks, ms, ns; + extern logical lceres_(char *, char *, integer *, integer *, complex *, + complex *, integer *); + char tranas[1], tranbs[1], transa[1], transb[1]; + real errmax; + integer ica, icb, laa, lbb, lda, lcc, ldb, ldc; + extern logical lce_(complex *, complex *, integer *); + complex als, bls; + real err; + extern /* Subroutine */ int ccgemm3m_(integer *, char *, char *, integer * + , integer *, integer *, complex *, complex *, integer *, complex * + , integer *, complex *, complex *, integer *); + + /* Fortran I/O blocks */ + static cilist io___128 = { 0, 0, 0, fmt_9994, 0 }; + static cilist io___131 = { 0, 0, 0, fmt_9998, 0 }; + static cilist io___133 = { 0, 0, 0, fmt_10000, 0 }; + static cilist io___134 = { 0, 0, 0, fmt_10001, 0 }; + static cilist io___135 = { 0, 0, 0, fmt_10002, 0 }; + static cilist io___136 = { 0, 0, 0, fmt_10003, 0 }; + static cilist io___137 = { 0, 0, 0, fmt_9996, 0 }; + + + +/* Tests CGEMM. */ + +/* Auxiliary routine for test program for Level 3 Blas. */ + +/* -- Written on 8-February-1989. */ +/* Jack Dongarra, Argonne National Laboratory. */ +/* Iain Duff, AERE Harwell. */ +/* Jeremy Du Croz, Numerical Algorithms Group Ltd. */ +/* Sven Hammarling, Numerical Algorithms Group Ltd. */ + + /* Parameter adjustments */ + --idim; + --alf; + --bet; + --g; + --ct; + --cs; + --cc; + c_dim1 = *nmax; + c_offset = 1 + c_dim1 * 1; + c__ -= c_offset; + --bs; + --bb; + b_dim1 = *nmax; + b_offset = 1 + b_dim1 * 1; + b -= b_offset; + --as; + --aa; + a_dim1 = *nmax; + a_offset = 1 + a_dim1 * 1; + a -= a_offset; + + /* Function Body */ + + nargs = 13; + nc = 0; + reset = TRUE_; + errmax = 0.f; + + i__1 = *nidim; + for (im = 1; im <= i__1; ++im) { + m = idim[im]; + + i__2 = *nidim; + for (in = 1; in <= i__2; ++in) { + n = idim[in]; +/* Set LDC to 1 more than minimum value if room. */ + ldc = m; + if (ldc < *nmax) { + ++ldc; + } +/* Skip tests if not enough room. */ + if (ldc > *nmax) { + goto L100; + } + lcc = ldc * n; + null = n <= 0 || m <= 0; + + i__3 = *nidim; + for (ik = 1; ik <= i__3; ++ik) { + k = idim[ik]; + + for (ica = 1; ica <= 3; ++ica) { + *(unsigned char *)transa = *(unsigned char *)&ich[ica - 1] + ; + trana = *(unsigned char *)transa == 'T' || *(unsigned + char *)transa == 'C'; + + if (trana) { + ma = k; + na = m; + } else { + ma = m; + na = k; + } +/* Set LDA to 1 more than minimum value if room. */ + lda = ma; + if (lda < *nmax) { + ++lda; + } +/* Skip tests if not enough room. */ + if (lda > *nmax) { + goto L80; + } + laa = lda * na; + +/* Generate the matrix A. */ + + cmake_("ge", " ", " ", &ma, &na, &a[a_offset], nmax, &aa[ + 1], &lda, &reset, &c_b1); + + for (icb = 1; icb <= 3; ++icb) { + *(unsigned char *)transb = *(unsigned char *)&ich[icb + - 1]; + tranb = *(unsigned char *)transb == 'T' || *(unsigned + char *)transb == 'C'; + + if (tranb) { + mb = n; + nb = k; + } else { + mb = k; + nb = n; + } +/* Set LDB to 1 more than minimum value if room. */ + ldb = mb; + if (ldb < *nmax) { + ++ldb; + } +/* Skip tests if not enough room. */ + if (ldb > *nmax) { + goto L70; + } + lbb = ldb * nb; + +/* Generate the matrix B. */ + + cmake_("ge", " ", " ", &mb, &nb, &b[b_offset], nmax, & + bb[1], &ldb, &reset, &c_b1); + + i__4 = *nalf; + for (ia = 1; ia <= i__4; ++ia) { + i__5 = ia; + alpha.r = alf[i__5].r, alpha.i = alf[i__5].i; + + i__5 = *nbet; + for (ib = 1; ib <= i__5; ++ib) { + i__6 = ib; + beta.r = bet[i__6].r, beta.i = bet[i__6].i; + +/* Generate the matrix C. */ + + cmake_("ge", " ", " ", &m, &n, &c__[c_offset], + nmax, &cc[1], &ldc, &reset, &c_b1); + + ++nc; + +/* Save every datum before calling the */ +/* subroutine. */ + + *(unsigned char *)tranas = *(unsigned char *) + transa; + *(unsigned char *)tranbs = *(unsigned char *) + transb; + ms = m; + ns = n; + ks = k; + als.r = alpha.r, als.i = alpha.i; + i__6 = laa; + for (i__ = 1; i__ <= i__6; ++i__) { + i__7 = i__; + i__8 = i__; + as[i__7].r = aa[i__8].r, as[i__7].i = aa[ + i__8].i; +/* L10: */ + } + ldas = lda; + i__6 = lbb; + for (i__ = 1; i__ <= i__6; ++i__) { + i__7 = i__; + i__8 = i__; + bs[i__7].r = bb[i__8].r, bs[i__7].i = bb[ + i__8].i; +/* L20: */ + } + ldbs = ldb; + bls.r = beta.r, bls.i = beta.i; + i__6 = lcc; + for (i__ = 1; i__ <= i__6; ++i__) { + i__7 = i__; + i__8 = i__; + cs[i__7].r = cc[i__8].r, cs[i__7].i = cc[ + i__8].i; +/* L30: */ + } + ldcs = ldc; + +/* Call the subroutine. */ + + if (*trace) { + cprcn1_(ntra, &nc, sname, iorder, transa, + transb, &m, &n, &k, &alpha, &lda, + &ldb, &beta, &ldc); + } + if (*rewi) { + al__1.aerr = 0; + al__1.aunit = *ntra; + f_rew(&al__1); + } + ccgemm3m_(iorder, transa, transb, &m, &n, &k, + &alpha, &aa[1], &lda, &bb[1], &ldb, & + beta, &cc[1], &ldc); + +/* Check if error-exit was taken incorrectly. */ + + if (! infoc_1.ok) { + io___128.ciunit = *nout; + s_wsfe(&io___128); + e_wsfe(); + *fatal = TRUE_; + goto L120; + } + +/* See what data changed inside subroutines. */ + + isame[0] = *(unsigned char *)transa == *( + unsigned char *)tranas; + isame[1] = *(unsigned char *)transb == *( + unsigned char *)tranbs; + isame[2] = ms == m; + isame[3] = ns == n; + isame[4] = ks == k; + isame[5] = als.r == alpha.r && als.i == + alpha.i; + isame[6] = lce_(&as[1], &aa[1], &laa); + isame[7] = ldas == lda; + isame[8] = lce_(&bs[1], &bb[1], &lbb); + isame[9] = ldbs == ldb; + isame[10] = bls.r == beta.r && bls.i == + beta.i; + if (null) { + isame[11] = lce_(&cs[1], &cc[1], &lcc); + } else { + isame[11] = lceres_("ge", " ", &m, &n, & + cs[1], &cc[1], &ldc); + } + isame[12] = ldcs == ldc; + +/* If data was incorrectly changed, report */ +/* and return. */ + + same = TRUE_; + i__6 = nargs; + for (i__ = 1; i__ <= i__6; ++i__) { + same = same && isame[i__ - 1]; + if (! isame[i__ - 1]) { + io___131.ciunit = *nout; + s_wsfe(&io___131); + do_fio(&c__1, (char *)&i__, (ftnlen) + sizeof(integer)); + e_wsfe(); + } +/* L40: */ + } + if (! same) { + *fatal = TRUE_; + goto L120; + } + + if (! null) { + +/* Check the result. */ + + cmmch_(transa, transb, &m, &n, &k, &alpha, + &a[a_offset], nmax, &b[b_offset], + nmax, &beta, &c__[c_offset], + nmax, &ct[1], &g[1], &cc[1], &ldc, + eps, &err, fatal, nout, &c_true); + errmax = f2cmax(errmax,err); +/* If got really bad answer, report and */ +/* return. */ + if (*fatal) { + goto L120; + } + } + +/* L50: */ + } + +/* L60: */ + } + +L70: + ; + } + +L80: + ; + } + +/* L90: */ + } + +L100: + ; + } + +/* L110: */ + } + +/* Report result. */ + + if (errmax < *thresh) { + if (*iorder == 0) { + io___133.ciunit = *nout; + s_wsfe(&io___133); + do_fio(&c__1, sname, (ftnlen)13); + do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); + e_wsfe(); + } + if (*iorder == 1) { + io___134.ciunit = *nout; + s_wsfe(&io___134); + do_fio(&c__1, sname, (ftnlen)13); + do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); + e_wsfe(); + } + } else { + if (*iorder == 0) { + io___135.ciunit = *nout; + s_wsfe(&io___135); + do_fio(&c__1, sname, (ftnlen)13); + do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); + do_fio(&c__1, (char *)&errmax, (ftnlen)sizeof(real)); + e_wsfe(); + } + if (*iorder == 1) { + io___136.ciunit = *nout; + s_wsfe(&io___136); + do_fio(&c__1, sname, (ftnlen)13); + do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); + do_fio(&c__1, (char *)&errmax, (ftnlen)sizeof(real)); + e_wsfe(); + } + } + goto L130; + +L120: + io___137.ciunit = *nout; + s_wsfe(&io___137); + do_fio(&c__1, sname, (ftnlen)13); + e_wsfe(); + cprcn1_(nout, &nc, sname, iorder, transa, transb, &m, &n, &k, &alpha, & + lda, &ldb, &beta, &ldc); + +L130: + return 0; + +/* L9995: */ + +/* End of CCHK1. */ + +} /* cchk1_ */ + + +/* Subroutine */ int cprcn1_(integer *nout, integer *nc, char *sname, integer + *iorder, char *transa, char *transb, integer *m, integer *n, integer * + k, complex *alpha, integer *lda, integer *ldb, complex *beta, integer + *ldc) +{ + /* Format strings */ + static char fmt_9995[] = "(1x,i6,\002: \002,a13,\002(\002,a14,\002,\002," + "a14,\002,\002,a14,\002,\002)"; + static char fmt_9994[] = "(10x,3(i3,\002,\002),\002 (\002,f4.1,\002,\002" + ",f4.1,\002) , A,\002,i3,\002, B,\002,i3,\002, (\002,f4.1,\002" + ",\002,f4.1,\002) , C,\002,i3,\002).\002)"; + + /* Local variables */ + char crc[14], cta[14], ctb[14]; + + /* Fortran I/O blocks */ + static cilist io___141 = { 0, 0, 0, fmt_9995, 0 }; + static cilist io___142 = { 0, 0, 0, fmt_9994, 0 }; + + + if (*(unsigned char *)transa == 'N') { + s_copy(cta, " CblasNoTrans", (ftnlen)14, (ftnlen)14); + } else if (*(unsigned char *)transa == 'T') { + s_copy(cta, " CblasTrans", (ftnlen)14, (ftnlen)14); + } else { + s_copy(cta, "CblasConjTrans", (ftnlen)14, (ftnlen)14); + } + if (*(unsigned char *)transb == 'N') { + s_copy(ctb, " CblasNoTrans", (ftnlen)14, (ftnlen)14); + } else if (*(unsigned char *)transb == 'T') { + s_copy(ctb, " CblasTrans", (ftnlen)14, (ftnlen)14); + } else { + s_copy(ctb, "CblasConjTrans", (ftnlen)14, (ftnlen)14); + } + if (*iorder == 1) { + s_copy(crc, " CblasRowMajor", (ftnlen)14, (ftnlen)14); + } else { + s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14); + } + io___141.ciunit = *nout; + s_wsfe(&io___141); + do_fio(&c__1, (char *)&(*nc), (ftnlen)sizeof(integer)); + do_fio(&c__1, sname, (ftnlen)13); + do_fio(&c__1, crc, (ftnlen)14); + do_fio(&c__1, cta, (ftnlen)14); + do_fio(&c__1, ctb, (ftnlen)14); + e_wsfe(); + io___142.ciunit = *nout; + s_wsfe(&io___142); + do_fio(&c__1, (char *)&(*m), (ftnlen)sizeof(integer)); + do_fio(&c__1, (char *)&(*n), (ftnlen)sizeof(integer)); + do_fio(&c__1, (char *)&(*k), (ftnlen)sizeof(integer)); + do_fio(&c__2, (char *)&(*alpha), (ftnlen)sizeof(real)); + do_fio(&c__1, (char *)&(*lda), (ftnlen)sizeof(integer)); + do_fio(&c__1, (char *)&(*ldb), (ftnlen)sizeof(integer)); + do_fio(&c__2, (char *)&(*beta), (ftnlen)sizeof(real)); + do_fio(&c__1, (char *)&(*ldc), (ftnlen)sizeof(integer)); + e_wsfe(); + return 0; +} /* cprcn1_ */ + + +/* Subroutine */ int cchk2_(char *sname, real *eps, real *thresh, integer * + nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, + integer *nidim, integer *idim, integer *nalf, complex *alf, integer * + nbet, complex *bet, integer *nmax, complex *a, complex *aa, complex * + as, complex *b, complex *bb, complex *bs, complex *c__, complex *cc, + complex *cs, complex *ct, real *g, integer *iorder) +{ + /* Initialized data */ + + static char ichs[2] = "LR"; + static char ichu[2] = "UL"; + + /* Format strings */ + static char fmt_9994[] = "(\002 ******* FATAL ERROR - ERROR-CALL MYEXIT " + "TAKEN ON VALID\002,\002 CALL *******\002)"; + static char fmt_9998[] = "(\002 ******* FATAL ERROR - PARAMETER NUMBER" + " \002,i2,\002 WAS CH\002,\002ANGED INCORRECTLY *******\002)"; + static char fmt_10000[] = "(\002 \002,a13,\002 PASSED THE COLUMN-MAJOR C" + "OMPUTATIONAL TESTS\002,\002 (\002,i6,\002 CALL\002,\002S)\002)"; + static char fmt_10001[] = "(\002 \002,a13,\002 PASSED THE ROW-MAJOR C" + "OMPUTATIONAL TESTS\002,\002 (\002,i6,\002 CALL\002,\002S)\002)"; + static char fmt_10002[] = "(\002 \002,a13,\002 COMPLETED THE COLUMN-MAJO" + "R COMPUTATIONAL \002,\002TESTS (\002,i6,\002 CALLS)\002,/\002 **" + "***** BUT WITH MAXIMUM TEST \002,\002RATIO \002,f8.2,\002 - SUSP" + "ECT *******\002)"; + static char fmt_10003[] = "(\002 \002,a13,\002 COMPLETED THE ROW-MAJOR " + " COMPUTATIONAL \002,\002TESTS (\002,i6,\002 CALLS)\002,/\002 **" + "***** BUT WITH MAXIMUM TEST \002,\002RATIO \002,f8.2,\002 - SUSP" + "ECT *******\002)"; + static char fmt_9996[] = "(\002 ******* \002,a13,\002 FAILED ON CALL NUM" + "BER:\002)"; + + /* System generated locals */ + integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, + i__3, i__4, i__5, i__6, i__7; + alist al__1; + + /* Local variables */ + complex beta; + integer ldas, ldbs, ldcs; + logical same; + char side[1]; + logical conj, left, null; + char uplo[1]; + integer i__, m, n; + extern /* Subroutine */ int cmake_(char *, char *, char *, integer *, + integer *, complex *, integer *, complex *, integer *, logical *, + complex *); + complex alpha; + extern /* Subroutine */ int cmmch_(char *, char *, integer *, integer *, + integer *, complex *, complex *, integer *, complex *, integer *, + complex *, complex *, integer *, complex *, real *, complex *, + integer *, real *, real *, logical *, integer *, logical *); + logical isame[13]; + char sides[1]; + integer nargs; + logical reset; + char uplos[1]; + extern /* Subroutine */ int cprcn2_(integer *, integer *, char *, integer + *, char *, char *, integer *, integer *, complex *, integer *, + integer *, complex *, integer *); + integer ia, ib, na, nc, im, in; + extern /* Subroutine */ int cchemm_(integer *, char *, char *, integer *, + integer *, complex *, complex *, integer *, complex *, integer *, + complex *, complex *, integer *); + integer ms, ns; + extern logical lceres_(char *, char *, integer *, integer *, complex *, + complex *, integer *); + extern /* Subroutine */ int ccsymm_(integer *, char *, char *, integer *, + integer *, complex *, complex *, integer *, complex *, integer *, + complex *, complex *, integer *); + real errmax; + integer laa, lbb, lda, lcc, ldb, ldc; + extern logical lce_(complex *, complex *, integer *); + integer ics; + complex als, bls; + integer icu; + real err; + + /* Fortran I/O blocks */ + static cilist io___181 = { 0, 0, 0, fmt_9994, 0 }; + static cilist io___184 = { 0, 0, 0, fmt_9998, 0 }; + static cilist io___186 = { 0, 0, 0, fmt_10000, 0 }; + static cilist io___187 = { 0, 0, 0, fmt_10001, 0 }; + static cilist io___188 = { 0, 0, 0, fmt_10002, 0 }; + static cilist io___189 = { 0, 0, 0, fmt_10003, 0 }; + static cilist io___190 = { 0, 0, 0, fmt_9996, 0 }; + + + +/* Tests CHEMM and CSYMM. */ + +/* Auxiliary routine for test program for Level 3 Blas. */ + +/* -- Written on 8-February-1989. */ +/* Jack Dongarra, Argonne National Laboratory. */ +/* Iain Duff, AERE Harwell. */ +/* Jeremy Du Croz, Numerical Algorithms Group Ltd. */ +/* Sven Hammarling, Numerical Algorithms Group Ltd. */ + + /* Parameter adjustments */ + --idim; + --alf; + --bet; + --g; + --ct; + --cs; + --cc; + c_dim1 = *nmax; + c_offset = 1 + c_dim1 * 1; + c__ -= c_offset; + --bs; + --bb; + b_dim1 = *nmax; + b_offset = 1 + b_dim1 * 1; + b -= b_offset; + --as; + --aa; + a_dim1 = *nmax; + a_offset = 1 + a_dim1 * 1; + a -= a_offset; + + /* Function Body */ + conj = s_cmp(sname + 7, "he", (ftnlen)2, (ftnlen)2) == 0; + + nargs = 12; + nc = 0; + reset = TRUE_; + errmax = 0.f; + + i__1 = *nidim; + for (im = 1; im <= i__1; ++im) { + m = idim[im]; + + i__2 = *nidim; + for (in = 1; in <= i__2; ++in) { + n = idim[in]; +/* Set LDC to 1 more than minimum value if room. */ + ldc = m; + if (ldc < *nmax) { + ++ldc; + } +/* Skip tests if not enough room. */ + if (ldc > *nmax) { + goto L90; + } + lcc = ldc * n; + null = n <= 0 || m <= 0; +/* Set LDB to 1 more than minimum value if room. */ + ldb = m; + if (ldb < *nmax) { + ++ldb; + } +/* Skip tests if not enough room. */ + if (ldb > *nmax) { + goto L90; + } + lbb = ldb * n; + +/* Generate the matrix B. */ + + cmake_("ge", " ", " ", &m, &n, &b[b_offset], nmax, &bb[1], &ldb, & + reset, &c_b1); + + for (ics = 1; ics <= 2; ++ics) { + *(unsigned char *)side = *(unsigned char *)&ichs[ics - 1]; + left = *(unsigned char *)side == 'L'; + + if (left) { + na = m; + } else { + na = n; + } +/* Set LDA to 1 more than minimum value if room. */ + lda = na; + if (lda < *nmax) { + ++lda; + } +/* Skip tests if not enough room. */ + if (lda > *nmax) { + goto L80; + } + laa = lda * na; + + for (icu = 1; icu <= 2; ++icu) { + *(unsigned char *)uplo = *(unsigned char *)&ichu[icu - 1]; + +/* Generate the hermitian or symmetric matrix A. */ + + cmake_(sname + 7, uplo, " ", &na, &na, &a[a_offset], nmax, + &aa[1], &lda, &reset, &c_b1); + + i__3 = *nalf; + for (ia = 1; ia <= i__3; ++ia) { + i__4 = ia; + alpha.r = alf[i__4].r, alpha.i = alf[i__4].i; + + i__4 = *nbet; + for (ib = 1; ib <= i__4; ++ib) { + i__5 = ib; + beta.r = bet[i__5].r, beta.i = bet[i__5].i; + +/* Generate the matrix C. */ + + cmake_("ge", " ", " ", &m, &n, &c__[c_offset], + nmax, &cc[1], &ldc, &reset, &c_b1); + + ++nc; + +/* Save every datum before calling the */ +/* subroutine. */ + + *(unsigned char *)sides = *(unsigned char *)side; + *(unsigned char *)uplos = *(unsigned char *)uplo; + ms = m; + ns = n; + als.r = alpha.r, als.i = alpha.i; + i__5 = laa; + for (i__ = 1; i__ <= i__5; ++i__) { + i__6 = i__; + i__7 = i__; + as[i__6].r = aa[i__7].r, as[i__6].i = aa[i__7] + .i; +/* L10: */ + } + ldas = lda; + i__5 = lbb; + for (i__ = 1; i__ <= i__5; ++i__) { + i__6 = i__; + i__7 = i__; + bs[i__6].r = bb[i__7].r, bs[i__6].i = bb[i__7] + .i; +/* L20: */ + } + ldbs = ldb; + bls.r = beta.r, bls.i = beta.i; + i__5 = lcc; + for (i__ = 1; i__ <= i__5; ++i__) { + i__6 = i__; + i__7 = i__; + cs[i__6].r = cc[i__7].r, cs[i__6].i = cc[i__7] + .i; +/* L30: */ + } + ldcs = ldc; + +/* Call the subroutine. */ + + if (*trace) { + cprcn2_(ntra, &nc, sname, iorder, side, uplo, + &m, &n, &alpha, &lda, &ldb, &beta, & + ldc) + ; + } + if (*rewi) { + al__1.aerr = 0; + al__1.aunit = *ntra; + f_rew(&al__1); + } + if (conj) { + cchemm_(iorder, side, uplo, &m, &n, &alpha, & + aa[1], &lda, &bb[1], &ldb, &beta, &cc[ + 1], &ldc); + } else { + ccsymm_(iorder, side, uplo, &m, &n, &alpha, & + aa[1], &lda, &bb[1], &ldb, &beta, &cc[ + 1], &ldc); + } + +/* Check if error-exit was taken incorrectly. */ + + if (! infoc_1.ok) { + io___181.ciunit = *nout; + s_wsfe(&io___181); + e_wsfe(); + *fatal = TRUE_; + goto L110; + } + +/* See what data changed inside subroutines. */ + + isame[0] = *(unsigned char *)sides == *(unsigned + char *)side; + isame[1] = *(unsigned char *)uplos == *(unsigned + char *)uplo; + isame[2] = ms == m; + isame[3] = ns == n; + isame[4] = als.r == alpha.r && als.i == alpha.i; + isame[5] = lce_(&as[1], &aa[1], &laa); + isame[6] = ldas == lda; + isame[7] = lce_(&bs[1], &bb[1], &lbb); + isame[8] = ldbs == ldb; + isame[9] = bls.r == beta.r && bls.i == beta.i; + if (null) { + isame[10] = lce_(&cs[1], &cc[1], &lcc); + } else { + isame[10] = lceres_("ge", " ", &m, &n, &cs[1], + &cc[1], &ldc); + } + isame[11] = ldcs == ldc; + +/* If data was incorrectly changed, report and */ +/* return. */ + + same = TRUE_; + i__5 = nargs; + for (i__ = 1; i__ <= i__5; ++i__) { + same = same && isame[i__ - 1]; + if (! isame[i__ - 1]) { + io___184.ciunit = *nout; + s_wsfe(&io___184); + do_fio(&c__1, (char *)&i__, (ftnlen) + sizeof(integer)); + e_wsfe(); + } +/* L40: */ + } + if (! same) { + *fatal = TRUE_; + goto L110; + } + + if (! null) { + +/* Check the result. */ + + if (left) { + cmmch_("N", "N", &m, &n, &m, &alpha, &a[ + a_offset], nmax, &b[b_offset], + nmax, &beta, &c__[c_offset], nmax, + &ct[1], &g[1], &cc[1], &ldc, eps, + &err, fatal, nout, &c_true); + } else { + cmmch_("N", "N", &m, &n, &n, &alpha, &b[ + b_offset], nmax, &a[a_offset], + nmax, &beta, &c__[c_offset], nmax, + &ct[1], &g[1], &cc[1], &ldc, eps, + &err, fatal, nout, &c_true); + } + errmax = f2cmax(errmax,err); +/* If got really bad answer, report and */ +/* return. */ + if (*fatal) { + goto L110; + } + } + +/* L50: */ + } + +/* L60: */ + } + +/* L70: */ + } + +L80: + ; + } + +L90: + ; + } + +/* L100: */ + } + +/* Report result. */ + + if (errmax < *thresh) { + if (*iorder == 0) { + io___186.ciunit = *nout; + s_wsfe(&io___186); + do_fio(&c__1, sname, (ftnlen)13); + do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); + e_wsfe(); + } + if (*iorder == 1) { + io___187.ciunit = *nout; + s_wsfe(&io___187); + do_fio(&c__1, sname, (ftnlen)13); + do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); + e_wsfe(); + } + } else { + if (*iorder == 0) { + io___188.ciunit = *nout; + s_wsfe(&io___188); + do_fio(&c__1, sname, (ftnlen)13); + do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); + do_fio(&c__1, (char *)&errmax, (ftnlen)sizeof(real)); + e_wsfe(); + } + if (*iorder == 1) { + io___189.ciunit = *nout; + s_wsfe(&io___189); + do_fio(&c__1, sname, (ftnlen)13); + do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); + do_fio(&c__1, (char *)&errmax, (ftnlen)sizeof(real)); + e_wsfe(); + } + } + goto L120; + +L110: + io___190.ciunit = *nout; + s_wsfe(&io___190); + do_fio(&c__1, sname, (ftnlen)13); + e_wsfe(); + cprcn2_(nout, &nc, sname, iorder, side, uplo, &m, &n, &alpha, &lda, &ldb, + &beta, &ldc); + +L120: + return 0; + +/* L9995: */ + +/* End of CCHK2. */ + +} /* cchk2_ */ + + +/* Subroutine */ int cprcn2_(integer *nout, integer *nc, char *sname, integer + *iorder, char *side, char *uplo, integer *m, integer *n, complex * + alpha, integer *lda, integer *ldb, complex *beta, integer *ldc) +{ + /* Format strings */ + static char fmt_9995[] = "(1x,i6,\002: \002,a13,\002(\002,a14,\002,\002," + "a14,\002,\002,a14,\002,\002)"; + static char fmt_9994[] = "(10x,2(i3,\002,\002),\002 (\002,f4.1,\002,\002" + ",f4.1,\002), A,\002,i3,\002, B,\002,i3,\002, (\002,f4.1,\002," + "\002,f4.1,\002), \002,\002C,\002,i3,\002).\002)"; + + /* Local variables */ + char cs[14], cu[14], crc[14]; + + /* Fortran I/O blocks */ + static cilist io___194 = { 0, 0, 0, fmt_9995, 0 }; + static cilist io___195 = { 0, 0, 0, fmt_9994, 0 }; + + + if (*(unsigned char *)side == 'L') { + s_copy(cs, " CblasLeft", (ftnlen)14, (ftnlen)14); + } else { + s_copy(cs, " CblasRight", (ftnlen)14, (ftnlen)14); + } + if (*(unsigned char *)uplo == 'U') { + s_copy(cu, " CblasUpper", (ftnlen)14, (ftnlen)14); + } else { + s_copy(cu, " CblasLower", (ftnlen)14, (ftnlen)14); + } + if (*iorder == 1) { + s_copy(crc, " CblasRowMajor", (ftnlen)14, (ftnlen)14); + } else { + s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14); + } + io___194.ciunit = *nout; + s_wsfe(&io___194); + do_fio(&c__1, (char *)&(*nc), (ftnlen)sizeof(integer)); + do_fio(&c__1, sname, (ftnlen)13); + do_fio(&c__1, crc, (ftnlen)14); + do_fio(&c__1, cs, (ftnlen)14); + do_fio(&c__1, cu, (ftnlen)14); + e_wsfe(); + io___195.ciunit = *nout; + s_wsfe(&io___195); + do_fio(&c__1, (char *)&(*m), (ftnlen)sizeof(integer)); + do_fio(&c__1, (char *)&(*n), (ftnlen)sizeof(integer)); + do_fio(&c__2, (char *)&(*alpha), (ftnlen)sizeof(real)); + do_fio(&c__1, (char *)&(*lda), (ftnlen)sizeof(integer)); + do_fio(&c__1, (char *)&(*ldb), (ftnlen)sizeof(integer)); + do_fio(&c__2, (char *)&(*beta), (ftnlen)sizeof(real)); + do_fio(&c__1, (char *)&(*ldc), (ftnlen)sizeof(integer)); + e_wsfe(); + return 0; +} /* cprcn2_ */ + + +/* Subroutine */ int cchk3_(char *sname, real *eps, real *thresh, integer * + nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, + integer *nidim, integer *idim, integer *nalf, complex *alf, integer * + nmax, complex *a, complex *aa, complex *as, complex *b, complex *bb, + complex *bs, complex *ct, real *g, complex *c__, integer *iorder) +{ + /* Initialized data */ + + static char ichu[2] = "UL"; + static char icht[3] = "NTC"; + static char ichd[2] = "UN"; + static char ichs[2] = "LR"; + + /* Format strings */ + static char fmt_9994[] = "(\002 ******* FATAL ERROR - ERROR-CALL MYEXIT " + "TAKEN ON VALID\002,\002 CALL *******\002)"; + static char fmt_9998[] = "(\002 ******* FATAL ERROR - PARAMETER NUMBER" + " \002,i2,\002 WAS CH\002,\002ANGED INCORRECTLY *******\002)"; + static char fmt_10000[] = "(\002 \002,a13,\002 PASSED THE COLUMN-MAJOR C" + "OMPUTATIONAL TESTS\002,\002 (\002,i6,\002 CALL\002,\002S)\002)"; + static char fmt_10001[] = "(\002 \002,a13,\002 PASSED THE ROW-MAJOR C" + "OMPUTATIONAL TESTS\002,\002 (\002,i6,\002 CALL\002,\002S)\002)"; + static char fmt_10002[] = "(\002 \002,a13,\002 COMPLETED THE COLUMN-MAJO" + "R COMPUTATIONAL \002,\002TESTS (\002,i6,\002 CALLS)\002,/\002 **" + "***** BUT WITH MAXIMUM TEST \002,\002RATIO \002,f8.2,\002 - SUSP" + "ECT *******\002)"; + static char fmt_10003[] = "(\002 \002,a13,\002 COMPLETED THE ROW-MAJOR " + " COMPUTATIONAL \002,\002TESTS (\002,i6,\002 CALLS)\002,/\002 **" + "***** BUT WITH MAXIMUM TEST \002,\002RATIO \002,f8.2,\002 - SUSP" + "ECT *******\002)"; + static char fmt_9996[] = "(\002 ******* \002,a13,\002 FAILED ON CALL NUM" + "BER:\002)"; + + /* System generated locals */ + integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, + i__3, i__4, i__5, i__6, i__7; + complex q__1; + alist al__1; + + /* Local variables */ + char diag[1]; + integer ldas, ldbs; + logical same; + char side[1]; + logical left, null; + char uplo[1]; + integer i__, j, m, n; + extern /* Subroutine */ int cmake_(char *, char *, char *, integer *, + integer *, complex *, integer *, complex *, integer *, logical *, + complex *); + complex alpha; + char diags[1]; + extern /* Subroutine */ int cmmch_(char *, char *, integer *, integer *, + integer *, complex *, complex *, integer *, complex *, integer *, + complex *, complex *, integer *, complex *, real *, complex *, + integer *, real *, real *, logical *, integer *, logical *); + logical isame[13]; + char sides[1]; + integer nargs; + logical reset; + char uplos[1]; + extern /* Subroutine */ int cprcn3_(integer *, integer *, char *, integer + *, char *, char *, char *, char *, integer *, integer *, complex * + , integer *, integer *); + integer ia, na, nc, im, in, ms, ns; + extern logical lceres_(char *, char *, integer *, integer *, complex *, + complex *, integer *); + extern /* Subroutine */ int cctrmm_(integer *, char *, char *, char *, + char *, integer *, integer *, complex *, complex *, integer *, + complex *, integer *); + char tranas[1], transa[1]; + extern /* Subroutine */ int cctrsm_(integer *, char *, char *, char *, + char *, integer *, integer *, complex *, complex *, integer *, + complex *, integer *); + real errmax; + integer laa, icd, lbb, lda, ldb; + extern logical lce_(complex *, complex *, integer *); + integer ics; + complex als; + integer ict, icu; + real err; + + /* Fortran I/O blocks */ + static cilist io___236 = { 0, 0, 0, fmt_9994, 0 }; + static cilist io___239 = { 0, 0, 0, fmt_9998, 0 }; + static cilist io___241 = { 0, 0, 0, fmt_10000, 0 }; + static cilist io___242 = { 0, 0, 0, fmt_10001, 0 }; + static cilist io___243 = { 0, 0, 0, fmt_10002, 0 }; + static cilist io___244 = { 0, 0, 0, fmt_10003, 0 }; + static cilist io___245 = { 0, 0, 0, fmt_9996, 0 }; + + + +/* Tests CTRMM and CTRSM. */ + +/* Auxiliary routine for test program for Level 3 Blas. */ + +/* -- Written on 8-February-1989. */ +/* Jack Dongarra, Argonne National Laboratory. */ +/* Iain Duff, AERE Harwell. */ +/* Jeremy Du Croz, Numerical Algorithms Group Ltd. */ +/* Sven Hammarling, Numerical Algorithms Group Ltd. */ + + /* Parameter adjustments */ + --idim; + --alf; + c_dim1 = *nmax; + c_offset = 1 + c_dim1 * 1; + c__ -= c_offset; + --g; + --ct; + --bs; + --bb; + b_dim1 = *nmax; + b_offset = 1 + b_dim1 * 1; + b -= b_offset; + --as; + --aa; + a_dim1 = *nmax; + a_offset = 1 + a_dim1 * 1; + a -= a_offset; + + /* Function Body */ + + nargs = 11; + nc = 0; + reset = TRUE_; + errmax = 0.f; +/* Set up zero matrix for CMMCH. */ + i__1 = *nmax; + for (j = 1; j <= i__1; ++j) { + i__2 = *nmax; + for (i__ = 1; i__ <= i__2; ++i__) { + i__3 = i__ + j * c_dim1; + c__[i__3].r = 0.f, c__[i__3].i = 0.f; +/* L10: */ + } +/* L20: */ + } + + i__1 = *nidim; + for (im = 1; im <= i__1; ++im) { + m = idim[im]; + + i__2 = *nidim; + for (in = 1; in <= i__2; ++in) { + n = idim[in]; +/* Set LDB to 1 more than minimum value if room. */ + ldb = m; + if (ldb < *nmax) { + ++ldb; + } +/* Skip tests if not enough room. */ + if (ldb > *nmax) { + goto L130; + } + lbb = ldb * n; + null = m <= 0 || n <= 0; + + for (ics = 1; ics <= 2; ++ics) { + *(unsigned char *)side = *(unsigned char *)&ichs[ics - 1]; + left = *(unsigned char *)side == 'L'; + if (left) { + na = m; + } else { + na = n; + } +/* Set LDA to 1 more than minimum value if room. */ + lda = na; + if (lda < *nmax) { + ++lda; + } +/* Skip tests if not enough room. */ + if (lda > *nmax) { + goto L130; + } + laa = lda * na; + + for (icu = 1; icu <= 2; ++icu) { + *(unsigned char *)uplo = *(unsigned char *)&ichu[icu - 1]; + + for (ict = 1; ict <= 3; ++ict) { + *(unsigned char *)transa = *(unsigned char *)&icht[ + ict - 1]; + + for (icd = 1; icd <= 2; ++icd) { + *(unsigned char *)diag = *(unsigned char *)&ichd[ + icd - 1]; + + i__3 = *nalf; + for (ia = 1; ia <= i__3; ++ia) { + i__4 = ia; + alpha.r = alf[i__4].r, alpha.i = alf[i__4].i; + +/* Generate the matrix A. */ + + cmake_("tr", uplo, diag, &na, &na, &a[ + a_offset], nmax, &aa[1], &lda, &reset, + &c_b1); + +/* Generate the matrix B. */ + + cmake_("ge", " ", " ", &m, &n, &b[b_offset], + nmax, &bb[1], &ldb, &reset, &c_b1); + + ++nc; + +/* Save every datum before calling the */ +/* subroutine. */ + + *(unsigned char *)sides = *(unsigned char *) + side; + *(unsigned char *)uplos = *(unsigned char *) + uplo; + *(unsigned char *)tranas = *(unsigned char *) + transa; + *(unsigned char *)diags = *(unsigned char *) + diag; + ms = m; + ns = n; + als.r = alpha.r, als.i = alpha.i; + i__4 = laa; + for (i__ = 1; i__ <= i__4; ++i__) { + i__5 = i__; + i__6 = i__; + as[i__5].r = aa[i__6].r, as[i__5].i = aa[ + i__6].i; +/* L30: */ + } + ldas = lda; + i__4 = lbb; + for (i__ = 1; i__ <= i__4; ++i__) { + i__5 = i__; + i__6 = i__; + bs[i__5].r = bb[i__6].r, bs[i__5].i = bb[ + i__6].i; +/* L40: */ + } + ldbs = ldb; + +/* Call the subroutine. */ + + if (s_cmp(sname + 9, "mm", (ftnlen)2, (ftnlen) + 2) == 0) { + if (*trace) { + cprcn3_(ntra, &nc, sname, iorder, + side, uplo, transa, diag, &m, + &n, &alpha, &lda, &ldb, ( + ftnlen)13, (ftnlen)1, (ftnlen) + 1, (ftnlen)1, (ftnlen)1); + } + if (*rewi) { + al__1.aerr = 0; + al__1.aunit = *ntra; + f_rew(&al__1); + } + cctrmm_(iorder, side, uplo, transa, diag, + &m, &n, &alpha, &aa[1], &lda, &bb[ + 1], &ldb); + } else if (s_cmp(sname + 9, "sm", (ftnlen)2, ( + ftnlen)2) == 0) { + if (*trace) { + cprcn3_(ntra, &nc, sname, iorder, + side, uplo, transa, diag, &m, + &n, &alpha, &lda, &ldb, ( + ftnlen)13, (ftnlen)1, (ftnlen) + 1, (ftnlen)1, (ftnlen)1); + } + if (*rewi) { + al__1.aerr = 0; + al__1.aunit = *ntra; + f_rew(&al__1); + } + cctrsm_(iorder, side, uplo, transa, diag, + &m, &n, &alpha, &aa[1], &lda, &bb[ + 1], &ldb); + } + +/* Check if error-exit was taken incorrectly. */ + + if (! infoc_1.ok) { + io___236.ciunit = *nout; + s_wsfe(&io___236); + e_wsfe(); + *fatal = TRUE_; + goto L150; + } + +/* See what data changed inside subroutines. */ + + isame[0] = *(unsigned char *)sides == *( + unsigned char *)side; + isame[1] = *(unsigned char *)uplos == *( + unsigned char *)uplo; + isame[2] = *(unsigned char *)tranas == *( + unsigned char *)transa; + isame[3] = *(unsigned char *)diags == *( + unsigned char *)diag; + isame[4] = ms == m; + isame[5] = ns == n; + isame[6] = als.r == alpha.r && als.i == + alpha.i; + isame[7] = lce_(&as[1], &aa[1], &laa); + isame[8] = ldas == lda; + if (null) { + isame[9] = lce_(&bs[1], &bb[1], &lbb); + } else { + isame[9] = lceres_("ge", " ", &m, &n, &bs[ + 1], &bb[1], &ldb); + } + isame[10] = ldbs == ldb; + +/* If data was incorrectly changed, report and */ +/* return. */ + + same = TRUE_; + i__4 = nargs; + for (i__ = 1; i__ <= i__4; ++i__) { + same = same && isame[i__ - 1]; + if (! isame[i__ - 1]) { + io___239.ciunit = *nout; + s_wsfe(&io___239); + do_fio(&c__1, (char *)&i__, (ftnlen) + sizeof(integer)); + e_wsfe(); + } +/* L50: */ + } + if (! same) { + *fatal = TRUE_; + goto L150; + } + + if (! null) { + if (s_cmp(sname + 9, "mm", (ftnlen)2, ( + ftnlen)2) == 0) { + +/* Check the result. */ + + if (left) { + cmmch_(transa, "N", &m, &n, &m, & + alpha, &a[a_offset], nmax, + &b[b_offset], nmax, & + c_b1, &c__[c_offset], + nmax, &ct[1], &g[1], &bb[ + 1], &ldb, eps, &err, + fatal, nout, &c_true, ( + ftnlen)1, (ftnlen)1); + } else { + cmmch_("N", transa, &m, &n, &n, & + alpha, &b[b_offset], nmax, + &a[a_offset], nmax, & + c_b1, &c__[c_offset], + nmax, &ct[1], &g[1], &bb[ + 1], &ldb, eps, &err, + fatal, nout, &c_true); + } + } else if (s_cmp(sname + 9, "sm", (ftnlen) + 2, (ftnlen)2) == 0) { + +/* Compute approximation to original */ +/* matrix. */ + + i__4 = n; + for (j = 1; j <= i__4; ++j) { + i__5 = m; + for (i__ = 1; i__ <= i__5; ++i__) + { + i__6 = i__ + j * c_dim1; + i__7 = i__ + (j - 1) * ldb; + c__[i__6].r = bb[i__7].r, c__[i__6].i = bb[i__7].i; + i__6 = i__ + (j - 1) * ldb; + i__7 = i__ + j * b_dim1; + q__1.r = alpha.r * b[i__7].r - alpha.i * b[i__7].i, + q__1.i = alpha.r * b[i__7].i + alpha.i * b[ + i__7].r; + bb[i__6].r = q__1.r, bb[i__6].i = q__1.i; +/* L60: */ + } +/* L70: */ + } + + if (left) { + cmmch_(transa, "N", &m, &n, &m, & + c_b2, &a[a_offset], nmax, + &c__[c_offset], nmax, & + c_b1, &b[b_offset], nmax, + &ct[1], &g[1], &bb[1], & + ldb, eps, &err, fatal, + nout, &c_false); + } else { + cmmch_("N", transa, &m, &n, &n, & + c_b2, &c__[c_offset], + nmax, &a[a_offset], nmax, + &c_b1, &b[b_offset], nmax, + &ct[1], &g[1], &bb[1], & + ldb, eps, &err, fatal, + nout, &c_false); + } + } + errmax = f2cmax(errmax,err); +/* If got really bad answer, report and */ +/* return. */ + if (*fatal) { + goto L150; + } + } + +/* L80: */ + } + +/* L90: */ + } + +/* L100: */ + } + +/* L110: */ + } + +/* L120: */ + } + +L130: + ; + } + +/* L140: */ + } + +/* Report result. */ + + if (errmax < *thresh) { + if (*iorder == 0) { + io___241.ciunit = *nout; + s_wsfe(&io___241); + do_fio(&c__1, sname, (ftnlen)13); + do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); + e_wsfe(); + } + if (*iorder == 1) { + io___242.ciunit = *nout; + s_wsfe(&io___242); + do_fio(&c__1, sname, (ftnlen)13); + do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); + e_wsfe(); + } + } else { + if (*iorder == 0) { + io___243.ciunit = *nout; + s_wsfe(&io___243); + do_fio(&c__1, sname, (ftnlen)13); + do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); + do_fio(&c__1, (char *)&errmax, (ftnlen)sizeof(real)); + e_wsfe(); + } + if (*iorder == 1) { + io___244.ciunit = *nout; + s_wsfe(&io___244); + do_fio(&c__1, sname, (ftnlen)13); + do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); + do_fio(&c__1, (char *)&errmax, (ftnlen)sizeof(real)); + e_wsfe(); + } + } + goto L160; + +L150: + io___245.ciunit = *nout; + s_wsfe(&io___245); + do_fio(&c__1, sname, (ftnlen)13); + e_wsfe(); + if (*trace) { + cprcn3_(ntra, &nc, sname, iorder, side, uplo, transa, diag, &m, &n, & + alpha, &lda, &ldb); + } + +L160: + return 0; + +/* L9995: */ + +/* End of CCHK3. */ + +} /* cchk3_ */ + + +/* Subroutine */ int cprcn3_(integer *nout, integer *nc, char *sname, integer + *iorder, char *side, char *uplo, char *transa, char *diag, integer *m, + integer *n, complex *alpha, integer *lda, integer *ldb) +{ + /* Format strings */ + static char fmt_9995[] = "(1x,i6,\002: \002,a13,\002(\002,a14,\002,\002," + "a14,\002,\002,a14,\002,\002)"; + static char fmt_9994[] = "(10x,2(a14,\002,\002),2(i3,\002,\002),\002 " + "(\002,f4.1,\002,\002,f4.1,\002), A,\002,i3,\002, B,\002,i3,\002)." + "\002)"; + + /* Local variables */ + char ca[14], cd[14], cs[14], cu[14], crc[14]; + + /* Fortran I/O blocks */ + static cilist io___251 = { 0, 0, 0, fmt_9995, 0 }; + static cilist io___252 = { 0, 0, 0, fmt_9994, 0 }; + + + if (*(unsigned char *)side == 'L') { + s_copy(cs, " CblasLeft", (ftnlen)14, (ftnlen)14); + } else { + s_copy(cs, " CblasRight", (ftnlen)14, (ftnlen)14); + } + if (*(unsigned char *)uplo == 'U') { + s_copy(cu, " CblasUpper", (ftnlen)14, (ftnlen)14); + } else { + s_copy(cu, " CblasLower", (ftnlen)14, (ftnlen)14); + } + if (*(unsigned char *)transa == 'N') { + s_copy(ca, " CblasNoTrans", (ftnlen)14, (ftnlen)14); + } else if (*(unsigned char *)transa == 'T') { + s_copy(ca, " CblasTrans", (ftnlen)14, (ftnlen)14); + } else { + s_copy(ca, "CblasConjTrans", (ftnlen)14, (ftnlen)14); + } + if (*(unsigned char *)diag == 'N') { + s_copy(cd, " CblasNonUnit", (ftnlen)14, (ftnlen)14); + } else { + s_copy(cd, " CblasUnit", (ftnlen)14, (ftnlen)14); + } + if (*iorder == 1) { + s_copy(crc, " CblasRowMajor", (ftnlen)14, (ftnlen)14); + } else { + s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14); + } + io___251.ciunit = *nout; + s_wsfe(&io___251); + do_fio(&c__1, (char *)&(*nc), (ftnlen)sizeof(integer)); + do_fio(&c__1, sname, (ftnlen)13); + do_fio(&c__1, crc, (ftnlen)14); + do_fio(&c__1, cs, (ftnlen)14); + do_fio(&c__1, cu, (ftnlen)14); + e_wsfe(); + io___252.ciunit = *nout; + s_wsfe(&io___252); + do_fio(&c__1, ca, (ftnlen)14); + do_fio(&c__1, cd, (ftnlen)14); + do_fio(&c__1, (char *)&(*m), (ftnlen)sizeof(integer)); + do_fio(&c__1, (char *)&(*n), (ftnlen)sizeof(integer)); + do_fio(&c__2, (char *)&(*alpha), (ftnlen)sizeof(real)); + do_fio(&c__1, (char *)&(*lda), (ftnlen)sizeof(integer)); + do_fio(&c__1, (char *)&(*ldb), (ftnlen)sizeof(integer)); + e_wsfe(); + return 0; +} /* cprcn3_ */ + + +/* Subroutine */ int cchk4_(char *sname, real *eps, real *thresh, integer * + nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, + integer *nidim, integer *idim, integer *nalf, complex *alf, integer * + nbet, complex *bet, integer *nmax, complex *a, complex *aa, complex * + as, complex *b, complex *bb, complex *bs, complex *c__, complex *cc, + complex *cs, complex *ct, real *g, integer *iorder) +{ + /* Initialized data */ + + static char icht[2] = "NC"; + static char ichu[2] = "UL"; + + /* Format strings */ + static char fmt_9992[] = "(\002 ******* FATAL ERROR - ERROR-CALL MYEXIT " + "TAKEN ON VALID\002,\002 CALL *******\002)"; + static char fmt_9998[] = "(\002 ******* FATAL ERROR - PARAMETER NUMBER" + " \002,i2,\002 WAS CH\002,\002ANGED INCORRECTLY *******\002)"; + static char fmt_10000[] = "(\002 \002,a13,\002 PASSED THE COLUMN-MAJOR C" + "OMPUTATIONAL TESTS\002,\002 (\002,i6,\002 CALL\002,\002S)\002)"; + static char fmt_10001[] = "(\002 \002,a13,\002 PASSED THE ROW-MAJOR C" + "OMPUTATIONAL TESTS\002,\002 (\002,i6,\002 CALL\002,\002S)\002)"; + static char fmt_10002[] = "(\002 \002,a13,\002 COMPLETED THE COLUMN-MAJO" + "R COMPUTATIONAL \002,\002TESTS (\002,i6,\002 CALLS)\002,/\002 **" + "***** BUT WITH MAXIMUM TEST \002,\002RATIO \002,f8.2,\002 - SUSP" + "ECT *******\002)"; + static char fmt_10003[] = "(\002 \002,a13,\002 COMPLETED THE ROW-MAJOR " + " COMPUTATIONAL \002,\002TESTS (\002,i6,\002 CALLS)\002,/\002 **" + "***** BUT WITH MAXIMUM TEST \002,\002RATIO \002,f8.2,\002 - SUSP" + "ECT *******\002)"; + static char fmt_9995[] = "(\002 THESE ARE THE RESULTS FOR COLUMN" + " \002,i3)"; + static char fmt_9996[] = "(\002 ******* \002,a13,\002 FAILED ON CALL NUM" + "BER:\002)"; + + /* System generated locals */ + integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, + i__3, i__4, i__5, i__6, i__7; + complex q__1; + alist al__1; + + /* Local variables */ + complex beta; + integer ldas, ldcs; + logical same, conj; + complex bets; + real rals; + logical tran, null; + char uplo[1]; + integer i__, j, k, n; + extern /* Subroutine */ int cmake_(char *, char *, char *, integer *, + integer *, complex *, integer *, complex *, integer *, logical *, + complex *); + complex alpha; + extern /* Subroutine */ int cmmch_(char *, char *, integer *, integer *, + integer *, complex *, complex *, integer *, complex *, integer *, + complex *, complex *, integer *, complex *, real *, complex *, + integer *, real *, real *, logical *, integer *, logical *); + real rbeta; + logical isame[13]; + integer nargs; + real rbets; + logical reset; + char trans[1]; + logical upper; + char uplos[1]; + extern /* Subroutine */ int cprcn4_(integer *, integer *, char *, integer + *, char *, char *, integer *, integer *, complex *, integer *, + complex *, integer *), cprcn6_(integer *, + integer *, char *, integer *, char *, char *, integer *, integer * + , real *, integer *, real *, integer *); + integer ia, ib, jc, ma, na, nc, ik, in, jj, lj, ks; + extern /* Subroutine */ int ccherk_(integer *, char *, char *, integer *, + integer *, real *, complex *, integer *, real *, complex *, + integer *); + integer ns; + real ralpha; + extern logical lceres_(char *, char *, integer *, integer *, complex *, + complex *, integer *); + real errmax; + extern /* Subroutine */ int ccsyrk_(integer *, char *, char *, integer *, + integer *, complex *, complex *, integer *, complex *, complex *, + integer *); + char transs[1], transt[1]; + integer laa, lda, lcc, ldc; + extern logical lce_(complex *, complex *, integer *); + complex als; + integer ict, icu; + real err; + + /* Fortran I/O blocks */ + static cilist io___294 = { 0, 0, 0, fmt_9992, 0 }; + static cilist io___297 = { 0, 0, 0, fmt_9998, 0 }; + static cilist io___304 = { 0, 0, 0, fmt_10000, 0 }; + static cilist io___305 = { 0, 0, 0, fmt_10001, 0 }; + static cilist io___306 = { 0, 0, 0, fmt_10002, 0 }; + static cilist io___307 = { 0, 0, 0, fmt_10003, 0 }; + static cilist io___308 = { 0, 0, 0, fmt_9995, 0 }; + static cilist io___309 = { 0, 0, 0, fmt_9996, 0 }; + + + +/* Tests CHERK and CSYRK. */ + +/* Auxiliary routine for test program for Level 3 Blas. */ + +/* -- Written on 8-February-1989. */ +/* Jack Dongarra, Argonne National Laboratory. */ +/* Iain Duff, AERE Harwell. */ +/* Jeremy Du Croz, Numerical Algorithms Group Ltd. */ +/* Sven Hammarling, Numerical Algorithms Group Ltd. */ + + /* Parameter adjustments */ + --idim; + --alf; + --bet; + --g; + --ct; + --cs; + --cc; + c_dim1 = *nmax; + c_offset = 1 + c_dim1 * 1; + c__ -= c_offset; + --bs; + --bb; + b_dim1 = *nmax; + b_offset = 1 + b_dim1 * 1; + b -= b_offset; + --as; + --aa; + a_dim1 = *nmax; + a_offset = 1 + a_dim1 * 1; + a -= a_offset; + + /* Function Body */ + conj = s_cmp(sname + 7, "he", (ftnlen)2, (ftnlen)2) == 0; + + nargs = 10; + nc = 0; + reset = TRUE_; + errmax = 0.f; + + i__1 = *nidim; + for (in = 1; in <= i__1; ++in) { + n = idim[in]; +/* Set LDC to 1 more than minimum value if room. */ + ldc = n; + if (ldc < *nmax) { + ++ldc; + } +/* Skip tests if not enough room. */ + if (ldc > *nmax) { + goto L100; + } + lcc = ldc * n; + + i__2 = *nidim; + for (ik = 1; ik <= i__2; ++ik) { + k = idim[ik]; + + for (ict = 1; ict <= 2; ++ict) { + *(unsigned char *)trans = *(unsigned char *)&icht[ict - 1]; + tran = *(unsigned char *)trans == 'C'; + if (tran && ! conj) { + *(unsigned char *)trans = 'T'; + } + if (tran) { + ma = k; + na = n; + } else { + ma = n; + na = k; + } +/* Set LDA to 1 more than minimum value if room. */ + lda = ma; + if (lda < *nmax) { + ++lda; + } +/* Skip tests if not enough room. */ + if (lda > *nmax) { + goto L80; + } + laa = lda * na; + +/* Generate the matrix A. */ + + cmake_("ge", " ", " ", &ma, &na, &a[a_offset], nmax, &aa[1], & + lda, &reset, &c_b1); + + for (icu = 1; icu <= 2; ++icu) { + *(unsigned char *)uplo = *(unsigned char *)&ichu[icu - 1]; + upper = *(unsigned char *)uplo == 'U'; + + i__3 = *nalf; + for (ia = 1; ia <= i__3; ++ia) { + i__4 = ia; + alpha.r = alf[i__4].r, alpha.i = alf[i__4].i; + if (conj) { + ralpha = alpha.r; + q__1.r = ralpha, q__1.i = 0.f; + alpha.r = q__1.r, alpha.i = q__1.i; + } + + i__4 = *nbet; + for (ib = 1; ib <= i__4; ++ib) { + i__5 = ib; + beta.r = bet[i__5].r, beta.i = bet[i__5].i; + if (conj) { + rbeta = beta.r; + q__1.r = rbeta, q__1.i = 0.f; + beta.r = q__1.r, beta.i = q__1.i; + } + null = n <= 0; + if (conj) { + null = null || (k <= 0 || ralpha == 0.f) && + rbeta == 1.f; + } + +/* Generate the matrix C. */ + + cmake_(sname + 7, uplo, " ", &n, &n, &c__[ + c_offset], nmax, &cc[1], &ldc, &reset, & + c_b1); + + ++nc; + +/* Save every datum before calling the subroutine. */ + + *(unsigned char *)uplos = *(unsigned char *)uplo; + *(unsigned char *)transs = *(unsigned char *) + trans; + ns = n; + ks = k; + if (conj) { + rals = ralpha; + } else { + als.r = alpha.r, als.i = alpha.i; + } + i__5 = laa; + for (i__ = 1; i__ <= i__5; ++i__) { + i__6 = i__; + i__7 = i__; + as[i__6].r = aa[i__7].r, as[i__6].i = aa[i__7] + .i; +/* L10: */ + } + ldas = lda; + if (conj) { + rbets = rbeta; + } else { + bets.r = beta.r, bets.i = beta.i; + } + i__5 = lcc; + for (i__ = 1; i__ <= i__5; ++i__) { + i__6 = i__; + i__7 = i__; + cs[i__6].r = cc[i__7].r, cs[i__6].i = cc[i__7] + .i; +/* L20: */ + } + ldcs = ldc; + +/* Call the subroutine. */ + + if (conj) { + if (*trace) { + cprcn6_(ntra, &nc, sname, iorder, uplo, + trans, &n, &k, &ralpha, &lda, & + rbeta, &ldc); + } + if (*rewi) { + al__1.aerr = 0; + al__1.aunit = *ntra; + f_rew(&al__1); + } + ccherk_(iorder, uplo, trans, &n, &k, &ralpha, + &aa[1], &lda, &rbeta, &cc[1], &ldc); + } else { + if (*trace) { + cprcn4_(ntra, &nc, sname, iorder, uplo, + trans, &n, &k, &alpha, &lda, & + beta, &ldc); + } + if (*rewi) { + al__1.aerr = 0; + al__1.aunit = *ntra; + f_rew(&al__1); + } + ccsyrk_(iorder, uplo, trans, &n, &k, &alpha, & + aa[1], &lda, &beta, &cc[1], &ldc); + } + +/* Check if error-exit was taken incorrectly. */ + + if (! infoc_1.ok) { + io___294.ciunit = *nout; + s_wsfe(&io___294); + e_wsfe(); + *fatal = TRUE_; + goto L120; + } + +/* See what data changed inside subroutines. */ + + isame[0] = *(unsigned char *)uplos == *(unsigned + char *)uplo; + isame[1] = *(unsigned char *)transs == *(unsigned + char *)trans; + isame[2] = ns == n; + isame[3] = ks == k; + if (conj) { + isame[4] = rals == ralpha; + } else { + isame[4] = als.r == alpha.r && als.i == + alpha.i; + } + isame[5] = lce_(&as[1], &aa[1], &laa); + isame[6] = ldas == lda; + if (conj) { + isame[7] = rbets == rbeta; + } else { + isame[7] = bets.r == beta.r && bets.i == + beta.i; + } + if (null) { + isame[8] = lce_(&cs[1], &cc[1], &lcc); + } else { + isame[8] = lceres_(sname + 7, uplo, &n, &n, & + cs[1], &cc[1], &ldc); + } + isame[9] = ldcs == ldc; + +/* If data was incorrectly changed, report and */ +/* return. */ + + same = TRUE_; + i__5 = nargs; + for (i__ = 1; i__ <= i__5; ++i__) { + same = same && isame[i__ - 1]; + if (! isame[i__ - 1]) { + io___297.ciunit = *nout; + s_wsfe(&io___297); + do_fio(&c__1, (char *)&i__, (ftnlen) + sizeof(integer)); + e_wsfe(); + } +/* L30: */ + } + if (! same) { + *fatal = TRUE_; + goto L120; + } + + if (! null) { + +/* Check the result column by column. */ + + if (conj) { + *(unsigned char *)transt = 'C'; + } else { + *(unsigned char *)transt = 'T'; + } + jc = 1; + i__5 = n; + for (j = 1; j <= i__5; ++j) { + if (upper) { + jj = 1; + lj = j; + } else { + jj = j; + lj = n - j + 1; + } + if (tran) { + cmmch_(transt, "N", &lj, &c__1, &k, & + alpha, &a[jj * a_dim1 + 1], + nmax, &a[j * a_dim1 + 1], + nmax, &beta, &c__[jj + j * + c_dim1], nmax, &ct[1], &g[1], + &cc[jc], &ldc, eps, &err, + fatal, nout, &c_true); + } else { + cmmch_("N", transt, &lj, &c__1, &k, & + alpha, &a[jj + a_dim1], nmax, + &a[j + a_dim1], nmax, &beta, & + c__[jj + j * c_dim1], nmax, & + ct[1], &g[1], &cc[jc], &ldc, + eps, &err, fatal, nout, & + c_true); + } + if (upper) { + jc += ldc; + } else { + jc = jc + ldc + 1; + } + errmax = f2cmax(errmax,err); +/* If got really bad answer, report and */ +/* return. */ + if (*fatal) { + goto L110; + } +/* L40: */ + } + } + +/* L50: */ + } + +/* L60: */ + } + +/* L70: */ + } + +L80: + ; + } + +/* L90: */ + } + +L100: + ; + } + +/* Report result. */ + + if (errmax < *thresh) { + if (*iorder == 0) { + io___304.ciunit = *nout; + s_wsfe(&io___304); + do_fio(&c__1, sname, (ftnlen)13); + do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); + e_wsfe(); + } + if (*iorder == 1) { + io___305.ciunit = *nout; + s_wsfe(&io___305); + do_fio(&c__1, sname, (ftnlen)13); + do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); + e_wsfe(); + } + } else { + if (*iorder == 0) { + io___306.ciunit = *nout; + s_wsfe(&io___306); + do_fio(&c__1, sname, (ftnlen)13); + do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); + do_fio(&c__1, (char *)&errmax, (ftnlen)sizeof(real)); + e_wsfe(); + } + if (*iorder == 1) { + io___307.ciunit = *nout; + s_wsfe(&io___307); + do_fio(&c__1, sname, (ftnlen)13); + do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); + do_fio(&c__1, (char *)&errmax, (ftnlen)sizeof(real)); + e_wsfe(); + } + } + goto L130; + +L110: + if (n > 1) { + io___308.ciunit = *nout; + s_wsfe(&io___308); + do_fio(&c__1, (char *)&j, (ftnlen)sizeof(integer)); + e_wsfe(); + } + +L120: + io___309.ciunit = *nout; + s_wsfe(&io___309); + do_fio(&c__1, sname, (ftnlen)13); + e_wsfe(); + if (conj) { + cprcn6_(nout, &nc, sname, iorder, uplo, trans, &n, &k, &ralpha, &lda, + &rbeta, &ldc); + } else { + cprcn4_(nout, &nc, sname, iorder, uplo, trans, &n, &k, &alpha, &lda, & + beta, &ldc); + } + +L130: + return 0; + +/* L9994: */ +/* L9993: */ + +/* End of CCHK4. */ + +} /* cchk4_ */ + + +/* Subroutine */ int cprcn4_(integer *nout, integer *nc, char *sname, integer + *iorder, char *uplo, char *transa, integer *n, integer *k, complex * + alpha, integer *lda, complex *beta, integer *ldc) +{ + /* Format strings */ + static char fmt_9995[] = "(1x,i6,\002: \002,a13,\002(\002,3(a14,\002," + "\002))"; + static char fmt_9994[] = "(10x,2(i3,\002,\002),\002 (\002,f4.1,\002,\002" + ",f4.1,\002), A,\002,i3,\002, (\002,f4.1,\002,\002,f4.1,\002), C" + ",\002,i3,\002).\002)"; + + /* Local variables */ + char ca[14], cu[14], crc[14]; + + /* Fortran I/O blocks */ + static cilist io___313 = { 0, 0, 0, fmt_9995, 0 }; + static cilist io___314 = { 0, 0, 0, fmt_9994, 0 }; + + + if (*(unsigned char *)uplo == 'U') { + s_copy(cu, " CblasUpper", (ftnlen)14, (ftnlen)14); + } else { + s_copy(cu, " CblasLower", (ftnlen)14, (ftnlen)14); + } + if (*(unsigned char *)transa == 'N') { + s_copy(ca, " CblasNoTrans", (ftnlen)14, (ftnlen)14); + } else if (*(unsigned char *)transa == 'T') { + s_copy(ca, " CblasTrans", (ftnlen)14, (ftnlen)14); + } else { + s_copy(ca, "CblasConjTrans", (ftnlen)14, (ftnlen)14); + } + if (*iorder == 1) { + s_copy(crc, " CblasRowMajor", (ftnlen)14, (ftnlen)14); + } else { + s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14); + } + io___313.ciunit = *nout; + s_wsfe(&io___313); + do_fio(&c__1, (char *)&(*nc), (ftnlen)sizeof(integer)); + do_fio(&c__1, sname, (ftnlen)13); + do_fio(&c__1, crc, (ftnlen)14); + do_fio(&c__1, cu, (ftnlen)14); + do_fio(&c__1, ca, (ftnlen)14); + e_wsfe(); + io___314.ciunit = *nout; + s_wsfe(&io___314); + do_fio(&c__1, (char *)&(*n), (ftnlen)sizeof(integer)); + do_fio(&c__1, (char *)&(*k), (ftnlen)sizeof(integer)); + do_fio(&c__2, (char *)&(*alpha), (ftnlen)sizeof(real)); + do_fio(&c__1, (char *)&(*lda), (ftnlen)sizeof(integer)); + do_fio(&c__2, (char *)&(*beta), (ftnlen)sizeof(real)); + do_fio(&c__1, (char *)&(*ldc), (ftnlen)sizeof(integer)); + e_wsfe(); + return 0; +} /* cprcn4_ */ + + + +/* Subroutine */ int cprcn6_(integer *nout, integer *nc, char *sname, integer + *iorder, char *uplo, char *transa, integer *n, integer *k, real * + alpha, integer *lda, real *beta, integer *ldc) +{ + /* Format strings */ + static char fmt_9995[] = "(1x,i6,\002: \002,a13,\002(\002,3(a14,\002," + "\002))"; + static char fmt_9994[] = "(10x,2(i3,\002,\002),f4.1,\002, A,\002,i3" + ",\002,\002,f4.1,\002, C,\002,i3,\002).\002)"; + + /* Local variables */ + char ca[14], cu[14], crc[14]; + + /* Fortran I/O blocks */ + static cilist io___318 = { 0, 0, 0, fmt_9995, 0 }; + static cilist io___319 = { 0, 0, 0, fmt_9994, 0 }; + + + if (*(unsigned char *)uplo == 'U') { + s_copy(cu, " CblasUpper", (ftnlen)14, (ftnlen)14); + } else { + s_copy(cu, " CblasLower", (ftnlen)14, (ftnlen)14); + } + if (*(unsigned char *)transa == 'N') { + s_copy(ca, " CblasNoTrans", (ftnlen)14, (ftnlen)14); + } else if (*(unsigned char *)transa == 'T') { + s_copy(ca, " CblasTrans", (ftnlen)14, (ftnlen)14); + } else { + s_copy(ca, "CblasConjTrans", (ftnlen)14, (ftnlen)14); + } + if (*iorder == 1) { + s_copy(crc, " CblasRowMajor", (ftnlen)14, (ftnlen)14); + } else { + s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14); + } + io___318.ciunit = *nout; + s_wsfe(&io___318); + do_fio(&c__1, (char *)&(*nc), (ftnlen)sizeof(integer)); + do_fio(&c__1, sname, (ftnlen)13); + do_fio(&c__1, crc, (ftnlen)14); + do_fio(&c__1, cu, (ftnlen)14); + do_fio(&c__1, ca, (ftnlen)14); + e_wsfe(); + io___319.ciunit = *nout; + s_wsfe(&io___319); + do_fio(&c__1, (char *)&(*n), (ftnlen)sizeof(integer)); + do_fio(&c__1, (char *)&(*k), (ftnlen)sizeof(integer)); + do_fio(&c__1, (char *)&(*alpha), (ftnlen)sizeof(real)); + do_fio(&c__1, (char *)&(*lda), (ftnlen)sizeof(integer)); + do_fio(&c__1, (char *)&(*beta), (ftnlen)sizeof(real)); + do_fio(&c__1, (char *)&(*ldc), (ftnlen)sizeof(integer)); + e_wsfe(); + return 0; +} /* cprcn6_ */ + + +/* Subroutine */ int cchk5_(char *sname, real *eps, real *thresh, integer * + nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, + integer *nidim, integer *idim, integer *nalf, complex *alf, integer * + nbet, complex *bet, integer *nmax, complex *ab, complex *aa, complex * + as, complex *bb, complex *bs, complex *c__, complex *cc, complex *cs, + complex *ct, real *g, complex *w, integer *iorder) +{ + /* Initialized data */ + + static char icht[2] = "NC"; + static char ichu[2] = "UL"; + + /* Format strings */ + static char fmt_9992[] = "(\002 ******* FATAL ERROR - ERROR-CALL MYEXIT " + "TAKEN ON VALID\002,\002 CALL *******\002)"; + static char fmt_9998[] = "(\002 ******* FATAL ERROR - PARAMETER NUMBER" + " \002,i2,\002 WAS CH\002,\002ANGED INCORRECTLY *******\002)"; + static char fmt_10000[] = "(\002 \002,a13,\002 PASSED THE COLUMN-MAJOR C" + "OMPUTATIONAL TESTS\002,\002 (\002,i6,\002 CALL\002,\002S)\002)"; + static char fmt_10001[] = "(\002 \002,a13,\002 PASSED THE ROW-MAJOR C" + "OMPUTATIONAL TESTS\002,\002 (\002,i6,\002 CALL\002,\002S)\002)"; + static char fmt_10002[] = "(\002 \002,a13,\002 COMPLETED THE COLUMN-MAJO" + "R COMPUTATIONAL \002,\002TESTS (\002,i6,\002 CALLS)\002,/\002 **" + "***** BUT WITH MAXIMUM TEST \002,\002RATIO \002,f8.2,\002 - SUSP" + "ECT *******\002)"; + static char fmt_10003[] = "(\002 \002,a13,\002 COMPLETED THE ROW-MAJOR " + " COMPUTATIONAL \002,\002TESTS (\002,i6,\002 CALLS)\002,/\002 **" + "***** BUT WITH MAXIMUM TEST \002,\002RATIO \002,f8.2,\002 - SUSP" + "ECT *******\002)"; + static char fmt_9995[] = "(\002 THESE ARE THE RESULTS FOR COLUMN" + " \002,i3)"; + static char fmt_9996[] = "(\002 ******* \002,a13,\002 FAILED ON CALL NUM" + "BER:\002)"; + + /* System generated locals */ + integer c_dim1, c_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7, i__8; + complex q__1, q__2; + alist al__1; + + /* Local variables */ + integer jjab; + complex beta; + integer ldas, ldbs, ldcs; + logical same, conj; + complex bets; + logical tran, null; + char uplo[1]; + integer i__, j, k, n; + extern /* Subroutine */ int cmake_(char *, char *, char *, integer *, + integer *, complex *, integer *, complex *, integer *, logical *, + complex *); + complex alpha; + extern /* Subroutine */ int cmmch_(char *, char *, integer *, integer *, + integer *, complex *, complex *, integer *, complex *, integer *, + complex *, complex *, integer *, complex *, real *, complex *, + integer *, real *, real *, logical *, integer *, logical *); + real rbeta; + logical isame[13]; + integer nargs; + real rbets; + logical reset; + char trans[1]; + logical upper; + char uplos[1]; + extern /* Subroutine */ int cprcn5_(integer *, integer *, char *, integer + *, char *, char *, integer *, integer *, complex *, integer *, + integer *, complex *, integer *), cprcn7_( + integer *, integer *, char *, integer *, char *, char *, integer * + , integer *, complex *, integer *, integer *, real *, integer *); + integer ia, ib, jc, ma, na, nc, ik, in, jj, lj, ks, ns; + extern logical lceres_(char *, char *, integer *, integer *, complex *, + complex *, integer *); + real errmax; + char transs[1], transt[1]; + extern /* Subroutine */ int ccher2k_(integer *, char *, char *, integer *, + integer *, complex *, complex *, integer *, complex *, integer *, + real *, complex *, integer *); + integer laa, lbb, lda, lcc, ldb, ldc; + extern logical lce_(complex *, complex *, integer *); + extern /* Subroutine */ int ccsyr2k_(integer *, char *, char *, integer *, + integer *, complex *, complex *, integer *, complex *, integer *, + complex *, complex *, integer *); + complex als; + integer ict, icu; + real err; + + /* Fortran I/O blocks */ + static cilist io___362 = { 0, 0, 0, fmt_9992, 0 }; + static cilist io___365 = { 0, 0, 0, fmt_9998, 0 }; + static cilist io___373 = { 0, 0, 0, fmt_10000, 0 }; + static cilist io___374 = { 0, 0, 0, fmt_10001, 0 }; + static cilist io___375 = { 0, 0, 0, fmt_10002, 0 }; + static cilist io___376 = { 0, 0, 0, fmt_10003, 0 }; + static cilist io___377 = { 0, 0, 0, fmt_9995, 0 }; + static cilist io___378 = { 0, 0, 0, fmt_9996, 0 }; + + + +/* Tests CHER2K and CSYR2K. */ + +/* Auxiliary routine for test program for Level 3 Blas. */ + +/* -- Written on 8-February-1989. */ +/* Jack Dongarra, Argonne National Laboratory. */ +/* Iain Duff, AERE Harwell. */ +/* Jeremy Du Croz, Numerical Algorithms Group Ltd. */ +/* Sven Hammarling, Numerical Algorithms Group Ltd. */ + + /* Parameter adjustments */ + --idim; + --alf; + --bet; + --w; + --g; + --ct; + --cs; + --cc; + c_dim1 = *nmax; + c_offset = 1 + c_dim1 * 1; + c__ -= c_offset; + --bs; + --bb; + --as; + --aa; + --ab; + + /* Function Body */ + conj = s_cmp(sname + 7, "he", (ftnlen)2, (ftnlen)2) == 0; + + nargs = 12; + nc = 0; + reset = TRUE_; + errmax = 0.f; + + i__1 = *nidim; + for (in = 1; in <= i__1; ++in) { + n = idim[in]; +/* Set LDC to 1 more than minimum value if room. */ + ldc = n; + if (ldc < *nmax) { + ++ldc; + } +/* Skip tests if not enough room. */ + if (ldc > *nmax) { + goto L130; + } + lcc = ldc * n; + + i__2 = *nidim; + for (ik = 1; ik <= i__2; ++ik) { + k = idim[ik]; + + for (ict = 1; ict <= 2; ++ict) { + *(unsigned char *)trans = *(unsigned char *)&icht[ict - 1]; + tran = *(unsigned char *)trans == 'C'; + if (tran && ! conj) { + *(unsigned char *)trans = 'T'; + } + if (tran) { + ma = k; + na = n; + } else { + ma = n; + na = k; + } +/* Set LDA to 1 more than minimum value if room. */ + lda = ma; + if (lda < *nmax) { + ++lda; + } +/* Skip tests if not enough room. */ + if (lda > *nmax) { + goto L110; + } + laa = lda * na; + +/* Generate the matrix A. */ + + if (tran) { + i__3 = *nmax << 1; + cmake_("ge", " ", " ", &ma, &na, &ab[1], &i__3, &aa[1], & + lda, &reset, &c_b1); + } else { + cmake_("ge", " ", " ", &ma, &na, &ab[1], nmax, &aa[1], & + lda, &reset, &c_b1); + } + +/* Generate the matrix B. */ + + ldb = lda; + lbb = laa; + if (tran) { + i__3 = *nmax << 1; + cmake_("ge", " ", " ", &ma, &na, &ab[k + 1], &i__3, &bb[1] + , &ldb, &reset, &c_b1); + } else { + cmake_("ge", " ", " ", &ma, &na, &ab[k * *nmax + 1], nmax, + &bb[1], &ldb, &reset, &c_b1); + } + + for (icu = 1; icu <= 2; ++icu) { + *(unsigned char *)uplo = *(unsigned char *)&ichu[icu - 1]; + upper = *(unsigned char *)uplo == 'U'; + + i__3 = *nalf; + for (ia = 1; ia <= i__3; ++ia) { + i__4 = ia; + alpha.r = alf[i__4].r, alpha.i = alf[i__4].i; + + i__4 = *nbet; + for (ib = 1; ib <= i__4; ++ib) { + i__5 = ib; + beta.r = bet[i__5].r, beta.i = bet[i__5].i; + if (conj) { + rbeta = beta.r; + q__1.r = rbeta, q__1.i = 0.f; + beta.r = q__1.r, beta.i = q__1.i; + } + null = n <= 0; + if (conj) { + null = null || (k <= 0 || alpha.r == 0.f && + alpha.i == 0.f) && rbeta == 1.f; + } + +/* Generate the matrix C. */ + + cmake_(sname + 7, uplo, " ", &n, &n, &c__[ + c_offset], nmax, &cc[1], &ldc, &reset, & + c_b1); + + ++nc; + +/* Save every datum before calling the subroutine. */ + + *(unsigned char *)uplos = *(unsigned char *)uplo; + *(unsigned char *)transs = *(unsigned char *) + trans; + ns = n; + ks = k; + als.r = alpha.r, als.i = alpha.i; + i__5 = laa; + for (i__ = 1; i__ <= i__5; ++i__) { + i__6 = i__; + i__7 = i__; + as[i__6].r = aa[i__7].r, as[i__6].i = aa[i__7] + .i; +/* L10: */ + } + ldas = lda; + i__5 = lbb; + for (i__ = 1; i__ <= i__5; ++i__) { + i__6 = i__; + i__7 = i__; + bs[i__6].r = bb[i__7].r, bs[i__6].i = bb[i__7] + .i; +/* L20: */ + } + ldbs = ldb; + if (conj) { + rbets = rbeta; + } else { + bets.r = beta.r, bets.i = beta.i; + } + i__5 = lcc; + for (i__ = 1; i__ <= i__5; ++i__) { + i__6 = i__; + i__7 = i__; + cs[i__6].r = cc[i__7].r, cs[i__6].i = cc[i__7] + .i; +/* L30: */ + } + ldcs = ldc; + +/* Call the subroutine. */ + + if (conj) { + if (*trace) { + cprcn7_(ntra, &nc, sname, iorder, uplo, + trans, &n, &k, &alpha, &lda, &ldb, + &rbeta, &ldc); + } + if (*rewi) { + al__1.aerr = 0; + al__1.aunit = *ntra; + f_rew(&al__1); + } + ccher2k_(iorder, uplo, trans, &n, &k, &alpha, + &aa[1], &lda, &bb[1], &ldb, &rbeta, & + cc[1], &ldc); + } else { + if (*trace) { + cprcn5_(ntra, &nc, sname, iorder, uplo, + trans, &n, &k, &alpha, &lda, &ldb, + &beta, &ldc); + } + if (*rewi) { + al__1.aerr = 0; + al__1.aunit = *ntra; + f_rew(&al__1); + } + ccsyr2k_(iorder, uplo, trans, &n, &k, &alpha, + &aa[1], &lda, &bb[1], &ldb, &beta, & + cc[1], &ldc); + } + +/* Check if error-exit was taken incorrectly. */ + + if (! infoc_1.ok) { + io___362.ciunit = *nout; + s_wsfe(&io___362); + e_wsfe(); + *fatal = TRUE_; + goto L150; + } + +/* See what data changed inside subroutines. */ + + isame[0] = *(unsigned char *)uplos == *(unsigned + char *)uplo; + isame[1] = *(unsigned char *)transs == *(unsigned + char *)trans; + isame[2] = ns == n; + isame[3] = ks == k; + isame[4] = als.r == alpha.r && als.i == alpha.i; + isame[5] = lce_(&as[1], &aa[1], &laa); + isame[6] = ldas == lda; + isame[7] = lce_(&bs[1], &bb[1], &lbb); + isame[8] = ldbs == ldb; + if (conj) { + isame[9] = rbets == rbeta; + } else { + isame[9] = bets.r == beta.r && bets.i == + beta.i; + } + if (null) { + isame[10] = lce_(&cs[1], &cc[1], &lcc); + } else { + isame[10] = lceres_("he", uplo, &n, &n, &cs[1] + , &cc[1], &ldc); + } + isame[11] = ldcs == ldc; + +/* If data was incorrectly changed, report and */ +/* return. */ + + same = TRUE_; + i__5 = nargs; + for (i__ = 1; i__ <= i__5; ++i__) { + same = same && isame[i__ - 1]; + if (! isame[i__ - 1]) { + io___365.ciunit = *nout; + s_wsfe(&io___365); + do_fio(&c__1, (char *)&i__, (ftnlen) + sizeof(integer)); + e_wsfe(); + } +/* L40: */ + } + if (! same) { + *fatal = TRUE_; + goto L150; + } + + if (! null) { + +/* Check the result column by column. */ + + if (conj) { + *(unsigned char *)transt = 'C'; + } else { + *(unsigned char *)transt = 'T'; + } + jjab = 1; + jc = 1; + i__5 = n; + for (j = 1; j <= i__5; ++j) { + if (upper) { + jj = 1; + lj = j; + } else { + jj = j; + lj = n - j + 1; + } + if (tran) { + i__6 = k; + for (i__ = 1; i__ <= i__6; ++i__) { + i__7 = i__; + i__8 = (j - 1 << 1) * *nmax + k + + i__; + q__1.r = alpha.r * ab[i__8].r - + alpha.i * ab[i__8].i, + q__1.i = alpha.r * ab[ + i__8].i + alpha.i * ab[ + i__8].r; + w[i__7].r = q__1.r, w[i__7].i = + q__1.i; + if (conj) { + i__7 = k + i__; + r_cnjg(&q__2, &alpha); + i__8 = (j - 1 << 1) * *nmax + i__; + q__1.r = q__2.r * ab[i__8].r - q__2.i * ab[i__8].i, + q__1.i = q__2.r * ab[i__8].i + q__2.i * ab[ + i__8].r; + w[i__7].r = q__1.r, w[i__7].i = q__1.i; + } else { + i__7 = k + i__; + i__8 = (j - 1 << 1) * *nmax + i__; + q__1.r = alpha.r * ab[i__8].r - alpha.i * ab[i__8] + .i, q__1.i = alpha.r * ab[i__8].i + alpha.i + * ab[i__8].r; + w[i__7].r = q__1.r, w[i__7].i = q__1.i; + } +/* L50: */ + } + i__6 = k << 1; + i__7 = *nmax << 1; + i__8 = *nmax << 1; + cmmch_(transt, "N", &lj, &c__1, &i__6, + &c_b2, &ab[jjab], &i__7, &w[ + 1], &i__8, &beta, &c__[jj + j + * c_dim1], nmax, &ct[1], &g[1] + , &cc[jc], &ldc, eps, &err, + fatal, nout, &c_true); + } else { + i__6 = k; + for (i__ = 1; i__ <= i__6; ++i__) { + if (conj) { + i__7 = i__; + r_cnjg(&q__2, &ab[(k + i__ - 1) * *nmax + j]); + q__1.r = alpha.r * q__2.r - alpha.i * q__2.i, + q__1.i = alpha.r * q__2.i + alpha.i * + q__2.r; + w[i__7].r = q__1.r, w[i__7].i = q__1.i; + i__7 = k + i__; + i__8 = (i__ - 1) * *nmax + j; + q__2.r = alpha.r * ab[i__8].r - alpha.i * ab[i__8] + .i, q__2.i = alpha.r * ab[i__8].i + alpha.i + * ab[i__8].r; + r_cnjg(&q__1, &q__2); + w[i__7].r = q__1.r, w[i__7].i = q__1.i; + } else { + i__7 = i__; + i__8 = (k + i__ - 1) * *nmax + j; + q__1.r = alpha.r * ab[i__8].r - alpha.i * ab[i__8] + .i, q__1.i = alpha.r * ab[i__8].i + alpha.i + * ab[i__8].r; + w[i__7].r = q__1.r, w[i__7].i = q__1.i; + i__7 = k + i__; + i__8 = (i__ - 1) * *nmax + j; + q__1.r = alpha.r * ab[i__8].r - alpha.i * ab[i__8] + .i, q__1.i = alpha.r * ab[i__8].i + alpha.i + * ab[i__8].r; + w[i__7].r = q__1.r, w[i__7].i = q__1.i; + } +/* L60: */ + } + i__6 = k << 1; + i__7 = *nmax << 1; + cmmch_("N", "N", &lj, &c__1, &i__6, & + c_b2, &ab[jj], nmax, &w[1], & + i__7, &beta, &c__[jj + j * + c_dim1], nmax, &ct[1], &g[1], + &cc[jc], &ldc, eps, &err, + fatal, nout, &c_true); + } + if (upper) { + jc += ldc; + } else { + jc = jc + ldc + 1; + if (tran) { + jjab += *nmax << 1; + } + } + errmax = f2cmax(errmax,err); +/* If got really bad answer, report and */ +/* return. */ + if (*fatal) { + goto L140; + } +/* L70: */ + } + } + +/* L80: */ + } + +/* L90: */ + } + +/* L100: */ + } + +L110: + ; + } + +/* L120: */ + } + +L130: + ; + } + +/* Report result. */ + + if (errmax < *thresh) { + if (*iorder == 0) { + io___373.ciunit = *nout; + s_wsfe(&io___373); + do_fio(&c__1, sname, (ftnlen)13); + do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); + e_wsfe(); + } + if (*iorder == 1) { + io___374.ciunit = *nout; + s_wsfe(&io___374); + do_fio(&c__1, sname, (ftnlen)13); + do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); + e_wsfe(); + } + } else { + if (*iorder == 0) { + io___375.ciunit = *nout; + s_wsfe(&io___375); + do_fio(&c__1, sname, (ftnlen)13); + do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); + do_fio(&c__1, (char *)&errmax, (ftnlen)sizeof(real)); + e_wsfe(); + } + if (*iorder == 1) { + io___376.ciunit = *nout; + s_wsfe(&io___376); + do_fio(&c__1, sname, (ftnlen)13); + do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); + do_fio(&c__1, (char *)&errmax, (ftnlen)sizeof(real)); + e_wsfe(); + } + } + goto L160; + +L140: + if (n > 1) { + io___377.ciunit = *nout; + s_wsfe(&io___377); + do_fio(&c__1, (char *)&j, (ftnlen)sizeof(integer)); + e_wsfe(); + } + +L150: + io___378.ciunit = *nout; + s_wsfe(&io___378); + do_fio(&c__1, sname, (ftnlen)13); + e_wsfe(); + if (conj) { + cprcn7_(nout, &nc, sname, iorder, uplo, trans, &n, &k, &alpha, &lda, & + ldb, &rbeta, &ldc); + } else { + cprcn5_(nout, &nc, sname, iorder, uplo, trans, &n, &k, &alpha, &lda, & + ldb, &beta, &ldc); + } + +L160: + return 0; + +/* L9994: */ +/* L9993: */ + +/* End of CCHK5. */ + +} /* cchk5_ */ + + +/* Subroutine */ int cprcn5_(integer *nout, integer *nc, char *sname, integer + *iorder, char *uplo, char *transa, integer *n, integer *k, complex * + alpha, integer *lda, integer *ldb, complex *beta, integer *ldc) +{ + /* Format strings */ + static char fmt_9995[] = "(1x,i6,\002: \002,a13,\002(\002,3(a14,\002," + "\002))"; + static char fmt_9994[] = "(10x,2(i3,\002,\002),\002 (\002,f4.1,\002,\002" + ",f4.1,\002), A,\002,i3,\002, B\002,i3,\002, (\002,f4.1,\002,\002" + ",f4.1,\002), C,\002,i3,\002).\002)"; + + /* Local variables */ + char ca[14], cu[14], crc[14]; + + /* Fortran I/O blocks */ + static cilist io___382 = { 0, 0, 0, fmt_9995, 0 }; + static cilist io___383 = { 0, 0, 0, fmt_9994, 0 }; + + + if (*(unsigned char *)uplo == 'U') { + s_copy(cu, " CblasUpper", (ftnlen)14, (ftnlen)14); + } else { + s_copy(cu, " CblasLower", (ftnlen)14, (ftnlen)14); + } + if (*(unsigned char *)transa == 'N') { + s_copy(ca, " CblasNoTrans", (ftnlen)14, (ftnlen)14); + } else if (*(unsigned char *)transa == 'T') { + s_copy(ca, " CblasTrans", (ftnlen)14, (ftnlen)14); + } else { + s_copy(ca, "CblasConjTrans", (ftnlen)14, (ftnlen)14); + } + if (*iorder == 1) { + s_copy(crc, " CblasRowMajor", (ftnlen)14, (ftnlen)14); + } else { + s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14); + } + io___382.ciunit = *nout; + s_wsfe(&io___382); + do_fio(&c__1, (char *)&(*nc), (ftnlen)sizeof(integer)); + do_fio(&c__1, sname, (ftnlen)13); + do_fio(&c__1, crc, (ftnlen)14); + do_fio(&c__1, cu, (ftnlen)14); + do_fio(&c__1, ca, (ftnlen)14); + e_wsfe(); + io___383.ciunit = *nout; + s_wsfe(&io___383); + do_fio(&c__1, (char *)&(*n), (ftnlen)sizeof(integer)); + do_fio(&c__1, (char *)&(*k), (ftnlen)sizeof(integer)); + do_fio(&c__2, (char *)&(*alpha), (ftnlen)sizeof(real)); + do_fio(&c__1, (char *)&(*lda), (ftnlen)sizeof(integer)); + do_fio(&c__1, (char *)&(*ldb), (ftnlen)sizeof(integer)); + do_fio(&c__2, (char *)&(*beta), (ftnlen)sizeof(real)); + do_fio(&c__1, (char *)&(*ldc), (ftnlen)sizeof(integer)); + e_wsfe(); + return 0; +} /* cprcn5_ */ + + + +/* Subroutine */ int cprcn7_(integer *nout, integer *nc, char *sname, integer + *iorder, char *uplo, char *transa, integer *n, integer *k, complex * + alpha, integer *lda, integer *ldb, real *beta, integer *ldc) +{ + /* Format strings */ + static char fmt_9995[] = "(1x,i6,\002: \002,a13,\002(\002,3(a14,\002," + "\002))"; + static char fmt_9994[] = "(10x,2(i3,\002,\002),\002 (\002,f4.1,\002,\002" + ",f4.1,\002), A,\002,i3,\002, B\002,i3,\002,\002,f4.1,\002, C," + "\002,i3,\002).\002)"; + + /* Local variables */ + char ca[14], cu[14], crc[14]; + + /* Fortran I/O blocks */ + static cilist io___387 = { 0, 0, 0, fmt_9995, 0 }; + static cilist io___388 = { 0, 0, 0, fmt_9994, 0 }; + + + if (*(unsigned char *)uplo == 'U') { + s_copy(cu, " CblasUpper", (ftnlen)14, (ftnlen)14); + } else { + s_copy(cu, " CblasLower", (ftnlen)14, (ftnlen)14); + } + if (*(unsigned char *)transa == 'N') { + s_copy(ca, " CblasNoTrans", (ftnlen)14, (ftnlen)14); + } else if (*(unsigned char *)transa == 'T') { + s_copy(ca, " CblasTrans", (ftnlen)14, (ftnlen)14); + } else { + s_copy(ca, "CblasConjTrans", (ftnlen)14, (ftnlen)14); + } + if (*iorder == 1) { + s_copy(crc, " CblasRowMajor", (ftnlen)14, (ftnlen)14); + } else { + s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14); + } + io___387.ciunit = *nout; + s_wsfe(&io___387); + do_fio(&c__1, (char *)&(*nc), (ftnlen)sizeof(integer)); + do_fio(&c__1, sname, (ftnlen)13); + do_fio(&c__1, crc, (ftnlen)14); + do_fio(&c__1, cu, (ftnlen)14); + do_fio(&c__1, ca, (ftnlen)14); + e_wsfe(); + io___388.ciunit = *nout; + s_wsfe(&io___388); + do_fio(&c__1, (char *)&(*n), (ftnlen)sizeof(integer)); + do_fio(&c__1, (char *)&(*k), (ftnlen)sizeof(integer)); + do_fio(&c__2, (char *)&(*alpha), (ftnlen)sizeof(real)); + do_fio(&c__1, (char *)&(*lda), (ftnlen)sizeof(integer)); + do_fio(&c__1, (char *)&(*ldb), (ftnlen)sizeof(integer)); + do_fio(&c__1, (char *)&(*beta), (ftnlen)sizeof(real)); + do_fio(&c__1, (char *)&(*ldc), (ftnlen)sizeof(integer)); + e_wsfe(); + return 0; +} /* cprcn7_ */ + + +/* Subroutine */ int cmake_(char *type__, char *uplo, char *diag, integer *m, + integer *n, complex *a, integer *nmax, complex *aa, integer *lda, + logical *reset, complex *transl) +{ + /* System generated locals */ + integer a_dim1, a_offset, i__1, i__2, i__3, i__4; + real r__1; + complex q__1, q__2; + + /* Local variables */ + extern /* Complex */ VOID cbeg_(complex *, logical *); + integer ibeg, iend; + logical unit; + integer i__, j; + logical lower, upper; + integer jj; + logical gen, her, tri, sym; + + +/* Generates values for an M by N matrix A. */ +/* Stores the values in the array AA in the data structure required */ +/* by the routine, with unwanted elements set to rogue value. */ + +/* TYPE is 'ge', 'he', 'sy' or 'tr'. */ + +/* Auxiliary routine for test program for Level 3 Blas. */ + +/* -- Written on 8-February-1989. */ +/* Jack Dongarra, Argonne National Laboratory. */ +/* Iain Duff, AERE Harwell. */ +/* Jeremy Du Croz, Numerical Algorithms Group Ltd. */ +/* Sven Hammarling, Numerical Algorithms Group Ltd. */ + + /* Parameter adjustments */ + a_dim1 = *nmax; + a_offset = 1 + a_dim1 * 1; + a -= a_offset; + --aa; + + /* Function Body */ + gen = s_cmp(type__, "ge", (ftnlen)2, (ftnlen)2) == 0; + her = s_cmp(type__, "he", (ftnlen)2, (ftnlen)2) == 0; + sym = s_cmp(type__, "sy", (ftnlen)2, (ftnlen)2) == 0; + tri = s_cmp(type__, "tr", (ftnlen)2, (ftnlen)2) == 0; + upper = (her || sym || tri) && *(unsigned char *)uplo == 'U'; + lower = (her || sym || tri) && *(unsigned char *)uplo == 'L'; + unit = tri && *(unsigned char *)diag == 'U'; + +/* Generate data in array A. */ + + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + i__2 = *m; + for (i__ = 1; i__ <= i__2; ++i__) { + if (gen || upper && i__ <= j || lower && i__ >= j) { + i__3 = i__ + j * a_dim1; + cbeg_(&q__2, reset); + q__1.r = q__2.r + transl->r, q__1.i = q__2.i + transl->i; + a[i__3].r = q__1.r, a[i__3].i = q__1.i; + if (i__ != j) { +/* Set some elements to zero */ + if (*n > 3 && j == *n / 2) { + i__3 = i__ + j * a_dim1; + a[i__3].r = 0.f, a[i__3].i = 0.f; + } + if (her) { + i__3 = j + i__ * a_dim1; + r_cnjg(&q__1, &a[i__ + j * a_dim1]); + a[i__3].r = q__1.r, a[i__3].i = q__1.i; + } else if (sym) { + i__3 = j + i__ * a_dim1; + i__4 = i__ + j * a_dim1; + a[i__3].r = a[i__4].r, a[i__3].i = a[i__4].i; + } else if (tri) { + i__3 = j + i__ * a_dim1; + a[i__3].r = 0.f, a[i__3].i = 0.f; + } + } + } +/* L10: */ + } + if (her) { + i__2 = j + j * a_dim1; + i__3 = j + j * a_dim1; + r__1 = a[i__3].r; + q__1.r = r__1, q__1.i = 0.f; + a[i__2].r = q__1.r, a[i__2].i = q__1.i; + } + if (tri) { + i__2 = j + j * a_dim1; + i__3 = j + j * a_dim1; + q__1.r = a[i__3].r + 1.f, q__1.i = a[i__3].i + 0.f; + a[i__2].r = q__1.r, a[i__2].i = q__1.i; + } + if (unit) { + i__2 = j + j * a_dim1; + a[i__2].r = 1.f, a[i__2].i = 0.f; + } +/* L20: */ + } + +/* Store elements in array AS in data structure required by routine. */ + + if (s_cmp(type__, "ge", (ftnlen)2, (ftnlen)2) == 0) { + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + i__2 = *m; + for (i__ = 1; i__ <= i__2; ++i__) { + i__3 = i__ + (j - 1) * *lda; + i__4 = i__ + j * a_dim1; + aa[i__3].r = a[i__4].r, aa[i__3].i = a[i__4].i; +/* L30: */ + } + i__2 = *lda; + for (i__ = *m + 1; i__ <= i__2; ++i__) { + i__3 = i__ + (j - 1) * *lda; + aa[i__3].r = -1e10f, aa[i__3].i = 1e10f; +/* L40: */ + } +/* L50: */ + } + } else if (s_cmp(type__, "he", (ftnlen)2, (ftnlen)2) == 0 || s_cmp(type__, + "sy", (ftnlen)2, (ftnlen)2) == 0 || s_cmp(type__, "tr", (ftnlen) + 2, (ftnlen)2) == 0) { + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + if (upper) { + ibeg = 1; + if (unit) { + iend = j - 1; + } else { + iend = j; + } + } else { + if (unit) { + ibeg = j + 1; + } else { + ibeg = j; + } + iend = *n; + } + i__2 = ibeg - 1; + for (i__ = 1; i__ <= i__2; ++i__) { + i__3 = i__ + (j - 1) * *lda; + aa[i__3].r = -1e10f, aa[i__3].i = 1e10f; +/* L60: */ + } + i__2 = iend; + for (i__ = ibeg; i__ <= i__2; ++i__) { + i__3 = i__ + (j - 1) * *lda; + i__4 = i__ + j * a_dim1; + aa[i__3].r = a[i__4].r, aa[i__3].i = a[i__4].i; +/* L70: */ + } + i__2 = *lda; + for (i__ = iend + 1; i__ <= i__2; ++i__) { + i__3 = i__ + (j - 1) * *lda; + aa[i__3].r = -1e10f, aa[i__3].i = 1e10f; +/* L80: */ + } + if (her) { + jj = j + (j - 1) * *lda; + i__2 = jj; + i__3 = jj; + r__1 = aa[i__3].r; + q__1.r = r__1, q__1.i = -1e10f; + aa[i__2].r = q__1.r, aa[i__2].i = q__1.i; + } +/* L90: */ + } + } + return 0; + +/* End of CMAKE. */ + +} /* cmake_ */ + +/* Subroutine */ int cmmch_(char *transa, char *transb, integer *m, integer * + n, integer *kk, complex *alpha, complex *a, integer *lda, complex *b, + integer *ldb, complex *beta, complex *c__, integer *ldc, complex *ct, + real *g, complex *cc, integer *ldcc, real *eps, real *err, logical * + fatal, integer *nout, logical *mv) +{ + /* Format strings */ + static char fmt_9999[] = "(\002 ******* FATAL ERROR - COMPUTED RESULT IS" + " LESS THAN HAL\002,\002F ACCURATE *******\002,/\002 " + " EXPECTED RE\002,\002SULT COMPUTED R" + "ESULT\002)"; + static char fmt_9998[] = "(1x,i7,2(\002 (\002,g15.6,\002,\002,g15.6," + "\002)\002))"; + static char fmt_9997[] = "(\002 THESE ARE THE RESULTS FOR COLUMN" + " \002,i3)"; + + /* System generated locals */ + integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, cc_dim1, + cc_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7; + real r__1, r__2, r__3, r__4, r__5, r__6; + complex q__1, q__2, q__3, q__4; + + /* Local variables */ + real erri; + integer i__, j, k; + logical trana, tranb, ctrana, ctranb; + + /* Fortran I/O blocks */ + static cilist io___409 = { 0, 0, 0, fmt_9999, 0 }; + static cilist io___410 = { 0, 0, 0, fmt_9998, 0 }; + static cilist io___411 = { 0, 0, 0, fmt_9998, 0 }; + static cilist io___412 = { 0, 0, 0, fmt_9997, 0 }; + + + +/* Checks the results of the computational tests. */ + +/* Auxiliary routine for test program for Level 3 Blas. */ + +/* -- Written on 8-February-1989. */ +/* Jack Dongarra, Argonne National Laboratory. */ +/* Iain Duff, AERE Harwell. */ +/* Jeremy Du Croz, Numerical Algorithms Group Ltd. */ +/* Sven Hammarling, Numerical Algorithms Group Ltd. */ + + /* Parameter adjustments */ + a_dim1 = *lda; + a_offset = 1 + a_dim1 * 1; + a -= a_offset; + b_dim1 = *ldb; + b_offset = 1 + b_dim1 * 1; + b -= b_offset; + c_dim1 = *ldc; + c_offset = 1 + c_dim1 * 1; + c__ -= c_offset; + --ct; + --g; + cc_dim1 = *ldcc; + cc_offset = 1 + cc_dim1 * 1; + cc -= cc_offset; + + /* Function Body */ + trana = *(unsigned char *)transa == 'T' || *(unsigned char *)transa == + 'C'; + tranb = *(unsigned char *)transb == 'T' || *(unsigned char *)transb == + 'C'; + ctrana = *(unsigned char *)transa == 'C'; + ctranb = *(unsigned char *)transb == 'C'; + +/* Compute expected result, one column at a time, in CT using data */ +/* in A, B and C. */ +/* Compute gauges in G. */ + + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + + i__2 = *m; + for (i__ = 1; i__ <= i__2; ++i__) { + i__3 = i__; + ct[i__3].r = 0.f, ct[i__3].i = 0.f; + g[i__] = 0.f; +/* L10: */ + } + if (! trana && ! tranb) { + i__2 = *kk; + for (k = 1; k <= i__2; ++k) { + i__3 = *m; + for (i__ = 1; i__ <= i__3; ++i__) { + i__4 = i__; + i__5 = i__; + i__6 = i__ + k * a_dim1; + i__7 = k + j * b_dim1; + q__2.r = a[i__6].r * b[i__7].r - a[i__6].i * b[i__7].i, + q__2.i = a[i__6].r * b[i__7].i + a[i__6].i * b[ + i__7].r; + q__1.r = ct[i__5].r + q__2.r, q__1.i = ct[i__5].i + + q__2.i; + ct[i__4].r = q__1.r, ct[i__4].i = q__1.i; + i__4 = i__ + k * a_dim1; + i__5 = k + j * b_dim1; + g[i__] += ((r__1 = a[i__4].r, abs(r__1)) + (r__2 = r_imag( + &a[i__ + k * a_dim1]), abs(r__2))) * ((r__3 = b[ + i__5].r, abs(r__3)) + (r__4 = r_imag(&b[k + j * + b_dim1]), abs(r__4))); +/* L20: */ + } +/* L30: */ + } + } else if (trana && ! tranb) { + if (ctrana) { + i__2 = *kk; + for (k = 1; k <= i__2; ++k) { + i__3 = *m; + for (i__ = 1; i__ <= i__3; ++i__) { + i__4 = i__; + i__5 = i__; + r_cnjg(&q__3, &a[k + i__ * a_dim1]); + i__6 = k + j * b_dim1; + q__2.r = q__3.r * b[i__6].r - q__3.i * b[i__6].i, + q__2.i = q__3.r * b[i__6].i + q__3.i * b[i__6] + .r; + q__1.r = ct[i__5].r + q__2.r, q__1.i = ct[i__5].i + + q__2.i; + ct[i__4].r = q__1.r, ct[i__4].i = q__1.i; + i__4 = k + i__ * a_dim1; + i__5 = k + j * b_dim1; + g[i__] += ((r__1 = a[i__4].r, abs(r__1)) + (r__2 = + r_imag(&a[k + i__ * a_dim1]), abs(r__2))) * (( + r__3 = b[i__5].r, abs(r__3)) + (r__4 = r_imag( + &b[k + j * b_dim1]), abs(r__4))); +/* L40: */ + } +/* L50: */ + } + } else { + i__2 = *kk; + for (k = 1; k <= i__2; ++k) { + i__3 = *m; + for (i__ = 1; i__ <= i__3; ++i__) { + i__4 = i__; + i__5 = i__; + i__6 = k + i__ * a_dim1; + i__7 = k + j * b_dim1; + q__2.r = a[i__6].r * b[i__7].r - a[i__6].i * b[i__7] + .i, q__2.i = a[i__6].r * b[i__7].i + a[i__6] + .i * b[i__7].r; + q__1.r = ct[i__5].r + q__2.r, q__1.i = ct[i__5].i + + q__2.i; + ct[i__4].r = q__1.r, ct[i__4].i = q__1.i; + i__4 = k + i__ * a_dim1; + i__5 = k + j * b_dim1; + g[i__] += ((r__1 = a[i__4].r, abs(r__1)) + (r__2 = + r_imag(&a[k + i__ * a_dim1]), abs(r__2))) * (( + r__3 = b[i__5].r, abs(r__3)) + (r__4 = r_imag( + &b[k + j * b_dim1]), abs(r__4))); +/* L60: */ + } +/* L70: */ + } + } + } else if (! trana && tranb) { + if (ctranb) { + i__2 = *kk; + for (k = 1; k <= i__2; ++k) { + i__3 = *m; + for (i__ = 1; i__ <= i__3; ++i__) { + i__4 = i__; + i__5 = i__; + i__6 = i__ + k * a_dim1; + r_cnjg(&q__3, &b[j + k * b_dim1]); + q__2.r = a[i__6].r * q__3.r - a[i__6].i * q__3.i, + q__2.i = a[i__6].r * q__3.i + a[i__6].i * + q__3.r; + q__1.r = ct[i__5].r + q__2.r, q__1.i = ct[i__5].i + + q__2.i; + ct[i__4].r = q__1.r, ct[i__4].i = q__1.i; + i__4 = i__ + k * a_dim1; + i__5 = j + k * b_dim1; + g[i__] += ((r__1 = a[i__4].r, abs(r__1)) + (r__2 = + r_imag(&a[i__ + k * a_dim1]), abs(r__2))) * (( + r__3 = b[i__5].r, abs(r__3)) + (r__4 = r_imag( + &b[j + k * b_dim1]), abs(r__4))); +/* L80: */ + } +/* L90: */ + } + } else { + i__2 = *kk; + for (k = 1; k <= i__2; ++k) { + i__3 = *m; + for (i__ = 1; i__ <= i__3; ++i__) { + i__4 = i__; + i__5 = i__; + i__6 = i__ + k * a_dim1; + i__7 = j + k * b_dim1; + q__2.r = a[i__6].r * b[i__7].r - a[i__6].i * b[i__7] + .i, q__2.i = a[i__6].r * b[i__7].i + a[i__6] + .i * b[i__7].r; + q__1.r = ct[i__5].r + q__2.r, q__1.i = ct[i__5].i + + q__2.i; + ct[i__4].r = q__1.r, ct[i__4].i = q__1.i; + i__4 = i__ + k * a_dim1; + i__5 = j + k * b_dim1; + g[i__] += ((r__1 = a[i__4].r, abs(r__1)) + (r__2 = + r_imag(&a[i__ + k * a_dim1]), abs(r__2))) * (( + r__3 = b[i__5].r, abs(r__3)) + (r__4 = r_imag( + &b[j + k * b_dim1]), abs(r__4))); +/* L100: */ + } +/* L110: */ + } + } + } else if (trana && tranb) { + if (ctrana) { + if (ctranb) { + i__2 = *kk; + for (k = 1; k <= i__2; ++k) { + i__3 = *m; + for (i__ = 1; i__ <= i__3; ++i__) { + i__4 = i__; + i__5 = i__; + r_cnjg(&q__3, &a[k + i__ * a_dim1]); + r_cnjg(&q__4, &b[j + k * b_dim1]); + q__2.r = q__3.r * q__4.r - q__3.i * q__4.i, + q__2.i = q__3.r * q__4.i + q__3.i * + q__4.r; + q__1.r = ct[i__5].r + q__2.r, q__1.i = ct[i__5].i + + q__2.i; + ct[i__4].r = q__1.r, ct[i__4].i = q__1.i; + i__4 = k + i__ * a_dim1; + i__5 = j + k * b_dim1; + g[i__] += ((r__1 = a[i__4].r, abs(r__1)) + (r__2 = + r_imag(&a[k + i__ * a_dim1]), abs(r__2))) + * ((r__3 = b[i__5].r, abs(r__3)) + (r__4 + = r_imag(&b[j + k * b_dim1]), abs(r__4))); +/* L120: */ + } +/* L130: */ + } + } else { + i__2 = *kk; + for (k = 1; k <= i__2; ++k) { + i__3 = *m; + for (i__ = 1; i__ <= i__3; ++i__) { + i__4 = i__; + i__5 = i__; + r_cnjg(&q__3, &a[k + i__ * a_dim1]); + i__6 = j + k * b_dim1; + q__2.r = q__3.r * b[i__6].r - q__3.i * b[i__6].i, + q__2.i = q__3.r * b[i__6].i + q__3.i * b[ + i__6].r; + q__1.r = ct[i__5].r + q__2.r, q__1.i = ct[i__5].i + + q__2.i; + ct[i__4].r = q__1.r, ct[i__4].i = q__1.i; + i__4 = k + i__ * a_dim1; + i__5 = j + k * b_dim1; + g[i__] += ((r__1 = a[i__4].r, abs(r__1)) + (r__2 = + r_imag(&a[k + i__ * a_dim1]), abs(r__2))) + * ((r__3 = b[i__5].r, abs(r__3)) + (r__4 + = r_imag(&b[j + k * b_dim1]), abs(r__4))); +/* L140: */ + } +/* L150: */ + } + } + } else { + if (ctranb) { + i__2 = *kk; + for (k = 1; k <= i__2; ++k) { + i__3 = *m; + for (i__ = 1; i__ <= i__3; ++i__) { + i__4 = i__; + i__5 = i__; + i__6 = k + i__ * a_dim1; + r_cnjg(&q__3, &b[j + k * b_dim1]); + q__2.r = a[i__6].r * q__3.r - a[i__6].i * q__3.i, + q__2.i = a[i__6].r * q__3.i + a[i__6].i * + q__3.r; + q__1.r = ct[i__5].r + q__2.r, q__1.i = ct[i__5].i + + q__2.i; + ct[i__4].r = q__1.r, ct[i__4].i = q__1.i; + i__4 = k + i__ * a_dim1; + i__5 = j + k * b_dim1; + g[i__] += ((r__1 = a[i__4].r, abs(r__1)) + (r__2 = + r_imag(&a[k + i__ * a_dim1]), abs(r__2))) + * ((r__3 = b[i__5].r, abs(r__3)) + (r__4 + = r_imag(&b[j + k * b_dim1]), abs(r__4))); +/* L160: */ + } +/* L170: */ + } + } else { + i__2 = *kk; + for (k = 1; k <= i__2; ++k) { + i__3 = *m; + for (i__ = 1; i__ <= i__3; ++i__) { + i__4 = i__; + i__5 = i__; + i__6 = k + i__ * a_dim1; + i__7 = j + k * b_dim1; + q__2.r = a[i__6].r * b[i__7].r - a[i__6].i * b[ + i__7].i, q__2.i = a[i__6].r * b[i__7].i + + a[i__6].i * b[i__7].r; + q__1.r = ct[i__5].r + q__2.r, q__1.i = ct[i__5].i + + q__2.i; + ct[i__4].r = q__1.r, ct[i__4].i = q__1.i; + i__4 = k + i__ * a_dim1; + i__5 = j + k * b_dim1; + g[i__] += ((r__1 = a[i__4].r, abs(r__1)) + (r__2 = + r_imag(&a[k + i__ * a_dim1]), abs(r__2))) + * ((r__3 = b[i__5].r, abs(r__3)) + (r__4 + = r_imag(&b[j + k * b_dim1]), abs(r__4))); +/* L180: */ + } +/* L190: */ + } + } + } + } + i__2 = *m; + for (i__ = 1; i__ <= i__2; ++i__) { + i__3 = i__; + i__4 = i__; + q__2.r = alpha->r * ct[i__4].r - alpha->i * ct[i__4].i, q__2.i = + alpha->r * ct[i__4].i + alpha->i * ct[i__4].r; + i__5 = i__ + j * c_dim1; + q__3.r = beta->r * c__[i__5].r - beta->i * c__[i__5].i, q__3.i = + beta->r * c__[i__5].i + beta->i * c__[i__5].r; + q__1.r = q__2.r + q__3.r, q__1.i = q__2.i + q__3.i; + ct[i__3].r = q__1.r, ct[i__3].i = q__1.i; + i__3 = i__ + j * c_dim1; + g[i__] = ((r__1 = alpha->r, abs(r__1)) + (r__2 = r_imag(alpha), + abs(r__2))) * g[i__] + ((r__3 = beta->r, abs(r__3)) + ( + r__4 = r_imag(beta), abs(r__4))) * ((r__5 = c__[i__3].r, + abs(r__5)) + (r__6 = r_imag(&c__[i__ + j * c_dim1]), abs( + r__6))); +/* L200: */ + } + +/* Compute the error ratio for this result. */ + + *err = 0.f; + i__2 = *m; + for (i__ = 1; i__ <= i__2; ++i__) { + i__3 = i__; + i__4 = i__ + j * cc_dim1; + q__2.r = ct[i__3].r - cc[i__4].r, q__2.i = ct[i__3].i - cc[i__4] + .i; + q__1.r = q__2.r, q__1.i = q__2.i; + erri = ((r__1 = q__1.r, abs(r__1)) + (r__2 = r_imag(&q__1), abs( + r__2))) / *eps; + if (g[i__] != 0.f) { + erri /= g[i__]; + } + *err = f2cmax(*err,erri); + if (*err * sqrt(*eps) >= 1.f) { + goto L230; + } +/* L210: */ + } + +/* L220: */ + } + +/* If the loop completes, all results are at least half accurate. */ + goto L250; + +/* Report fatal error. */ + +L230: + *fatal = TRUE_; + io___409.ciunit = *nout; + s_wsfe(&io___409); + e_wsfe(); + i__1 = *m; + for (i__ = 1; i__ <= i__1; ++i__) { + if (*mv) { + io___410.ciunit = *nout; + s_wsfe(&io___410); + do_fio(&c__1, (char *)&i__, (ftnlen)sizeof(integer)); + do_fio(&c__2, (char *)&ct[i__], (ftnlen)sizeof(real)); + do_fio(&c__2, (char *)&cc[i__ + j * cc_dim1], (ftnlen)sizeof(real) + ); + e_wsfe(); + } else { + io___411.ciunit = *nout; + s_wsfe(&io___411); + do_fio(&c__1, (char *)&i__, (ftnlen)sizeof(integer)); + do_fio(&c__2, (char *)&cc[i__ + j * cc_dim1], (ftnlen)sizeof(real) + ); + do_fio(&c__2, (char *)&ct[i__], (ftnlen)sizeof(real)); + e_wsfe(); + } +/* L240: */ + } + if (*n > 1) { + io___412.ciunit = *nout; + s_wsfe(&io___412); + do_fio(&c__1, (char *)&j, (ftnlen)sizeof(integer)); + e_wsfe(); + } + +L250: + return 0; + + +/* End of CMMCH. */ + +} /* cmmch_ */ + +logical lce_(complex *ri, complex *rj, integer *lr) +{ + /* System generated locals */ + integer i__1, i__2, i__3; + logical ret_val; + + /* Local variables */ + integer i__; + + +/* Tests if two arrays are identical. */ + +/* Auxiliary routine for test program for Level 3 Blas. */ + +/* -- Written on 8-February-1989. */ +/* Jack Dongarra, Argonne National Laboratory. */ +/* Iain Duff, AERE Harwell. */ +/* Jeremy Du Croz, Numerical Algorithms Group Ltd. */ +/* Sven Hammarling, Numerical Algorithms Group Ltd. */ + + /* Parameter adjustments */ + --rj; + --ri; + + /* Function Body */ + i__1 = *lr; + for (i__ = 1; i__ <= i__1; ++i__) { + i__2 = i__; + i__3 = i__; + if (ri[i__2].r != rj[i__3].r || ri[i__2].i != rj[i__3].i) { + goto L20; + } +/* L10: */ + } + ret_val = TRUE_; + goto L30; +L20: + ret_val = FALSE_; +L30: + return ret_val; + +/* End of LCE. */ + +} /* lce_ */ + +logical lceres_(char *type__, char *uplo, integer *m, integer *n, complex *aa, + complex *as, integer *lda) +{ + /* System generated locals */ + integer aa_dim1, aa_offset, as_dim1, as_offset, i__1, i__2, i__3, i__4; + logical ret_val; + + /* Local variables */ + integer ibeg, iend, i__, j; + logical upper; + + +/* Tests if selected elements in two arrays are equal. */ + +/* TYPE is 'ge' or 'he' or 'sy'. */ + +/* Auxiliary routine for test program for Level 3 Blas. */ + +/* -- Written on 8-February-1989. */ +/* Jack Dongarra, Argonne National Laboratory. */ +/* Iain Duff, AERE Harwell. */ +/* Jeremy Du Croz, Numerical Algorithms Group Ltd. */ +/* Sven Hammarling, Numerical Algorithms Group Ltd. */ + + /* Parameter adjustments */ + as_dim1 = *lda; + as_offset = 1 + as_dim1 * 1; + as -= as_offset; + aa_dim1 = *lda; + aa_offset = 1 + aa_dim1 * 1; + aa -= aa_offset; + + /* Function Body */ + upper = *(unsigned char *)uplo == 'U'; + if (s_cmp(type__, "ge", (ftnlen)2, (ftnlen)2) == 0) { + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + i__2 = *lda; + for (i__ = *m + 1; i__ <= i__2; ++i__) { + i__3 = i__ + j * aa_dim1; + i__4 = i__ + j * as_dim1; + if (aa[i__3].r != as[i__4].r || aa[i__3].i != as[i__4].i) { + goto L70; + } +/* L10: */ + } +/* L20: */ + } + } else if (s_cmp(type__, "he", (ftnlen)2, (ftnlen)2) == 0 || s_cmp(type__, + "sy", (ftnlen)2, (ftnlen)2) == 0) { + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + if (upper) { + ibeg = 1; + iend = j; + } else { + ibeg = j; + iend = *n; + } + i__2 = ibeg - 1; + for (i__ = 1; i__ <= i__2; ++i__) { + i__3 = i__ + j * aa_dim1; + i__4 = i__ + j * as_dim1; + if (aa[i__3].r != as[i__4].r || aa[i__3].i != as[i__4].i) { + goto L70; + } +/* L30: */ + } + i__2 = *lda; + for (i__ = iend + 1; i__ <= i__2; ++i__) { + i__3 = i__ + j * aa_dim1; + i__4 = i__ + j * as_dim1; + if (aa[i__3].r != as[i__4].r || aa[i__3].i != as[i__4].i) { + goto L70; + } +/* L40: */ + } +/* L50: */ + } + } + +/* L60: */ + ret_val = TRUE_; + goto L80; +L70: + ret_val = FALSE_; +L80: + return ret_val; + +/* End of LCERES. */ + +} /* lceres_ */ + +/* Complex */ VOID cbeg_(complex * ret_val, logical *reset) +{ + /* System generated locals */ + real r__1, r__2; + complex q__1; + + /* Local variables */ + static integer i__, j, ic, mi, mj; + + +/* Generates complex numbers as pairs of random numbers uniformly */ +/* distributed between -0.5 and 0.5. */ + +/* Auxiliary routine for test program for Level 3 Blas. */ + +/* -- Written on 8-February-1989. */ +/* Jack Dongarra, Argonne National Laboratory. */ +/* Iain Duff, AERE Harwell. */ +/* Jeremy Du Croz, Numerical Algorithms Group Ltd. */ +/* Sven Hammarling, Numerical Algorithms Group Ltd. */ + + if (*reset) { +/* Initialize local variables. */ + mi = 891; + mj = 457; + i__ = 7; + j = 7; + ic = 0; + *reset = FALSE_; + } + +/* The sequence of values of I or J is bounded between 1 and 999. */ +/* If initial I or J = 1,2,3,6,7 or 9, the period will be 50. */ +/* If initial I or J = 4 or 8, the period will be 25. */ +/* If initial I or J = 5, the period will be 10. */ +/* IC is used to break up the period by skipping 1 value of I or J */ +/* in 6. */ + + ++ic; +L10: + i__ *= mi; + j *= mj; + i__ -= i__ / 1000 * 1000; + j -= j / 1000 * 1000; + if (ic >= 5) { + ic = 0; + goto L10; + } + r__1 = (i__ - 500) / 1001.f; + r__2 = (j - 500) / 1001.f; + q__1.r = r__1, q__1.i = r__2; + ret_val->r = q__1.r, ret_val->i = q__1.i; + return ; + +/* End of CBEG. */ + +} /* cbeg_ */ + +real sdiff_(real *x, real *y) +{ + /* System generated locals */ + real ret_val; + + +/* Auxiliary routine for test program for Level 3 Blas. */ + +/* -- Written on 8-February-1989. */ +/* Jack Dongarra, Argonne National Laboratory. */ +/* Iain Duff, AERE Harwell. */ +/* Jeremy Du Croz, Numerical Algorithms Group Ltd. */ +/* Sven Hammarling, Numerical Algorithms Group Ltd. */ + + ret_val = *x - *y; + return ret_val; + +/* End of SDIFF. */ + +} /* sdiff_ */ + +/* Main program alias */ int cblat3_ () { MAIN__ (); return 0; } diff --git a/ctest/c_zblat3c_3m.c b/ctest/c_zblat3c_3m.c new file mode 100644 index 000000000..059daccb5 --- /dev/null +++ b/ctest/c_zblat3c_3m.c @@ -0,0 +1,4897 @@ +#include +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimagf(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} +//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +static float spow_ui(float x, integer n) { + float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static double dpow_ui(double x, integer n) { + double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#ifdef _MSC_VER +static _Fcomplex cpow_ui(complex x, integer n) { + complex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; + for(u = n; ; ) { + if(u & 01) pow.r *= x.r, pow.i *= x.i; + if(u >>= 1) x.r *= x.r, x.i *= x.i; + else break; + } + } + _Fcomplex p={pow.r, pow.i}; + return p; +} +#else +static _Complex float cpow_ui(_Complex float x, integer n) { + _Complex float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +#ifdef _MSC_VER +static _Dcomplex zpow_ui(_Dcomplex x, integer n) { + _Dcomplex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; + for(u = n; ; ) { + if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; + if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; + else break; + } + } + _Dcomplex p = {pow._Val[0], pow._Val[1]}; + return p; +} +#else +static _Complex double zpow_ui(_Complex double x, integer n) { + _Complex double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +static integer pow_ii(integer x, integer n) { + integer pow; unsigned long int u; + if (n <= 0) { + if (n == 0 || x == 1) pow = 1; + else if (x != -1) pow = x == 0 ? 1/x : 0; + else n = -n; + } + if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { + u = n; + for(pow = 1; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static integer dmaxloc_(double *w, integer s, integer e, integer *n) +{ + double m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static integer smaxloc_(float *w, integer s, integer e, integer *n) +{ + float m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { + integer n = *n_, incx = *incx_, incy = *incy_, i; +#ifdef _MSC_VER + _Fcomplex zdotc = {0.0, 0.0}; + if (incx == 1 && incy == 1) { + for (i=0;i= 0; + if (trace) { + o__1.oerr = 0; + o__1.ounit = ntra; + o__1.ofnmlen = 32; + o__1.ofnm = snaps; + o__1.orl = 0; + o__1.osta = "NEW"; + o__1.oacc = 0; + o__1.ofm = 0; + o__1.oblnk = 0; + f_open(&o__1); + } +/* Read the flag that directs rewinding of the snapshot file. */ + s_rsle(&io___7); + do_lio(&c__8, &c__1, (char *)&rewi, (ftnlen)sizeof(logical)); + e_rsle(); + rewi = rewi && trace; +/* Read the flag that directs stopping on any failure. */ + s_rsle(&io___9); + do_lio(&c__8, &c__1, (char *)&sfatal, (ftnlen)sizeof(logical)); + e_rsle(); +/* Read the flag that indicates whether error exits are to be tested. */ + s_rsle(&io___11); + do_lio(&c__8, &c__1, (char *)&tsterr, (ftnlen)sizeof(logical)); + e_rsle(); +/* Read the flag that indicates whether row-major data layout to be tested. */ + s_rsle(&io___13); + do_lio(&c__3, &c__1, (char *)&layout, (ftnlen)sizeof(integer)); + e_rsle(); +/* Read the threshold value of the test ratio */ + s_rsle(&io___15); + do_lio(&c__5, &c__1, (char *)&thresh, (ftnlen)sizeof(doublereal)); + e_rsle(); + +/* Read and check the parameter values for the tests. */ + +/* Values of N */ + s_rsle(&io___17); + do_lio(&c__3, &c__1, (char *)&nidim, (ftnlen)sizeof(integer)); + e_rsle(); + if (nidim < 1 || nidim > 9) { + s_wsfe(&io___19); + do_fio(&c__1, "N", (ftnlen)1); + do_fio(&c__1, (char *)&c__9, (ftnlen)sizeof(integer)); + e_wsfe(); + goto L220; + } + s_rsle(&io___20); + i__1 = nidim; + for (i__ = 1; i__ <= i__1; ++i__) { + do_lio(&c__3, &c__1, (char *)&idim[i__ - 1], (ftnlen)sizeof(integer)); + } + e_rsle(); + i__1 = nidim; + for (i__ = 1; i__ <= i__1; ++i__) { + if (idim[i__ - 1] < 0 || idim[i__ - 1] > 65) { + s_wsfe(&io___23); + do_fio(&c__1, (char *)&c__65, (ftnlen)sizeof(integer)); + e_wsfe(); + goto L220; + } +/* L10: */ + } +/* Values of ALPHA */ + s_rsle(&io___24); + do_lio(&c__3, &c__1, (char *)&nalf, (ftnlen)sizeof(integer)); + e_rsle(); + if (nalf < 1 || nalf > 7) { + s_wsfe(&io___26); + do_fio(&c__1, "ALPHA", (ftnlen)5); + do_fio(&c__1, (char *)&c__7, (ftnlen)sizeof(integer)); + e_wsfe(); + goto L220; + } + s_rsle(&io___27); + i__1 = nalf; + for (i__ = 1; i__ <= i__1; ++i__) { + do_lio(&c__7, &c__1, (char *)&alf[i__ - 1], (ftnlen)sizeof( + doublecomplex)); + } + e_rsle(); +/* Values of BETA */ + s_rsle(&io___29); + do_lio(&c__3, &c__1, (char *)&nbet, (ftnlen)sizeof(integer)); + e_rsle(); + if (nbet < 1 || nbet > 7) { + s_wsfe(&io___31); + do_fio(&c__1, "BETA", (ftnlen)4); + do_fio(&c__1, (char *)&c__7, (ftnlen)sizeof(integer)); + e_wsfe(); + goto L220; + } + s_rsle(&io___32); + i__1 = nbet; + for (i__ = 1; i__ <= i__1; ++i__) { + do_lio(&c__7, &c__1, (char *)&bet[i__ - 1], (ftnlen)sizeof( + doublecomplex)); + } + e_rsle(); + +/* Report values of parameters. */ + + s_wsfe(&io___34); + e_wsfe(); + s_wsfe(&io___35); + i__1 = nidim; + for (i__ = 1; i__ <= i__1; ++i__) { + do_fio(&c__1, (char *)&idim[i__ - 1], (ftnlen)sizeof(integer)); + } + e_wsfe(); + s_wsfe(&io___36); + i__1 = nalf; + for (i__ = 1; i__ <= i__1; ++i__) { + do_fio(&c__2, (char *)&alf[i__ - 1], (ftnlen)sizeof(doublereal)); + } + e_wsfe(); + s_wsfe(&io___37); + i__1 = nbet; + for (i__ = 1; i__ <= i__1; ++i__) { + do_fio(&c__2, (char *)&bet[i__ - 1], (ftnlen)sizeof(doublereal)); + } + e_wsfe(); + if (! tsterr) { + s_wsle(&io___38); + e_wsle(); + s_wsfe(&io___39); + e_wsfe(); + } + s_wsle(&io___40); + e_wsle(); + s_wsfe(&io___41); + do_fio(&c__1, (char *)&thresh, (ftnlen)sizeof(doublereal)); + e_wsfe(); + s_wsle(&io___42); + e_wsle(); + rorder = FALSE_; + corder = FALSE_; + if (layout == 2) { + rorder = TRUE_; + corder = TRUE_; + s_wsfe(&io___45); + e_wsfe(); + } else if (layout == 1) { + rorder = TRUE_; + s_wsfe(&io___46); + e_wsfe(); + } else if (layout == 0) { + corder = TRUE_; + s_wsfe(&io___47); + e_wsfe(); + } + s_wsle(&io___48); + e_wsle(); + +/* Read names of subroutines and flags which indicate */ +/* whether they are to be tested. */ + + for (i__ = 1; i__ <= 9; ++i__) { + ltest[i__ - 1] = FALSE_; +/* L20: */ + } +L30: + i__1 = s_rsfe(&io___50); + if (i__1 != 0) { + goto L60; + } + i__1 = do_fio(&c__1, snamet, (ftnlen)13); + if (i__1 != 0) { + goto L60; + } + i__1 = do_fio(&c__1, (char *)<estt, (ftnlen)sizeof(logical)); + if (i__1 != 0) { + goto L60; + } + i__1 = e_rsfe(); + if (i__1 != 0) { + goto L60; + } + for (i__ = 1; i__ <= 9; ++i__) { + if (s_cmp(snamet, snames + (i__ - 1) * 13, (ftnlen)13, (ftnlen)13) == + 0) { + goto L50; + } +/* L40: */ + } + s_wsfe(&io___53); + do_fio(&c__1, snamet, (ftnlen)13); + e_wsfe(); + s_stop("", (ftnlen)0); +L50: + ltest[i__ - 1] = ltestt; + goto L30; + +L60: + cl__1.cerr = 0; + cl__1.cunit = 5; + cl__1.csta = 0; + f_clos(&cl__1); + +/* Compute EPS (the machine precision). */ + + eps = 1.; +L70: + d__1 = eps + 1.; + if (ddiff_(&d__1, &c_b92) == 0.) { + goto L80; + } + eps *= .5; + goto L70; +L80: + eps += eps; + s_wsfe(&io___55); + do_fio(&c__1, (char *)&eps, (ftnlen)sizeof(doublereal)); + e_wsfe(); + +/* Check the reliability of ZMMCH using exact data. */ + + n = 32; + i__1 = n; + for (j = 1; j <= i__1; ++j) { + i__2 = n; + for (i__ = 1; i__ <= i__2; ++i__) { + i__3 = i__ + j * 65 - 66; +/* Computing MAX */ + i__5 = i__ - j + 1; + i__4 = f2cmax(i__5,0); + ab[i__3].r = (doublereal) i__4, ab[i__3].i = 0.; +/* L90: */ + } + i__2 = j + 4224; + ab[i__2].r = (doublereal) j, ab[i__2].i = 0.; + i__2 = (j + 65) * 65 - 65; + ab[i__2].r = (doublereal) j, ab[i__2].i = 0.; + i__2 = j - 1; + c__[i__2].r = 0., c__[i__2].i = 0.; +/* L100: */ + } + i__1 = n; + for (j = 1; j <= i__1; ++j) { + i__2 = j - 1; + i__3 = j * ((j + 1) * j) / 2 - (j + 1) * j * (j - 1) / 3; + cc[i__2].r = (doublereal) i__3, cc[i__2].i = 0.; +/* L110: */ + } +/* CC holds the exact result. On exit from ZMMCH CT holds */ +/* the result computed by ZMMCH. */ + *(unsigned char *)transa = 'N'; + *(unsigned char *)transb = 'N'; + zmmch_(transa, transb, &n, &c__1, &n, &c_b2, ab, &c__65, &ab[4225], & + c__65, &c_b1, c__, &c__65, ct, g, cc, &c__65, &eps, &err, &fatal, + &c__6, &c_true); + same = lze_(cc, ct, &n); + if (! same || err != 0.) { + s_wsfe(&io___68); + do_fio(&c__1, transa, (ftnlen)1); + do_fio(&c__1, transb, (ftnlen)1); + do_fio(&c__1, (char *)&same, (ftnlen)sizeof(logical)); + do_fio(&c__1, (char *)&err, (ftnlen)sizeof(doublereal)); + e_wsfe(); + s_stop("", (ftnlen)0); + } + *(unsigned char *)transb = 'C'; + zmmch_(transa, transb, &n, &c__1, &n, &c_b2, ab, &c__65, &ab[4225], & + c__65, &c_b1, c__, &c__65, ct, g, cc, &c__65, &eps, &err, &fatal, + &c__6, &c_true); + same = lze_(cc, ct, &n); + if (! same || err != 0.) { + s_wsfe(&io___69); + do_fio(&c__1, transa, (ftnlen)1); + do_fio(&c__1, transb, (ftnlen)1); + do_fio(&c__1, (char *)&same, (ftnlen)sizeof(logical)); + do_fio(&c__1, (char *)&err, (ftnlen)sizeof(doublereal)); + e_wsfe(); + s_stop("", (ftnlen)0); + } + i__1 = n; + for (j = 1; j <= i__1; ++j) { + i__2 = j + 4224; + i__3 = n - j + 1; + ab[i__2].r = (doublereal) i__3, ab[i__2].i = 0.; + i__2 = (j + 65) * 65 - 65; + i__3 = n - j + 1; + ab[i__2].r = (doublereal) i__3, ab[i__2].i = 0.; +/* L120: */ + } + i__1 = n; + for (j = 1; j <= i__1; ++j) { + i__2 = n - j; + i__3 = j * ((j + 1) * j) / 2 - (j + 1) * j * (j - 1) / 3; + cc[i__2].r = (doublereal) i__3, cc[i__2].i = 0.; +/* L130: */ + } + *(unsigned char *)transa = 'C'; + *(unsigned char *)transb = 'N'; + zmmch_(transa, transb, &n, &c__1, &n, &c_b2, ab, &c__65, &ab[4225], & + c__65, &c_b1, c__, &c__65, ct, g, cc, &c__65, &eps, &err, &fatal, + &c__6, &c_true); + same = lze_(cc, ct, &n); + if (! same || err != 0.) { + s_wsfe(&io___70); + do_fio(&c__1, transa, (ftnlen)1); + do_fio(&c__1, transb, (ftnlen)1); + do_fio(&c__1, (char *)&same, (ftnlen)sizeof(logical)); + do_fio(&c__1, (char *)&err, (ftnlen)sizeof(doublereal)); + e_wsfe(); + s_stop("", (ftnlen)0); + } + *(unsigned char *)transb = 'C'; + zmmch_(transa, transb, &n, &c__1, &n, &c_b2, ab, &c__65, &ab[4225], & + c__65, &c_b1, c__, &c__65, ct, g, cc, &c__65, &eps, &err, &fatal, + &c__6, &c_true); + same = lze_(cc, ct, &n); + if (! same || err != 0.) { + s_wsfe(&io___71); + do_fio(&c__1, transa, (ftnlen)1); + do_fio(&c__1, transb, (ftnlen)1); + do_fio(&c__1, (char *)&same, (ftnlen)sizeof(logical)); + do_fio(&c__1, (char *)&err, (ftnlen)sizeof(doublereal)); + e_wsfe(); + s_stop("", (ftnlen)0); + } + +/* Test each subroutine in turn. */ + + for (isnum = 1; isnum <= 9; ++isnum) { + s_wsle(&io___73); + e_wsle(); + if (! ltest[isnum - 1]) { +/* Subprogram is not to be tested. */ + s_wsfe(&io___74); + do_fio(&c__1, snames + (isnum - 1) * 13, (ftnlen)13); + e_wsfe(); + } else { + s_copy(srnamc_1.srnamt, snames + (isnum - 1) * 13, (ftnlen)13, ( + ftnlen)13); +/* Test error exits. */ + if (tsterr) { + cz3chke_(snames + (isnum - 1) * 13); + s_wsle(&io___75); + e_wsle(); + } +/* Test computations. */ + infoc_1.infot = 0; + infoc_1.ok = TRUE_; + fatal = FALSE_; + switch (isnum) { + case 1: goto L140; + case 2: goto L150; + case 3: goto L150; + case 4: goto L160; + case 5: goto L160; + case 6: goto L170; + case 7: goto L170; + case 8: goto L180; + case 9: goto L180; + } +/* Test ZGEMM, 01. */ +L140: + if (corder) { + zchk1_(snames + (isnum - 1) * 13, &eps, &thresh, &c__6, &ntra, + &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, & + nbet, bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, + cc, cs, ct, g, &c__0); + } + if (rorder) { + zchk1_(snames + (isnum - 1) * 13, &eps, &thresh, &c__6, &ntra, + &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, & + nbet, bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, + cc, cs, ct, g, &c__1); + } + goto L190; +/* Test ZHEMM, 02, ZSYMM, 03. */ +L150: + if (corder) { + zchk2_(snames + (isnum - 1) * 13, &eps, &thresh, &c__6, &ntra, + &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, & + nbet, bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, + cc, cs, ct, g, &c__0); + } + if (rorder) { + zchk2_(snames + (isnum - 1) * 13, &eps, &thresh, &c__6, &ntra, + &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, & + nbet, bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, + cc, cs, ct, g, &c__1); + } + goto L190; +/* Test ZTRMM, 04, ZTRSM, 05. */ +L160: + if (corder) { + zchk3_(snames + (isnum - 1) * 13, &eps, &thresh, &c__6, &ntra, + &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, & + c__65, ab, aa, as, &ab[4225], bb, bs, ct, g, c__, & + c__0); + } + if (rorder) { + zchk3_(snames + (isnum - 1) * 13, &eps, &thresh, &c__6, &ntra, + &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, & + c__65, ab, aa, as, &ab[4225], bb, bs, ct, g, c__, & + c__1); + } + goto L190; +/* Test ZHERK, 06, ZSYRK, 07. */ +L170: + if (corder) { + zchk4_(snames + (isnum - 1) * 13, &eps, &thresh, &c__6, &ntra, + &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, & + nbet, bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, + cc, cs, ct, g, &c__0); + } + if (rorder) { + zchk4_(snames + (isnum - 1) * 13, &eps, &thresh, &c__6, &ntra, + &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, & + nbet, bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, + cc, cs, ct, g, &c__1); + } + goto L190; +/* Test ZHER2K, 08, ZSYR2K, 09. */ +L180: + if (corder) { + zchk5_(snames + (isnum - 1) * 13, &eps, &thresh, &c__6, &ntra, + &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, & + nbet, bet, &c__65, ab, aa, as, bb, bs, c__, cc, cs, + ct, g, w, &c__0); + } + if (rorder) { + zchk5_(snames + (isnum - 1) * 13, &eps, &thresh, &c__6, &ntra, + &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, & + nbet, bet, &c__65, ab, aa, as, bb, bs, c__, cc, cs, + ct, g, w, &c__1); + } + goto L190; + +L190: + if (fatal && sfatal) { + goto L210; + } + } +/* L200: */ + } + s_wsfe(&io___82); + e_wsfe(); + goto L230; + +L210: + s_wsfe(&io___83); + e_wsfe(); + goto L230; + +L220: + s_wsfe(&io___84); + e_wsfe(); + +L230: + if (trace) { + cl__1.cerr = 0; + cl__1.cunit = ntra; + cl__1.csta = 0; + f_clos(&cl__1); + } + cl__1.cerr = 0; + cl__1.cunit = 6; + cl__1.csta = 0; + f_clos(&cl__1); + s_stop("", (ftnlen)0); + + +/* End of ZBLAT3. */ + + return 0; +} /* MAIN__ */ + +/* Subroutine */ int zchk1_(char *sname, doublereal *eps, doublereal *thresh, + integer *nout, integer *ntra, logical *trace, logical *rewi, logical * + fatal, integer *nidim, integer *idim, integer *nalf, doublecomplex * + alf, integer *nbet, doublecomplex *bet, integer *nmax, doublecomplex * + a, doublecomplex *aa, doublecomplex *as, doublecomplex *b, + doublecomplex *bb, doublecomplex *bs, doublecomplex *c__, + doublecomplex *cc, doublecomplex *cs, doublecomplex *ct, doublereal * + g, integer *iorder) +{ + /* Initialized data */ + + static char ich[3] = "NTC"; + + /* Format strings */ + static char fmt_9994[] = "(\002 ******* FATAL ERROR - ERROR-CALL MYEXIT " + "TAKEN ON VALID\002,\002 CALL *******\002)"; + static char fmt_9998[] = "(\002 ******* FATAL ERROR - PARAMETER NUMBER" + " \002,i2,\002 WAS CH\002,\002ANGED INCORRECTLY *******\002)"; + static char fmt_10000[] = "(\002 \002,a13,\002 PASSED THE COLUMN-MAJOR C" + "OMPUTATIONAL TESTS\002,\002 (\002,i6,\002 CALL\002,\002S)\002)"; + static char fmt_10001[] = "(\002 \002,a13,\002 PASSED THE ROW-MAJOR C" + "OMPUTATIONAL TESTS\002,\002 (\002,i6,\002 CALL\002,\002S)\002)"; + static char fmt_10002[] = "(\002 \002,a13,\002 COMPLETED THE COLUMN-MAJO" + "R COMPUTATIONAL \002,\002TESTS (\002,i6,\002 CALLS)\002,/\002 **" + "***** BUT WITH MAXIMUM TEST \002,\002RATIO \002,f8.2,\002 - SUSP" + "ECT *******\002)"; + static char fmt_10003[] = "(\002 \002,a13,\002 COMPLETED THE ROW-MAJOR " + " COMPUTATIONAL \002,\002TESTS (\002,i6,\002 CALLS)\002,/\002 **" + "***** BUT WITH MAXIMUM TEST \002,\002RATIO \002,f8.2,\002 - SUSP" + "ECT *******\002)"; + static char fmt_9996[] = "(\002 ******* \002,a13,\002 FAILED ON CALL NUM" + "BER:\002)"; + + /* System generated locals */ + integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, + i__3, i__4, i__5, i__6, i__7, i__8; + alist al__1; + + /* Local variables */ + extern /* Subroutine */ int czgemm3m_(integer *, char *, char *, integer * + , integer *, integer *, doublecomplex *, doublecomplex *, integer + *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, + integer *); + doublecomplex beta; + integer ldas, ldbs, ldcs; + logical same, null; + integer i__, k, m, n; + doublecomplex alpha; + logical isame[13], trana, tranb; + extern /* Subroutine */ int zmake_(char *, char *, char *, integer *, + integer *, doublecomplex *, integer *, doublecomplex *, integer *, + logical *, doublecomplex *); + integer nargs; + extern /* Subroutine */ int zmmch_(char *, char *, integer *, integer *, + integer *, doublecomplex *, doublecomplex *, integer *, + doublecomplex *, integer *, doublecomplex *, doublecomplex *, + integer *, doublecomplex *, doublereal *, doublecomplex *, + integer *, doublereal *, doublereal *, logical *, integer *, + logical *); + logical reset; + integer ia, ib; + extern /* Subroutine */ int zprcn1_(integer *, integer *, char *, integer + *, char *, char *, integer *, integer *, integer *, doublecomplex + *, integer *, integer *, doublecomplex *, integer *); + integer ma, mb, na, nb, nc, ik, im, in, ks, ms, ns; + char tranas[1], tranbs[1], transa[1], transb[1]; + doublereal errmax; + extern logical lzeres_(char *, char *, integer *, integer *, + doublecomplex *, doublecomplex *, integer *); + integer ica, icb, laa, lbb, lda, lcc, ldb, ldc; + doublecomplex als, bls; + doublereal err; + extern logical lze_(doublecomplex *, doublecomplex *, integer *); + + /* Fortran I/O blocks */ + static cilist io___128 = { 0, 0, 0, fmt_9994, 0 }; + static cilist io___131 = { 0, 0, 0, fmt_9998, 0 }; + static cilist io___133 = { 0, 0, 0, fmt_10000, 0 }; + static cilist io___134 = { 0, 0, 0, fmt_10001, 0 }; + static cilist io___135 = { 0, 0, 0, fmt_10002, 0 }; + static cilist io___136 = { 0, 0, 0, fmt_10003, 0 }; + static cilist io___137 = { 0, 0, 0, fmt_9996, 0 }; + + + +/* Tests ZGEMM. */ + +/* Auxiliary routine for test program for Level 3 Blas. */ + +/* -- Written on 8-February-1989. */ +/* Jack Dongarra, Argonne National Laboratory. */ +/* Iain Duff, AERE Harwell. */ +/* Jeremy Du Croz, Numerical Algorithms Group Ltd. */ +/* Sven Hammarling, Numerical Algorithms Group Ltd. */ + + /* Parameter adjustments */ + --idim; + --alf; + --bet; + --g; + --ct; + --cs; + --cc; + c_dim1 = *nmax; + c_offset = 1 + c_dim1 * 1; + c__ -= c_offset; + --bs; + --bb; + b_dim1 = *nmax; + b_offset = 1 + b_dim1 * 1; + b -= b_offset; + --as; + --aa; + a_dim1 = *nmax; + a_offset = 1 + a_dim1 * 1; + a -= a_offset; + + /* Function Body */ + + nargs = 13; + nc = 0; + reset = TRUE_; + errmax = 0.; + + i__1 = *nidim; + for (im = 1; im <= i__1; ++im) { + m = idim[im]; + + i__2 = *nidim; + for (in = 1; in <= i__2; ++in) { + n = idim[in]; +/* Set LDC to 1 more than minimum value if room. */ + ldc = m; + if (ldc < *nmax) { + ++ldc; + } +/* Skip tests if not enough room. */ + if (ldc > *nmax) { + goto L100; + } + lcc = ldc * n; + null = n <= 0 || m <= 0; + + i__3 = *nidim; + for (ik = 1; ik <= i__3; ++ik) { + k = idim[ik]; + + for (ica = 1; ica <= 3; ++ica) { + *(unsigned char *)transa = *(unsigned char *)&ich[ica - 1] + ; + trana = *(unsigned char *)transa == 'T' || *(unsigned + char *)transa == 'C'; + + if (trana) { + ma = k; + na = m; + } else { + ma = m; + na = k; + } +/* Set LDA to 1 more than minimum value if room. */ + lda = ma; + if (lda < *nmax) { + ++lda; + } +/* Skip tests if not enough room. */ + if (lda > *nmax) { + goto L80; + } + laa = lda * na; + +/* Generate the matrix A. */ + + zmake_("ge", " ", " ", &ma, &na, &a[a_offset], nmax, &aa[ + 1], &lda, &reset, &c_b1); + + for (icb = 1; icb <= 3; ++icb) { + *(unsigned char *)transb = *(unsigned char *)&ich[icb + - 1]; + tranb = *(unsigned char *)transb == 'T' || *(unsigned + char *)transb == 'C'; + + if (tranb) { + mb = n; + nb = k; + } else { + mb = k; + nb = n; + } +/* Set LDB to 1 more than minimum value if room. */ + ldb = mb; + if (ldb < *nmax) { + ++ldb; + } +/* Skip tests if not enough room. */ + if (ldb > *nmax) { + goto L70; + } + lbb = ldb * nb; + +/* Generate the matrix B. */ + + zmake_("ge", " ", " ", &mb, &nb, &b[b_offset], nmax, & + bb[1], &ldb, &reset, &c_b1); + + i__4 = *nalf; + for (ia = 1; ia <= i__4; ++ia) { + i__5 = ia; + alpha.r = alf[i__5].r, alpha.i = alf[i__5].i; + + i__5 = *nbet; + for (ib = 1; ib <= i__5; ++ib) { + i__6 = ib; + beta.r = bet[i__6].r, beta.i = bet[i__6].i; + +/* Generate the matrix C. */ + + zmake_("ge", " ", " ", &m, &n, &c__[c_offset], + nmax, &cc[1], &ldc, &reset, &c_b1); + + ++nc; + +/* Save every datum before calling the */ +/* subroutine. */ + + *(unsigned char *)tranas = *(unsigned char *) + transa; + *(unsigned char *)tranbs = *(unsigned char *) + transb; + ms = m; + ns = n; + ks = k; + als.r = alpha.r, als.i = alpha.i; + i__6 = laa; + for (i__ = 1; i__ <= i__6; ++i__) { + i__7 = i__; + i__8 = i__; + as[i__7].r = aa[i__8].r, as[i__7].i = aa[ + i__8].i; +/* L10: */ + } + ldas = lda; + i__6 = lbb; + for (i__ = 1; i__ <= i__6; ++i__) { + i__7 = i__; + i__8 = i__; + bs[i__7].r = bb[i__8].r, bs[i__7].i = bb[ + i__8].i; +/* L20: */ + } + ldbs = ldb; + bls.r = beta.r, bls.i = beta.i; + i__6 = lcc; + for (i__ = 1; i__ <= i__6; ++i__) { + i__7 = i__; + i__8 = i__; + cs[i__7].r = cc[i__8].r, cs[i__7].i = cc[ + i__8].i; +/* L30: */ + } + ldcs = ldc; + +/* Call the subroutine. */ + + if (*trace) { + zprcn1_(ntra, &nc, sname, iorder, transa, + transb, &m, &n, &k, &alpha, &lda, + &ldb, &beta, &ldc); + } + if (*rewi) { + al__1.aerr = 0; + al__1.aunit = *ntra; + f_rew(&al__1); + } + czgemm3m_(iorder, transa, transb, &m, &n, &k, + &alpha, &aa[1], &lda, &bb[1], &ldb, & + beta, &cc[1], &ldc); + +/* Check if error-exit was taken incorrectly. */ + + if (! infoc_1.ok) { + io___128.ciunit = *nout; + s_wsfe(&io___128); + e_wsfe(); + *fatal = TRUE_; + goto L120; + } + +/* See what data changed inside subroutines. */ + + isame[0] = *(unsigned char *)transa == *( + unsigned char *)tranas; + isame[1] = *(unsigned char *)transb == *( + unsigned char *)tranbs; + isame[2] = ms == m; + isame[3] = ns == n; + isame[4] = ks == k; + isame[5] = als.r == alpha.r && als.i == + alpha.i; + isame[6] = lze_(&as[1], &aa[1], &laa); + isame[7] = ldas == lda; + isame[8] = lze_(&bs[1], &bb[1], &lbb); + isame[9] = ldbs == ldb; + isame[10] = bls.r == beta.r && bls.i == + beta.i; + if (null) { + isame[11] = lze_(&cs[1], &cc[1], &lcc); + } else { + isame[11] = lzeres_("ge", " ", &m, &n, & + cs[1], &cc[1], &ldc); + } + isame[12] = ldcs == ldc; + +/* If data was incorrectly changed, report */ +/* and return. */ + + same = TRUE_; + i__6 = nargs; + for (i__ = 1; i__ <= i__6; ++i__) { + same = same && isame[i__ - 1]; + if (! isame[i__ - 1]) { + io___131.ciunit = *nout; + s_wsfe(&io___131); + do_fio(&c__1, (char *)&i__, (ftnlen) + sizeof(integer)); + e_wsfe(); + } +/* L40: */ + } + if (! same) { + *fatal = TRUE_; + goto L120; + } + + if (! null) { + +/* Check the result. */ + + zmmch_(transa, transb, &m, &n, &k, &alpha, + &a[a_offset], nmax, &b[b_offset], + nmax, &beta, &c__[c_offset], + nmax, &ct[1], &g[1], &cc[1], &ldc, + eps, &err, fatal, nout, &c_true); + errmax = f2cmax(errmax,err); +/* If got really bad answer, report and */ +/* return. */ + if (*fatal) { + goto L120; + } + } + +/* L50: */ + } + +/* L60: */ + } + +L70: + ; + } + +L80: + ; + } + +/* L90: */ + } + +L100: + ; + } + +/* L110: */ + } + +/* Report result. */ + + if (errmax < *thresh) { + if (*iorder == 0) { + io___133.ciunit = *nout; + s_wsfe(&io___133); + do_fio(&c__1, sname, (ftnlen)13); + do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); + e_wsfe(); + } + if (*iorder == 1) { + io___134.ciunit = *nout; + s_wsfe(&io___134); + do_fio(&c__1, sname, (ftnlen)13); + do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); + e_wsfe(); + } + } else { + if (*iorder == 0) { + io___135.ciunit = *nout; + s_wsfe(&io___135); + do_fio(&c__1, sname, (ftnlen)13); + do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); + do_fio(&c__1, (char *)&errmax, (ftnlen)sizeof(doublereal)); + e_wsfe(); + } + if (*iorder == 1) { + io___136.ciunit = *nout; + s_wsfe(&io___136); + do_fio(&c__1, sname, (ftnlen)13); + do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); + do_fio(&c__1, (char *)&errmax, (ftnlen)sizeof(doublereal)); + e_wsfe(); + } + } + goto L130; + +L120: + io___137.ciunit = *nout; + s_wsfe(&io___137); + do_fio(&c__1, sname, (ftnlen)13); + e_wsfe(); + zprcn1_(nout, &nc, sname, iorder, transa, transb, &m, &n, &k, &alpha, & + lda, &ldb, &beta, &ldc); + +L130: + return 0; + +/* L9995: */ + +/* End of ZCHK1. */ + +} /* zchk1_ */ + + +/* Subroutine */ int zprcn1_(integer *nout, integer *nc, char *sname, integer + *iorder, char *transa, char *transb, integer *m, integer *n, integer * + k, doublecomplex *alpha, integer *lda, integer *ldb, doublecomplex * + beta, integer *ldc) +{ + /* Format strings */ + static char fmt_9995[] = "(1x,i6,\002: \002,a13,\002(\002,a15,\002,\002," + "a14,\002,\002,a14,\002,\002)"; + static char fmt_9994[] = "(10x,3(i3,\002,\002),\002 (\002,f4.1,\002,\002" + ",f4.1,\002) , A,\002,i3,\002, B,\002,i3,\002, (\002,f4.1,\002" + ",\002,f4.1,\002) , C,\002,i3,\002).\002)"; + + /* Local variables */ + char crc[14], cta[14], ctb[14]; + + /* Fortran I/O blocks */ + static cilist io___141 = { 0, 0, 0, fmt_9995, 0 }; + static cilist io___142 = { 0, 0, 0, fmt_9994, 0 }; + + + if (*(unsigned char *)transa == 'N') { + s_copy(cta, " CblasNoTrans", (ftnlen)14, (ftnlen)14); + } else if (*(unsigned char *)transa == 'T') { + s_copy(cta, " CblasTrans", (ftnlen)14, (ftnlen)14); + } else { + s_copy(cta, "CblasConjTrans", (ftnlen)14, (ftnlen)14); + } + if (*(unsigned char *)transb == 'N') { + s_copy(ctb, " CblasNoTrans", (ftnlen)14, (ftnlen)14); + } else if (*(unsigned char *)transb == 'T') { + s_copy(ctb, " CblasTrans", (ftnlen)14, (ftnlen)14); + } else { + s_copy(ctb, "CblasConjTrans", (ftnlen)14, (ftnlen)14); + } + if (*iorder == 1) { + s_copy(crc, " CblasRowMajor", (ftnlen)14, (ftnlen)14); + } else { + s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14); + } + io___141.ciunit = *nout; + s_wsfe(&io___141); + do_fio(&c__1, (char *)&(*nc), (ftnlen)sizeof(integer)); + do_fio(&c__1, sname, (ftnlen)13); + do_fio(&c__1, crc, (ftnlen)14); + do_fio(&c__1, cta, (ftnlen)14); + do_fio(&c__1, ctb, (ftnlen)14); + e_wsfe(); + io___142.ciunit = *nout; + s_wsfe(&io___142); + do_fio(&c__1, (char *)&(*m), (ftnlen)sizeof(integer)); + do_fio(&c__1, (char *)&(*n), (ftnlen)sizeof(integer)); + do_fio(&c__1, (char *)&(*k), (ftnlen)sizeof(integer)); + do_fio(&c__2, (char *)&(*alpha), (ftnlen)sizeof(doublereal)); + do_fio(&c__1, (char *)&(*lda), (ftnlen)sizeof(integer)); + do_fio(&c__1, (char *)&(*ldb), (ftnlen)sizeof(integer)); + do_fio(&c__2, (char *)&(*beta), (ftnlen)sizeof(doublereal)); + do_fio(&c__1, (char *)&(*ldc), (ftnlen)sizeof(integer)); + e_wsfe(); + return 0; +} /* zprcn1_ */ + + +/* Subroutine */ int zchk2_(char *sname, doublereal *eps, doublereal *thresh, + integer *nout, integer *ntra, logical *trace, logical *rewi, logical * + fatal, integer *nidim, integer *idim, integer *nalf, doublecomplex * + alf, integer *nbet, doublecomplex *bet, integer *nmax, doublecomplex * + a, doublecomplex *aa, doublecomplex *as, doublecomplex *b, + doublecomplex *bb, doublecomplex *bs, doublecomplex *c__, + doublecomplex *cc, doublecomplex *cs, doublecomplex *ct, doublereal * + g, integer *iorder) +{ + /* Initialized data */ + + static char ichs[2] = "LR"; + static char ichu[2] = "UL"; + + /* Format strings */ + static char fmt_9994[] = "(\002 ******* FATAL ERROR - ERROR-CALL MYEXIT " + "TAKEN ON VALID\002,\002 CALL *******\002)"; + static char fmt_9998[] = "(\002 ******* FATAL ERROR - PARAMETER NUMBER" + " \002,i2,\002 WAS CH\002,\002ANGED INCORRECTLY *******\002)"; + static char fmt_10000[] = "(\002 \002,a13,\002 PASSED THE COLUMN-MAJOR C" + "OMPUTATIONAL TESTS\002,\002 (\002,i6,\002 CALL\002,\002S)\002)"; + static char fmt_10001[] = "(\002 \002,a13,\002 PASSED THE ROW-MAJOR C" + "OMPUTATIONAL TESTS\002,\002 (\002,i6,\002 CALL\002,\002S)\002)"; + static char fmt_10002[] = "(\002 \002,a13,\002 COMPLETED THE COLUMN-MAJO" + "R COMPUTATIONAL \002,\002TESTS (\002,i6,\002 CALLS)\002,/\002 **" + "***** BUT WITH MAXIMUM TEST \002,\002RATIO \002,f8.2,\002 - SUSP" + "ECT *******\002)"; + static char fmt_10003[] = "(\002 \002,a13,\002 COMPLETED THE ROW-MAJOR " + " COMPUTATIONAL \002,\002TESTS (\002,i6,\002 CALLS)\002,/\002 **" + "***** BUT WITH MAXIMUM TEST \002,\002RATIO \002,f8.2,\002 - SUSP" + "ECT *******\002)"; + static char fmt_9996[] = "(\002 ******* \002,a13,\002 FAILED ON CALL NUM" + "BER:\002)"; + + /* System generated locals */ + integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, + i__3, i__4, i__5, i__6, i__7; + alist al__1; + + /* Local variables */ + doublecomplex beta; + integer ldas, ldbs, ldcs; + logical same; + char side[1]; + logical conj, left, null; + char uplo[1]; + integer i__, m, n; + doublecomplex alpha; + logical isame[13]; + char sides[1]; + extern /* Subroutine */ int zmake_(char *, char *, char *, integer *, + integer *, doublecomplex *, integer *, doublecomplex *, integer *, + logical *, doublecomplex *); + integer nargs; + extern /* Subroutine */ int zmmch_(char *, char *, integer *, integer *, + integer *, doublecomplex *, doublecomplex *, integer *, + doublecomplex *, integer *, doublecomplex *, doublecomplex *, + integer *, doublecomplex *, doublereal *, doublecomplex *, + integer *, doublereal *, doublereal *, logical *, integer *, + logical *); + logical reset; + char uplos[1]; + integer ia, ib; + extern /* Subroutine */ int zprcn2_(integer *, integer *, char *, integer + *, char *, char *, integer *, integer *, doublecomplex *, integer + *, integer *, doublecomplex *, integer *); + integer na, nc, im, in, ms, ns; + extern /* Subroutine */ int czhemm_(integer *, char *, char *, integer *, + integer *, doublecomplex *, doublecomplex *, integer *, + doublecomplex *, integer *, doublecomplex *, doublecomplex *, + integer *); + doublereal errmax; + extern logical lzeres_(char *, char *, integer *, integer *, + doublecomplex *, doublecomplex *, integer *); + extern /* Subroutine */ int czsymm_(integer *, char *, char *, integer *, + integer *, doublecomplex *, doublecomplex *, integer *, + doublecomplex *, integer *, doublecomplex *, doublecomplex *, + integer *); + integer laa, lbb, lda, lcc, ldb, ldc, ics; + doublecomplex als, bls; + integer icu; + doublereal err; + extern logical lze_(doublecomplex *, doublecomplex *, integer *); + + /* Fortran I/O blocks */ + static cilist io___181 = { 0, 0, 0, fmt_9994, 0 }; + static cilist io___184 = { 0, 0, 0, fmt_9998, 0 }; + static cilist io___186 = { 0, 0, 0, fmt_10000, 0 }; + static cilist io___187 = { 0, 0, 0, fmt_10001, 0 }; + static cilist io___188 = { 0, 0, 0, fmt_10002, 0 }; + static cilist io___189 = { 0, 0, 0, fmt_10003, 0 }; + static cilist io___190 = { 0, 0, 0, fmt_9996, 0 }; + + + +/* Tests ZHEMM and ZSYMM. */ + +/* Auxiliary routine for test program for Level 3 Blas. */ + +/* -- Written on 8-February-1989. */ +/* Jack Dongarra, Argonne National Laboratory. */ +/* Iain Duff, AERE Harwell. */ +/* Jeremy Du Croz, Numerical Algorithms Group Ltd. */ +/* Sven Hammarling, Numerical Algorithms Group Ltd. */ + + /* Parameter adjustments */ + --idim; + --alf; + --bet; + --g; + --ct; + --cs; + --cc; + c_dim1 = *nmax; + c_offset = 1 + c_dim1 * 1; + c__ -= c_offset; + --bs; + --bb; + b_dim1 = *nmax; + b_offset = 1 + b_dim1 * 1; + b -= b_offset; + --as; + --aa; + a_dim1 = *nmax; + a_offset = 1 + a_dim1 * 1; + a -= a_offset; + + /* Function Body */ + conj = s_cmp(sname + 7, "he", (ftnlen)2, (ftnlen)2) == 0; + + nargs = 12; + nc = 0; + reset = TRUE_; + errmax = 0.; + + i__1 = *nidim; + for (im = 1; im <= i__1; ++im) { + m = idim[im]; + + i__2 = *nidim; + for (in = 1; in <= i__2; ++in) { + n = idim[in]; +/* Set LDC to 1 more than minimum value if room. */ + ldc = m; + if (ldc < *nmax) { + ++ldc; + } +/* Skip tests if not enough room. */ + if (ldc > *nmax) { + goto L90; + } + lcc = ldc * n; + null = n <= 0 || m <= 0; +/* Set LDB to 1 more than minimum value if room. */ + ldb = m; + if (ldb < *nmax) { + ++ldb; + } +/* Skip tests if not enough room. */ + if (ldb > *nmax) { + goto L90; + } + lbb = ldb * n; + +/* Generate the matrix B. */ + + zmake_("ge", " ", " ", &m, &n, &b[b_offset], nmax, &bb[1], &ldb, & + reset, &c_b1); + + for (ics = 1; ics <= 2; ++ics) { + *(unsigned char *)side = *(unsigned char *)&ichs[ics - 1]; + left = *(unsigned char *)side == 'L'; + + if (left) { + na = m; + } else { + na = n; + } +/* Set LDA to 1 more than minimum value if room. */ + lda = na; + if (lda < *nmax) { + ++lda; + } +/* Skip tests if not enough room. */ + if (lda > *nmax) { + goto L80; + } + laa = lda * na; + + for (icu = 1; icu <= 2; ++icu) { + *(unsigned char *)uplo = *(unsigned char *)&ichu[icu - 1]; + +/* Generate the hermitian or symmetric matrix A. */ + + zmake_(sname + 7, uplo, " ", &na, &na, &a[a_offset], nmax, + &aa[1], &lda, &reset, &c_b1); + + i__3 = *nalf; + for (ia = 1; ia <= i__3; ++ia) { + i__4 = ia; + alpha.r = alf[i__4].r, alpha.i = alf[i__4].i; + + i__4 = *nbet; + for (ib = 1; ib <= i__4; ++ib) { + i__5 = ib; + beta.r = bet[i__5].r, beta.i = bet[i__5].i; + +/* Generate the matrix C. */ + + zmake_("ge", " ", " ", &m, &n, &c__[c_offset], + nmax, &cc[1], &ldc, &reset, &c_b1); + + ++nc; + +/* Save every datum before calling the */ +/* subroutine. */ + + *(unsigned char *)sides = *(unsigned char *)side; + *(unsigned char *)uplos = *(unsigned char *)uplo; + ms = m; + ns = n; + als.r = alpha.r, als.i = alpha.i; + i__5 = laa; + for (i__ = 1; i__ <= i__5; ++i__) { + i__6 = i__; + i__7 = i__; + as[i__6].r = aa[i__7].r, as[i__6].i = aa[i__7] + .i; +/* L10: */ + } + ldas = lda; + i__5 = lbb; + for (i__ = 1; i__ <= i__5; ++i__) { + i__6 = i__; + i__7 = i__; + bs[i__6].r = bb[i__7].r, bs[i__6].i = bb[i__7] + .i; +/* L20: */ + } + ldbs = ldb; + bls.r = beta.r, bls.i = beta.i; + i__5 = lcc; + for (i__ = 1; i__ <= i__5; ++i__) { + i__6 = i__; + i__7 = i__; + cs[i__6].r = cc[i__7].r, cs[i__6].i = cc[i__7] + .i; +/* L30: */ + } + ldcs = ldc; + +/* Call the subroutine. */ + + if (*trace) { + zprcn2_(ntra, &nc, sname, iorder, side, uplo, + &m, &n, &alpha, &lda, &ldb, &beta, & + ldc) + ; + } + if (*rewi) { + al__1.aerr = 0; + al__1.aunit = *ntra; + f_rew(&al__1); + } + if (conj) { + czhemm_(iorder, side, uplo, &m, &n, &alpha, & + aa[1], &lda, &bb[1], &ldb, &beta, &cc[ + 1], &ldc); + } else { + czsymm_(iorder, side, uplo, &m, &n, &alpha, & + aa[1], &lda, &bb[1], &ldb, &beta, &cc[ + 1], &ldc); + } + +/* Check if error-exit was taken incorrectly. */ + + if (! infoc_1.ok) { + io___181.ciunit = *nout; + s_wsfe(&io___181); + e_wsfe(); + *fatal = TRUE_; + goto L110; + } + +/* See what data changed inside subroutines. */ + + isame[0] = *(unsigned char *)sides == *(unsigned + char *)side; + isame[1] = *(unsigned char *)uplos == *(unsigned + char *)uplo; + isame[2] = ms == m; + isame[3] = ns == n; + isame[4] = als.r == alpha.r && als.i == alpha.i; + isame[5] = lze_(&as[1], &aa[1], &laa); + isame[6] = ldas == lda; + isame[7] = lze_(&bs[1], &bb[1], &lbb); + isame[8] = ldbs == ldb; + isame[9] = bls.r == beta.r && bls.i == beta.i; + if (null) { + isame[10] = lze_(&cs[1], &cc[1], &lcc); + } else { + isame[10] = lzeres_("ge", " ", &m, &n, &cs[1], + &cc[1], &ldc); + } + isame[11] = ldcs == ldc; + +/* If data was incorrectly changed, report and */ +/* return. */ + + same = TRUE_; + i__5 = nargs; + for (i__ = 1; i__ <= i__5; ++i__) { + same = same && isame[i__ - 1]; + if (! isame[i__ - 1]) { + io___184.ciunit = *nout; + s_wsfe(&io___184); + do_fio(&c__1, (char *)&i__, (ftnlen) + sizeof(integer)); + e_wsfe(); + } +/* L40: */ + } + if (! same) { + *fatal = TRUE_; + goto L110; + } + + if (! null) { + +/* Check the result. */ + + if (left) { + zmmch_("N", "N", &m, &n, &m, &alpha, &a[ + a_offset], nmax, &b[b_offset], + nmax, &beta, &c__[c_offset], nmax, + &ct[1], &g[1], &cc[1], &ldc, eps, + &err, fatal, nout, &c_true); + } else { + zmmch_("N", "N", &m, &n, &n, &alpha, &b[ + b_offset], nmax, &a[a_offset], + nmax, &beta, &c__[c_offset], nmax, + &ct[1], &g[1], &cc[1], &ldc, eps, + &err, fatal, nout, &c_true); + } + errmax = f2cmax(errmax,err); +/* If got really bad answer, report and */ +/* return. */ + if (*fatal) { + goto L110; + } + } + +/* L50: */ + } + +/* L60: */ + } + +/* L70: */ + } + +L80: + ; + } + +L90: + ; + } + +/* L100: */ + } + +/* Report result. */ + + if (errmax < *thresh) { + if (*iorder == 0) { + io___186.ciunit = *nout; + s_wsfe(&io___186); + do_fio(&c__1, sname, (ftnlen)13); + do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); + e_wsfe(); + } + if (*iorder == 1) { + io___187.ciunit = *nout; + s_wsfe(&io___187); + do_fio(&c__1, sname, (ftnlen)13); + do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); + e_wsfe(); + } + } else { + if (*iorder == 0) { + io___188.ciunit = *nout; + s_wsfe(&io___188); + do_fio(&c__1, sname, (ftnlen)13); + do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); + do_fio(&c__1, (char *)&errmax, (ftnlen)sizeof(doublereal)); + e_wsfe(); + } + if (*iorder == 1) { + io___189.ciunit = *nout; + s_wsfe(&io___189); + do_fio(&c__1, sname, (ftnlen)13); + do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); + do_fio(&c__1, (char *)&errmax, (ftnlen)sizeof(doublereal)); + e_wsfe(); + } + } + goto L120; + +L110: + io___190.ciunit = *nout; + s_wsfe(&io___190); + do_fio(&c__1, sname, (ftnlen)13); + e_wsfe(); + zprcn2_(nout, &nc, sname, iorder, side, uplo, &m, &n, &alpha, &lda, &ldb, + &beta, &ldc); + +L120: + return 0; + +/* L9995: */ + +/* End of ZCHK2. */ + +} /* zchk2_ */ + + +/* Subroutine */ int zprcn2_(integer *nout, integer *nc, char *sname, integer + *iorder, char *side, char *uplo, integer *m, integer *n, + doublecomplex *alpha, integer *lda, integer *ldb, doublecomplex *beta, + integer *ldc) +{ + /* Format strings */ + static char fmt_9995[] = "(1x,i6,\002: \002,a13,\002(\002,a15,\002,\002," + "a14,\002,\002,a14,\002,\002)"; + static char fmt_9994[] = "(10x,2(i3,\002,\002),\002 (\002,f4.1,\002,\002" + ",f4.1,\002), A,\002,i3,\002, B,\002,i3,\002, (\002,f4.1,\002," + "\002,f4.1,\002), \002,\002C,\002,i3,\002).\002)"; + + /* Local variables */ + char cs[14], cu[14], crc[14]; + + /* Fortran I/O blocks */ + static cilist io___194 = { 0, 0, 0, fmt_9995, 0 }; + static cilist io___195 = { 0, 0, 0, fmt_9994, 0 }; + + + if (*(unsigned char *)side == 'L') { + s_copy(cs, " CblasLeft", (ftnlen)14, (ftnlen)14); + } else { + s_copy(cs, " CblasRight", (ftnlen)14, (ftnlen)14); + } + if (*(unsigned char *)uplo == 'U') { + s_copy(cu, " CblasUpper", (ftnlen)14, (ftnlen)14); + } else { + s_copy(cu, " CblasLower", (ftnlen)14, (ftnlen)14); + } + if (*iorder == 1) { + s_copy(crc, " CblasRowMajor", (ftnlen)14, (ftnlen)14); + } else { + s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14); + } + io___194.ciunit = *nout; + s_wsfe(&io___194); + do_fio(&c__1, (char *)&(*nc), (ftnlen)sizeof(integer)); + do_fio(&c__1, sname, (ftnlen)13); + do_fio(&c__1, crc, (ftnlen)14); + do_fio(&c__1, cs, (ftnlen)14); + do_fio(&c__1, cu, (ftnlen)14); + e_wsfe(); + io___195.ciunit = *nout; + s_wsfe(&io___195); + do_fio(&c__1, (char *)&(*m), (ftnlen)sizeof(integer)); + do_fio(&c__1, (char *)&(*n), (ftnlen)sizeof(integer)); + do_fio(&c__2, (char *)&(*alpha), (ftnlen)sizeof(doublereal)); + do_fio(&c__1, (char *)&(*lda), (ftnlen)sizeof(integer)); + do_fio(&c__1, (char *)&(*ldb), (ftnlen)sizeof(integer)); + do_fio(&c__2, (char *)&(*beta), (ftnlen)sizeof(doublereal)); + do_fio(&c__1, (char *)&(*ldc), (ftnlen)sizeof(integer)); + e_wsfe(); + return 0; +} /* zprcn2_ */ + + +/* Subroutine */ int zchk3_(char *sname, doublereal *eps, doublereal *thresh, + integer *nout, integer *ntra, logical *trace, logical *rewi, logical * + fatal, integer *nidim, integer *idim, integer *nalf, doublecomplex * + alf, integer *nmax, doublecomplex *a, doublecomplex *aa, + doublecomplex *as, doublecomplex *b, doublecomplex *bb, doublecomplex + *bs, doublecomplex *ct, doublereal *g, doublecomplex *c__, integer * + iorder) +{ + /* Initialized data */ + + static char ichu[2] = "UL"; + static char icht[3] = "NTC"; + static char ichd[2] = "UN"; + static char ichs[2] = "LR"; + + /* Format strings */ + static char fmt_9994[] = "(\002 ******* FATAL ERROR - ERROR-CALL MYEXIT " + "TAKEN ON VALID\002,\002 CALL *******\002)"; + static char fmt_9998[] = "(\002 ******* FATAL ERROR - PARAMETER NUMBER" + " \002,i2,\002 WAS CH\002,\002ANGED INCORRECTLY *******\002)"; + static char fmt_10000[] = "(\002 \002,a13,\002 PASSED THE COLUMN-MAJOR C" + "OMPUTATIONAL TESTS\002,\002 (\002,i6,\002 CALL\002,\002S)\002)"; + static char fmt_10001[] = "(\002 \002,a13,\002 PASSED THE ROW-MAJOR C" + "OMPUTATIONAL TESTS\002,\002 (\002,i6,\002 CALL\002,\002S)\002)"; + static char fmt_10002[] = "(\002 \002,a13,\002 COMPLETED THE COLUMN-MAJO" + "R COMPUTATIONAL \002,\002TESTS (\002,i6,\002 CALLS)\002,/\002 **" + "***** BUT WITH MAXIMUM TEST \002,\002RATIO \002,f8.2,\002 - SUSP" + "ECT *******\002)"; + static char fmt_10003[] = "(\002 \002,a13,\002 COMPLETED THE ROW-MAJOR " + " COMPUTATIONAL \002,\002TESTS (\002,i6,\002 CALLS)\002,/\002 **" + "***** BUT WITH MAXIMUM TEST \002,\002RATIO \002,f8.2,\002 - SUSP" + "ECT *******\002)"; + static char fmt_9996[] = "(\002 ******* \002,a13,\002 FAILED ON CALL NUM" + "BER:\002)"; + + /* System generated locals */ + integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, + i__3, i__4, i__5, i__6, i__7; + doublecomplex z__1; + alist al__1; + + /* Local variables */ + char diag[1]; + integer ldas, ldbs; + logical same; + char side[1]; + logical left, null; + char uplo[1]; + integer i__, j, m, n; + doublecomplex alpha; + char diags[1]; + logical isame[13]; + char sides[1]; + extern /* Subroutine */ int zmake_(char *, char *, char *, integer *, + integer *, doublecomplex *, integer *, doublecomplex *, integer *, + logical *, doublecomplex *); + integer nargs; + extern /* Subroutine */ int zmmch_(char *, char *, integer *, integer *, + integer *, doublecomplex *, doublecomplex *, integer *, + doublecomplex *, integer *, doublecomplex *, doublecomplex *, + integer *, doublecomplex *, doublereal *, doublecomplex *, + integer *, doublereal *, doublereal *, logical *, integer *, + logical *); + logical reset; + char uplos[1]; + integer ia, na; + extern /* Subroutine */ int zprcn3_(integer *, integer *, char *, integer + *, char *, char *, char *, char *, integer *, integer *, + doublecomplex *, integer *, integer *); + integer nc, im, in, ms, ns; + char tranas[1], transa[1]; + doublereal errmax; + extern logical lzeres_(char *, char *, integer *, integer *, + doublecomplex *, doublecomplex *, integer *); + extern /* Subroutine */ int cztrmm_(integer *, char *, char *, char *, + char *, integer *, integer *, doublecomplex *, doublecomplex *, + integer *, doublecomplex *, integer *), cztrsm_(integer *, char *, char *, char *, char *, + integer *, integer *, doublecomplex *, doublecomplex *, integer *, + doublecomplex *, integer *); + integer laa, icd, lbb, lda, ldb, ics; + doublecomplex als; + integer ict, icu; + doublereal err; + extern logical lze_(doublecomplex *, doublecomplex *, integer *); + + /* Fortran I/O blocks */ + static cilist io___236 = { 0, 0, 0, fmt_9994, 0 }; + static cilist io___239 = { 0, 0, 0, fmt_9998, 0 }; + static cilist io___241 = { 0, 0, 0, fmt_10000, 0 }; + static cilist io___242 = { 0, 0, 0, fmt_10001, 0 }; + static cilist io___243 = { 0, 0, 0, fmt_10002, 0 }; + static cilist io___244 = { 0, 0, 0, fmt_10003, 0 }; + static cilist io___245 = { 0, 0, 0, fmt_9996, 0 }; + + + +/* Tests ZTRMM and ZTRSM. */ + +/* Auxiliary routine for test program for Level 3 Blas. */ + +/* -- Written on 8-February-1989. */ +/* Jack Dongarra, Argonne National Laboratory. */ +/* Iain Duff, AERE Harwell. */ +/* Jeremy Du Croz, Numerical Algorithms Group Ltd. */ +/* Sven Hammarling, Numerical Algorithms Group Ltd. */ + + /* Parameter adjustments */ + --idim; + --alf; + c_dim1 = *nmax; + c_offset = 1 + c_dim1 * 1; + c__ -= c_offset; + --g; + --ct; + --bs; + --bb; + b_dim1 = *nmax; + b_offset = 1 + b_dim1 * 1; + b -= b_offset; + --as; + --aa; + a_dim1 = *nmax; + a_offset = 1 + a_dim1 * 1; + a -= a_offset; + + /* Function Body */ + + nargs = 11; + nc = 0; + reset = TRUE_; + errmax = 0.; +/* Set up zero matrix for ZMMCH. */ + i__1 = *nmax; + for (j = 1; j <= i__1; ++j) { + i__2 = *nmax; + for (i__ = 1; i__ <= i__2; ++i__) { + i__3 = i__ + j * c_dim1; + c__[i__3].r = 0., c__[i__3].i = 0.; +/* L10: */ + } +/* L20: */ + } + + i__1 = *nidim; + for (im = 1; im <= i__1; ++im) { + m = idim[im]; + + i__2 = *nidim; + for (in = 1; in <= i__2; ++in) { + n = idim[in]; +/* Set LDB to 1 more than minimum value if room. */ + ldb = m; + if (ldb < *nmax) { + ++ldb; + } +/* Skip tests if not enough room. */ + if (ldb > *nmax) { + goto L130; + } + lbb = ldb * n; + null = m <= 0 || n <= 0; + + for (ics = 1; ics <= 2; ++ics) { + *(unsigned char *)side = *(unsigned char *)&ichs[ics - 1]; + left = *(unsigned char *)side == 'L'; + if (left) { + na = m; + } else { + na = n; + } +/* Set LDA to 1 more than minimum value if room. */ + lda = na; + if (lda < *nmax) { + ++lda; + } +/* Skip tests if not enough room. */ + if (lda > *nmax) { + goto L130; + } + laa = lda * na; + + for (icu = 1; icu <= 2; ++icu) { + *(unsigned char *)uplo = *(unsigned char *)&ichu[icu - 1]; + + for (ict = 1; ict <= 3; ++ict) { + *(unsigned char *)transa = *(unsigned char *)&icht[ + ict - 1]; + + for (icd = 1; icd <= 2; ++icd) { + *(unsigned char *)diag = *(unsigned char *)&ichd[ + icd - 1]; + + i__3 = *nalf; + for (ia = 1; ia <= i__3; ++ia) { + i__4 = ia; + alpha.r = alf[i__4].r, alpha.i = alf[i__4].i; + +/* Generate the matrix A. */ + + zmake_("tr", uplo, diag, &na, &na, &a[ + a_offset], nmax, &aa[1], &lda, &reset, + &c_b1); + +/* Generate the matrix B. */ + + zmake_("ge", " ", " ", &m, &n, &b[b_offset], + nmax, &bb[1], &ldb, &reset, &c_b1); + + ++nc; + +/* Save every datum before calling the */ +/* subroutine. */ + + *(unsigned char *)sides = *(unsigned char *) + side; + *(unsigned char *)uplos = *(unsigned char *) + uplo; + *(unsigned char *)tranas = *(unsigned char *) + transa; + *(unsigned char *)diags = *(unsigned char *) + diag; + ms = m; + ns = n; + als.r = alpha.r, als.i = alpha.i; + i__4 = laa; + for (i__ = 1; i__ <= i__4; ++i__) { + i__5 = i__; + i__6 = i__; + as[i__5].r = aa[i__6].r, as[i__5].i = aa[ + i__6].i; +/* L30: */ + } + ldas = lda; + i__4 = lbb; + for (i__ = 1; i__ <= i__4; ++i__) { + i__5 = i__; + i__6 = i__; + bs[i__5].r = bb[i__6].r, bs[i__5].i = bb[ + i__6].i; +/* L40: */ + } + ldbs = ldb; + +/* Call the subroutine. */ + + if (s_cmp(sname + 9, "mm", (ftnlen)2, (ftnlen) + 2) == 0) { + if (*trace) { + zprcn3_(ntra, &nc, sname, iorder, + side, uplo, transa, diag, &m, + &n, &alpha, &lda, &ldb, ( + ftnlen)13, (ftnlen)1, (ftnlen) + 1, (ftnlen)1, (ftnlen)1); + } + if (*rewi) { + al__1.aerr = 0; + al__1.aunit = *ntra; + f_rew(&al__1); + } + cztrmm_(iorder, side, uplo, transa, diag, + &m, &n, &alpha, &aa[1], &lda, &bb[ + 1], &ldb); + } else if (s_cmp(sname + 9, "sm", (ftnlen)2, ( + ftnlen)2) == 0) { + if (*trace) { + zprcn3_(ntra, &nc, sname, iorder, + side, uplo, transa, diag, &m, + &n, &alpha, &lda, &ldb, ( + ftnlen)13, (ftnlen)1, (ftnlen) + 1, (ftnlen)1, (ftnlen)1); + } + if (*rewi) { + al__1.aerr = 0; + al__1.aunit = *ntra; + f_rew(&al__1); + } + cztrsm_(iorder, side, uplo, transa, diag, + &m, &n, &alpha, &aa[1], &lda, &bb[ + 1], &ldb); + } + +/* Check if error-exit was taken incorrectly. */ + + if (! infoc_1.ok) { + io___236.ciunit = *nout; + s_wsfe(&io___236); + e_wsfe(); + *fatal = TRUE_; + goto L150; + } + +/* See what data changed inside subroutines. */ + + isame[0] = *(unsigned char *)sides == *( + unsigned char *)side; + isame[1] = *(unsigned char *)uplos == *( + unsigned char *)uplo; + isame[2] = *(unsigned char *)tranas == *( + unsigned char *)transa; + isame[3] = *(unsigned char *)diags == *( + unsigned char *)diag; + isame[4] = ms == m; + isame[5] = ns == n; + isame[6] = als.r == alpha.r && als.i == + alpha.i; + isame[7] = lze_(&as[1], &aa[1], &laa); + isame[8] = ldas == lda; + if (null) { + isame[9] = lze_(&bs[1], &bb[1], &lbb); + } else { + isame[9] = lzeres_("ge", " ", &m, &n, &bs[ + 1], &bb[1], &ldb); + } + isame[10] = ldbs == ldb; + +/* If data was incorrectly changed, report and */ +/* return. */ + + same = TRUE_; + i__4 = nargs; + for (i__ = 1; i__ <= i__4; ++i__) { + same = same && isame[i__ - 1]; + if (! isame[i__ - 1]) { + io___239.ciunit = *nout; + s_wsfe(&io___239); + do_fio(&c__1, (char *)&i__, (ftnlen) + sizeof(integer)); + e_wsfe(); + } +/* L50: */ + } + if (! same) { + *fatal = TRUE_; + goto L150; + } + + if (! null) { + if (s_cmp(sname + 9, "mm", (ftnlen)2, ( + ftnlen)2) == 0) { + +/* Check the result. */ + + if (left) { + zmmch_(transa, "N", &m, &n, &m, & + alpha, &a[a_offset], nmax, + &b[b_offset], nmax, & + c_b1, &c__[c_offset], + nmax, &ct[1], &g[1], &bb[ + 1], &ldb, eps, &err, + fatal, nout, &c_true, ( + ftnlen)1, (ftnlen)1); + } else { + zmmch_("N", transa, &m, &n, &n, & + alpha, &b[b_offset], nmax, + &a[a_offset], nmax, & + c_b1, &c__[c_offset], + nmax, &ct[1], &g[1], &bb[ + 1], &ldb, eps, &err, + fatal, nout, &c_true); + } + } else if (s_cmp(sname + 9, "sm", (ftnlen) + 2, (ftnlen)2) == 0) { + +/* Compute approximation to original */ +/* matrix. */ + + i__4 = n; + for (j = 1; j <= i__4; ++j) { + i__5 = m; + for (i__ = 1; i__ <= i__5; ++i__) + { + i__6 = i__ + j * c_dim1; + i__7 = i__ + (j - 1) * ldb; + c__[i__6].r = bb[i__7].r, c__[i__6].i = bb[i__7].i; + i__6 = i__ + (j - 1) * ldb; + i__7 = i__ + j * b_dim1; + z__1.r = alpha.r * b[i__7].r - alpha.i * b[i__7].i, + z__1.i = alpha.r * b[i__7].i + alpha.i * b[ + i__7].r; + bb[i__6].r = z__1.r, bb[i__6].i = z__1.i; +/* L60: */ + } +/* L70: */ + } + + if (left) { + zmmch_(transa, "N", &m, &n, &m, & + c_b2, &a[a_offset], nmax, + &c__[c_offset], nmax, & + c_b1, &b[b_offset], nmax, + &ct[1], &g[1], &bb[1], & + ldb, eps, &err, fatal, + nout, &c_false); + } else { + zmmch_("N", transa, &m, &n, &n, & + c_b2, &c__[c_offset], + nmax, &a[a_offset], nmax, + &c_b1, &b[b_offset], nmax, + &ct[1], &g[1], &bb[1], & + ldb, eps, &err, fatal, + nout, &c_false); + } + } + errmax = f2cmax(errmax,err); +/* If got really bad answer, report and */ +/* return. */ + if (*fatal) { + goto L150; + } + } + +/* L80: */ + } + +/* L90: */ + } + +/* L100: */ + } + +/* L110: */ + } + +/* L120: */ + } + +L130: + ; + } + +/* L140: */ + } + +/* Report result. */ + + if (errmax < *thresh) { + if (*iorder == 0) { + io___241.ciunit = *nout; + s_wsfe(&io___241); + do_fio(&c__1, sname, (ftnlen)13); + do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); + e_wsfe(); + } + if (*iorder == 1) { + io___242.ciunit = *nout; + s_wsfe(&io___242); + do_fio(&c__1, sname, (ftnlen)13); + do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); + e_wsfe(); + } + } else { + if (*iorder == 0) { + io___243.ciunit = *nout; + s_wsfe(&io___243); + do_fio(&c__1, sname, (ftnlen)13); + do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); + do_fio(&c__1, (char *)&errmax, (ftnlen)sizeof(doublereal)); + e_wsfe(); + } + if (*iorder == 1) { + io___244.ciunit = *nout; + s_wsfe(&io___244); + do_fio(&c__1, sname, (ftnlen)13); + do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); + do_fio(&c__1, (char *)&errmax, (ftnlen)sizeof(doublereal)); + e_wsfe(); + } + } + goto L160; + +L150: + io___245.ciunit = *nout; + s_wsfe(&io___245); + do_fio(&c__1, sname, (ftnlen)13); + e_wsfe(); + if (*trace) { + zprcn3_(ntra, &nc, sname, iorder, side, uplo, transa, diag, &m, &n, & + alpha, &lda, &ldb); + } + +L160: + return 0; + +/* L9995: */ + +/* End of ZCHK3. */ + +} /* zchk3_ */ + + +/* Subroutine */ int zprcn3_(integer *nout, integer *nc, char *sname, integer + *iorder, char *side, char *uplo, char *transa, char *diag, integer *m, + integer *n, doublecomplex *alpha, integer *lda, integer *ldb) +{ + /* Format strings */ + static char fmt_9995[] = "(1x,i6,\002: \002,a13,\002(\002,a15,\002,\002," + "a14,\002,\002,a14,\002,\002)"; + static char fmt_9994[] = "(10x,2(a15,\002,\002),2(i3,\002,\002),\002 " + "(\002,f4.1,\002,\002,f4.1,\002), A,\002,i3,\002, B,\002,i3,\002)." + "\002)"; + + /* Local variables */ + char ca[14], cd[14], cs[14], cu[14], crc[14]; + + /* Fortran I/O blocks */ + static cilist io___251 = { 0, 0, 0, fmt_9995, 0 }; + static cilist io___252 = { 0, 0, 0, fmt_9994, 0 }; + + + if (*(unsigned char *)side == 'L') { + s_copy(cs, " CblasLeft", (ftnlen)14, (ftnlen)14); + } else { + s_copy(cs, " CblasRight", (ftnlen)14, (ftnlen)14); + } + if (*(unsigned char *)uplo == 'U') { + s_copy(cu, " CblasUpper", (ftnlen)14, (ftnlen)14); + } else { + s_copy(cu, " CblasLower", (ftnlen)14, (ftnlen)14); + } + if (*(unsigned char *)transa == 'N') { + s_copy(ca, " CblasNoTrans", (ftnlen)14, (ftnlen)14); + } else if (*(unsigned char *)transa == 'T') { + s_copy(ca, " CblasTrans", (ftnlen)14, (ftnlen)14); + } else { + s_copy(ca, "CblasConjTrans", (ftnlen)14, (ftnlen)14); + } + if (*(unsigned char *)diag == 'N') { + s_copy(cd, " CblasNonUnit", (ftnlen)14, (ftnlen)14); + } else { + s_copy(cd, " CblasUnit", (ftnlen)14, (ftnlen)14); + } + if (*iorder == 1) { + s_copy(crc, " CblasRowMajor", (ftnlen)14, (ftnlen)14); + } else { + s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14); + } + io___251.ciunit = *nout; + s_wsfe(&io___251); + do_fio(&c__1, (char *)&(*nc), (ftnlen)sizeof(integer)); + do_fio(&c__1, sname, (ftnlen)13); + do_fio(&c__1, crc, (ftnlen)14); + do_fio(&c__1, cs, (ftnlen)14); + do_fio(&c__1, cu, (ftnlen)14); + e_wsfe(); + io___252.ciunit = *nout; + s_wsfe(&io___252); + do_fio(&c__1, ca, (ftnlen)14); + do_fio(&c__1, cd, (ftnlen)14); + do_fio(&c__1, (char *)&(*m), (ftnlen)sizeof(integer)); + do_fio(&c__1, (char *)&(*n), (ftnlen)sizeof(integer)); + do_fio(&c__2, (char *)&(*alpha), (ftnlen)sizeof(doublereal)); + do_fio(&c__1, (char *)&(*lda), (ftnlen)sizeof(integer)); + do_fio(&c__1, (char *)&(*ldb), (ftnlen)sizeof(integer)); + e_wsfe(); + return 0; +} /* zprcn3_ */ + + +/* Subroutine */ int zchk4_(char *sname, doublereal *eps, doublereal *thresh, + integer *nout, integer *ntra, logical *trace, logical *rewi, logical * + fatal, integer *nidim, integer *idim, integer *nalf, doublecomplex * + alf, integer *nbet, doublecomplex *bet, integer *nmax, doublecomplex * + a, doublecomplex *aa, doublecomplex *as, doublecomplex *b, + doublecomplex *bb, doublecomplex *bs, doublecomplex *c__, + doublecomplex *cc, doublecomplex *cs, doublecomplex *ct, doublereal * + g, integer *iorder) +{ + /* Initialized data */ + + static char icht[2] = "NC"; + static char ichu[2] = "UL"; + + /* Format strings */ + static char fmt_9992[] = "(\002 ******* FATAL ERROR - ERROR-CALL MYEXIT " + "TAKEN ON VALID\002,\002 CALL *******\002)"; + static char fmt_9998[] = "(\002 ******* FATAL ERROR - PARAMETER NUMBER" + " \002,i2,\002 WAS CH\002,\002ANGED INCORRECTLY *******\002)"; + static char fmt_10000[] = "(\002 \002,a13,\002 PASSED THE COLUMN-MAJOR C" + "OMPUTATIONAL TESTS\002,\002 (\002,i6,\002 CALL\002,\002S)\002)"; + static char fmt_10001[] = "(\002 \002,a13,\002 PASSED THE ROW-MAJOR C" + "OMPUTATIONAL TESTS\002,\002 (\002,i6,\002 CALL\002,\002S)\002)"; + static char fmt_10002[] = "(\002 \002,a13,\002 COMPLETED THE COLUMN-MAJO" + "R COMPUTATIONAL \002,\002TESTS (\002,i6,\002 CALLS)\002,/\002 **" + "***** BUT WITH MAXIMUM TEST \002,\002RATIO \002,f8.2,\002 - SUSP" + "ECT *******\002)"; + static char fmt_10003[] = "(\002 \002,a13,\002 COMPLETED THE ROW-MAJOR " + " COMPUTATIONAL \002,\002TESTS (\002,i6,\002 CALLS)\002,/\002 **" + "***** BUT WITH MAXIMUM TEST \002,\002RATIO \002,f8.2,\002 - SUSP" + "ECT *******\002)"; + static char fmt_9995[] = "(\002 THESE ARE THE RESULTS FOR COLUMN" + " \002,i3)"; + static char fmt_9996[] = "(\002 ******* \002,a13,\002 FAILED ON CALL NUM" + "BER:\002)"; + + /* System generated locals */ + integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, + i__3, i__4, i__5, i__6, i__7; + doublecomplex z__1; + alist al__1; + + /* Local variables */ + doublecomplex beta; + integer ldas, ldcs; + logical same, conj; + doublecomplex bets; + doublereal rals; + logical tran, null; + char uplo[1]; + integer i__, j, k, n; + doublecomplex alpha; + doublereal rbeta; + logical isame[13]; + extern /* Subroutine */ int zmake_(char *, char *, char *, integer *, + integer *, doublecomplex *, integer *, doublecomplex *, integer *, + logical *, doublecomplex *); + integer nargs; + extern /* Subroutine */ int zmmch_(char *, char *, integer *, integer *, + integer *, doublecomplex *, doublecomplex *, integer *, + doublecomplex *, integer *, doublecomplex *, doublecomplex *, + integer *, doublecomplex *, doublereal *, doublecomplex *, + integer *, doublereal *, doublereal *, logical *, integer *, + logical *); + doublereal rbets; + logical reset; + char trans[1]; + logical upper; + char uplos[1]; + integer ia, ib, jc, ma, na; + extern /* Subroutine */ int zprcn4_(integer *, integer *, char *, integer + *, char *, char *, integer *, integer *, doublecomplex *, integer + *, doublecomplex *, integer *); + integer nc; + extern /* Subroutine */ int zprcn6_(integer *, integer *, char *, integer + *, char *, char *, integer *, integer *, doublereal *, integer *, + doublereal *, integer *); + integer ik, in, jj, lj, ks, ns; + doublereal ralpha; + extern /* Subroutine */ int czherk_(integer *, char *, char *, integer *, + integer *, doublereal *, doublecomplex *, integer *, doublereal *, + doublecomplex *, integer *); + doublereal errmax; + extern logical lzeres_(char *, char *, integer *, integer *, + doublecomplex *, doublecomplex *, integer *); + char transs[1], transt[1]; + extern /* Subroutine */ int czsyrk_(integer *, char *, char *, integer *, + integer *, doublecomplex *, doublecomplex *, integer *, + doublecomplex *, doublecomplex *, integer *); + integer laa, lda, lcc, ldc; + doublecomplex als; + integer ict, icu; + doublereal err; + extern logical lze_(doublecomplex *, doublecomplex *, integer *); + + /* Fortran I/O blocks */ + static cilist io___294 = { 0, 0, 0, fmt_9992, 0 }; + static cilist io___297 = { 0, 0, 0, fmt_9998, 0 }; + static cilist io___304 = { 0, 0, 0, fmt_10000, 0 }; + static cilist io___305 = { 0, 0, 0, fmt_10001, 0 }; + static cilist io___306 = { 0, 0, 0, fmt_10002, 0 }; + static cilist io___307 = { 0, 0, 0, fmt_10003, 0 }; + static cilist io___308 = { 0, 0, 0, fmt_9995, 0 }; + static cilist io___309 = { 0, 0, 0, fmt_9996, 0 }; + + + +/* Tests ZHERK and ZSYRK. */ + +/* Auxiliary routine for test program for Level 3 Blas. */ + +/* -- Written on 8-February-1989. */ +/* Jack Dongarra, Argonne National Laboratory. */ +/* Iain Duff, AERE Harwell. */ +/* Jeremy Du Croz, Numerical Algorithms Group Ltd. */ +/* Sven Hammarling, Numerical Algorithms Group Ltd. */ + + /* Parameter adjustments */ + --idim; + --alf; + --bet; + --g; + --ct; + --cs; + --cc; + c_dim1 = *nmax; + c_offset = 1 + c_dim1 * 1; + c__ -= c_offset; + --bs; + --bb; + b_dim1 = *nmax; + b_offset = 1 + b_dim1 * 1; + b -= b_offset; + --as; + --aa; + a_dim1 = *nmax; + a_offset = 1 + a_dim1 * 1; + a -= a_offset; + + /* Function Body */ + conj = s_cmp(sname + 7, "he", (ftnlen)2, (ftnlen)2) == 0; + + nargs = 10; + nc = 0; + reset = TRUE_; + errmax = 0.; + + i__1 = *nidim; + for (in = 1; in <= i__1; ++in) { + n = idim[in]; +/* Set LDC to 1 more than minimum value if room. */ + ldc = n; + if (ldc < *nmax) { + ++ldc; + } +/* Skip tests if not enough room. */ + if (ldc > *nmax) { + goto L100; + } + lcc = ldc * n; + + i__2 = *nidim; + for (ik = 1; ik <= i__2; ++ik) { + k = idim[ik]; + + for (ict = 1; ict <= 2; ++ict) { + *(unsigned char *)trans = *(unsigned char *)&icht[ict - 1]; + tran = *(unsigned char *)trans == 'C'; + if (tran && ! conj) { + *(unsigned char *)trans = 'T'; + } + if (tran) { + ma = k; + na = n; + } else { + ma = n; + na = k; + } +/* Set LDA to 1 more than minimum value if room. */ + lda = ma; + if (lda < *nmax) { + ++lda; + } +/* Skip tests if not enough room. */ + if (lda > *nmax) { + goto L80; + } + laa = lda * na; + +/* Generate the matrix A. */ + + zmake_("ge", " ", " ", &ma, &na, &a[a_offset], nmax, &aa[1], & + lda, &reset, &c_b1); + + for (icu = 1; icu <= 2; ++icu) { + *(unsigned char *)uplo = *(unsigned char *)&ichu[icu - 1]; + upper = *(unsigned char *)uplo == 'U'; + + i__3 = *nalf; + for (ia = 1; ia <= i__3; ++ia) { + i__4 = ia; + alpha.r = alf[i__4].r, alpha.i = alf[i__4].i; + if (conj) { + ralpha = alpha.r; + z__1.r = ralpha, z__1.i = 0.; + alpha.r = z__1.r, alpha.i = z__1.i; + } + + i__4 = *nbet; + for (ib = 1; ib <= i__4; ++ib) { + i__5 = ib; + beta.r = bet[i__5].r, beta.i = bet[i__5].i; + if (conj) { + rbeta = beta.r; + z__1.r = rbeta, z__1.i = 0.; + beta.r = z__1.r, beta.i = z__1.i; + } + null = n <= 0; + if (conj) { + null = null || (k <= 0 || ralpha == 0.) && + rbeta == 1.; + } + +/* Generate the matrix C. */ + + zmake_(sname + 7, uplo, " ", &n, &n, &c__[ + c_offset], nmax, &cc[1], &ldc, &reset, & + c_b1); + + ++nc; + +/* Save every datum before calling the subroutine. */ + + *(unsigned char *)uplos = *(unsigned char *)uplo; + *(unsigned char *)transs = *(unsigned char *) + trans; + ns = n; + ks = k; + if (conj) { + rals = ralpha; + } else { + als.r = alpha.r, als.i = alpha.i; + } + i__5 = laa; + for (i__ = 1; i__ <= i__5; ++i__) { + i__6 = i__; + i__7 = i__; + as[i__6].r = aa[i__7].r, as[i__6].i = aa[i__7] + .i; +/* L10: */ + } + ldas = lda; + if (conj) { + rbets = rbeta; + } else { + bets.r = beta.r, bets.i = beta.i; + } + i__5 = lcc; + for (i__ = 1; i__ <= i__5; ++i__) { + i__6 = i__; + i__7 = i__; + cs[i__6].r = cc[i__7].r, cs[i__6].i = cc[i__7] + .i; +/* L20: */ + } + ldcs = ldc; + +/* Call the subroutine. */ + + if (conj) { + if (*trace) { + zprcn6_(ntra, &nc, sname, iorder, uplo, + trans, &n, &k, &ralpha, &lda, & + rbeta, &ldc); + } + if (*rewi) { + al__1.aerr = 0; + al__1.aunit = *ntra; + f_rew(&al__1); + } + czherk_(iorder, uplo, trans, &n, &k, &ralpha, + &aa[1], &lda, &rbeta, &cc[1], &ldc); + } else { + if (*trace) { + zprcn4_(ntra, &nc, sname, iorder, uplo, + trans, &n, &k, &alpha, &lda, & + beta, &ldc); + } + if (*rewi) { + al__1.aerr = 0; + al__1.aunit = *ntra; + f_rew(&al__1); + } + czsyrk_(iorder, uplo, trans, &n, &k, &alpha, & + aa[1], &lda, &beta, &cc[1], &ldc); + } + +/* Check if error-exit was taken incorrectly. */ + + if (! infoc_1.ok) { + io___294.ciunit = *nout; + s_wsfe(&io___294); + e_wsfe(); + *fatal = TRUE_; + goto L120; + } + +/* See what data changed inside subroutines. */ + + isame[0] = *(unsigned char *)uplos == *(unsigned + char *)uplo; + isame[1] = *(unsigned char *)transs == *(unsigned + char *)trans; + isame[2] = ns == n; + isame[3] = ks == k; + if (conj) { + isame[4] = rals == ralpha; + } else { + isame[4] = als.r == alpha.r && als.i == + alpha.i; + } + isame[5] = lze_(&as[1], &aa[1], &laa); + isame[6] = ldas == lda; + if (conj) { + isame[7] = rbets == rbeta; + } else { + isame[7] = bets.r == beta.r && bets.i == + beta.i; + } + if (null) { + isame[8] = lze_(&cs[1], &cc[1], &lcc); + } else { + isame[8] = lzeres_(sname + 7, uplo, &n, &n, & + cs[1], &cc[1], &ldc); + } + isame[9] = ldcs == ldc; + +/* If data was incorrectly changed, report and */ +/* return. */ + + same = TRUE_; + i__5 = nargs; + for (i__ = 1; i__ <= i__5; ++i__) { + same = same && isame[i__ - 1]; + if (! isame[i__ - 1]) { + io___297.ciunit = *nout; + s_wsfe(&io___297); + do_fio(&c__1, (char *)&i__, (ftnlen) + sizeof(integer)); + e_wsfe(); + } +/* L30: */ + } + if (! same) { + *fatal = TRUE_; + goto L120; + } + + if (! null) { + +/* Check the result column by column. */ + + if (conj) { + *(unsigned char *)transt = 'C'; + } else { + *(unsigned char *)transt = 'T'; + } + jc = 1; + i__5 = n; + for (j = 1; j <= i__5; ++j) { + if (upper) { + jj = 1; + lj = j; + } else { + jj = j; + lj = n - j + 1; + } + if (tran) { + zmmch_(transt, "N", &lj, &c__1, &k, & + alpha, &a[jj * a_dim1 + 1], + nmax, &a[j * a_dim1 + 1], + nmax, &beta, &c__[jj + j * + c_dim1], nmax, &ct[1], &g[1], + &cc[jc], &ldc, eps, &err, + fatal, nout, &c_true); + } else { + zmmch_("N", transt, &lj, &c__1, &k, & + alpha, &a[jj + a_dim1], nmax, + &a[j + a_dim1], nmax, &beta, & + c__[jj + j * c_dim1], nmax, & + ct[1], &g[1], &cc[jc], &ldc, + eps, &err, fatal, nout, & + c_true); + } + if (upper) { + jc += ldc; + } else { + jc = jc + ldc + 1; + } + errmax = f2cmax(errmax,err); +/* If got really bad answer, report and */ +/* return. */ + if (*fatal) { + goto L110; + } +/* L40: */ + } + } + +/* L50: */ + } + +/* L60: */ + } + +/* L70: */ + } + +L80: + ; + } + +/* L90: */ + } + +L100: + ; + } + +/* Report result. */ + + if (errmax < *thresh) { + if (*iorder == 0) { + io___304.ciunit = *nout; + s_wsfe(&io___304); + do_fio(&c__1, sname, (ftnlen)13); + do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); + e_wsfe(); + } + if (*iorder == 1) { + io___305.ciunit = *nout; + s_wsfe(&io___305); + do_fio(&c__1, sname, (ftnlen)13); + do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); + e_wsfe(); + } + } else { + if (*iorder == 0) { + io___306.ciunit = *nout; + s_wsfe(&io___306); + do_fio(&c__1, sname, (ftnlen)13); + do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); + do_fio(&c__1, (char *)&errmax, (ftnlen)sizeof(doublereal)); + e_wsfe(); + } + if (*iorder == 1) { + io___307.ciunit = *nout; + s_wsfe(&io___307); + do_fio(&c__1, sname, (ftnlen)13); + do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); + do_fio(&c__1, (char *)&errmax, (ftnlen)sizeof(doublereal)); + e_wsfe(); + } + } + goto L130; + +L110: + if (n > 1) { + io___308.ciunit = *nout; + s_wsfe(&io___308); + do_fio(&c__1, (char *)&j, (ftnlen)sizeof(integer)); + e_wsfe(); + } + +L120: + io___309.ciunit = *nout; + s_wsfe(&io___309); + do_fio(&c__1, sname, (ftnlen)13); + e_wsfe(); + if (conj) { + zprcn6_(nout, &nc, sname, iorder, uplo, trans, &n, &k, &ralpha, &lda, + &rbeta, &ldc); + } else { + zprcn4_(nout, &nc, sname, iorder, uplo, trans, &n, &k, &alpha, &lda, & + beta, &ldc); + } + +L130: + return 0; + +/* L9994: */ +/* L9993: */ + +/* End of CCHK4. */ + +} /* zchk4_ */ + + +/* Subroutine */ int zprcn4_(integer *nout, integer *nc, char *sname, integer + *iorder, char *uplo, char *transa, integer *n, integer *k, + doublecomplex *alpha, integer *lda, doublecomplex *beta, integer *ldc) +{ + /* Format strings */ + static char fmt_9995[] = "(1x,i6,\002: \002,a13,\002(\002,3(a15,\002," + "\002))"; + static char fmt_9994[] = "(10x,2(i3,\002,\002),\002 (\002,f4.1,\002,\002" + ",f4.1,\002), A,\002,i3,\002, (\002,f4.1,\002,\002,f4.1,\002), C" + ",\002,i3,\002).\002)"; + + /* Local variables */ + char ca[14], cu[14], crc[14]; + + /* Fortran I/O blocks */ + static cilist io___313 = { 0, 0, 0, fmt_9995, 0 }; + static cilist io___314 = { 0, 0, 0, fmt_9994, 0 }; + + + if (*(unsigned char *)uplo == 'U') { + s_copy(cu, " CblasUpper", (ftnlen)14, (ftnlen)14); + } else { + s_copy(cu, " CblasLower", (ftnlen)14, (ftnlen)14); + } + if (*(unsigned char *)transa == 'N') { + s_copy(ca, " CblasNoTrans", (ftnlen)14, (ftnlen)14); + } else if (*(unsigned char *)transa == 'T') { + s_copy(ca, " CblasTrans", (ftnlen)14, (ftnlen)14); + } else { + s_copy(ca, "CblasConjTrans", (ftnlen)14, (ftnlen)14); + } + if (*iorder == 1) { + s_copy(crc, " CblasRowMajor", (ftnlen)14, (ftnlen)14); + } else { + s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14); + } + io___313.ciunit = *nout; + s_wsfe(&io___313); + do_fio(&c__1, (char *)&(*nc), (ftnlen)sizeof(integer)); + do_fio(&c__1, sname, (ftnlen)13); + do_fio(&c__1, crc, (ftnlen)14); + do_fio(&c__1, cu, (ftnlen)14); + do_fio(&c__1, ca, (ftnlen)14); + e_wsfe(); + io___314.ciunit = *nout; + s_wsfe(&io___314); + do_fio(&c__1, (char *)&(*n), (ftnlen)sizeof(integer)); + do_fio(&c__1, (char *)&(*k), (ftnlen)sizeof(integer)); + do_fio(&c__2, (char *)&(*alpha), (ftnlen)sizeof(doublereal)); + do_fio(&c__1, (char *)&(*lda), (ftnlen)sizeof(integer)); + do_fio(&c__2, (char *)&(*beta), (ftnlen)sizeof(doublereal)); + do_fio(&c__1, (char *)&(*ldc), (ftnlen)sizeof(integer)); + e_wsfe(); + return 0; +} /* zprcn4_ */ + + + +/* Subroutine */ int zprcn6_(integer *nout, integer *nc, char *sname, integer + *iorder, char *uplo, char *transa, integer *n, integer *k, doublereal + *alpha, integer *lda, doublereal *beta, integer *ldc) +{ + /* Format strings */ + static char fmt_9995[] = "(1x,i6,\002: \002,a13,\002(\002,3(a15,\002," + "\002))"; + static char fmt_9994[] = "(10x,2(i3,\002,\002),f4.1,\002, A,\002,i3" + ",\002,\002,f4.1,\002, C,\002,i3,\002).\002)"; + + /* Local variables */ + char ca[14], cu[14], crc[14]; + + /* Fortran I/O blocks */ + static cilist io___318 = { 0, 0, 0, fmt_9995, 0 }; + static cilist io___319 = { 0, 0, 0, fmt_9994, 0 }; + + + if (*(unsigned char *)uplo == 'U') { + s_copy(cu, " CblasUpper", (ftnlen)14, (ftnlen)14); + } else { + s_copy(cu, " CblasLower", (ftnlen)14, (ftnlen)14); + } + if (*(unsigned char *)transa == 'N') { + s_copy(ca, " CblasNoTrans", (ftnlen)14, (ftnlen)14); + } else if (*(unsigned char *)transa == 'T') { + s_copy(ca, " CblasTrans", (ftnlen)14, (ftnlen)14); + } else { + s_copy(ca, "CblasConjTrans", (ftnlen)14, (ftnlen)14); + } + if (*iorder == 1) { + s_copy(crc, " CblasRowMajor", (ftnlen)14, (ftnlen)14); + } else { + s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14); + } + io___318.ciunit = *nout; + s_wsfe(&io___318); + do_fio(&c__1, (char *)&(*nc), (ftnlen)sizeof(integer)); + do_fio(&c__1, sname, (ftnlen)13); + do_fio(&c__1, crc, (ftnlen)14); + do_fio(&c__1, cu, (ftnlen)14); + do_fio(&c__1, ca, (ftnlen)14); + e_wsfe(); + io___319.ciunit = *nout; + s_wsfe(&io___319); + do_fio(&c__1, (char *)&(*n), (ftnlen)sizeof(integer)); + do_fio(&c__1, (char *)&(*k), (ftnlen)sizeof(integer)); + do_fio(&c__1, (char *)&(*alpha), (ftnlen)sizeof(doublereal)); + do_fio(&c__1, (char *)&(*lda), (ftnlen)sizeof(integer)); + do_fio(&c__1, (char *)&(*beta), (ftnlen)sizeof(doublereal)); + do_fio(&c__1, (char *)&(*ldc), (ftnlen)sizeof(integer)); + e_wsfe(); + return 0; +} /* zprcn6_ */ + + +/* Subroutine */ int zchk5_(char *sname, doublereal *eps, doublereal *thresh, + integer *nout, integer *ntra, logical *trace, logical *rewi, logical * + fatal, integer *nidim, integer *idim, integer *nalf, doublecomplex * + alf, integer *nbet, doublecomplex *bet, integer *nmax, doublecomplex * + ab, doublecomplex *aa, doublecomplex *as, doublecomplex *bb, + doublecomplex *bs, doublecomplex *c__, doublecomplex *cc, + doublecomplex *cs, doublecomplex *ct, doublereal *g, doublecomplex *w, + integer *iorder) +{ + /* Initialized data */ + + static char icht[2] = "NC"; + static char ichu[2] = "UL"; + + /* Format strings */ + static char fmt_9992[] = "(\002 ******* FATAL ERROR - ERROR-CALL MYEXIT " + "TAKEN ON VALID\002,\002 CALL *******\002)"; + static char fmt_9998[] = "(\002 ******* FATAL ERROR - PARAMETER NUMBER" + " \002,i2,\002 WAS CH\002,\002ANGED INCORRECTLY *******\002)"; + static char fmt_10000[] = "(\002 \002,a13,\002 PASSED THE COLUMN-MAJOR C" + "OMPUTATIONAL TESTS\002,\002 (\002,i6,\002 CALL\002,\002S)\002)"; + static char fmt_10001[] = "(\002 \002,a13,\002 PASSED THE ROW-MAJOR C" + "OMPUTATIONAL TESTS\002,\002 (\002,i6,\002 CALL\002,\002S)\002)"; + static char fmt_10002[] = "(\002 \002,a13,\002 COMPLETED THE COLUMN-MAJO" + "R COMPUTATIONAL \002,\002TESTS (\002,i6,\002 CALLS)\002,/\002 **" + "***** BUT WITH MAXIMUM TEST \002,\002RATIO \002,f8.2,\002 - SUSP" + "ECT *******\002)"; + static char fmt_10003[] = "(\002 \002,a13,\002 COMPLETED THE ROW-MAJOR " + " COMPUTATIONAL \002,\002TESTS (\002,i6,\002 CALLS)\002,/\002 **" + "***** BUT WITH MAXIMUM TEST \002,\002RATIO \002,f8.2,\002 - SUSP" + "ECT *******\002)"; + static char fmt_9995[] = "(\002 THESE ARE THE RESULTS FOR COLUMN" + " \002,i3)"; + static char fmt_9996[] = "(\002 ******* \002,a13,\002 FAILED ON CALL NUM" + "BER:\002)"; + + /* System generated locals */ + integer c_dim1, c_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7, i__8; + doublecomplex z__1, z__2; + alist al__1; + + /* Local variables */ + integer jjab; + doublecomplex beta; + integer ldas, ldbs, ldcs; + logical same, conj; + doublecomplex bets; + logical tran, null; + char uplo[1]; + integer i__, j, k, n; + doublecomplex alpha; + doublereal rbeta; + logical isame[13]; + extern /* Subroutine */ int zmake_(char *, char *, char *, integer *, + integer *, doublecomplex *, integer *, doublecomplex *, integer *, + logical *, doublecomplex *); + integer nargs; + extern /* Subroutine */ int zmmch_(char *, char *, integer *, integer *, + integer *, doublecomplex *, doublecomplex *, integer *, + doublecomplex *, integer *, doublecomplex *, doublecomplex *, + integer *, doublecomplex *, doublereal *, doublecomplex *, + integer *, doublereal *, doublereal *, logical *, integer *, + logical *); + doublereal rbets; + logical reset; + char trans[1]; + logical upper; + char uplos[1]; + integer ia, ib, jc, ma, na, nc; + extern /* Subroutine */ int zprcn5_(integer *, integer *, char *, integer + *, char *, char *, integer *, integer *, doublecomplex *, integer + *, integer *, doublecomplex *, integer *), + zprcn7_(integer *, integer *, char *, integer *, char *, char *, + integer *, integer *, doublecomplex *, integer *, integer *, + doublereal *, integer *); + integer ik, in, jj, lj, ks, ns; + doublereal errmax; + extern logical lzeres_(char *, char *, integer *, integer *, + doublecomplex *, doublecomplex *, integer *); + char transs[1], transt[1]; + extern /* Subroutine */ int czher2k_(integer *, char *, char *, integer *, + integer *, doublecomplex *, doublecomplex *, integer *, + doublecomplex *, integer *, doublereal *, doublecomplex *, + integer *); + integer laa, lbb, lda, lcc, ldb, ldc; + doublecomplex als; + integer ict, icu; + extern /* Subroutine */ int czsyr2k_(integer *, char *, char *, integer *, + integer *, doublecomplex *, doublecomplex *, integer *, + doublecomplex *, integer *, doublecomplex *, doublecomplex *, + integer *); + doublereal err; + extern logical lze_(doublecomplex *, doublecomplex *, integer *); + + /* Fortran I/O blocks */ + static cilist io___362 = { 0, 0, 0, fmt_9992, 0 }; + static cilist io___365 = { 0, 0, 0, fmt_9998, 0 }; + static cilist io___373 = { 0, 0, 0, fmt_10000, 0 }; + static cilist io___374 = { 0, 0, 0, fmt_10001, 0 }; + static cilist io___375 = { 0, 0, 0, fmt_10002, 0 }; + static cilist io___376 = { 0, 0, 0, fmt_10003, 0 }; + static cilist io___377 = { 0, 0, 0, fmt_9995, 0 }; + static cilist io___378 = { 0, 0, 0, fmt_9996, 0 }; + + + +/* Tests ZHER2K and ZSYR2K. */ + +/* Auxiliary routine for test program for Level 3 Blas. */ + +/* -- Written on 8-February-1989. */ +/* Jack Dongarra, Argonne National Laboratory. */ +/* Iain Duff, AERE Harwell. */ +/* Jeremy Du Croz, Numerical Algorithms Group Ltd. */ +/* Sven Hammarling, Numerical Algorithms Group Ltd. */ + + /* Parameter adjustments */ + --idim; + --alf; + --bet; + --w; + --g; + --ct; + --cs; + --cc; + c_dim1 = *nmax; + c_offset = 1 + c_dim1 * 1; + c__ -= c_offset; + --bs; + --bb; + --as; + --aa; + --ab; + + /* Function Body */ + conj = s_cmp(sname + 7, "he", (ftnlen)2, (ftnlen)2) == 0; + + nargs = 12; + nc = 0; + reset = TRUE_; + errmax = 0.; + + i__1 = *nidim; + for (in = 1; in <= i__1; ++in) { + n = idim[in]; +/* Set LDC to 1 more than minimum value if room. */ + ldc = n; + if (ldc < *nmax) { + ++ldc; + } +/* Skip tests if not enough room. */ + if (ldc > *nmax) { + goto L130; + } + lcc = ldc * n; + + i__2 = *nidim; + for (ik = 1; ik <= i__2; ++ik) { + k = idim[ik]; + + for (ict = 1; ict <= 2; ++ict) { + *(unsigned char *)trans = *(unsigned char *)&icht[ict - 1]; + tran = *(unsigned char *)trans == 'C'; + if (tran && ! conj) { + *(unsigned char *)trans = 'T'; + } + if (tran) { + ma = k; + na = n; + } else { + ma = n; + na = k; + } +/* Set LDA to 1 more than minimum value if room. */ + lda = ma; + if (lda < *nmax) { + ++lda; + } +/* Skip tests if not enough room. */ + if (lda > *nmax) { + goto L110; + } + laa = lda * na; + +/* Generate the matrix A. */ + + if (tran) { + i__3 = *nmax << 1; + zmake_("ge", " ", " ", &ma, &na, &ab[1], &i__3, &aa[1], & + lda, &reset, &c_b1); + } else { + zmake_("ge", " ", " ", &ma, &na, &ab[1], nmax, &aa[1], & + lda, &reset, &c_b1); + } + +/* Generate the matrix B. */ + + ldb = lda; + lbb = laa; + if (tran) { + i__3 = *nmax << 1; + zmake_("ge", " ", " ", &ma, &na, &ab[k + 1], &i__3, &bb[1] + , &ldb, &reset, &c_b1); + } else { + zmake_("ge", " ", " ", &ma, &na, &ab[k * *nmax + 1], nmax, + &bb[1], &ldb, &reset, &c_b1); + } + + for (icu = 1; icu <= 2; ++icu) { + *(unsigned char *)uplo = *(unsigned char *)&ichu[icu - 1]; + upper = *(unsigned char *)uplo == 'U'; + + i__3 = *nalf; + for (ia = 1; ia <= i__3; ++ia) { + i__4 = ia; + alpha.r = alf[i__4].r, alpha.i = alf[i__4].i; + + i__4 = *nbet; + for (ib = 1; ib <= i__4; ++ib) { + i__5 = ib; + beta.r = bet[i__5].r, beta.i = bet[i__5].i; + if (conj) { + rbeta = beta.r; + z__1.r = rbeta, z__1.i = 0.; + beta.r = z__1.r, beta.i = z__1.i; + } + null = n <= 0; + if (conj) { + null = null || (k <= 0 || alpha.r == 0. && + alpha.i == 0.) && rbeta == 1.; + } + +/* Generate the matrix C. */ + + zmake_(sname + 7, uplo, " ", &n, &n, &c__[ + c_offset], nmax, &cc[1], &ldc, &reset, & + c_b1); + + ++nc; + +/* Save every datum before calling the subroutine. */ + + *(unsigned char *)uplos = *(unsigned char *)uplo; + *(unsigned char *)transs = *(unsigned char *) + trans; + ns = n; + ks = k; + als.r = alpha.r, als.i = alpha.i; + i__5 = laa; + for (i__ = 1; i__ <= i__5; ++i__) { + i__6 = i__; + i__7 = i__; + as[i__6].r = aa[i__7].r, as[i__6].i = aa[i__7] + .i; +/* L10: */ + } + ldas = lda; + i__5 = lbb; + for (i__ = 1; i__ <= i__5; ++i__) { + i__6 = i__; + i__7 = i__; + bs[i__6].r = bb[i__7].r, bs[i__6].i = bb[i__7] + .i; +/* L20: */ + } + ldbs = ldb; + if (conj) { + rbets = rbeta; + } else { + bets.r = beta.r, bets.i = beta.i; + } + i__5 = lcc; + for (i__ = 1; i__ <= i__5; ++i__) { + i__6 = i__; + i__7 = i__; + cs[i__6].r = cc[i__7].r, cs[i__6].i = cc[i__7] + .i; +/* L30: */ + } + ldcs = ldc; + +/* Call the subroutine. */ + + if (conj) { + if (*trace) { + zprcn7_(ntra, &nc, sname, iorder, uplo, + trans, &n, &k, &alpha, &lda, &ldb, + &rbeta, &ldc); + } + if (*rewi) { + al__1.aerr = 0; + al__1.aunit = *ntra; + f_rew(&al__1); + } + czher2k_(iorder, uplo, trans, &n, &k, &alpha, + &aa[1], &lda, &bb[1], &ldb, &rbeta, & + cc[1], &ldc); + } else { + if (*trace) { + zprcn5_(ntra, &nc, sname, iorder, uplo, + trans, &n, &k, &alpha, &lda, &ldb, + &beta, &ldc); + } + if (*rewi) { + al__1.aerr = 0; + al__1.aunit = *ntra; + f_rew(&al__1); + } + czsyr2k_(iorder, uplo, trans, &n, &k, &alpha, + &aa[1], &lda, &bb[1], &ldb, &beta, & + cc[1], &ldc); + } + +/* Check if error-exit was taken incorrectly. */ + + if (! infoc_1.ok) { + io___362.ciunit = *nout; + s_wsfe(&io___362); + e_wsfe(); + *fatal = TRUE_; + goto L150; + } + +/* See what data changed inside subroutines. */ + + isame[0] = *(unsigned char *)uplos == *(unsigned + char *)uplo; + isame[1] = *(unsigned char *)transs == *(unsigned + char *)trans; + isame[2] = ns == n; + isame[3] = ks == k; + isame[4] = als.r == alpha.r && als.i == alpha.i; + isame[5] = lze_(&as[1], &aa[1], &laa); + isame[6] = ldas == lda; + isame[7] = lze_(&bs[1], &bb[1], &lbb); + isame[8] = ldbs == ldb; + if (conj) { + isame[9] = rbets == rbeta; + } else { + isame[9] = bets.r == beta.r && bets.i == + beta.i; + } + if (null) { + isame[10] = lze_(&cs[1], &cc[1], &lcc); + } else { + isame[10] = lzeres_("he", uplo, &n, &n, &cs[1] + , &cc[1], &ldc); + } + isame[11] = ldcs == ldc; + +/* If data was incorrectly changed, report and */ +/* return. */ + + same = TRUE_; + i__5 = nargs; + for (i__ = 1; i__ <= i__5; ++i__) { + same = same && isame[i__ - 1]; + if (! isame[i__ - 1]) { + io___365.ciunit = *nout; + s_wsfe(&io___365); + do_fio(&c__1, (char *)&i__, (ftnlen) + sizeof(integer)); + e_wsfe(); + } +/* L40: */ + } + if (! same) { + *fatal = TRUE_; + goto L150; + } + + if (! null) { + +/* Check the result column by column. */ + + if (conj) { + *(unsigned char *)transt = 'C'; + } else { + *(unsigned char *)transt = 'T'; + } + jjab = 1; + jc = 1; + i__5 = n; + for (j = 1; j <= i__5; ++j) { + if (upper) { + jj = 1; + lj = j; + } else { + jj = j; + lj = n - j + 1; + } + if (tran) { + i__6 = k; + for (i__ = 1; i__ <= i__6; ++i__) { + i__7 = i__; + i__8 = (j - 1 << 1) * *nmax + k + + i__; + z__1.r = alpha.r * ab[i__8].r - + alpha.i * ab[i__8].i, + z__1.i = alpha.r * ab[ + i__8].i + alpha.i * ab[ + i__8].r; + w[i__7].r = z__1.r, w[i__7].i = + z__1.i; + if (conj) { + i__7 = k + i__; + d_cnjg(&z__2, &alpha); + i__8 = (j - 1 << 1) * *nmax + i__; + z__1.r = z__2.r * ab[i__8].r - z__2.i * ab[i__8].i, + z__1.i = z__2.r * ab[i__8].i + z__2.i * ab[ + i__8].r; + w[i__7].r = z__1.r, w[i__7].i = z__1.i; + } else { + i__7 = k + i__; + i__8 = (j - 1 << 1) * *nmax + i__; + z__1.r = alpha.r * ab[i__8].r - alpha.i * ab[i__8] + .i, z__1.i = alpha.r * ab[i__8].i + alpha.i + * ab[i__8].r; + w[i__7].r = z__1.r, w[i__7].i = z__1.i; + } +/* L50: */ + } + i__6 = k << 1; + i__7 = *nmax << 1; + i__8 = *nmax << 1; + zmmch_(transt, "N", &lj, &c__1, &i__6, + &c_b2, &ab[jjab], &i__7, &w[ + 1], &i__8, &beta, &c__[jj + j + * c_dim1], nmax, &ct[1], &g[1] + , &cc[jc], &ldc, eps, &err, + fatal, nout, &c_true); + } else { + i__6 = k; + for (i__ = 1; i__ <= i__6; ++i__) { + if (conj) { + i__7 = i__; + d_cnjg(&z__2, &ab[(k + i__ - 1) * *nmax + j]); + z__1.r = alpha.r * z__2.r - alpha.i * z__2.i, + z__1.i = alpha.r * z__2.i + alpha.i * + z__2.r; + w[i__7].r = z__1.r, w[i__7].i = z__1.i; + i__7 = k + i__; + i__8 = (i__ - 1) * *nmax + j; + z__2.r = alpha.r * ab[i__8].r - alpha.i * ab[i__8] + .i, z__2.i = alpha.r * ab[i__8].i + alpha.i + * ab[i__8].r; + d_cnjg(&z__1, &z__2); + w[i__7].r = z__1.r, w[i__7].i = z__1.i; + } else { + i__7 = i__; + i__8 = (k + i__ - 1) * *nmax + j; + z__1.r = alpha.r * ab[i__8].r - alpha.i * ab[i__8] + .i, z__1.i = alpha.r * ab[i__8].i + alpha.i + * ab[i__8].r; + w[i__7].r = z__1.r, w[i__7].i = z__1.i; + i__7 = k + i__; + i__8 = (i__ - 1) * *nmax + j; + z__1.r = alpha.r * ab[i__8].r - alpha.i * ab[i__8] + .i, z__1.i = alpha.r * ab[i__8].i + alpha.i + * ab[i__8].r; + w[i__7].r = z__1.r, w[i__7].i = z__1.i; + } +/* L60: */ + } + i__6 = k << 1; + i__7 = *nmax << 1; + zmmch_("N", "N", &lj, &c__1, &i__6, & + c_b2, &ab[jj], nmax, &w[1], & + i__7, &beta, &c__[jj + j * + c_dim1], nmax, &ct[1], &g[1], + &cc[jc], &ldc, eps, &err, + fatal, nout, &c_true); + } + if (upper) { + jc += ldc; + } else { + jc = jc + ldc + 1; + if (tran) { + jjab += *nmax << 1; + } + } + errmax = f2cmax(errmax,err); +/* If got really bad answer, report and */ +/* return. */ + if (*fatal) { + goto L140; + } +/* L70: */ + } + } + +/* L80: */ + } + +/* L90: */ + } + +/* L100: */ + } + +L110: + ; + } + +/* L120: */ + } + +L130: + ; + } + +/* Report result. */ + + if (errmax < *thresh) { + if (*iorder == 0) { + io___373.ciunit = *nout; + s_wsfe(&io___373); + do_fio(&c__1, sname, (ftnlen)13); + do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); + e_wsfe(); + } + if (*iorder == 1) { + io___374.ciunit = *nout; + s_wsfe(&io___374); + do_fio(&c__1, sname, (ftnlen)13); + do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); + e_wsfe(); + } + } else { + if (*iorder == 0) { + io___375.ciunit = *nout; + s_wsfe(&io___375); + do_fio(&c__1, sname, (ftnlen)13); + do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); + do_fio(&c__1, (char *)&errmax, (ftnlen)sizeof(doublereal)); + e_wsfe(); + } + if (*iorder == 1) { + io___376.ciunit = *nout; + s_wsfe(&io___376); + do_fio(&c__1, sname, (ftnlen)13); + do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); + do_fio(&c__1, (char *)&errmax, (ftnlen)sizeof(doublereal)); + e_wsfe(); + } + } + goto L160; + +L140: + if (n > 1) { + io___377.ciunit = *nout; + s_wsfe(&io___377); + do_fio(&c__1, (char *)&j, (ftnlen)sizeof(integer)); + e_wsfe(); + } + +L150: + io___378.ciunit = *nout; + s_wsfe(&io___378); + do_fio(&c__1, sname, (ftnlen)13); + e_wsfe(); + if (conj) { + zprcn7_(nout, &nc, sname, iorder, uplo, trans, &n, &k, &alpha, &lda, & + ldb, &rbeta, &ldc); + } else { + zprcn5_(nout, &nc, sname, iorder, uplo, trans, &n, &k, &alpha, &lda, & + ldb, &beta, &ldc); + } + +L160: + return 0; + +/* L9994: */ +/* L9993: */ + +/* End of ZCHK5. */ + +} /* zchk5_ */ + + +/* Subroutine */ int zprcn5_(integer *nout, integer *nc, char *sname, integer + *iorder, char *uplo, char *transa, integer *n, integer *k, + doublecomplex *alpha, integer *lda, integer *ldb, doublecomplex *beta, + integer *ldc) +{ + /* Format strings */ + static char fmt_9995[] = "(1x,i6,\002: \002,a13,\002(\002,3(a15,\002," + "\002))"; + static char fmt_9994[] = "(10x,2(i3,\002,\002),\002 (\002,f4.1,\002,\002" + ",f4.1,\002), A,\002,i3,\002, B\002,i3,\002, (\002,f4.1,\002,\002" + ",f4.1,\002), C,\002,i3,\002).\002)"; + + /* Local variables */ + char ca[14], cu[14], crc[14]; + + /* Fortran I/O blocks */ + static cilist io___382 = { 0, 0, 0, fmt_9995, 0 }; + static cilist io___383 = { 0, 0, 0, fmt_9994, 0 }; + + + if (*(unsigned char *)uplo == 'U') { + s_copy(cu, " CblasUpper", (ftnlen)14, (ftnlen)14); + } else { + s_copy(cu, " CblasLower", (ftnlen)14, (ftnlen)14); + } + if (*(unsigned char *)transa == 'N') { + s_copy(ca, " CblasNoTrans", (ftnlen)14, (ftnlen)14); + } else if (*(unsigned char *)transa == 'T') { + s_copy(ca, " CblasTrans", (ftnlen)14, (ftnlen)14); + } else { + s_copy(ca, "CblasConjTrans", (ftnlen)14, (ftnlen)14); + } + if (*iorder == 1) { + s_copy(crc, " CblasRowMajor", (ftnlen)14, (ftnlen)14); + } else { + s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14); + } + io___382.ciunit = *nout; + s_wsfe(&io___382); + do_fio(&c__1, (char *)&(*nc), (ftnlen)sizeof(integer)); + do_fio(&c__1, sname, (ftnlen)13); + do_fio(&c__1, crc, (ftnlen)14); + do_fio(&c__1, cu, (ftnlen)14); + do_fio(&c__1, ca, (ftnlen)14); + e_wsfe(); + io___383.ciunit = *nout; + s_wsfe(&io___383); + do_fio(&c__1, (char *)&(*n), (ftnlen)sizeof(integer)); + do_fio(&c__1, (char *)&(*k), (ftnlen)sizeof(integer)); + do_fio(&c__2, (char *)&(*alpha), (ftnlen)sizeof(doublereal)); + do_fio(&c__1, (char *)&(*lda), (ftnlen)sizeof(integer)); + do_fio(&c__1, (char *)&(*ldb), (ftnlen)sizeof(integer)); + do_fio(&c__2, (char *)&(*beta), (ftnlen)sizeof(doublereal)); + do_fio(&c__1, (char *)&(*ldc), (ftnlen)sizeof(integer)); + e_wsfe(); + return 0; +} /* zprcn5_ */ + + + +/* Subroutine */ int zprcn7_(integer *nout, integer *nc, char *sname, integer + *iorder, char *uplo, char *transa, integer *n, integer *k, + doublecomplex *alpha, integer *lda, integer *ldb, doublereal *beta, + integer *ldc) +{ + /* Format strings */ + static char fmt_9995[] = "(1x,i6,\002: \002,a13,\002(\002,3(a15,\002," + "\002))"; + static char fmt_9994[] = "(10x,2(i3,\002,\002),\002 (\002,f4.1,\002,\002" + ",f4.1,\002), A,\002,i3,\002, B\002,i3,\002,\002,f4.1,\002, C," + "\002,i3,\002).\002)"; + + /* Local variables */ + char ca[14], cu[14], crc[14]; + + /* Fortran I/O blocks */ + static cilist io___387 = { 0, 0, 0, fmt_9995, 0 }; + static cilist io___388 = { 0, 0, 0, fmt_9994, 0 }; + + + if (*(unsigned char *)uplo == 'U') { + s_copy(cu, " CblasUpper", (ftnlen)14, (ftnlen)14); + } else { + s_copy(cu, " CblasLower", (ftnlen)14, (ftnlen)14); + } + if (*(unsigned char *)transa == 'N') { + s_copy(ca, " CblasNoTrans", (ftnlen)14, (ftnlen)14); + } else if (*(unsigned char *)transa == 'T') { + s_copy(ca, " CblasTrans", (ftnlen)14, (ftnlen)14); + } else { + s_copy(ca, "CblasConjTrans", (ftnlen)14, (ftnlen)14); + } + if (*iorder == 1) { + s_copy(crc, " CblasRowMajor", (ftnlen)14, (ftnlen)14); + } else { + s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14); + } + io___387.ciunit = *nout; + s_wsfe(&io___387); + do_fio(&c__1, (char *)&(*nc), (ftnlen)sizeof(integer)); + do_fio(&c__1, sname, (ftnlen)13); + do_fio(&c__1, crc, (ftnlen)14); + do_fio(&c__1, cu, (ftnlen)14); + do_fio(&c__1, ca, (ftnlen)14); + e_wsfe(); + io___388.ciunit = *nout; + s_wsfe(&io___388); + do_fio(&c__1, (char *)&(*n), (ftnlen)sizeof(integer)); + do_fio(&c__1, (char *)&(*k), (ftnlen)sizeof(integer)); + do_fio(&c__2, (char *)&(*alpha), (ftnlen)sizeof(doublereal)); + do_fio(&c__1, (char *)&(*lda), (ftnlen)sizeof(integer)); + do_fio(&c__1, (char *)&(*ldb), (ftnlen)sizeof(integer)); + do_fio(&c__1, (char *)&(*beta), (ftnlen)sizeof(doublereal)); + do_fio(&c__1, (char *)&(*ldc), (ftnlen)sizeof(integer)); + e_wsfe(); + return 0; +} /* zprcn7_ */ + + +/* Subroutine */ int zmake_(char *type__, char *uplo, char *diag, integer *m, + integer *n, doublecomplex *a, integer *nmax, doublecomplex *aa, + integer *lda, logical *reset, doublecomplex *transl) +{ + /* System generated locals */ + integer a_dim1, a_offset, i__1, i__2, i__3, i__4; + doublereal d__1; + doublecomplex z__1, z__2; + + /* Local variables */ + integer ibeg, iend; + extern /* Double Complex */ VOID zbeg_(doublecomplex *, logical *); + logical unit; + integer i__, j; + logical lower, upper; + integer jj; + logical gen, her, tri, sym; + + +/* Generates values for an M by N matrix A. */ +/* Stores the values in the array AA in the data structure required */ +/* by the routine, with unwanted elements set to rogue value. */ + +/* TYPE is 'ge', 'he', 'sy' or 'tr'. */ + +/* Auxiliary routine for test program for Level 3 Blas. */ + +/* -- Written on 8-February-1989. */ +/* Jack Dongarra, Argonne National Laboratory. */ +/* Iain Duff, AERE Harwell. */ +/* Jeremy Du Croz, Numerical Algorithms Group Ltd. */ +/* Sven Hammarling, Numerical Algorithms Group Ltd. */ + + /* Parameter adjustments */ + a_dim1 = *nmax; + a_offset = 1 + a_dim1 * 1; + a -= a_offset; + --aa; + + /* Function Body */ + gen = s_cmp(type__, "ge", (ftnlen)2, (ftnlen)2) == 0; + her = s_cmp(type__, "he", (ftnlen)2, (ftnlen)2) == 0; + sym = s_cmp(type__, "sy", (ftnlen)2, (ftnlen)2) == 0; + tri = s_cmp(type__, "tr", (ftnlen)2, (ftnlen)2) == 0; + upper = (her || sym || tri) && *(unsigned char *)uplo == 'U'; + lower = (her || sym || tri) && *(unsigned char *)uplo == 'L'; + unit = tri && *(unsigned char *)diag == 'U'; + +/* Generate data in array A. */ + + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + i__2 = *m; + for (i__ = 1; i__ <= i__2; ++i__) { + if (gen || upper && i__ <= j || lower && i__ >= j) { + i__3 = i__ + j * a_dim1; + zbeg_(&z__2, reset); + z__1.r = z__2.r + transl->r, z__1.i = z__2.i + transl->i; + a[i__3].r = z__1.r, a[i__3].i = z__1.i; + if (i__ != j) { +/* Set some elements to zero */ + if (*n > 3 && j == *n / 2) { + i__3 = i__ + j * a_dim1; + a[i__3].r = 0., a[i__3].i = 0.; + } + if (her) { + i__3 = j + i__ * a_dim1; + d_cnjg(&z__1, &a[i__ + j * a_dim1]); + a[i__3].r = z__1.r, a[i__3].i = z__1.i; + } else if (sym) { + i__3 = j + i__ * a_dim1; + i__4 = i__ + j * a_dim1; + a[i__3].r = a[i__4].r, a[i__3].i = a[i__4].i; + } else if (tri) { + i__3 = j + i__ * a_dim1; + a[i__3].r = 0., a[i__3].i = 0.; + } + } + } +/* L10: */ + } + if (her) { + i__2 = j + j * a_dim1; + i__3 = j + j * a_dim1; + d__1 = a[i__3].r; + z__1.r = d__1, z__1.i = 0.; + a[i__2].r = z__1.r, a[i__2].i = z__1.i; + } + if (tri) { + i__2 = j + j * a_dim1; + i__3 = j + j * a_dim1; + z__1.r = a[i__3].r + 1., z__1.i = a[i__3].i + 0.; + a[i__2].r = z__1.r, a[i__2].i = z__1.i; + } + if (unit) { + i__2 = j + j * a_dim1; + a[i__2].r = 1., a[i__2].i = 0.; + } +/* L20: */ + } + +/* Store elements in array AS in data structure required by routine. */ + + if (s_cmp(type__, "ge", (ftnlen)2, (ftnlen)2) == 0) { + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + i__2 = *m; + for (i__ = 1; i__ <= i__2; ++i__) { + i__3 = i__ + (j - 1) * *lda; + i__4 = i__ + j * a_dim1; + aa[i__3].r = a[i__4].r, aa[i__3].i = a[i__4].i; +/* L30: */ + } + i__2 = *lda; + for (i__ = *m + 1; i__ <= i__2; ++i__) { + i__3 = i__ + (j - 1) * *lda; + aa[i__3].r = -1e10, aa[i__3].i = 1e10; +/* L40: */ + } +/* L50: */ + } + } else if (s_cmp(type__, "he", (ftnlen)2, (ftnlen)2) == 0 || s_cmp(type__, + "sy", (ftnlen)2, (ftnlen)2) == 0 || s_cmp(type__, "tr", (ftnlen) + 2, (ftnlen)2) == 0) { + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + if (upper) { + ibeg = 1; + if (unit) { + iend = j - 1; + } else { + iend = j; + } + } else { + if (unit) { + ibeg = j + 1; + } else { + ibeg = j; + } + iend = *n; + } + i__2 = ibeg - 1; + for (i__ = 1; i__ <= i__2; ++i__) { + i__3 = i__ + (j - 1) * *lda; + aa[i__3].r = -1e10, aa[i__3].i = 1e10; +/* L60: */ + } + i__2 = iend; + for (i__ = ibeg; i__ <= i__2; ++i__) { + i__3 = i__ + (j - 1) * *lda; + i__4 = i__ + j * a_dim1; + aa[i__3].r = a[i__4].r, aa[i__3].i = a[i__4].i; +/* L70: */ + } + i__2 = *lda; + for (i__ = iend + 1; i__ <= i__2; ++i__) { + i__3 = i__ + (j - 1) * *lda; + aa[i__3].r = -1e10, aa[i__3].i = 1e10; +/* L80: */ + } + if (her) { + jj = j + (j - 1) * *lda; + i__2 = jj; + i__3 = jj; + d__1 = aa[i__3].r; + z__1.r = d__1, z__1.i = -1e10; + aa[i__2].r = z__1.r, aa[i__2].i = z__1.i; + } +/* L90: */ + } + } + return 0; + +/* End of ZMAKE. */ + +} /* zmake_ */ + +/* Subroutine */ int zmmch_(char *transa, char *transb, integer *m, integer * + n, integer *kk, doublecomplex *alpha, doublecomplex *a, integer *lda, + doublecomplex *b, integer *ldb, doublecomplex *beta, doublecomplex * + c__, integer *ldc, doublecomplex *ct, doublereal *g, doublecomplex * + cc, integer *ldcc, doublereal *eps, doublereal *err, logical *fatal, + integer *nout, logical *mv) +{ + /* Format strings */ + static char fmt_9999[] = "(\002 ******* FATAL ERROR - COMPUTED RESULT IS" + " LESS THAN HAL\002,\002F ACCURATE *******\002,/\002 " + " EXPECTED RE\002,\002SULT COMPUTED R" + "ESULT\002)"; + static char fmt_9998[] = "(1x,i7,2(\002 (\002,g15.6,\002,\002,g15.6," + "\002)\002))"; + static char fmt_9997[] = "(\002 THESE ARE THE RESULTS FOR COLUMN" + " \002,i3)"; + + /* System generated locals */ + integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, cc_dim1, + cc_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7; + doublereal d__1, d__2, d__3, d__4, d__5, d__6; + doublecomplex z__1, z__2, z__3, z__4; + + /* Local variables */ + doublereal erri; + integer i__, j, k; + logical trana, tranb, ctrana, ctranb; + + /* Fortran I/O blocks */ + static cilist io___409 = { 0, 0, 0, fmt_9999, 0 }; + static cilist io___410 = { 0, 0, 0, fmt_9998, 0 }; + static cilist io___411 = { 0, 0, 0, fmt_9998, 0 }; + static cilist io___412 = { 0, 0, 0, fmt_9997, 0 }; + + + +/* Checks the results of the computational tests. */ + +/* Auxiliary routine for test program for Level 3 Blas. */ + +/* -- Written on 8-February-1989. */ +/* Jack Dongarra, Argonne National Laboratory. */ +/* Iain Duff, AERE Harwell. */ +/* Jeremy Du Croz, Numerical Algorithms Group Ltd. */ +/* Sven Hammarling, Numerical Algorithms Group Ltd. */ + + /* Parameter adjustments */ + a_dim1 = *lda; + a_offset = 1 + a_dim1 * 1; + a -= a_offset; + b_dim1 = *ldb; + b_offset = 1 + b_dim1 * 1; + b -= b_offset; + c_dim1 = *ldc; + c_offset = 1 + c_dim1 * 1; + c__ -= c_offset; + --ct; + --g; + cc_dim1 = *ldcc; + cc_offset = 1 + cc_dim1 * 1; + cc -= cc_offset; + + /* Function Body */ + trana = *(unsigned char *)transa == 'T' || *(unsigned char *)transa == + 'C'; + tranb = *(unsigned char *)transb == 'T' || *(unsigned char *)transb == + 'C'; + ctrana = *(unsigned char *)transa == 'C'; + ctranb = *(unsigned char *)transb == 'C'; + +/* Compute expected result, one column at a time, in CT using data */ +/* in A, B and C. */ +/* Compute gauges in G. */ + + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + + i__2 = *m; + for (i__ = 1; i__ <= i__2; ++i__) { + i__3 = i__; + ct[i__3].r = 0., ct[i__3].i = 0.; + g[i__] = 0.; +/* L10: */ + } + if (! trana && ! tranb) { + i__2 = *kk; + for (k = 1; k <= i__2; ++k) { + i__3 = *m; + for (i__ = 1; i__ <= i__3; ++i__) { + i__4 = i__; + i__5 = i__; + i__6 = i__ + k * a_dim1; + i__7 = k + j * b_dim1; + z__2.r = a[i__6].r * b[i__7].r - a[i__6].i * b[i__7].i, + z__2.i = a[i__6].r * b[i__7].i + a[i__6].i * b[ + i__7].r; + z__1.r = ct[i__5].r + z__2.r, z__1.i = ct[i__5].i + + z__2.i; + ct[i__4].r = z__1.r, ct[i__4].i = z__1.i; + i__4 = i__ + k * a_dim1; + i__5 = k + j * b_dim1; + g[i__] += ((d__1 = a[i__4].r, abs(d__1)) + (d__2 = d_imag( + &a[i__ + k * a_dim1]), abs(d__2))) * ((d__3 = b[ + i__5].r, abs(d__3)) + (d__4 = d_imag(&b[k + j * + b_dim1]), abs(d__4))); +/* L20: */ + } +/* L30: */ + } + } else if (trana && ! tranb) { + if (ctrana) { + i__2 = *kk; + for (k = 1; k <= i__2; ++k) { + i__3 = *m; + for (i__ = 1; i__ <= i__3; ++i__) { + i__4 = i__; + i__5 = i__; + d_cnjg(&z__3, &a[k + i__ * a_dim1]); + i__6 = k + j * b_dim1; + z__2.r = z__3.r * b[i__6].r - z__3.i * b[i__6].i, + z__2.i = z__3.r * b[i__6].i + z__3.i * b[i__6] + .r; + z__1.r = ct[i__5].r + z__2.r, z__1.i = ct[i__5].i + + z__2.i; + ct[i__4].r = z__1.r, ct[i__4].i = z__1.i; + i__4 = k + i__ * a_dim1; + i__5 = k + j * b_dim1; + g[i__] += ((d__1 = a[i__4].r, abs(d__1)) + (d__2 = + d_imag(&a[k + i__ * a_dim1]), abs(d__2))) * (( + d__3 = b[i__5].r, abs(d__3)) + (d__4 = d_imag( + &b[k + j * b_dim1]), abs(d__4))); +/* L40: */ + } +/* L50: */ + } + } else { + i__2 = *kk; + for (k = 1; k <= i__2; ++k) { + i__3 = *m; + for (i__ = 1; i__ <= i__3; ++i__) { + i__4 = i__; + i__5 = i__; + i__6 = k + i__ * a_dim1; + i__7 = k + j * b_dim1; + z__2.r = a[i__6].r * b[i__7].r - a[i__6].i * b[i__7] + .i, z__2.i = a[i__6].r * b[i__7].i + a[i__6] + .i * b[i__7].r; + z__1.r = ct[i__5].r + z__2.r, z__1.i = ct[i__5].i + + z__2.i; + ct[i__4].r = z__1.r, ct[i__4].i = z__1.i; + i__4 = k + i__ * a_dim1; + i__5 = k + j * b_dim1; + g[i__] += ((d__1 = a[i__4].r, abs(d__1)) + (d__2 = + d_imag(&a[k + i__ * a_dim1]), abs(d__2))) * (( + d__3 = b[i__5].r, abs(d__3)) + (d__4 = d_imag( + &b[k + j * b_dim1]), abs(d__4))); +/* L60: */ + } +/* L70: */ + } + } + } else if (! trana && tranb) { + if (ctranb) { + i__2 = *kk; + for (k = 1; k <= i__2; ++k) { + i__3 = *m; + for (i__ = 1; i__ <= i__3; ++i__) { + i__4 = i__; + i__5 = i__; + i__6 = i__ + k * a_dim1; + d_cnjg(&z__3, &b[j + k * b_dim1]); + z__2.r = a[i__6].r * z__3.r - a[i__6].i * z__3.i, + z__2.i = a[i__6].r * z__3.i + a[i__6].i * + z__3.r; + z__1.r = ct[i__5].r + z__2.r, z__1.i = ct[i__5].i + + z__2.i; + ct[i__4].r = z__1.r, ct[i__4].i = z__1.i; + i__4 = i__ + k * a_dim1; + i__5 = j + k * b_dim1; + g[i__] += ((d__1 = a[i__4].r, abs(d__1)) + (d__2 = + d_imag(&a[i__ + k * a_dim1]), abs(d__2))) * (( + d__3 = b[i__5].r, abs(d__3)) + (d__4 = d_imag( + &b[j + k * b_dim1]), abs(d__4))); +/* L80: */ + } +/* L90: */ + } + } else { + i__2 = *kk; + for (k = 1; k <= i__2; ++k) { + i__3 = *m; + for (i__ = 1; i__ <= i__3; ++i__) { + i__4 = i__; + i__5 = i__; + i__6 = i__ + k * a_dim1; + i__7 = j + k * b_dim1; + z__2.r = a[i__6].r * b[i__7].r - a[i__6].i * b[i__7] + .i, z__2.i = a[i__6].r * b[i__7].i + a[i__6] + .i * b[i__7].r; + z__1.r = ct[i__5].r + z__2.r, z__1.i = ct[i__5].i + + z__2.i; + ct[i__4].r = z__1.r, ct[i__4].i = z__1.i; + i__4 = i__ + k * a_dim1; + i__5 = j + k * b_dim1; + g[i__] += ((d__1 = a[i__4].r, abs(d__1)) + (d__2 = + d_imag(&a[i__ + k * a_dim1]), abs(d__2))) * (( + d__3 = b[i__5].r, abs(d__3)) + (d__4 = d_imag( + &b[j + k * b_dim1]), abs(d__4))); +/* L100: */ + } +/* L110: */ + } + } + } else if (trana && tranb) { + if (ctrana) { + if (ctranb) { + i__2 = *kk; + for (k = 1; k <= i__2; ++k) { + i__3 = *m; + for (i__ = 1; i__ <= i__3; ++i__) { + i__4 = i__; + i__5 = i__; + d_cnjg(&z__3, &a[k + i__ * a_dim1]); + d_cnjg(&z__4, &b[j + k * b_dim1]); + z__2.r = z__3.r * z__4.r - z__3.i * z__4.i, + z__2.i = z__3.r * z__4.i + z__3.i * + z__4.r; + z__1.r = ct[i__5].r + z__2.r, z__1.i = ct[i__5].i + + z__2.i; + ct[i__4].r = z__1.r, ct[i__4].i = z__1.i; + i__4 = k + i__ * a_dim1; + i__5 = j + k * b_dim1; + g[i__] += ((d__1 = a[i__4].r, abs(d__1)) + (d__2 = + d_imag(&a[k + i__ * a_dim1]), abs(d__2))) + * ((d__3 = b[i__5].r, abs(d__3)) + (d__4 + = d_imag(&b[j + k * b_dim1]), abs(d__4))); +/* L120: */ + } +/* L130: */ + } + } else { + i__2 = *kk; + for (k = 1; k <= i__2; ++k) { + i__3 = *m; + for (i__ = 1; i__ <= i__3; ++i__) { + i__4 = i__; + i__5 = i__; + d_cnjg(&z__3, &a[k + i__ * a_dim1]); + i__6 = j + k * b_dim1; + z__2.r = z__3.r * b[i__6].r - z__3.i * b[i__6].i, + z__2.i = z__3.r * b[i__6].i + z__3.i * b[ + i__6].r; + z__1.r = ct[i__5].r + z__2.r, z__1.i = ct[i__5].i + + z__2.i; + ct[i__4].r = z__1.r, ct[i__4].i = z__1.i; + i__4 = k + i__ * a_dim1; + i__5 = j + k * b_dim1; + g[i__] += ((d__1 = a[i__4].r, abs(d__1)) + (d__2 = + d_imag(&a[k + i__ * a_dim1]), abs(d__2))) + * ((d__3 = b[i__5].r, abs(d__3)) + (d__4 + = d_imag(&b[j + k * b_dim1]), abs(d__4))); +/* L140: */ + } +/* L150: */ + } + } + } else { + if (ctranb) { + i__2 = *kk; + for (k = 1; k <= i__2; ++k) { + i__3 = *m; + for (i__ = 1; i__ <= i__3; ++i__) { + i__4 = i__; + i__5 = i__; + i__6 = k + i__ * a_dim1; + d_cnjg(&z__3, &b[j + k * b_dim1]); + z__2.r = a[i__6].r * z__3.r - a[i__6].i * z__3.i, + z__2.i = a[i__6].r * z__3.i + a[i__6].i * + z__3.r; + z__1.r = ct[i__5].r + z__2.r, z__1.i = ct[i__5].i + + z__2.i; + ct[i__4].r = z__1.r, ct[i__4].i = z__1.i; + i__4 = k + i__ * a_dim1; + i__5 = j + k * b_dim1; + g[i__] += ((d__1 = a[i__4].r, abs(d__1)) + (d__2 = + d_imag(&a[k + i__ * a_dim1]), abs(d__2))) + * ((d__3 = b[i__5].r, abs(d__3)) + (d__4 + = d_imag(&b[j + k * b_dim1]), abs(d__4))); +/* L160: */ + } +/* L170: */ + } + } else { + i__2 = *kk; + for (k = 1; k <= i__2; ++k) { + i__3 = *m; + for (i__ = 1; i__ <= i__3; ++i__) { + i__4 = i__; + i__5 = i__; + i__6 = k + i__ * a_dim1; + i__7 = j + k * b_dim1; + z__2.r = a[i__6].r * b[i__7].r - a[i__6].i * b[ + i__7].i, z__2.i = a[i__6].r * b[i__7].i + + a[i__6].i * b[i__7].r; + z__1.r = ct[i__5].r + z__2.r, z__1.i = ct[i__5].i + + z__2.i; + ct[i__4].r = z__1.r, ct[i__4].i = z__1.i; + i__4 = k + i__ * a_dim1; + i__5 = j + k * b_dim1; + g[i__] += ((d__1 = a[i__4].r, abs(d__1)) + (d__2 = + d_imag(&a[k + i__ * a_dim1]), abs(d__2))) + * ((d__3 = b[i__5].r, abs(d__3)) + (d__4 + = d_imag(&b[j + k * b_dim1]), abs(d__4))); +/* L180: */ + } +/* L190: */ + } + } + } + } + i__2 = *m; + for (i__ = 1; i__ <= i__2; ++i__) { + i__3 = i__; + i__4 = i__; + z__2.r = alpha->r * ct[i__4].r - alpha->i * ct[i__4].i, z__2.i = + alpha->r * ct[i__4].i + alpha->i * ct[i__4].r; + i__5 = i__ + j * c_dim1; + z__3.r = beta->r * c__[i__5].r - beta->i * c__[i__5].i, z__3.i = + beta->r * c__[i__5].i + beta->i * c__[i__5].r; + z__1.r = z__2.r + z__3.r, z__1.i = z__2.i + z__3.i; + ct[i__3].r = z__1.r, ct[i__3].i = z__1.i; + i__3 = i__ + j * c_dim1; + g[i__] = ((d__1 = alpha->r, abs(d__1)) + (d__2 = d_imag(alpha), + abs(d__2))) * g[i__] + ((d__3 = beta->r, abs(d__3)) + ( + d__4 = d_imag(beta), abs(d__4))) * ((d__5 = c__[i__3].r, + abs(d__5)) + (d__6 = d_imag(&c__[i__ + j * c_dim1]), abs( + d__6))); +/* L200: */ + } + +/* Compute the error ratio for this result. */ + + *err = 0.; + i__2 = *m; + for (i__ = 1; i__ <= i__2; ++i__) { + i__3 = i__; + i__4 = i__ + j * cc_dim1; + z__2.r = ct[i__3].r - cc[i__4].r, z__2.i = ct[i__3].i - cc[i__4] + .i; + z__1.r = z__2.r, z__1.i = z__2.i; + erri = ((d__1 = z__1.r, abs(d__1)) + (d__2 = d_imag(&z__1), abs( + d__2))) / *eps; + if (g[i__] != 0.) { + erri /= g[i__]; + } + *err = f2cmax(*err,erri); + if (*err * sqrt(*eps) >= 1.) { + goto L230; + } +/* L210: */ + } + +/* L220: */ + } + +/* If the loop completes, all results are at least half accurate. */ + goto L250; + +/* Report fatal error. */ + +L230: + *fatal = TRUE_; + io___409.ciunit = *nout; + s_wsfe(&io___409); + e_wsfe(); + i__1 = *m; + for (i__ = 1; i__ <= i__1; ++i__) { + if (*mv) { + io___410.ciunit = *nout; + s_wsfe(&io___410); + do_fio(&c__1, (char *)&i__, (ftnlen)sizeof(integer)); + do_fio(&c__2, (char *)&ct[i__], (ftnlen)sizeof(doublereal)); + do_fio(&c__2, (char *)&cc[i__ + j * cc_dim1], (ftnlen)sizeof( + doublereal)); + e_wsfe(); + } else { + io___411.ciunit = *nout; + s_wsfe(&io___411); + do_fio(&c__1, (char *)&i__, (ftnlen)sizeof(integer)); + do_fio(&c__2, (char *)&cc[i__ + j * cc_dim1], (ftnlen)sizeof( + doublereal)); + do_fio(&c__2, (char *)&ct[i__], (ftnlen)sizeof(doublereal)); + e_wsfe(); + } +/* L240: */ + } + if (*n > 1) { + io___412.ciunit = *nout; + s_wsfe(&io___412); + do_fio(&c__1, (char *)&j, (ftnlen)sizeof(integer)); + e_wsfe(); + } + +L250: + return 0; + + +/* End of ZMMCH. */ + +} /* zmmch_ */ + +logical lze_(doublecomplex *ri, doublecomplex *rj, integer *lr) +{ + /* System generated locals */ + integer i__1, i__2, i__3; + logical ret_val; + + /* Local variables */ + integer i__; + + +/* Tests if two arrays are identical. */ + +/* Auxiliary routine for test program for Level 3 Blas. */ + +/* -- Written on 8-February-1989. */ +/* Jack Dongarra, Argonne National Laboratory. */ +/* Iain Duff, AERE Harwell. */ +/* Jeremy Du Croz, Numerical Algorithms Group Ltd. */ +/* Sven Hammarling, Numerical Algorithms Group Ltd. */ + + /* Parameter adjustments */ + --rj; + --ri; + + /* Function Body */ + i__1 = *lr; + for (i__ = 1; i__ <= i__1; ++i__) { + i__2 = i__; + i__3 = i__; + if (ri[i__2].r != rj[i__3].r || ri[i__2].i != rj[i__3].i) { + goto L20; + } +/* L10: */ + } + ret_val = TRUE_; + goto L30; +L20: + ret_val = FALSE_; +L30: + return ret_val; + +/* End of LZE. */ + +} /* lze_ */ + +logical lzeres_(char *type__, char *uplo, integer *m, integer *n, + doublecomplex *aa, doublecomplex *as, integer *lda) +{ + /* System generated locals */ + integer aa_dim1, aa_offset, as_dim1, as_offset, i__1, i__2, i__3, i__4; + logical ret_val; + + /* Local variables */ + integer ibeg, iend, i__, j; + logical upper; + + +/* Tests if selected elements in two arrays are equal. */ + +/* TYPE is 'ge' or 'he' or 'sy'. */ + +/* Auxiliary routine for test program for Level 3 Blas. */ + +/* -- Written on 8-February-1989. */ +/* Jack Dongarra, Argonne National Laboratory. */ +/* Iain Duff, AERE Harwell. */ +/* Jeremy Du Croz, Numerical Algorithms Group Ltd. */ +/* Sven Hammarling, Numerical Algorithms Group Ltd. */ + + /* Parameter adjustments */ + as_dim1 = *lda; + as_offset = 1 + as_dim1 * 1; + as -= as_offset; + aa_dim1 = *lda; + aa_offset = 1 + aa_dim1 * 1; + aa -= aa_offset; + + /* Function Body */ + upper = *(unsigned char *)uplo == 'U'; + if (s_cmp(type__, "ge", (ftnlen)2, (ftnlen)2) == 0) { + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + i__2 = *lda; + for (i__ = *m + 1; i__ <= i__2; ++i__) { + i__3 = i__ + j * aa_dim1; + i__4 = i__ + j * as_dim1; + if (aa[i__3].r != as[i__4].r || aa[i__3].i != as[i__4].i) { + goto L70; + } +/* L10: */ + } +/* L20: */ + } + } else if (s_cmp(type__, "he", (ftnlen)2, (ftnlen)2) == 0 || s_cmp(type__, + "sy", (ftnlen)2, (ftnlen)2) == 0) { + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + if (upper) { + ibeg = 1; + iend = j; + } else { + ibeg = j; + iend = *n; + } + i__2 = ibeg - 1; + for (i__ = 1; i__ <= i__2; ++i__) { + i__3 = i__ + j * aa_dim1; + i__4 = i__ + j * as_dim1; + if (aa[i__3].r != as[i__4].r || aa[i__3].i != as[i__4].i) { + goto L70; + } +/* L30: */ + } + i__2 = *lda; + for (i__ = iend + 1; i__ <= i__2; ++i__) { + i__3 = i__ + j * aa_dim1; + i__4 = i__ + j * as_dim1; + if (aa[i__3].r != as[i__4].r || aa[i__3].i != as[i__4].i) { + goto L70; + } +/* L40: */ + } +/* L50: */ + } + } + +/* L60: */ + ret_val = TRUE_; + goto L80; +L70: + ret_val = FALSE_; +L80: + return ret_val; + +/* End of LZERES. */ + +} /* lzeres_ */ + +/* Double Complex */ VOID zbeg_(doublecomplex * ret_val, logical *reset) +{ + /* System generated locals */ + doublereal d__1, d__2; + doublecomplex z__1; + + /* Local variables */ + static integer i__, j, ic, mi, mj; + + +/* Generates complex numbers as pairs of random numbers uniformly */ +/* distributed between -0.5 and 0.5. */ + +/* Auxiliary routine for test program for Level 3 Blas. */ + +/* -- Written on 8-February-1989. */ +/* Jack Dongarra, Argonne National Laboratory. */ +/* Iain Duff, AERE Harwell. */ +/* Jeremy Du Croz, Numerical Algorithms Group Ltd. */ +/* Sven Hammarling, Numerical Algorithms Group Ltd. */ + + if (*reset) { +/* Initialize local variables. */ + mi = 891; + mj = 457; + i__ = 7; + j = 7; + ic = 0; + *reset = FALSE_; + } + +/* The sequence of values of I or J is bounded between 1 and 999. */ +/* If initial I or J = 1,2,3,6,7 or 9, the period will be 50. */ +/* If initial I or J = 4 or 8, the period will be 25. */ +/* If initial I or J = 5, the period will be 10. */ +/* IC is used to break up the period by skipping 1 value of I or J */ +/* in 6. */ + + ++ic; +L10: + i__ *= mi; + j *= mj; + i__ -= i__ / 1000 * 1000; + j -= j / 1000 * 1000; + if (ic >= 5) { + ic = 0; + goto L10; + } + d__1 = (i__ - 500) / 1001.; + d__2 = (j - 500) / 1001.; + z__1.r = d__1, z__1.i = d__2; + ret_val->r = z__1.r, ret_val->i = z__1.i; + return ; + +/* End of ZBEG. */ + +} /* zbeg_ */ + +doublereal ddiff_(doublereal *x, doublereal *y) +{ + /* System generated locals */ + doublereal ret_val; + + +/* Auxiliary routine for test program for Level 3 Blas. */ + +/* -- Written on 8-February-1989. */ +/* Jack Dongarra, Argonne National Laboratory. */ +/* Iain Duff, AERE Harwell. */ +/* Jeremy Du Croz, Numerical Algorithms Group Ltd. */ +/* Sven Hammarling, Numerical Algorithms Group Ltd. */ + + ret_val = *x - *y; + return ret_val; + +/* End of DDIFF. */ + +} /* ddiff_ */ + +/* Main program alias */ int zblat3_ () { MAIN__ (); return 0; } From 175e357f5d576e493cba9f180ad78800462f0c3f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 26 Feb 2024 14:19:50 +0100 Subject: [PATCH 09/23] run apt-get update before fetching Ubuntu packages --- .github/workflows/dynamic_arch.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/dynamic_arch.yml b/.github/workflows/dynamic_arch.yml index 49721958a..669aa8116 100644 --- a/.github/workflows/dynamic_arch.yml +++ b/.github/workflows/dynamic_arch.yml @@ -42,6 +42,7 @@ jobs: - name: Install Dependencies run: | if [ "$RUNNER_OS" == "Linux" ]; then + sudo apt-get update sudo apt-get install -y gfortran cmake ccache libtinfo5 elif [ "$RUNNER_OS" == "macOS" ]; then # It looks like "gfortran" isn't working correctly unless "gcc" is re-installed. From a1ec94c258ac4151eb69012310e0f694f7067ea2 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 26 Feb 2024 17:46:07 +0100 Subject: [PATCH 10/23] Readd proper f2c'd sources for the GEMM3M tests --- ctest/c_cblat3c_3m.c | 1490 +++++-------------------- ctest/c_zblat3c_3m.c | 2454 +++++++++++++----------------------------- 2 files changed, 1043 insertions(+), 2901 deletions(-) diff --git a/ctest/c_cblat3c_3m.c b/ctest/c_cblat3c_3m.c index 9cfa26a41..b5d6bf9cb 100644 --- a/ctest/c_cblat3c_3m.c +++ b/ctest/c_cblat3c_3m.c @@ -10,25 +10,7 @@ #undef I #endif -#if defined(_WIN64) -typedef long long BLASLONG; -typedef unsigned long long BLASULONG; -#else -typedef long BLASLONG; -typedef unsigned long BLASULONG; -#endif - -#ifdef LAPACK_ILP64 -typedef BLASLONG blasint; -#if defined(_WIN64) -#define blasabs(x) llabs(x) -#else -#define blasabs(x) labs(x) -#endif -#else -typedef int blasint; -#define blasabs(x) abs(x) -#endif +#include "common.h" typedef blasint integer; @@ -247,7 +229,6 @@ typedef struct Namelist Namelist; #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } #define sig_die(s, kill) { exit(1); } #define s_stop(s, n) {exit(0);} -static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; #define z_abs(z) (cabs(Cd(z))) #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} @@ -261,259 +242,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; /* procedure parameter types for -A and -C++ */ #define F2C_proc_par_types 1 -#ifdef __cplusplus -typedef logical (*L_fp)(...); -#else -typedef logical (*L_fp)(); -#endif - -static float spow_ui(float x, integer n) { - float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i= 0; if (trace) { - o__1.oerr = 0; +/* o__1.oerr = 0; o__1.ounit = ntra; o__1.ofnmlen = 32; o__1.ofnm = snaps; @@ -774,147 +411,122 @@ static logical c_false = FALSE_; o__1.oacc = 0; o__1.ofm = 0; o__1.oblnk = 0; - f_open(&o__1); + f_open(&o__1);*/ } /* Read the flag that directs rewinding of the snapshot file. */ - s_rsle(&io___7); - do_lio(&c__8, &c__1, (char *)&rewi, (ftnlen)sizeof(logical)); - e_rsle(); - rewi = rewi && trace; + fgets(line,80,stdin); + sscanf(line,"%d",&rewi); + rewi = rewi && trace; /* Read the flag that directs stopping on any failure. */ - s_rsle(&io___9); - do_lio(&c__8, &c__1, (char *)&sfatal, (ftnlen)sizeof(logical)); - e_rsle(); -/* Read the flag that indicates whether error exits are to be tested. */ - s_rsle(&io___11); - do_lio(&c__8, &c__1, (char *)&tsterr, (ftnlen)sizeof(logical)); - e_rsle(); -/* Read the flag that indicates whether row-major data layout to be tested. */ - s_rsle(&io___13); - do_lio(&c__3, &c__1, (char *)&layout, (ftnlen)sizeof(integer)); - e_rsle(); -/* Read the threshold value of the test ratio */ - s_rsle(&io___15); - do_lio(&c__4, &c__1, (char *)&thresh, (ftnlen)sizeof(real)); - e_rsle(); + fgets(line,80,stdin); + sscanf(line,"%c",&tmpchar); + sfatal=FALSE_; + if (tmpchar=='T')sfatal=TRUE_; + fgets(line,80,stdin); + sscanf(line,"%c",&tmpchar); + tsterr=FALSE_; + if (tmpchar=='T')tsterr=TRUE_; + fgets(line,80,stdin); + sscanf(line,"%d",&layout); + fgets(line,80,stdin); + sscanf(line,"%f",&thresh); + /* Read and check the parameter values for the tests. */ /* Values of N */ - s_rsle(&io___17); - do_lio(&c__3, &c__1, (char *)&nidim, (ftnlen)sizeof(integer)); - e_rsle(); + fgets(line,80,stdin); +#ifdef USE64BITINT + sscanf(line,"%ld",&nidim); +#else + sscanf(line,"%d",&nidim); +#endif + if (nidim < 1 || nidim > 9) { - s_wsfe(&io___19); - do_fio(&c__1, "N", (ftnlen)1); - do_fio(&c__1, (char *)&c__9, (ftnlen)sizeof(integer)); - e_wsfe(); + fprintf(stderr,"NUMBER OF VALUES OF N IS LESS THAN 1 OR GREATER THAN 9"); goto L220; } - s_rsle(&io___20); - i__1 = nidim; - for (i__ = 1; i__ <= i__1; ++i__) { - do_lio(&c__3, &c__1, (char *)&idim[i__ - 1], (ftnlen)sizeof(integer)); - } - e_rsle(); + fgets(line,80,stdin); +#ifdef USE64BITINT + sscanf(line,"%ld %ld %ld %ld %ld %ld %ld %ld %ld",&idim[0],&idim[1],&idim[2], + &idim[3],&idim[4],&idim[5],&idim[6],&idim[7],&idim[8]); +#else + sscanf(line,"%d %d %d %d %d %d %d %d %d",&idim[0],&idim[1],&idim[2], + &idim[3],&idim[4],&idim[5],&idim[6],&idim[7],&idim[8]); +#endif i__1 = nidim; for (i__ = 1; i__ <= i__1; ++i__) { if (idim[i__ - 1] < 0 || idim[i__ - 1] > 65) { - s_wsfe(&io___23); - do_fio(&c__1, (char *)&c__65, (ftnlen)sizeof(integer)); - e_wsfe(); + fprintf(stderr,"VALUE OF N IS LESS THAN 0 OR GREATER THAN 65\n"); goto L220; } /* L10: */ } /* Values of ALPHA */ - s_rsle(&io___24); - do_lio(&c__3, &c__1, (char *)&nalf, (ftnlen)sizeof(integer)); - e_rsle(); + fgets(line,80,stdin); +#ifdef USE64BITINT + sscanf(line,"%ld",&nalf); +#else + sscanf(line,"%d",&nalf); +#endif if (nalf < 1 || nalf > 7) { - s_wsfe(&io___26); - do_fio(&c__1, "ALPHA", (ftnlen)5); - do_fio(&c__1, (char *)&c__7, (ftnlen)sizeof(integer)); - e_wsfe(); + fprintf(stderr,"VALUE OF ALPHA IS LESS THAN 0 OR GREATER THAN 7\n"); goto L220; } - s_rsle(&io___27); - i__1 = nalf; - for (i__ = 1; i__ <= i__1; ++i__) { - do_lio(&c__6, &c__1, (char *)&alf[i__ - 1], (ftnlen)sizeof(complex)); - } - e_rsle(); + fgets(line,80,stdin); + sscanf(line,"(%f,%f) (%f,%f) (%f,%f) (%f,%f) (%f,%f) (%f,%f) (%f,%f)",&alf[0].r,&alf[0].i,&alf[1].r,&alf[1].i,&alf[2].r,&alf[2].i,&alf[3].r,&alf[3].i, + &alf[4].r,&alf[4].i,&alf[5].r,&alf[5].i,&alf[6].r,&alf[6].i); + +// i__1 = nalf; +// for (i__ = 1; i__ <= i__1; ++i__) { +// do_lio(&c__6, &c__1, (char *)&alf[i__ - 1], (ftnlen)sizeof(complex)); +// } /* Values of BETA */ - s_rsle(&io___29); - do_lio(&c__3, &c__1, (char *)&nbet, (ftnlen)sizeof(integer)); - e_rsle(); - if (nbet < 1 || nbet > 7) { - s_wsfe(&io___31); - do_fio(&c__1, "BETA", (ftnlen)4); - do_fio(&c__1, (char *)&c__7, (ftnlen)sizeof(integer)); - e_wsfe(); + fgets(line,80,stdin); +#ifdef USE64BITINT + sscanf(line,"%ld",&nbet); +#else + sscanf(line,"%d",&nbet); +#endif + if (nalf < 1 || nbet > 7) { + fprintf(stderr,"VALUE OF BETA IS LESS THAN 0 OR GREATER THAN 7\n"); goto L220; } - s_rsle(&io___32); - i__1 = nbet; - for (i__ = 1; i__ <= i__1; ++i__) { - do_lio(&c__6, &c__1, (char *)&bet[i__ - 1], (ftnlen)sizeof(complex)); - } - e_rsle(); + fgets(line,80,stdin); + sscanf(line,"(%f,%f) (%f,%f) (%f,%f) (%f,%f) (%f,%f) (%f,%f) (%f,%f)",&bet[0].r,&bet[0].i,&bet[1].r,&bet[1].i,&bet[2].r,&bet[2].i,&bet[3].r,&bet[3].i, + &bet[4].r,&bet[4].i,&bet[5].r,&bet[5].i,&bet[6].r,&bet[6].i); + /* Report values of parameters. */ - s_wsfe(&io___34); - e_wsfe(); - s_wsfe(&io___35); - i__1 = nidim; - for (i__ = 1; i__ <= i__1; ++i__) { - do_fio(&c__1, (char *)&idim[i__ - 1], (ftnlen)sizeof(integer)); - } - e_wsfe(); - s_wsfe(&io___36); - i__1 = nalf; - for (i__ = 1; i__ <= i__1; ++i__) { - do_fio(&c__2, (char *)&alf[i__ - 1], (ftnlen)sizeof(real)); - } - e_wsfe(); - s_wsfe(&io___37); - i__1 = nbet; - for (i__ = 1; i__ <= i__1; ++i__) { - do_fio(&c__2, (char *)&bet[i__ - 1], (ftnlen)sizeof(real)); - } - e_wsfe(); + printf("TESTS OF THE COMPLEX LEVEL 3 BLAS\nTHE FOLLOWING PARAMETER VALUES WILL BE USED:\n"); + printf(" FOR N"); + for (i__ =1; i__ <=nidim;++i__) printf(" %d",idim[i__-1]); + printf("\n"); + printf(" FOR ALPHA"); + for (i__ =1; i__ <=nalf;++i__) printf(" (%f,%f)",alf[i__-1].r,alf[i__-1].i); + printf("\n"); + printf(" FOR BETA"); + for (i__ =1; i__ <=nbet;++i__) printf(" (%f,%f)",bet[i__-1].r,bet[i__-1].i); + printf("\n"); + if (! tsterr) { - s_wsle(&io___38); - e_wsle(); - s_wsfe(&io___39); - e_wsfe(); + printf(" ERROR-EXITS WILL NOT BE TESTED\n"); } - s_wsle(&io___40); - e_wsle(); - s_wsfe(&io___41); - do_fio(&c__1, (char *)&thresh, (ftnlen)sizeof(real)); - e_wsfe(); - s_wsle(&io___42); - e_wsle(); + printf("ROUTINES PASS COMPUTATIONAL TESTS IF TEST RATIO IS LESS THAN %f\n",thresh); rorder = FALSE_; corder = FALSE_; if (layout == 2) { rorder = TRUE_; corder = TRUE_; - s_wsfe(&io___45); - e_wsfe(); + printf("COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED\n"); } else if (layout == 1) { rorder = TRUE_; - s_wsfe(&io___46); - e_wsfe(); + printf("ROW-MAJOR DATA LAYOUT IS TESTED\n"); } else if (layout == 0) { corder = TRUE_; - s_wsfe(&io___47); - e_wsfe(); + printf("COLUMN-MAJOR DATA LAYOUT IS TESTED\n"); } - s_wsle(&io___48); - e_wsle(); /* Read names of subroutines and flags which indicate */ /* whether they are to be tested. */ @@ -924,42 +536,33 @@ static logical c_false = FALSE_; /* L20: */ } L30: - i__1 = s_rsfe(&io___50); - if (i__1 != 0) { + if (! fgets(line,80,stdin)) { goto L60; } - i__1 = do_fio(&c__1, snamet, (ftnlen)13); - if (i__1 != 0) { - goto L60; - } - i__1 = do_fio(&c__1, (char *)<estt, (ftnlen)sizeof(logical)); - if (i__1 != 0) { - goto L60; - } - i__1 = e_rsfe(); - if (i__1 != 0) { + i__1 = sscanf(line,"%12c %c",snamet,&tmpchar); + ltestt=FALSE_; + if (tmpchar=='T')ltestt=TRUE_; + if (i__1 < 2) { goto L60; } for (i__ = 1; i__ <= 9; ++i__) { - if (s_cmp(snamet, snames + (i__ - 1) * 13, (ftnlen)13, (ftnlen)13) == + if (s_cmp(snamet, snames[i__ - 1] , (ftnlen)12, (ftnlen)12) == 0) { goto L50; } /* L40: */ } - s_wsfe(&io___53); - do_fio(&c__1, snamet, (ftnlen)13); - e_wsfe(); - s_stop("", (ftnlen)0); + printf("SUBPROGRAM NAME %s NOT RECOGNIZED\n****** TESTS ABANDONED ******\n",snamet); + exit(1); L50: ltest[i__ - 1] = ltestt; goto L30; L60: - cl__1.cerr = 0; +/* cl__1.cerr = 0; cl__1.cunit = 5; cl__1.csta = 0; - f_clos(&cl__1); + f_clos(&cl__1);*/ /* Compute EPS (the machine precision). */ @@ -973,9 +576,7 @@ L70: goto L70; L80: eps += eps; - s_wsfe(&io___55); - do_fio(&c__1, (char *)&eps, (ftnlen)sizeof(real)); - e_wsfe(); + printf("RELATIVE MACHINE PRECISION IS TAKEN TO BE %9.1g\n",eps); /* Check the reliability of CMMCH using exact data. */ @@ -1015,13 +616,12 @@ L80: &c__6, &c_true); same = lce_(cc, ct, &n); if (! same || err != 0.f) { - s_wsfe(&io___68); - do_fio(&c__1, transa, (ftnlen)1); - do_fio(&c__1, transb, (ftnlen)1); - do_fio(&c__1, (char *)&same, (ftnlen)sizeof(logical)); - do_fio(&c__1, (char *)&err, (ftnlen)sizeof(real)); - e_wsfe(); - s_stop("", (ftnlen)0); + printf("ERROR IN CMMCH - IN-LINE DOT PRODUCTS ARE BEING EVALUATED WRONGLY\n"); + printf("CMMCH WAS CALLED WITH TRANSA = %s AND TRANSB = %s\n", transa,transb); + printf("AND RETURNED SAME = %c AND ERR = %12.3f.\n",(same==FALSE_? 'F':'T'),err); + printf("THIS MAY BE DUE TO FAULTS IN THE ARITHMETIC OR THE COMPILER.\n"); + printf("****** TESTS ABANDONED ******\n"); + exit(1); } *(unsigned char *)transb = 'C'; cmmch_(transa, transb, &n, &c__1, &n, &c_b2, ab, &c__65, &ab[4225], & @@ -1029,13 +629,12 @@ L80: &c__6, &c_true); same = lce_(cc, ct, &n); if (! same || err != 0.f) { - s_wsfe(&io___69); - do_fio(&c__1, transa, (ftnlen)1); - do_fio(&c__1, transb, (ftnlen)1); - do_fio(&c__1, (char *)&same, (ftnlen)sizeof(logical)); - do_fio(&c__1, (char *)&err, (ftnlen)sizeof(real)); - e_wsfe(); - s_stop("", (ftnlen)0); + printf("ERROR IN CMMCH - IN-LINE DOT PRODUCTS ARE BEING EVALUATED WRONGLY\n"); + printf("CMMCH WAS CALLED WITH TRANSA = %s AND TRANSB = %s\n", transa,transb); + printf("AND RETURNED SAME = %c AND ERR = %12.3f.\n",(same==FALSE_? 'F':'T'),err); + printf("THIS MAY BE DUE TO FAULTS IN THE ARITHMETIC OR THE COMPILER.\n"); + printf("****** TESTS ABANDONED ******\n"); + exit(1); } i__1 = n; for (j = 1; j <= i__1; ++j) { @@ -1061,13 +660,12 @@ L80: &c__6, &c_true); same = lce_(cc, ct, &n); if (! same || err != 0.f) { - s_wsfe(&io___70); - do_fio(&c__1, transa, (ftnlen)1); - do_fio(&c__1, transb, (ftnlen)1); - do_fio(&c__1, (char *)&same, (ftnlen)sizeof(logical)); - do_fio(&c__1, (char *)&err, (ftnlen)sizeof(real)); - e_wsfe(); - s_stop("", (ftnlen)0); + printf("ERROR IN CMMCH - IN-LINE DOT PRODUCTS ARE BEING EVALUATED WRONGLY\n"); + printf("CMMCH WAS CALLED WITH TRANSA = %s AND TRANSB = %s\n", transa,transb); + printf("AND RETURNED SAME = %c AND ERR = %12.3f.\n",(same==FALSE_? 'F':'T'),err); + printf("THIS MAY BE DUE TO FAULTS IN THE ARITHMETIC OR THE COMPILER.\n"); + printf("****** TESTS ABANDONED ******\n"); + exit(1); } *(unsigned char *)transb = 'C'; cmmch_(transa, transb, &n, &c__1, &n, &c_b2, ab, &c__65, &ab[4225], & @@ -1075,33 +673,26 @@ L80: &c__6, &c_true); same = lce_(cc, ct, &n); if (! same || err != 0.f) { - s_wsfe(&io___71); - do_fio(&c__1, transa, (ftnlen)1); - do_fio(&c__1, transb, (ftnlen)1); - do_fio(&c__1, (char *)&same, (ftnlen)sizeof(logical)); - do_fio(&c__1, (char *)&err, (ftnlen)sizeof(real)); - e_wsfe(); - s_stop("", (ftnlen)0); + printf("ERROR IN CMMCH - IN-LINE DOT PRODUCTS ARE BEING EVALUATED WRONGLY\n"); + printf("CMMCH WAS CALLED WITH TRANSA = %s AND TRANSB = %s\n", transa,transb); + printf("AND RETURNED SAME = %c AND ERR = %12.3f.\n",(same==FALSE_? 'F':'T'),err); + printf("THIS MAY BE DUE TO FAULTS IN THE ARITHMETIC OR THE COMPILER.\n"); + printf("****** TESTS ABANDONED ******\n"); + exit(1); } /* Test each subroutine in turn. */ for (isnum = 1; isnum <= 9; ++isnum) { - s_wsle(&io___73); - e_wsle(); if (! ltest[isnum - 1]) { /* Subprogram is not to be tested. */ - s_wsfe(&io___74); - do_fio(&c__1, snames + (isnum - 1) * 13, (ftnlen)13); - e_wsfe(); + printf("%12s WAS NOT TESTED\n",snames[isnum-1]); } else { - s_copy(srnamc_1.srnamt, snames + (isnum - 1) * 13, (ftnlen)13, ( - ftnlen)13); + s_copy(srnamc_1.srnamt, snames[isnum - 1], (ftnlen)12, ( + ftnlen)12); /* Test error exits. */ if (tsterr) { - cc3chke_(snames + (isnum - 1) * 13); - s_wsle(&io___75); - e_wsle(); + cc3chke_(snames[isnum - 1]); } /* Test computations. */ infoc_1.infot = 0; @@ -1121,13 +712,13 @@ L80: /* Test CGEMM, 01. */ L140: if (corder) { - cchk1_(snames + (isnum - 1) * 13, &eps, &thresh, &c__6, &ntra, + cchk1_(snames[isnum - 1], &eps, &thresh, &c__6, &ntra, &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, & nbet, bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, cc, cs, ct, g, &c__0); } if (rorder) { - cchk1_(snames + (isnum - 1) * 13, &eps, &thresh, &c__6, &ntra, + cchk1_(snames[isnum - 1], &eps, &thresh, &c__6, &ntra, &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, & nbet, bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, cc, cs, ct, g, &c__1); @@ -1136,13 +727,13 @@ L140: /* Test CHEMM, 02, CSYMM, 03. */ L150: if (corder) { - cchk2_(snames + (isnum - 1) * 13, &eps, &thresh, &c__6, &ntra, + cchk2_(snames[isnum - 1], &eps, &thresh, &c__6, &ntra, &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, & nbet, bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, cc, cs, ct, g, &c__0); } if (rorder) { - cchk2_(snames + (isnum - 1) * 13, &eps, &thresh, &c__6, &ntra, + cchk2_(snames[isnum - 1], &eps, &thresh, &c__6, &ntra, &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, & nbet, bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, cc, cs, ct, g, &c__1); @@ -1151,13 +742,13 @@ L150: /* Test CTRMM, 04, CTRSM, 05. */ L160: if (corder) { - cchk3_(snames + (isnum - 1) * 13, &eps, &thresh, &c__6, &ntra, + cchk3_(snames[isnum - 1], &eps, &thresh, &c__6, &ntra, &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, & c__65, ab, aa, as, &ab[4225], bb, bs, ct, g, c__, & c__0); } if (rorder) { - cchk3_(snames + (isnum - 1) * 13, &eps, &thresh, &c__6, &ntra, + cchk3_(snames[isnum - 1], &eps, &thresh, &c__6, &ntra, &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, & c__65, ab, aa, as, &ab[4225], bb, bs, ct, g, c__, & c__1); @@ -1166,13 +757,13 @@ L160: /* Test CHERK, 06, CSYRK, 07. */ L170: if (corder) { - cchk4_(snames + (isnum - 1) * 13, &eps, &thresh, &c__6, &ntra, + cchk4_(snames[isnum - 1], &eps, &thresh, &c__6, &ntra, &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, & nbet, bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, cc, cs, ct, g, &c__0); } if (rorder) { - cchk4_(snames + (isnum - 1) * 13, &eps, &thresh, &c__6, &ntra, + cchk4_(snames[isnum - 1], &eps, &thresh, &c__6, &ntra, &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, & nbet, bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, cc, cs, ct, g, &c__1); @@ -1181,13 +772,13 @@ L170: /* Test CHER2K, 08, CSYR2K, 09. */ L180: if (corder) { - cchk5_(snames + (isnum - 1) * 13, &eps, &thresh, &c__6, &ntra, + cchk5_(snames[isnum - 1], &eps, &thresh, &c__6, &ntra, &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, & nbet, bet, &c__65, ab, aa, as, bb, bs, c__, cc, cs, ct, g, w, &c__0); } if (rorder) { - cchk5_(snames + (isnum - 1) * 13, &eps, &thresh, &c__6, &ntra, + cchk5_(snames[isnum - 1], &eps, &thresh, &c__6, &ntra, &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, & nbet, bet, &c__65, ab, aa, as, bb, bs, c__, cc, cs, ct, g, w, &c__1); @@ -1201,32 +792,29 @@ L190: } /* L200: */ } - s_wsfe(&io___82); - e_wsfe(); + printf("\nEND OF TESTS\n"); goto L230; L210: - s_wsfe(&io___83); - e_wsfe(); + printf("\n****** FATAL ERROR - TESTS ABANDONED ******\n"); goto L230; L220: - s_wsfe(&io___84); - e_wsfe(); - + printf("AMEND DATA FILE OR INCREASE ARRAY SIZES IN PROGRAM\n"); + printf("****** TESTS ABANDONED ******\n"); L230: if (trace) { - cl__1.cerr = 0; +/* cl__1.cerr = 0; cl__1.cunit = ntra; cl__1.csta = 0; - f_clos(&cl__1); + f_clos(&cl__1);*/ } - cl__1.cerr = 0; +/* cl__1.cerr = 0; cl__1.cunit = 6; cl__1.csta = 0; f_clos(&cl__1); - s_stop("", (ftnlen)0); - + s_stop("", (ftnlen)0);*/ + exit(0); /* End of CBLAT3. */ @@ -1244,30 +832,9 @@ L230: static char ich[3] = "NTC"; - /* Format strings */ - static char fmt_9994[] = "(\002 ****** FATAL ERROR - ERROR-CALL MYEXIT T" - "AKEN ON VALID\002,\002 CALL ******\002)"; - static char fmt_9998[] = "(\002 ******* FATAL ERROR - PARAMETER NUMBER" - " \002,i2,\002 WAS CH\002,\002ANGED INCORRECTLY *******\002)"; - static char fmt_10000[] = "(\002 \002,a13,\002 PASSED THE COLUMN-MAJOR C" - "OMPUTATIONAL TESTS\002,\002 (\002,i6,\002 CALL\002,\002S)\002)"; - static char fmt_10001[] = "(\002 \002,a13,\002 PASSED THE ROW-MAJOR C" - "OMPUTATIONAL TESTS\002,\002 (\002,i6,\002 CALL\002,\002S)\002)"; - static char fmt_10002[] = "(\002 \002,a13,\002 COMPLETED THE COLUMN-MAJO" - "R COMPUTATIONAL \002,\002TESTS (\002,i6,\002 CALLS)\002,/\002 **" - "***** BUT WITH MAXIMUM TEST \002,\002RATIO \002,f8.2,\002 - SUSP" - "ECT *******\002)"; - static char fmt_10003[] = "(\002 \002,a13,\002 COMPLETED THE ROW-MAJOR " - " COMPUTATIONAL \002,\002TESTS (\002,i6,\002 CALLS)\002,/\002 **" - "***** BUT WITH MAXIMUM TEST \002,\002RATIO \002,f8.2,\002 - SUSP" - "ECT *******\002)"; - static char fmt_9996[] = "(\002 ******* \002,a13,\002 FAILED ON CALL NUM" - "BER:\002)"; - /* System generated locals */ integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7, i__8; - alist al__1; /* Local variables */ complex beta; @@ -1288,7 +855,11 @@ L230: extern /* Subroutine */ int cprcn1_(integer *, integer *, char *, integer *, char *, char *, integer *, integer *, integer *, complex *, integer *, integer *, complex *, integer *); - integer ia, ib, ma, mb, na, nb, nc, ik, im, in, ks, ms, ns; + integer ia, ib, ma, mb, na, nb, nc, ik, im, in; + extern /* Subroutine */ int ccgemm3m_(integer *, char *, char *, integer *, + integer *, integer *, complex *, complex *, integer *, complex *, + integer *, complex *, complex *, integer *); + integer ks, ms, ns; extern logical lceres_(char *, char *, integer *, integer *, complex *, complex *, integer *); char tranas[1], tranbs[1], transa[1], transb[1]; @@ -1297,20 +868,6 @@ L230: extern logical lce_(complex *, complex *, integer *); complex als, bls; real err; - extern /* Subroutine */ int ccgemm3m_(integer *, char *, char *, integer * - , integer *, integer *, complex *, complex *, integer *, complex * - , integer *, complex *, complex *, integer *); - - /* Fortran I/O blocks */ - static cilist io___128 = { 0, 0, 0, fmt_9994, 0 }; - static cilist io___131 = { 0, 0, 0, fmt_9998, 0 }; - static cilist io___133 = { 0, 0, 0, fmt_10000, 0 }; - static cilist io___134 = { 0, 0, 0, fmt_10001, 0 }; - static cilist io___135 = { 0, 0, 0, fmt_10002, 0 }; - static cilist io___136 = { 0, 0, 0, fmt_10003, 0 }; - static cilist io___137 = { 0, 0, 0, fmt_9996, 0 }; - - /* Tests CGEMM. */ @@ -1497,20 +1054,21 @@ L230: &ldb, &beta, &ldc); } if (*rewi) { - al__1.aerr = 0; +/* al__1.aerr = 0; al__1.aunit = *ntra; - f_rew(&al__1); + f_rew(&al__1); */ } - ccgemm3m_(iorder, transa, transb, &m, &n, &k, - &alpha, &aa[1], &lda, &bb[1], &ldb, & + ccgemm3m_(iorder, transa, transb, &m, &n, &k, & + alpha, &aa[1], &lda, &bb[1], &ldb, & beta, &cc[1], &ldc); /* Check if error-exit was taken incorrectly. */ if (! infoc_1.ok) { - io___128.ciunit = *nout; - s_wsfe(&io___128); - e_wsfe(); +// io___128.ciunit = *nout; +// s_wsfe(&io___128); +// e_wsfe(); + printf(" *** FATAL ERROR - ERROR-CALL MYEXIT TAKEN ON VALID CALL\n"); *fatal = TRUE_; goto L120; } @@ -1548,11 +1106,7 @@ L230: for (i__ = 1; i__ <= i__6; ++i__) { same = same && isame[i__ - 1]; if (! isame[i__ - 1]) { - io___131.ciunit = *nout; - s_wsfe(&io___131); - do_fio(&c__1, (char *)&i__, (ftnlen) - sizeof(integer)); - e_wsfe(); + printf(" ******* FATAL ERROR - PARAMETER NUMBER %d WAS CHANGED INCORRECTLY *******\n",i__);; } /* L40: */ } @@ -1606,51 +1160,34 @@ L100: if (errmax < *thresh) { if (*iorder == 0) { - io___133.ciunit = *nout; - s_wsfe(&io___133); - do_fio(&c__1, sname, (ftnlen)13); - do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); - e_wsfe(); + printf("%s PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS (%d CALLS)\n",sname,nc); } if (*iorder == 1) { - io___134.ciunit = *nout; - s_wsfe(&io___134); - do_fio(&c__1, sname, (ftnlen)13); - do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); - e_wsfe(); + printf("%s PASSED THE ROW-MAJOR COMPUTATIONAL TESTS (%d CALLS)\n",sname,nc); } } else { if (*iorder == 0) { - io___135.ciunit = *nout; - s_wsfe(&io___135); - do_fio(&c__1, sname, (ftnlen)13); - do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); - do_fio(&c__1, (char *)&errmax, (ftnlen)sizeof(real)); - e_wsfe(); + printf("%s COMPLETED THE COLUMN-MAJOR COMPUTATIONAL TESTS (%d CALLS)/n",sname,nc); + printf("***** BUT WITH MAXIMUM TEST RATIO %8.2f - SUSPECT *******/n",errmax); } if (*iorder == 1) { - io___136.ciunit = *nout; - s_wsfe(&io___136); - do_fio(&c__1, sname, (ftnlen)13); - do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); - do_fio(&c__1, (char *)&errmax, (ftnlen)sizeof(real)); - e_wsfe(); + printf("%s COMPLETED THE ROW-MAJOR COMPUTATIONAL TESTS (%d CALLS)/n",sname,nc); + printf("***** BUT WITH MAXIMUM TEST RATIO %8.2f - SUSPECT *******/n",errmax); } } goto L130; L120: - io___137.ciunit = *nout; - s_wsfe(&io___137); - do_fio(&c__1, sname, (ftnlen)13); - e_wsfe(); + printf(" ******* %s FAILED ON CALL NUMBER:\n",sname); cprcn1_(nout, &nc, sname, iorder, transa, transb, &m, &n, &k, &alpha, & lda, &ldb, &beta, &ldc); L130: return 0; -/* L9995: */ +/* 9995 FORMAT( 1X, I6, ': ', A12,'(''', A1, ''',''', A1, ''',', */ +/* $ 3( I3, ',' ), '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, */ +/* $ ',(', F4.1, ',', F4.1, '), C,', I3, ').' ) */ /* End of CCHK1. */ @@ -1662,21 +1199,9 @@ L130: k, complex *alpha, integer *lda, integer *ldb, complex *beta, integer *ldc) { - /* Format strings */ - static char fmt_9995[] = "(1x,i6,\002: \002,a13,\002(\002,a14,\002,\002," - "a14,\002,\002,a14,\002,\002)"; - static char fmt_9994[] = "(10x,3(i3,\002,\002),\002 (\002,f4.1,\002,\002" - ",f4.1,\002) , A,\002,i3,\002, B,\002,i3,\002, (\002,f4.1,\002" - ",\002,f4.1,\002) , C,\002,i3,\002).\002)"; - /* Local variables */ char crc[14], cta[14], ctb[14]; - /* Fortran I/O blocks */ - static cilist io___141 = { 0, 0, 0, fmt_9995, 0 }; - static cilist io___142 = { 0, 0, 0, fmt_9994, 0 }; - - if (*(unsigned char *)transa == 'N') { s_copy(cta, " CblasNoTrans", (ftnlen)14, (ftnlen)14); } else if (*(unsigned char *)transa == 'T') { @@ -1696,25 +1221,8 @@ L130: } else { s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14); } - io___141.ciunit = *nout; - s_wsfe(&io___141); - do_fio(&c__1, (char *)&(*nc), (ftnlen)sizeof(integer)); - do_fio(&c__1, sname, (ftnlen)13); - do_fio(&c__1, crc, (ftnlen)14); - do_fio(&c__1, cta, (ftnlen)14); - do_fio(&c__1, ctb, (ftnlen)14); - e_wsfe(); - io___142.ciunit = *nout; - s_wsfe(&io___142); - do_fio(&c__1, (char *)&(*m), (ftnlen)sizeof(integer)); - do_fio(&c__1, (char *)&(*n), (ftnlen)sizeof(integer)); - do_fio(&c__1, (char *)&(*k), (ftnlen)sizeof(integer)); - do_fio(&c__2, (char *)&(*alpha), (ftnlen)sizeof(real)); - do_fio(&c__1, (char *)&(*lda), (ftnlen)sizeof(integer)); - do_fio(&c__1, (char *)&(*ldb), (ftnlen)sizeof(integer)); - do_fio(&c__2, (char *)&(*beta), (ftnlen)sizeof(real)); - do_fio(&c__1, (char *)&(*ldc), (ftnlen)sizeof(integer)); - e_wsfe(); + printf("%6d: %s %s %s %s\n",*nc,sname,crc,cta,ctb); + printf("%d %d %d (%4.1f,%4.1f) , A, %d, B, %d, (%4.1f,%4.1f) , C, %d.\n",*m,*n,*k,alpha->r,alpha->i,*lda,*ldb,beta->r,beta->i,*ldc); return 0; } /* cprcn1_ */ @@ -1731,30 +1239,9 @@ L130: static char ichs[2] = "LR"; static char ichu[2] = "UL"; - /* Format strings */ - static char fmt_9994[] = "(\002 ******* FATAL ERROR - ERROR-CALL MYEXIT " - "TAKEN ON VALID\002,\002 CALL *******\002)"; - static char fmt_9998[] = "(\002 ******* FATAL ERROR - PARAMETER NUMBER" - " \002,i2,\002 WAS CH\002,\002ANGED INCORRECTLY *******\002)"; - static char fmt_10000[] = "(\002 \002,a13,\002 PASSED THE COLUMN-MAJOR C" - "OMPUTATIONAL TESTS\002,\002 (\002,i6,\002 CALL\002,\002S)\002)"; - static char fmt_10001[] = "(\002 \002,a13,\002 PASSED THE ROW-MAJOR C" - "OMPUTATIONAL TESTS\002,\002 (\002,i6,\002 CALL\002,\002S)\002)"; - static char fmt_10002[] = "(\002 \002,a13,\002 COMPLETED THE COLUMN-MAJO" - "R COMPUTATIONAL \002,\002TESTS (\002,i6,\002 CALLS)\002,/\002 **" - "***** BUT WITH MAXIMUM TEST \002,\002RATIO \002,f8.2,\002 - SUSP" - "ECT *******\002)"; - static char fmt_10003[] = "(\002 \002,a13,\002 COMPLETED THE ROW-MAJOR " - " COMPUTATIONAL \002,\002TESTS (\002,i6,\002 CALLS)\002,/\002 **" - "***** BUT WITH MAXIMUM TEST \002,\002RATIO \002,f8.2,\002 - SUSP" - "ECT *******\002)"; - static char fmt_9996[] = "(\002 ******* \002,a13,\002 FAILED ON CALL NUM" - "BER:\002)"; - /* System generated locals */ integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7; - alist al__1; /* Local variables */ complex beta; @@ -1798,17 +1285,6 @@ L130: integer icu; real err; - /* Fortran I/O blocks */ - static cilist io___181 = { 0, 0, 0, fmt_9994, 0 }; - static cilist io___184 = { 0, 0, 0, fmt_9998, 0 }; - static cilist io___186 = { 0, 0, 0, fmt_10000, 0 }; - static cilist io___187 = { 0, 0, 0, fmt_10001, 0 }; - static cilist io___188 = { 0, 0, 0, fmt_10002, 0 }; - static cilist io___189 = { 0, 0, 0, fmt_10003, 0 }; - static cilist io___190 = { 0, 0, 0, fmt_9996, 0 }; - - - /* Tests CHEMM and CSYMM. */ /* Auxiliary routine for test program for Level 3 Blas. */ @@ -1974,9 +1450,9 @@ L130: ; } if (*rewi) { - al__1.aerr = 0; +/* al__1.aerr = 0; al__1.aunit = *ntra; - f_rew(&al__1); + f_rew(&al__1);*/ } if (conj) { cchemm_(iorder, side, uplo, &m, &n, &alpha, & @@ -1991,9 +1467,7 @@ L130: /* Check if error-exit was taken incorrectly. */ if (! infoc_1.ok) { - io___181.ciunit = *nout; - s_wsfe(&io___181); - e_wsfe(); + printf("*** FATAL ERROR - ERROR-CALL MYEXIT TAKEN ON VALID CALL\n"); *fatal = TRUE_; goto L110; } @@ -2028,11 +1502,7 @@ L130: for (i__ = 1; i__ <= i__5; ++i__) { same = same && isame[i__ - 1]; if (! isame[i__ - 1]) { - io___184.ciunit = *nout; - s_wsfe(&io___184); - do_fio(&c__1, (char *)&i__, (ftnlen) - sizeof(integer)); - e_wsfe(); + printf(" ******* FATAL ERROR - PARAMETER NUMBER %d WAS CHANGED INCORRECTLY *******\n",i__); } /* L40: */ } @@ -2090,51 +1560,34 @@ L90: if (errmax < *thresh) { if (*iorder == 0) { - io___186.ciunit = *nout; - s_wsfe(&io___186); - do_fio(&c__1, sname, (ftnlen)13); - do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); - e_wsfe(); + printf("%s PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS (%d CALLS)\n",sname,nc); } if (*iorder == 1) { - io___187.ciunit = *nout; - s_wsfe(&io___187); - do_fio(&c__1, sname, (ftnlen)13); - do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); - e_wsfe(); + printf("%s PASSED THE ROW-MAJOR COMPUTATIONAL TESTS (%d CALLS)\n",sname,nc); } } else { if (*iorder == 0) { - io___188.ciunit = *nout; - s_wsfe(&io___188); - do_fio(&c__1, sname, (ftnlen)13); - do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); - do_fio(&c__1, (char *)&errmax, (ftnlen)sizeof(real)); - e_wsfe(); + printf("%s COMPLETED THE COLUMN-MAJOR COMPUTATIONAL TESTS (%d CALLS)/n",sname,nc); + printf("***** BUT WITH MAXIMUM TEST RATIO %8.2f - SUSPECT *******/n",errmax); } if (*iorder == 1) { - io___189.ciunit = *nout; - s_wsfe(&io___189); - do_fio(&c__1, sname, (ftnlen)13); - do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); - do_fio(&c__1, (char *)&errmax, (ftnlen)sizeof(real)); - e_wsfe(); + printf("%s COMPLETED THE ROW-MAJOR COMPUTATIONAL TESTS (%d CALLS)/n",sname,nc); + printf("***** BUT WITH MAXIMUM TEST RATIO %8.2f - SUSPECT *******/n",errmax); } } goto L120; L110: - io___190.ciunit = *nout; - s_wsfe(&io___190); - do_fio(&c__1, sname, (ftnlen)13); - e_wsfe(); + printf(" ******* %s FAILED ON CALL NUMBER:\n",sname); cprcn2_(nout, &nc, sname, iorder, side, uplo, &m, &n, &alpha, &lda, &ldb, &beta, &ldc); L120: return 0; -/* L9995: */ +/* 9995 FORMAT(1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), */ +/* $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ',(', F4.1, */ +/* $ ',', F4.1, '), C,', I3, ') .' ) */ /* End of CCHK2. */ @@ -2145,21 +1598,9 @@ L120: *iorder, char *side, char *uplo, integer *m, integer *n, complex * alpha, integer *lda, integer *ldb, complex *beta, integer *ldc) { - /* Format strings */ - static char fmt_9995[] = "(1x,i6,\002: \002,a13,\002(\002,a14,\002,\002," - "a14,\002,\002,a14,\002,\002)"; - static char fmt_9994[] = "(10x,2(i3,\002,\002),\002 (\002,f4.1,\002,\002" - ",f4.1,\002), A,\002,i3,\002, B,\002,i3,\002, (\002,f4.1,\002," - "\002,f4.1,\002), \002,\002C,\002,i3,\002).\002)"; - /* Local variables */ char cs[14], cu[14], crc[14]; - /* Fortran I/O blocks */ - static cilist io___194 = { 0, 0, 0, fmt_9995, 0 }; - static cilist io___195 = { 0, 0, 0, fmt_9994, 0 }; - - if (*(unsigned char *)side == 'L') { s_copy(cs, " CblasLeft", (ftnlen)14, (ftnlen)14); } else { @@ -2175,24 +1616,8 @@ L120: } else { s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14); } - io___194.ciunit = *nout; - s_wsfe(&io___194); - do_fio(&c__1, (char *)&(*nc), (ftnlen)sizeof(integer)); - do_fio(&c__1, sname, (ftnlen)13); - do_fio(&c__1, crc, (ftnlen)14); - do_fio(&c__1, cs, (ftnlen)14); - do_fio(&c__1, cu, (ftnlen)14); - e_wsfe(); - io___195.ciunit = *nout; - s_wsfe(&io___195); - do_fio(&c__1, (char *)&(*m), (ftnlen)sizeof(integer)); - do_fio(&c__1, (char *)&(*n), (ftnlen)sizeof(integer)); - do_fio(&c__2, (char *)&(*alpha), (ftnlen)sizeof(real)); - do_fio(&c__1, (char *)&(*lda), (ftnlen)sizeof(integer)); - do_fio(&c__1, (char *)&(*ldb), (ftnlen)sizeof(integer)); - do_fio(&c__2, (char *)&(*beta), (ftnlen)sizeof(real)); - do_fio(&c__1, (char *)&(*ldc), (ftnlen)sizeof(integer)); - e_wsfe(); + printf("%6d: %s %s %s %s\n",*nc,sname,crc,cs,cu); + printf("%d %d (%4.1f,%4.1f) , A, %d, B, %d, (%4.1f,%4.1f) , C, %d.\n",*m,*n,alpha->r,alpha->i,*lda,*ldb,beta->r,beta->i,*ldc); return 0; } /* cprcn2_ */ @@ -2210,31 +1635,10 @@ L120: static char ichd[2] = "UN"; static char ichs[2] = "LR"; - /* Format strings */ - static char fmt_9994[] = "(\002 ******* FATAL ERROR - ERROR-CALL MYEXIT " - "TAKEN ON VALID\002,\002 CALL *******\002)"; - static char fmt_9998[] = "(\002 ******* FATAL ERROR - PARAMETER NUMBER" - " \002,i2,\002 WAS CH\002,\002ANGED INCORRECTLY *******\002)"; - static char fmt_10000[] = "(\002 \002,a13,\002 PASSED THE COLUMN-MAJOR C" - "OMPUTATIONAL TESTS\002,\002 (\002,i6,\002 CALL\002,\002S)\002)"; - static char fmt_10001[] = "(\002 \002,a13,\002 PASSED THE ROW-MAJOR C" - "OMPUTATIONAL TESTS\002,\002 (\002,i6,\002 CALL\002,\002S)\002)"; - static char fmt_10002[] = "(\002 \002,a13,\002 COMPLETED THE COLUMN-MAJO" - "R COMPUTATIONAL \002,\002TESTS (\002,i6,\002 CALLS)\002,/\002 **" - "***** BUT WITH MAXIMUM TEST \002,\002RATIO \002,f8.2,\002 - SUSP" - "ECT *******\002)"; - static char fmt_10003[] = "(\002 \002,a13,\002 COMPLETED THE ROW-MAJOR " - " COMPUTATIONAL \002,\002TESTS (\002,i6,\002 CALLS)\002,/\002 **" - "***** BUT WITH MAXIMUM TEST \002,\002RATIO \002,f8.2,\002 - SUSP" - "ECT *******\002)"; - static char fmt_9996[] = "(\002 ******* \002,a13,\002 FAILED ON CALL NUM" - "BER:\002)"; - /* System generated locals */ integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7; complex q__1; - alist al__1; /* Local variables */ char diag[1]; @@ -2279,17 +1683,6 @@ L120: integer ict, icu; real err; - /* Fortran I/O blocks */ - static cilist io___236 = { 0, 0, 0, fmt_9994, 0 }; - static cilist io___239 = { 0, 0, 0, fmt_9998, 0 }; - static cilist io___241 = { 0, 0, 0, fmt_10000, 0 }; - static cilist io___242 = { 0, 0, 0, fmt_10001, 0 }; - static cilist io___243 = { 0, 0, 0, fmt_10002, 0 }; - static cilist io___244 = { 0, 0, 0, fmt_10003, 0 }; - static cilist io___245 = { 0, 0, 0, fmt_9996, 0 }; - - - /* Tests CTRMM and CTRSM. */ /* Auxiliary routine for test program for Level 3 Blas. */ @@ -2444,14 +1837,14 @@ L120: if (*trace) { cprcn3_(ntra, &nc, sname, iorder, side, uplo, transa, diag, &m, - &n, &alpha, &lda, &ldb, ( - ftnlen)13, (ftnlen)1, (ftnlen) - 1, (ftnlen)1, (ftnlen)1); + &n, &alpha, &lda, &ldb/*, ( + ftnlen)12, (ftnlen)1, (ftnlen) + 1, (ftnlen)1, (ftnlen)1*/); } if (*rewi) { - al__1.aerr = 0; +/* al__1.aerr = 0; al__1.aunit = *ntra; - f_rew(&al__1); + f_rew(&al__1);*/ } cctrmm_(iorder, side, uplo, transa, diag, &m, &n, &alpha, &aa[1], &lda, &bb[ @@ -2461,14 +1854,14 @@ L120: if (*trace) { cprcn3_(ntra, &nc, sname, iorder, side, uplo, transa, diag, &m, - &n, &alpha, &lda, &ldb, ( - ftnlen)13, (ftnlen)1, (ftnlen) - 1, (ftnlen)1, (ftnlen)1); + &n, &alpha, &lda, &ldb/*, ( + ftnlen)12, (ftnlen)1, (ftnlen) + 1, (ftnlen)1, (ftnlen)1*/); } if (*rewi) { - al__1.aerr = 0; +/* al__1.aerr = 0; al__1.aunit = *ntra; - f_rew(&al__1); + f_rew(&al__1);*/ } cctrsm_(iorder, side, uplo, transa, diag, &m, &n, &alpha, &aa[1], &lda, &bb[ @@ -2478,9 +1871,7 @@ L120: /* Check if error-exit was taken incorrectly. */ if (! infoc_1.ok) { - io___236.ciunit = *nout; - s_wsfe(&io___236); - e_wsfe(); + printf("*** FATAL ERROR - ERROR-CALL MYEXIT TAKEN ON VALID CALL\n"); *fatal = TRUE_; goto L150; } @@ -2517,11 +1908,7 @@ L120: for (i__ = 1; i__ <= i__4; ++i__) { same = same && isame[i__ - 1]; if (! isame[i__ - 1]) { - io___239.ciunit = *nout; - s_wsfe(&io___239); - do_fio(&c__1, (char *)&i__, (ftnlen) - sizeof(integer)); - e_wsfe(); + printf(" ******* FATAL ERROR - PARAMETER NUMBER %d WAS CHANGED INCORRECTLY *******\n",i__); } /* L50: */ } @@ -2543,8 +1930,8 @@ L120: c_b1, &c__[c_offset], nmax, &ct[1], &g[1], &bb[ 1], &ldb, eps, &err, - fatal, nout, &c_true, ( - ftnlen)1, (ftnlen)1); + fatal, nout, &c_true/*, ( + ftnlen)1, (ftnlen)1*/); } else { cmmch_("N", transa, &m, &n, &n, & alpha, &b[b_offset], nmax, @@ -2631,44 +2018,25 @@ L130: if (errmax < *thresh) { if (*iorder == 0) { - io___241.ciunit = *nout; - s_wsfe(&io___241); - do_fio(&c__1, sname, (ftnlen)13); - do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); - e_wsfe(); + printf("%s PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS (%d CALLS)\n",sname,nc); } if (*iorder == 1) { - io___242.ciunit = *nout; - s_wsfe(&io___242); - do_fio(&c__1, sname, (ftnlen)13); - do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); - e_wsfe(); + printf("%s PASSED THE ROW-MAJOR COMPUTATIONAL TESTS (%d CALLS)\n",sname,nc); } } else { if (*iorder == 0) { - io___243.ciunit = *nout; - s_wsfe(&io___243); - do_fio(&c__1, sname, (ftnlen)13); - do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); - do_fio(&c__1, (char *)&errmax, (ftnlen)sizeof(real)); - e_wsfe(); + printf("%s COMPLETED THE COLUMN-MAJOR COMPUTATIONAL TESTS (%d CALLS)/n",sname,nc); + printf("***** BUT WITH MAXIMUM TEST RATIO %8.2f - SUSPECT *******/n",errmax); } if (*iorder == 1) { - io___244.ciunit = *nout; - s_wsfe(&io___244); - do_fio(&c__1, sname, (ftnlen)13); - do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); - do_fio(&c__1, (char *)&errmax, (ftnlen)sizeof(real)); - e_wsfe(); + printf("%s COMPLETED THE ROW-MAJOR COMPUTATIONAL TESTS (%d CALLS)/n",sname,nc); + printf("***** BUT WITH MAXIMUM TEST RATIO %8.2f - SUSPECT *******/n",errmax); } } goto L160; L150: - io___245.ciunit = *nout; - s_wsfe(&io___245); - do_fio(&c__1, sname, (ftnlen)13); - e_wsfe(); + printf(" ******* %s FAILED ON CALL NUMBER:\n",sname); if (*trace) { cprcn3_(ntra, &nc, sname, iorder, side, uplo, transa, diag, &m, &n, & alpha, &lda, &ldb); @@ -2677,7 +2045,9 @@ L150: L160: return 0; -/* L9995: */ +/* 9995 FORMAT(1X, I6, ': ', A12,'(', 4( '''', A1, ''',' ), 2( I3, ',' ), */ +/* $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ') ', */ +/* $ ' .' ) */ /* End of CCHK3. */ @@ -2688,21 +2058,9 @@ L160: *iorder, char *side, char *uplo, char *transa, char *diag, integer *m, integer *n, complex *alpha, integer *lda, integer *ldb) { - /* Format strings */ - static char fmt_9995[] = "(1x,i6,\002: \002,a13,\002(\002,a14,\002,\002," - "a14,\002,\002,a14,\002,\002)"; - static char fmt_9994[] = "(10x,2(a14,\002,\002),2(i3,\002,\002),\002 " - "(\002,f4.1,\002,\002,f4.1,\002), A,\002,i3,\002, B,\002,i3,\002)." - "\002)"; - /* Local variables */ char ca[14], cd[14], cs[14], cu[14], crc[14]; - /* Fortran I/O blocks */ - static cilist io___251 = { 0, 0, 0, fmt_9995, 0 }; - static cilist io___252 = { 0, 0, 0, fmt_9994, 0 }; - - if (*(unsigned char *)side == 'L') { s_copy(cs, " CblasLeft", (ftnlen)14, (ftnlen)14); } else { @@ -2730,24 +2088,9 @@ L160: } else { s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14); } - io___251.ciunit = *nout; - s_wsfe(&io___251); - do_fio(&c__1, (char *)&(*nc), (ftnlen)sizeof(integer)); - do_fio(&c__1, sname, (ftnlen)13); - do_fio(&c__1, crc, (ftnlen)14); - do_fio(&c__1, cs, (ftnlen)14); - do_fio(&c__1, cu, (ftnlen)14); - e_wsfe(); - io___252.ciunit = *nout; - s_wsfe(&io___252); - do_fio(&c__1, ca, (ftnlen)14); - do_fio(&c__1, cd, (ftnlen)14); - do_fio(&c__1, (char *)&(*m), (ftnlen)sizeof(integer)); - do_fio(&c__1, (char *)&(*n), (ftnlen)sizeof(integer)); - do_fio(&c__2, (char *)&(*alpha), (ftnlen)sizeof(real)); - do_fio(&c__1, (char *)&(*lda), (ftnlen)sizeof(integer)); - do_fio(&c__1, (char *)&(*ldb), (ftnlen)sizeof(integer)); - e_wsfe(); + printf("%6d: %s %s %s %s\n",*nc,sname,crc,cs,cu); + printf(" %s %s %d %d (%4.1f,%4.1f) A %d B %d\n",ca,cd,*m,*n,alpha->r,alpha->i,*lda,*ldb); + return 0; } /* cprcn3_ */ @@ -2764,33 +2107,10 @@ L160: static char icht[2] = "NC"; static char ichu[2] = "UL"; - /* Format strings */ - static char fmt_9992[] = "(\002 ******* FATAL ERROR - ERROR-CALL MYEXIT " - "TAKEN ON VALID\002,\002 CALL *******\002)"; - static char fmt_9998[] = "(\002 ******* FATAL ERROR - PARAMETER NUMBER" - " \002,i2,\002 WAS CH\002,\002ANGED INCORRECTLY *******\002)"; - static char fmt_10000[] = "(\002 \002,a13,\002 PASSED THE COLUMN-MAJOR C" - "OMPUTATIONAL TESTS\002,\002 (\002,i6,\002 CALL\002,\002S)\002)"; - static char fmt_10001[] = "(\002 \002,a13,\002 PASSED THE ROW-MAJOR C" - "OMPUTATIONAL TESTS\002,\002 (\002,i6,\002 CALL\002,\002S)\002)"; - static char fmt_10002[] = "(\002 \002,a13,\002 COMPLETED THE COLUMN-MAJO" - "R COMPUTATIONAL \002,\002TESTS (\002,i6,\002 CALLS)\002,/\002 **" - "***** BUT WITH MAXIMUM TEST \002,\002RATIO \002,f8.2,\002 - SUSP" - "ECT *******\002)"; - static char fmt_10003[] = "(\002 \002,a13,\002 COMPLETED THE ROW-MAJOR " - " COMPUTATIONAL \002,\002TESTS (\002,i6,\002 CALLS)\002,/\002 **" - "***** BUT WITH MAXIMUM TEST \002,\002RATIO \002,f8.2,\002 - SUSP" - "ECT *******\002)"; - static char fmt_9995[] = "(\002 THESE ARE THE RESULTS FOR COLUMN" - " \002,i3)"; - static char fmt_9996[] = "(\002 ******* \002,a13,\002 FAILED ON CALL NUM" - "BER:\002)"; - /* System generated locals */ integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7; complex q__1; - alist al__1; /* Local variables */ complex beta; @@ -2841,18 +2161,6 @@ L160: integer ict, icu; real err; - /* Fortran I/O blocks */ - static cilist io___294 = { 0, 0, 0, fmt_9992, 0 }; - static cilist io___297 = { 0, 0, 0, fmt_9998, 0 }; - static cilist io___304 = { 0, 0, 0, fmt_10000, 0 }; - static cilist io___305 = { 0, 0, 0, fmt_10001, 0 }; - static cilist io___306 = { 0, 0, 0, fmt_10002, 0 }; - static cilist io___307 = { 0, 0, 0, fmt_10003, 0 }; - static cilist io___308 = { 0, 0, 0, fmt_9995, 0 }; - static cilist io___309 = { 0, 0, 0, fmt_9996, 0 }; - - - /* Tests CHERK and CSYRK. */ /* Auxiliary routine for test program for Level 3 Blas. */ @@ -2892,6 +2200,8 @@ L160: nc = 0; reset = TRUE_; errmax = 0.f; + rals = 1.f; + rbets = 1.f; i__1 = *nidim; for (in = 1; in <= i__1; ++in) { @@ -2965,8 +2275,8 @@ L160: } null = n <= 0; if (conj) { - null = null || (k <= 0 || ralpha == 0.f) && - rbeta == 1.f; + null = null || ((k <= 0 || ralpha == 0.f) && + rbeta == 1.f); } /* Generate the matrix C. */ @@ -3022,9 +2332,9 @@ L160: rbeta, &ldc); } if (*rewi) { - al__1.aerr = 0; +/* al__1.aerr = 0; al__1.aunit = *ntra; - f_rew(&al__1); + f_rew(&al__1);*/ } ccherk_(iorder, uplo, trans, &n, &k, &ralpha, &aa[1], &lda, &rbeta, &cc[1], &ldc); @@ -3035,9 +2345,9 @@ L160: beta, &ldc); } if (*rewi) { - al__1.aerr = 0; +/* al__1.aerr = 0; al__1.aunit = *ntra; - f_rew(&al__1); + f_rew(&al__1);*/ } ccsyrk_(iorder, uplo, trans, &n, &k, &alpha, & aa[1], &lda, &beta, &cc[1], &ldc); @@ -3046,9 +2356,7 @@ L160: /* Check if error-exit was taken incorrectly. */ if (! infoc_1.ok) { - io___294.ciunit = *nout; - s_wsfe(&io___294); - e_wsfe(); + printf("*** FATAL ERROR - ERROR-CALL MYEXIT TAKEN ON VALID CALL\n"); *fatal = TRUE_; goto L120; } @@ -3091,11 +2399,7 @@ L160: for (i__ = 1; i__ <= i__5; ++i__) { same = same && isame[i__ - 1]; if (! isame[i__ - 1]) { - io___297.ciunit = *nout; - s_wsfe(&io___297); - do_fio(&c__1, (char *)&i__, (ftnlen) - sizeof(integer)); - e_wsfe(); + printf(" ******* FATAL ERROR - PARAMETER NUMBER %d WAS CHANGED INCORRECTLY *******\n",i__); } /* L30: */ } @@ -3179,52 +2483,30 @@ L100: if (errmax < *thresh) { if (*iorder == 0) { - io___304.ciunit = *nout; - s_wsfe(&io___304); - do_fio(&c__1, sname, (ftnlen)13); - do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); - e_wsfe(); + printf("%s PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS (%d CALLS)\n",sname,nc); } if (*iorder == 1) { - io___305.ciunit = *nout; - s_wsfe(&io___305); - do_fio(&c__1, sname, (ftnlen)13); - do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); - e_wsfe(); + printf("%s PASSED THE ROW-MAJOR COMPUTATIONAL TESTS (%d CALLS)\n",sname,nc); } } else { if (*iorder == 0) { - io___306.ciunit = *nout; - s_wsfe(&io___306); - do_fio(&c__1, sname, (ftnlen)13); - do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); - do_fio(&c__1, (char *)&errmax, (ftnlen)sizeof(real)); - e_wsfe(); + printf("%s COMPLETED THE COLUMN-MAJOR COMPUTATIONAL TESTS (%d CALLS)/n",sname,nc); + printf("***** BUT WITH MAXIMUM TEST RATIO %8.2f - SUSPECT *******/n",errmax); } if (*iorder == 1) { - io___307.ciunit = *nout; - s_wsfe(&io___307); - do_fio(&c__1, sname, (ftnlen)13); - do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); - do_fio(&c__1, (char *)&errmax, (ftnlen)sizeof(real)); - e_wsfe(); + printf("%s COMPLETED THE ROW-MAJOR COMPUTATIONAL TESTS (%d CALLS)/n",sname,nc); + printf("***** BUT WITH MAXIMUM TEST RATIO %8.2f - SUSPECT *******/n",errmax); } } goto L130; L110: if (n > 1) { - io___308.ciunit = *nout; - s_wsfe(&io___308); - do_fio(&c__1, (char *)&j, (ftnlen)sizeof(integer)); - e_wsfe(); + printf(" THESE ARE THE RESULTS FOR COLUMN %d:\n",j); } L120: - io___309.ciunit = *nout; - s_wsfe(&io___309); - do_fio(&c__1, sname, (ftnlen)13); - e_wsfe(); + printf(" ******* %s FAILED ON CALL NUMBER:\n",sname); if (conj) { cprcn6_(nout, &nc, sname, iorder, uplo, trans, &n, &k, &ralpha, &lda, &rbeta, &ldc); @@ -3236,8 +2518,12 @@ L120: L130: return 0; -/* L9994: */ -/* L9993: */ +/* 9994 FORMAT(1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), */ +/* $ F4.1, ', A,', I3, ',', F4.1, ', C,', I3, ') ', */ +/* $ ' .' ) */ +/* 9993 FORMAT(1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), */ +/* $ '(', F4.1, ',', F4.1, ') , A,', I3, ',(', F4.1, ',', F4.1, */ +/* $ '), C,', I3, ') .' ) */ /* End of CCHK4. */ @@ -3248,21 +2534,9 @@ L130: *iorder, char *uplo, char *transa, integer *n, integer *k, complex * alpha, integer *lda, complex *beta, integer *ldc) { - /* Format strings */ - static char fmt_9995[] = "(1x,i6,\002: \002,a13,\002(\002,3(a14,\002," - "\002))"; - static char fmt_9994[] = "(10x,2(i3,\002,\002),\002 (\002,f4.1,\002,\002" - ",f4.1,\002), A,\002,i3,\002, (\002,f4.1,\002,\002,f4.1,\002), C" - ",\002,i3,\002).\002)"; - /* Local variables */ char ca[14], cu[14], crc[14]; - /* Fortran I/O blocks */ - static cilist io___313 = { 0, 0, 0, fmt_9995, 0 }; - static cilist io___314 = { 0, 0, 0, fmt_9994, 0 }; - - if (*(unsigned char *)uplo == 'U') { s_copy(cu, " CblasUpper", (ftnlen)14, (ftnlen)14); } else { @@ -3280,23 +2554,8 @@ L130: } else { s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14); } - io___313.ciunit = *nout; - s_wsfe(&io___313); - do_fio(&c__1, (char *)&(*nc), (ftnlen)sizeof(integer)); - do_fio(&c__1, sname, (ftnlen)13); - do_fio(&c__1, crc, (ftnlen)14); - do_fio(&c__1, cu, (ftnlen)14); - do_fio(&c__1, ca, (ftnlen)14); - e_wsfe(); - io___314.ciunit = *nout; - s_wsfe(&io___314); - do_fio(&c__1, (char *)&(*n), (ftnlen)sizeof(integer)); - do_fio(&c__1, (char *)&(*k), (ftnlen)sizeof(integer)); - do_fio(&c__2, (char *)&(*alpha), (ftnlen)sizeof(real)); - do_fio(&c__1, (char *)&(*lda), (ftnlen)sizeof(integer)); - do_fio(&c__2, (char *)&(*beta), (ftnlen)sizeof(real)); - do_fio(&c__1, (char *)&(*ldc), (ftnlen)sizeof(integer)); - e_wsfe(); + printf("%6d: %s %s %s %s\n",*nc,sname,crc,cu,ca); + printf("( %d %d (%4.1f,%4.1f) A %d (%4.1f,%4.1f) C %d\n",*n,*k,alpha->r,alpha->i,*lda,beta->r,beta->i,*ldc); return 0; } /* cprcn4_ */ @@ -3306,20 +2565,9 @@ L130: *iorder, char *uplo, char *transa, integer *n, integer *k, real * alpha, integer *lda, real *beta, integer *ldc) { - /* Format strings */ - static char fmt_9995[] = "(1x,i6,\002: \002,a13,\002(\002,3(a14,\002," - "\002))"; - static char fmt_9994[] = "(10x,2(i3,\002,\002),f4.1,\002, A,\002,i3" - ",\002,\002,f4.1,\002, C,\002,i3,\002).\002)"; - /* Local variables */ char ca[14], cu[14], crc[14]; - /* Fortran I/O blocks */ - static cilist io___318 = { 0, 0, 0, fmt_9995, 0 }; - static cilist io___319 = { 0, 0, 0, fmt_9994, 0 }; - - if (*(unsigned char *)uplo == 'U') { s_copy(cu, " CblasUpper", (ftnlen)14, (ftnlen)14); } else { @@ -3337,23 +2585,8 @@ L130: } else { s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14); } - io___318.ciunit = *nout; - s_wsfe(&io___318); - do_fio(&c__1, (char *)&(*nc), (ftnlen)sizeof(integer)); - do_fio(&c__1, sname, (ftnlen)13); - do_fio(&c__1, crc, (ftnlen)14); - do_fio(&c__1, cu, (ftnlen)14); - do_fio(&c__1, ca, (ftnlen)14); - e_wsfe(); - io___319.ciunit = *nout; - s_wsfe(&io___319); - do_fio(&c__1, (char *)&(*n), (ftnlen)sizeof(integer)); - do_fio(&c__1, (char *)&(*k), (ftnlen)sizeof(integer)); - do_fio(&c__1, (char *)&(*alpha), (ftnlen)sizeof(real)); - do_fio(&c__1, (char *)&(*lda), (ftnlen)sizeof(integer)); - do_fio(&c__1, (char *)&(*beta), (ftnlen)sizeof(real)); - do_fio(&c__1, (char *)&(*ldc), (ftnlen)sizeof(integer)); - e_wsfe(); + printf("%6d: %s %s %s %s\n",*nc,sname,crc,cu,ca); + printf("( %d %d %4.1f A %d %4.1f C %d\n",*n,*k,*alpha,*lda,*beta,*ldc); return 0; } /* cprcn6_ */ @@ -3370,32 +2603,10 @@ L130: static char icht[2] = "NC"; static char ichu[2] = "UL"; - /* Format strings */ - static char fmt_9992[] = "(\002 ******* FATAL ERROR - ERROR-CALL MYEXIT " - "TAKEN ON VALID\002,\002 CALL *******\002)"; - static char fmt_9998[] = "(\002 ******* FATAL ERROR - PARAMETER NUMBER" - " \002,i2,\002 WAS CH\002,\002ANGED INCORRECTLY *******\002)"; - static char fmt_10000[] = "(\002 \002,a13,\002 PASSED THE COLUMN-MAJOR C" - "OMPUTATIONAL TESTS\002,\002 (\002,i6,\002 CALL\002,\002S)\002)"; - static char fmt_10001[] = "(\002 \002,a13,\002 PASSED THE ROW-MAJOR C" - "OMPUTATIONAL TESTS\002,\002 (\002,i6,\002 CALL\002,\002S)\002)"; - static char fmt_10002[] = "(\002 \002,a13,\002 COMPLETED THE COLUMN-MAJO" - "R COMPUTATIONAL \002,\002TESTS (\002,i6,\002 CALLS)\002,/\002 **" - "***** BUT WITH MAXIMUM TEST \002,\002RATIO \002,f8.2,\002 - SUSP" - "ECT *******\002)"; - static char fmt_10003[] = "(\002 \002,a13,\002 COMPLETED THE ROW-MAJOR " - " COMPUTATIONAL \002,\002TESTS (\002,i6,\002 CALLS)\002,/\002 **" - "***** BUT WITH MAXIMUM TEST \002,\002RATIO \002,f8.2,\002 - SUSP" - "ECT *******\002)"; - static char fmt_9995[] = "(\002 THESE ARE THE RESULTS FOR COLUMN" - " \002,i3)"; - static char fmt_9996[] = "(\002 ******* \002,a13,\002 FAILED ON CALL NUM" - "BER:\002)"; /* System generated locals */ integer c_dim1, c_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7, i__8; complex q__1, q__2; - alist al__1; /* Local variables */ integer jjab; @@ -3444,18 +2655,6 @@ L130: integer ict, icu; real err; - /* Fortran I/O blocks */ - static cilist io___362 = { 0, 0, 0, fmt_9992, 0 }; - static cilist io___365 = { 0, 0, 0, fmt_9998, 0 }; - static cilist io___373 = { 0, 0, 0, fmt_10000, 0 }; - static cilist io___374 = { 0, 0, 0, fmt_10001, 0 }; - static cilist io___375 = { 0, 0, 0, fmt_10002, 0 }; - static cilist io___376 = { 0, 0, 0, fmt_10003, 0 }; - static cilist io___377 = { 0, 0, 0, fmt_9995, 0 }; - static cilist io___378 = { 0, 0, 0, fmt_9996, 0 }; - - - /* Tests CHER2K and CSYR2K. */ /* Auxiliary routine for test program for Level 3 Blas. */ @@ -3578,8 +2777,8 @@ L130: } null = n <= 0; if (conj) { - null = null || (k <= 0 || alpha.r == 0.f && - alpha.i == 0.f) && rbeta == 1.f; + null = null || ((k <= 0 || (alpha.r == 0.f && + alpha.i == 0.f)) && rbeta == 1.f); } /* Generate the matrix C. */ @@ -3640,9 +2839,9 @@ L130: &rbeta, &ldc); } if (*rewi) { - al__1.aerr = 0; +/* al__1.aerr = 0; al__1.aunit = *ntra; - f_rew(&al__1); + f_rew(&al__1);*/ } ccher2k_(iorder, uplo, trans, &n, &k, &alpha, &aa[1], &lda, &bb[1], &ldb, &rbeta, & @@ -3654,9 +2853,9 @@ L130: &beta, &ldc); } if (*rewi) { - al__1.aerr = 0; +/* al__1.aerr = 0; al__1.aunit = *ntra; - f_rew(&al__1); + f_rew(&al__1);*/ } ccsyr2k_(iorder, uplo, trans, &n, &k, &alpha, &aa[1], &lda, &bb[1], &ldb, &beta, & @@ -3666,9 +2865,7 @@ L130: /* Check if error-exit was taken incorrectly. */ if (! infoc_1.ok) { - io___362.ciunit = *nout; - s_wsfe(&io___362); - e_wsfe(); + printf("*** FATAL ERROR - ERROR-CALL MYEXIT TAKEN ON VALID CALL\n"); *fatal = TRUE_; goto L150; } @@ -3708,11 +2905,7 @@ L130: for (i__ = 1; i__ <= i__5; ++i__) { same = same && isame[i__ - 1]; if (! isame[i__ - 1]) { - io___365.ciunit = *nout; - s_wsfe(&io___365); - do_fio(&c__1, (char *)&i__, (ftnlen) - sizeof(integer)); - e_wsfe(); + printf(" ******* FATAL ERROR - PARAMETER NUMBER %d WAS CHANGED INCORRECTLY *******\n",i__); } /* L40: */ } @@ -3745,7 +2938,7 @@ L130: i__6 = k; for (i__ = 1; i__ <= i__6; ++i__) { i__7 = i__; - i__8 = (j - 1 << 1) * *nmax + k + + i__8 = ((j - 1) << 1) * *nmax + k + i__; q__1.r = alpha.r * ab[i__8].r - alpha.i * ab[i__8].i, @@ -3757,14 +2950,14 @@ L130: if (conj) { i__7 = k + i__; r_cnjg(&q__2, &alpha); - i__8 = (j - 1 << 1) * *nmax + i__; + i__8 = ((j - 1) << 1) * *nmax + i__; q__1.r = q__2.r * ab[i__8].r - q__2.i * ab[i__8].i, q__1.i = q__2.r * ab[i__8].i + q__2.i * ab[ i__8].r; w[i__7].r = q__1.r, w[i__7].i = q__1.i; } else { i__7 = k + i__; - i__8 = (j - 1 << 1) * *nmax + i__; + i__8 = ((j - 1) << 1) * *nmax + i__; q__1.r = alpha.r * ab[i__8].r - alpha.i * ab[i__8] .i, q__1.i = alpha.r * ab[i__8].i + alpha.i * ab[i__8].r; @@ -3865,52 +3058,30 @@ L130: if (errmax < *thresh) { if (*iorder == 0) { - io___373.ciunit = *nout; - s_wsfe(&io___373); - do_fio(&c__1, sname, (ftnlen)13); - do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); - e_wsfe(); + printf("%s PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS (%d CALLS)\n",sname,nc); } if (*iorder == 1) { - io___374.ciunit = *nout; - s_wsfe(&io___374); - do_fio(&c__1, sname, (ftnlen)13); - do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); - e_wsfe(); + printf("%s PASSED THE ROW-MAJOR COMPUTATIONAL TESTS (%d CALLS)\n",sname,nc); } } else { if (*iorder == 0) { - io___375.ciunit = *nout; - s_wsfe(&io___375); - do_fio(&c__1, sname, (ftnlen)13); - do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); - do_fio(&c__1, (char *)&errmax, (ftnlen)sizeof(real)); - e_wsfe(); + printf("%s COMPLETED THE COLUMN-MAJOR COMPUTATIONAL TESTS (%d CALLS)/n",sname,nc); + printf("***** BUT WITH MAXIMUM TEST RATIO %8.2f - SUSPECT *******/n",errmax); } if (*iorder == 1) { - io___376.ciunit = *nout; - s_wsfe(&io___376); - do_fio(&c__1, sname, (ftnlen)13); - do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); - do_fio(&c__1, (char *)&errmax, (ftnlen)sizeof(real)); - e_wsfe(); + printf("%s COMPLETED THE ROW-MAJOR COMPUTATIONAL TESTS (%d CALLS)/n",sname,nc); + printf("***** BUT WITH MAXIMUM TEST RATIO %8.2f - SUSPECT *******/n",errmax); } } goto L160; L140: if (n > 1) { - io___377.ciunit = *nout; - s_wsfe(&io___377); - do_fio(&c__1, (char *)&j, (ftnlen)sizeof(integer)); - e_wsfe(); + printf(" THESE ARE THE RESULTS FOR COLUMN %d:\n",j); } L150: - io___378.ciunit = *nout; - s_wsfe(&io___378); - do_fio(&c__1, sname, (ftnlen)13); - e_wsfe(); + printf(" ******* %s FAILED ON CALL NUMBER:\n",sname); if (conj) { cprcn7_(nout, &nc, sname, iorder, uplo, trans, &n, &k, &alpha, &lda, & ldb, &rbeta, &ldc); @@ -3922,8 +3093,12 @@ L150: L160: return 0; -/* L9994: */ -/* L9993: */ +/* 9994 FORMAT(1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), */ +/* $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ',', F4.1, */ +/* $ ', C,', I3, ') .' ) */ +/* 9993 FORMAT(1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), */ +/* $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ',(', F4.1, */ +/* $ ',', F4.1, '), C,', I3, ') .' ) */ /* End of CCHK5. */ @@ -3934,21 +3109,10 @@ L160: *iorder, char *uplo, char *transa, integer *n, integer *k, complex * alpha, integer *lda, integer *ldb, complex *beta, integer *ldc) { - /* Format strings */ - static char fmt_9995[] = "(1x,i6,\002: \002,a13,\002(\002,3(a14,\002," - "\002))"; - static char fmt_9994[] = "(10x,2(i3,\002,\002),\002 (\002,f4.1,\002,\002" - ",f4.1,\002), A,\002,i3,\002, B\002,i3,\002, (\002,f4.1,\002,\002" - ",f4.1,\002), C,\002,i3,\002).\002)"; /* Local variables */ char ca[14], cu[14], crc[14]; - /* Fortran I/O blocks */ - static cilist io___382 = { 0, 0, 0, fmt_9995, 0 }; - static cilist io___383 = { 0, 0, 0, fmt_9994, 0 }; - - if (*(unsigned char *)uplo == 'U') { s_copy(cu, " CblasUpper", (ftnlen)14, (ftnlen)14); } else { @@ -3966,24 +3130,8 @@ L160: } else { s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14); } - io___382.ciunit = *nout; - s_wsfe(&io___382); - do_fio(&c__1, (char *)&(*nc), (ftnlen)sizeof(integer)); - do_fio(&c__1, sname, (ftnlen)13); - do_fio(&c__1, crc, (ftnlen)14); - do_fio(&c__1, cu, (ftnlen)14); - do_fio(&c__1, ca, (ftnlen)14); - e_wsfe(); - io___383.ciunit = *nout; - s_wsfe(&io___383); - do_fio(&c__1, (char *)&(*n), (ftnlen)sizeof(integer)); - do_fio(&c__1, (char *)&(*k), (ftnlen)sizeof(integer)); - do_fio(&c__2, (char *)&(*alpha), (ftnlen)sizeof(real)); - do_fio(&c__1, (char *)&(*lda), (ftnlen)sizeof(integer)); - do_fio(&c__1, (char *)&(*ldb), (ftnlen)sizeof(integer)); - do_fio(&c__2, (char *)&(*beta), (ftnlen)sizeof(real)); - do_fio(&c__1, (char *)&(*ldc), (ftnlen)sizeof(integer)); - e_wsfe(); + printf("%6d: %s %s %s %s\n",*nc,sname,crc,cu,ca); + printf("%d %d (%4.1f,%4.1f) , A, %d, B, %d, (%4.1f,%4.1f) , C, %d.\n",*n,*k,alpha->r,alpha->i,*lda,*ldb,beta->r,beta->i,*ldc); return 0; } /* cprcn5_ */ @@ -3993,21 +3141,10 @@ L160: *iorder, char *uplo, char *transa, integer *n, integer *k, complex * alpha, integer *lda, integer *ldb, real *beta, integer *ldc) { - /* Format strings */ - static char fmt_9995[] = "(1x,i6,\002: \002,a13,\002(\002,3(a14,\002," - "\002))"; - static char fmt_9994[] = "(10x,2(i3,\002,\002),\002 (\002,f4.1,\002,\002" - ",f4.1,\002), A,\002,i3,\002, B\002,i3,\002,\002,f4.1,\002, C," - "\002,i3,\002).\002)"; /* Local variables */ char ca[14], cu[14], crc[14]; - /* Fortran I/O blocks */ - static cilist io___387 = { 0, 0, 0, fmt_9995, 0 }; - static cilist io___388 = { 0, 0, 0, fmt_9994, 0 }; - - if (*(unsigned char *)uplo == 'U') { s_copy(cu, " CblasUpper", (ftnlen)14, (ftnlen)14); } else { @@ -4025,24 +3162,8 @@ L160: } else { s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14); } - io___387.ciunit = *nout; - s_wsfe(&io___387); - do_fio(&c__1, (char *)&(*nc), (ftnlen)sizeof(integer)); - do_fio(&c__1, sname, (ftnlen)13); - do_fio(&c__1, crc, (ftnlen)14); - do_fio(&c__1, cu, (ftnlen)14); - do_fio(&c__1, ca, (ftnlen)14); - e_wsfe(); - io___388.ciunit = *nout; - s_wsfe(&io___388); - do_fio(&c__1, (char *)&(*n), (ftnlen)sizeof(integer)); - do_fio(&c__1, (char *)&(*k), (ftnlen)sizeof(integer)); - do_fio(&c__2, (char *)&(*alpha), (ftnlen)sizeof(real)); - do_fio(&c__1, (char *)&(*lda), (ftnlen)sizeof(integer)); - do_fio(&c__1, (char *)&(*ldb), (ftnlen)sizeof(integer)); - do_fio(&c__1, (char *)&(*beta), (ftnlen)sizeof(real)); - do_fio(&c__1, (char *)&(*ldc), (ftnlen)sizeof(integer)); - e_wsfe(); + printf("%6d: %s %s %s %s\n",*nc,sname,crc,cu,ca); + printf("%d %d (%4.1f,%4.1f), A, %d, B, %d, %4.1f, C, %d.\n",*n,*k,alpha->r,alpha->i,*lda,*ldb,*beta,*ldc); return 0; } /* cprcn7_ */ @@ -4101,7 +3222,7 @@ L160: for (j = 1; j <= i__1; ++j) { i__2 = *m; for (i__ = 1; i__ <= i__2; ++i__) { - if (gen || upper && i__ <= j || lower && i__ >= j) { + if (gen || (upper && i__ <= j) || (lower && i__ >= j)) { i__3 = i__ + j * a_dim1; cbeg_(&q__2, reset); q__1.r = q__2.r + transl->r, q__1.i = q__2.i + transl->i; @@ -4230,15 +3351,6 @@ L160: real *g, complex *cc, integer *ldcc, real *eps, real *err, logical * fatal, integer *nout, logical *mv) { - /* Format strings */ - static char fmt_9999[] = "(\002 ******* FATAL ERROR - COMPUTED RESULT IS" - " LESS THAN HAL\002,\002F ACCURATE *******\002,/\002 " - " EXPECTED RE\002,\002SULT COMPUTED R" - "ESULT\002)"; - static char fmt_9998[] = "(1x,i7,2(\002 (\002,g15.6,\002,\002,g15.6," - "\002)\002))"; - static char fmt_9997[] = "(\002 THESE ARE THE RESULTS FOR COLUMN" - " \002,i3)"; /* System generated locals */ integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, cc_dim1, @@ -4251,14 +3363,6 @@ L160: integer i__, j, k; logical trana, tranb, ctrana, ctranb; - /* Fortran I/O blocks */ - static cilist io___409 = { 0, 0, 0, fmt_9999, 0 }; - static cilist io___410 = { 0, 0, 0, fmt_9998, 0 }; - static cilist io___411 = { 0, 0, 0, fmt_9998, 0 }; - static cilist io___412 = { 0, 0, 0, fmt_9997, 0 }; - - - /* Checks the results of the computational tests. */ /* Auxiliary routine for test program for Level 3 Blas. */ @@ -4595,35 +3699,19 @@ L160: L230: *fatal = TRUE_; - io___409.ciunit = *nout; - s_wsfe(&io___409); - e_wsfe(); + printf(" ******* FATAL ERROR - COMPUTED RESULT IS LESS THAN HALF ACCURATE *******\n"); + printf(" EXPECTED RESULT COMPUTED RESULT\n"); i__1 = *m; for (i__ = 1; i__ <= i__1; ++i__) { if (*mv) { - io___410.ciunit = *nout; - s_wsfe(&io___410); - do_fio(&c__1, (char *)&i__, (ftnlen)sizeof(integer)); - do_fio(&c__2, (char *)&ct[i__], (ftnlen)sizeof(real)); - do_fio(&c__2, (char *)&cc[i__ + j * cc_dim1], (ftnlen)sizeof(real) - ); - e_wsfe(); + printf("%7d (%15.6g,%15.6g) (%15.6g,%15.6g)\n",i__,ct[i__].r,ct[i__].i,cc[i__+j*cc_dim1].r,cc[i__+j*cc_dim1].i); } else { - io___411.ciunit = *nout; - s_wsfe(&io___411); - do_fio(&c__1, (char *)&i__, (ftnlen)sizeof(integer)); - do_fio(&c__2, (char *)&cc[i__ + j * cc_dim1], (ftnlen)sizeof(real) - ); - do_fio(&c__2, (char *)&ct[i__], (ftnlen)sizeof(real)); - e_wsfe(); + printf("%7d (%15.6g,%15.6g) (%15.6g,%15.6g)\n",i__,cc[i__+j*cc_dim1].r,cc[i__+j*cc_dim1].i,ct[i__].r,ct[i__].i); } /* L240: */ } if (*n > 1) { - io___412.ciunit = *nout; - s_wsfe(&io___412); - do_fio(&c__1, (char *)&j, (ftnlen)sizeof(integer)); - e_wsfe(); + printf(" THESE ARE THE RESULTS FOR COLUMN %d\n",j); } L250: @@ -4760,7 +3848,7 @@ logical lceres_(char *type__, char *uplo, integer *m, integer *n, complex *aa, } } -/* L60: */ +/* 60 CONTINUE */ ret_val = TRUE_; goto L80; L70: @@ -4851,4 +3939,4 @@ real sdiff_(real *x, real *y) } /* sdiff_ */ -/* Main program alias */ int cblat3_ () { MAIN__ (); return 0; } +/* Main program alias */ /*int cblat3_ () { MAIN__ (); return 0; }*/ diff --git a/ctest/c_zblat3c_3m.c b/ctest/c_zblat3c_3m.c index 059daccb5..0c76f11e7 100644 --- a/ctest/c_zblat3c_3m.c +++ b/ctest/c_zblat3c_3m.c @@ -10,25 +10,7 @@ #undef I #endif -#if defined(_WIN64) -typedef long long BLASLONG; -typedef unsigned long long BLASULONG; -#else -typedef long BLASLONG; -typedef unsigned long BLASULONG; -#endif - -#ifdef LAPACK_ILP64 -typedef BLASLONG blasint; -#if defined(_WIN64) -#define blasabs(x) llabs(x) -#else -#define blasabs(x) labs(x) -#endif -#else -typedef int blasint; -#define blasabs(x) abs(x) -#endif +#include "common.h" typedef blasint integer; @@ -40,14 +22,11 @@ typedef double doublereal; typedef struct { real r, i; } complex; typedef struct { doublereal r, i; } doublecomplex; #ifdef _MSC_VER -static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} -static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} #else static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} -static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} #endif #define pCf(z) (*_pCf(z)) @@ -247,7 +226,6 @@ typedef struct Namelist Namelist; #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } #define sig_die(s, kill) { exit(1); } #define s_stop(s, n) {exit(0);} -static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; #define z_abs(z) (cabs(Cd(z))) #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} @@ -261,259 +239,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; /* procedure parameter types for -A and -C++ */ #define F2C_proc_par_types 1 -#ifdef __cplusplus -typedef logical (*L_fp)(...); -#else -typedef logical (*L_fp)(); -#endif - -static float spow_ui(float x, integer n) { - float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i= 0; if (trace) { - o__1.oerr = 0; +/* o__1.oerr = 0; o__1.ounit = ntra; o__1.ofnmlen = 32; o__1.ofnm = snaps; @@ -783,149 +401,119 @@ static logical c_false = FALSE_; o__1.oacc = 0; o__1.ofm = 0; o__1.oblnk = 0; - f_open(&o__1); + f_open(&o__1);*/ } /* Read the flag that directs rewinding of the snapshot file. */ - s_rsle(&io___7); - do_lio(&c__8, &c__1, (char *)&rewi, (ftnlen)sizeof(logical)); - e_rsle(); - rewi = rewi && trace; + fgets(line,80,stdin); + sscanf(line,"%d",&rewi); + rewi = rewi && trace; /* Read the flag that directs stopping on any failure. */ - s_rsle(&io___9); - do_lio(&c__8, &c__1, (char *)&sfatal, (ftnlen)sizeof(logical)); - e_rsle(); + fgets(line,80,stdin); + sscanf(line,"%c",&tmpchar); + sfatal=FALSE_; + if (tmpchar=='T')sfatal=TRUE_; /* Read the flag that indicates whether error exits are to be tested. */ - s_rsle(&io___11); - do_lio(&c__8, &c__1, (char *)&tsterr, (ftnlen)sizeof(logical)); - e_rsle(); + fgets(line,80,stdin); + sscanf(line,"%c",&tmpchar); + tsterr=FALSE_; + if (tmpchar=='T')tsterr=TRUE_; /* Read the flag that indicates whether row-major data layout to be tested. */ - s_rsle(&io___13); - do_lio(&c__3, &c__1, (char *)&layout, (ftnlen)sizeof(integer)); - e_rsle(); + fgets(line,80,stdin); + sscanf(line,"%d",&layout); /* Read the threshold value of the test ratio */ - s_rsle(&io___15); - do_lio(&c__5, &c__1, (char *)&thresh, (ftnlen)sizeof(doublereal)); - e_rsle(); + fgets(line,80,stdin); + sscanf(line,"%lf",&thresh); /* Read and check the parameter values for the tests. */ /* Values of N */ - s_rsle(&io___17); - do_lio(&c__3, &c__1, (char *)&nidim, (ftnlen)sizeof(integer)); - e_rsle(); + fgets(line,80,stdin); +#ifdef USE64BITINT + sscanf(line,"%d",&nidim); +#else + sscanf(line,"%d",&nidim); +#endif if (nidim < 1 || nidim > 9) { - s_wsfe(&io___19); - do_fio(&c__1, "N", (ftnlen)1); - do_fio(&c__1, (char *)&c__9, (ftnlen)sizeof(integer)); - e_wsfe(); - goto L220; + fprintf(stderr,"NUMBER OF VALUES OF N IS LESS THAN 1 OR GREATER THAN 9"); + goto L220; } - s_rsle(&io___20); + fgets(line,80,stdin); +#ifdef USE64BITINT + sscanf(line,"%ld %ld %ld %ld %ld %ld %ld %ld %ld",&idim[0],&idim[1],&idim[2], + &idim[3],&idim[4],&idim[5],&idim[6],&idim[7],&idim[8]); +#else + sscanf(line,"%d %d %d %d %d %d %d %d %d",&idim[0],&idim[1],&idim[2], + &idim[3],&idim[4],&idim[5],&idim[6],&idim[7],&idim[8]); +#endif i__1 = nidim; for (i__ = 1; i__ <= i__1; ++i__) { - do_lio(&c__3, &c__1, (char *)&idim[i__ - 1], (ftnlen)sizeof(integer)); - } - e_rsle(); - i__1 = nidim; - for (i__ = 1; i__ <= i__1; ++i__) { - if (idim[i__ - 1] < 0 || idim[i__ - 1] > 65) { - s_wsfe(&io___23); - do_fio(&c__1, (char *)&c__65, (ftnlen)sizeof(integer)); - e_wsfe(); - goto L220; - } + if (idim[i__ - 1] < 0 || idim[i__ - 1] > 65) { + fprintf(stderr,"VALUE OF N IS LESS THAN 0 OR GREATER THAN 65\n"); + goto L220; + } /* L10: */ } /* Values of ALPHA */ - s_rsle(&io___24); - do_lio(&c__3, &c__1, (char *)&nalf, (ftnlen)sizeof(integer)); - e_rsle(); + fgets(line,80,stdin); +#ifdef USE64BITINT + sscanf(line,"%ld",&nalf); +#else + sscanf(line,"%d",&nalf); +#endif if (nalf < 1 || nalf > 7) { - s_wsfe(&io___26); - do_fio(&c__1, "ALPHA", (ftnlen)5); - do_fio(&c__1, (char *)&c__7, (ftnlen)sizeof(integer)); - e_wsfe(); - goto L220; + fprintf(stderr,"VALUE OF ALPHA IS LESS THAN 0 OR GREATER THAN 7\n"); + goto L220; } - s_rsle(&io___27); - i__1 = nalf; - for (i__ = 1; i__ <= i__1; ++i__) { - do_lio(&c__7, &c__1, (char *)&alf[i__ - 1], (ftnlen)sizeof( - doublecomplex)); - } - e_rsle(); + fgets(line,80,stdin); + sscanf(line,"(%lf,%lf) (%lf,%lf) (%lf,%lf) (%lf,%lf) (%lf,%lf) (%lf,%lf) (%lf,%lf)",&alf[0].r,&alf[0].i,&alf[1].r,&alf[1].i,&alf[2].r,&alf[2].i,&alf[3].r,&alf[3].i, + &alf[4].r,&alf[4].i,&alf[5].r,&alf[5].i,&alf[6].r,&alf[6].i); + /* Values of BETA */ - s_rsle(&io___29); - do_lio(&c__3, &c__1, (char *)&nbet, (ftnlen)sizeof(integer)); - e_rsle(); - if (nbet < 1 || nbet > 7) { - s_wsfe(&io___31); - do_fio(&c__1, "BETA", (ftnlen)4); - do_fio(&c__1, (char *)&c__7, (ftnlen)sizeof(integer)); - e_wsfe(); - goto L220; + fgets(line,80,stdin); +#ifdef USE64BITINT + sscanf(line,"%ld",&nbet); +#else + sscanf(line,"%d",&nbet); +#endif + if (nalf < 1 || nbet > 7) { + fprintf(stderr,"VALUE OF BETA IS LESS THAN 0 OR GREATER THAN 7\n"); + goto L220; } - s_rsle(&io___32); - i__1 = nbet; - for (i__ = 1; i__ <= i__1; ++i__) { - do_lio(&c__7, &c__1, (char *)&bet[i__ - 1], (ftnlen)sizeof( - doublecomplex)); - } - e_rsle(); + fgets(line,80,stdin); + sscanf(line,"(%lf,%lf) (%lf,%lf) (%lf,%lf) (%lf,%lf) (%lf,%lf) (%lf,%lf) (%lf,%lf)",&bet[0].r,&bet[0].i,&bet[1].r,&bet[1].i,&bet[2].r,&bet[2].i,&bet[3].r,&bet[3].i, + &bet[4].r,&bet[4].i,&bet[5].r,&bet[5].i,&bet[6].r,&bet[6].i); /* Report values of parameters. */ - s_wsfe(&io___34); - e_wsfe(); - s_wsfe(&io___35); - i__1 = nidim; - for (i__ = 1; i__ <= i__1; ++i__) { - do_fio(&c__1, (char *)&idim[i__ - 1], (ftnlen)sizeof(integer)); - } - e_wsfe(); - s_wsfe(&io___36); - i__1 = nalf; - for (i__ = 1; i__ <= i__1; ++i__) { - do_fio(&c__2, (char *)&alf[i__ - 1], (ftnlen)sizeof(doublereal)); - } - e_wsfe(); - s_wsfe(&io___37); - i__1 = nbet; - for (i__ = 1; i__ <= i__1; ++i__) { - do_fio(&c__2, (char *)&bet[i__ - 1], (ftnlen)sizeof(doublereal)); - } - e_wsfe(); + printf("TESTS OF THE DOUBLE PRECISION COMPLEX LEVEL 3 BLAS\nTHE FOLLOWING PARAMETER VALUES WILL BE USED:\n"); + printf(" FOR N"); + for (i__ =1; i__ <=nidim;++i__) printf(" %d",idim[i__-1]); + printf("\n"); + printf(" FOR ALPHA"); + for (i__ =1; i__ <=nalf;++i__) printf(" (%lf,%lf)",alf[i__-1].r,alf[i__-1].i); + printf("\n"); + printf(" FOR BETA"); + for (i__ =1; i__ <=nbet;++i__) printf(" (%lf,%lf)",bet[i__-1].r,bet[i__-1].i); + printf("\n"); + if (! tsterr) { - s_wsle(&io___38); - e_wsle(); - s_wsfe(&io___39); - e_wsfe(); + printf(" ERROR-EXITS WILL NOT BE TESTED\n"); } - s_wsle(&io___40); - e_wsle(); - s_wsfe(&io___41); - do_fio(&c__1, (char *)&thresh, (ftnlen)sizeof(doublereal)); - e_wsfe(); - s_wsle(&io___42); - e_wsle(); + + printf("ROUTINES PASS COMPUTATIONAL TESTS IF TEST RATIO IS LESS THAN %lf\n",thresh); rorder = FALSE_; corder = FALSE_; if (layout == 2) { rorder = TRUE_; corder = TRUE_; - s_wsfe(&io___45); - e_wsfe(); + printf("COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED\n"); } else if (layout == 1) { rorder = TRUE_; - s_wsfe(&io___46); - e_wsfe(); + printf("ROW-MAJOR DATA LAYOUT IS TESTED\n"); } else if (layout == 0) { corder = TRUE_; - s_wsfe(&io___47); - e_wsfe(); + printf("COLUMN-MAJOR DATA LAYOUT IS TESTED\n"); } - s_wsle(&io___48); - e_wsle(); /* Read names of subroutines and flags which indicate */ /* whether they are to be tested. */ @@ -935,42 +523,33 @@ static logical c_false = FALSE_; /* L20: */ } L30: - i__1 = s_rsfe(&io___50); - if (i__1 != 0) { - goto L60; + if (! fgets(line,80,stdin)) { + goto L60; } - i__1 = do_fio(&c__1, snamet, (ftnlen)13); - if (i__1 != 0) { - goto L60; - } - i__1 = do_fio(&c__1, (char *)<estt, (ftnlen)sizeof(logical)); - if (i__1 != 0) { - goto L60; - } - i__1 = e_rsfe(); - if (i__1 != 0) { - goto L60; + i__1 = sscanf(line,"%12c %c",snamet,&tmpchar); + ltestt=FALSE_; + if (tmpchar=='T')ltestt=TRUE_; + if (i__1 < 2) { + goto L60; } for (i__ = 1; i__ <= 9; ++i__) { - if (s_cmp(snamet, snames + (i__ - 1) * 13, (ftnlen)13, (ftnlen)13) == - 0) { - goto L50; - } + if (s_cmp(snamet, snames[i__ - 1] , (ftnlen)12, (ftnlen)12) == + 0) { + goto L50; + } /* L40: */ } - s_wsfe(&io___53); - do_fio(&c__1, snamet, (ftnlen)13); - e_wsfe(); - s_stop("", (ftnlen)0); + printf("SUBPROGRAM NAME %s NOT RECOGNIZED\n****** TESTS ABANDONED ******\n",snamet); + exit(1); L50: ltest[i__ - 1] = ltestt; goto L30; L60: - cl__1.cerr = 0; +/* cl__1.cerr = 0; cl__1.cunit = 5; cl__1.csta = 0; - f_clos(&cl__1); + f_clos(&cl__1);*/ /* Compute EPS (the machine precision). */ @@ -984,9 +563,7 @@ L70: goto L70; L80: eps += eps; - s_wsfe(&io___55); - do_fio(&c__1, (char *)&eps, (ftnlen)sizeof(doublereal)); - e_wsfe(); + printf("RELATIVE MACHINE PRECISION IS TAKEN TO BE %9.1g\n",eps); /* Check the reliability of ZMMCH using exact data. */ @@ -1023,30 +600,28 @@ L80: *(unsigned char *)transb = 'N'; zmmch_(transa, transb, &n, &c__1, &n, &c_b2, ab, &c__65, &ab[4225], & c__65, &c_b1, c__, &c__65, ct, g, cc, &c__65, &eps, &err, &fatal, - &c__6, &c_true); + &c__6, &c_true, (ftnlen)1, (ftnlen)1); same = lze_(cc, ct, &n); if (! same || err != 0.) { - s_wsfe(&io___68); - do_fio(&c__1, transa, (ftnlen)1); - do_fio(&c__1, transb, (ftnlen)1); - do_fio(&c__1, (char *)&same, (ftnlen)sizeof(logical)); - do_fio(&c__1, (char *)&err, (ftnlen)sizeof(doublereal)); - e_wsfe(); - s_stop("", (ftnlen)0); + printf("ERROR IN ZMMCH - IN-LINE DOT PRODUCTS ARE BEING EVALUATED WRONGLY\n"); + printf("ZMMCH WAS CALLED WITH TRANSA = %s AND TRANSB = %s\n", transa,transb); + printf("AND RETURNED SAME = %c AND ERR = %12.3f.\n",(same==FALSE_? 'F':'T'),err); + printf("THIS MAY BE DUE TO FAULTS IN THE ARITHMETIC OR THE COMPILER.\n"); + printf("****** TESTS ABANDONED ******\n"); + exit(1); } *(unsigned char *)transb = 'C'; zmmch_(transa, transb, &n, &c__1, &n, &c_b2, ab, &c__65, &ab[4225], & c__65, &c_b1, c__, &c__65, ct, g, cc, &c__65, &eps, &err, &fatal, - &c__6, &c_true); + &c__6, &c_true, (ftnlen)1, (ftnlen)1); same = lze_(cc, ct, &n); if (! same || err != 0.) { - s_wsfe(&io___69); - do_fio(&c__1, transa, (ftnlen)1); - do_fio(&c__1, transb, (ftnlen)1); - do_fio(&c__1, (char *)&same, (ftnlen)sizeof(logical)); - do_fio(&c__1, (char *)&err, (ftnlen)sizeof(doublereal)); - e_wsfe(); - s_stop("", (ftnlen)0); + printf("ERROR IN ZMMCH - IN-LINE DOT PRODUCTS ARE BEING EVALUATED WRONGLY\n"); + printf("ZMMCH WAS CALLED WITH TRANSA = %s AND TRANSB = %s\n", transa,transb); + printf("AND RETURNED SAME = %c AND ERR = %12.3f.\n",(same==FALSE_? 'F':'T'),err); + printf("THIS MAY BE DUE TO FAULTS IN THE ARITHMETIC OR THE COMPILER.\n"); + printf("****** TESTS ABANDONED ******\n"); + exit(1); } i__1 = n; for (j = 1; j <= i__1; ++j) { @@ -1069,56 +644,48 @@ L80: *(unsigned char *)transb = 'N'; zmmch_(transa, transb, &n, &c__1, &n, &c_b2, ab, &c__65, &ab[4225], & c__65, &c_b1, c__, &c__65, ct, g, cc, &c__65, &eps, &err, &fatal, - &c__6, &c_true); + &c__6, &c_true, (ftnlen)1, (ftnlen)1); same = lze_(cc, ct, &n); if (! same || err != 0.) { - s_wsfe(&io___70); - do_fio(&c__1, transa, (ftnlen)1); - do_fio(&c__1, transb, (ftnlen)1); - do_fio(&c__1, (char *)&same, (ftnlen)sizeof(logical)); - do_fio(&c__1, (char *)&err, (ftnlen)sizeof(doublereal)); - e_wsfe(); - s_stop("", (ftnlen)0); + printf("ERROR IN ZMMCH - IN-LINE DOT PRODUCTS ARE BEING EVALUATED WRONGLY\n"); + printf("ZMMCH WAS CALLED WITH TRANSA = %s AND TRANSB = %s\n", transa,transb); + printf("AND RETURNED SAME = %c AND ERR = %12.3f.\n",(same==FALSE_? 'F':'T'),err); + printf("THIS MAY BE DUE TO FAULTS IN THE ARITHMETIC OR THE COMPILER.\n"); + printf("****** TESTS ABANDONED ******\n"); + exit(1); } *(unsigned char *)transb = 'C'; zmmch_(transa, transb, &n, &c__1, &n, &c_b2, ab, &c__65, &ab[4225], & c__65, &c_b1, c__, &c__65, ct, g, cc, &c__65, &eps, &err, &fatal, - &c__6, &c_true); + &c__6, &c_true, (ftnlen)1, (ftnlen)1); same = lze_(cc, ct, &n); if (! same || err != 0.) { - s_wsfe(&io___71); - do_fio(&c__1, transa, (ftnlen)1); - do_fio(&c__1, transb, (ftnlen)1); - do_fio(&c__1, (char *)&same, (ftnlen)sizeof(logical)); - do_fio(&c__1, (char *)&err, (ftnlen)sizeof(doublereal)); - e_wsfe(); - s_stop("", (ftnlen)0); + printf("ERROR IN ZMMCH - IN-LINE DOT PRODUCTS ARE BEING EVALUATED WRONGLY\n"); + printf("ZMMCH WAS CALLED WITH TRANSA = %s AND TRANSB = %s\n", transa,transb); + printf("AND RETURNED SAME = %c AND ERR = %12.3f.\n",(same==FALSE_? 'F':'T'),err); + printf("THIS MAY BE DUE TO FAULTS IN THE ARITHMETIC OR THE COMPILER.\n"); + printf("****** TESTS ABANDONED ******\n"); + exit(1); } /* Test each subroutine in turn. */ for (isnum = 1; isnum <= 9; ++isnum) { - s_wsle(&io___73); - e_wsle(); if (! ltest[isnum - 1]) { /* Subprogram is not to be tested. */ - s_wsfe(&io___74); - do_fio(&c__1, snames + (isnum - 1) * 13, (ftnlen)13); - e_wsfe(); + printf("%12s WAS NOT TESTED\n",snames[isnum-1]); } else { - s_copy(srnamc_1.srnamt, snames + (isnum - 1) * 13, (ftnlen)13, ( - ftnlen)13); + s_copy(srnamc_1.srnamt, snames[isnum - 1], (ftnlen)12, ( + ftnlen)12); /* Test error exits. */ if (tsterr) { - cz3chke_(snames + (isnum - 1) * 13); - s_wsle(&io___75); - e_wsle(); + cz3chke_(snames[isnum - 1], (ftnlen)12); } /* Test computations. */ infoc_1.infot = 0; infoc_1.ok = TRUE_; fatal = FALSE_; - switch (isnum) { + switch ((int)isnum) { case 1: goto L140; case 2: goto L150; case 3: goto L150; @@ -1132,76 +699,76 @@ L80: /* Test ZGEMM, 01. */ L140: if (corder) { - zchk1_(snames + (isnum - 1) * 13, &eps, &thresh, &c__6, &ntra, + zchk1_(snames[isnum - 1], &eps, &thresh, &c__6, &ntra, &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, & nbet, bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, - cc, cs, ct, g, &c__0); + cc, cs, ct, g, &c__0, (ftnlen)12); } if (rorder) { - zchk1_(snames + (isnum - 1) * 13, &eps, &thresh, &c__6, &ntra, + zchk1_(snames[isnum - 1], &eps, &thresh, &c__6, &ntra, &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, & nbet, bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, - cc, cs, ct, g, &c__1); + cc, cs, ct, g, &c__1, (ftnlen)12); } goto L190; /* Test ZHEMM, 02, ZSYMM, 03. */ L150: if (corder) { - zchk2_(snames + (isnum - 1) * 13, &eps, &thresh, &c__6, &ntra, + zchk2_(snames[isnum - 1], &eps, &thresh, &c__6, &ntra, &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, & nbet, bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, - cc, cs, ct, g, &c__0); + cc, cs, ct, g, &c__0, (ftnlen)12); } if (rorder) { - zchk2_(snames + (isnum - 1) * 13, &eps, &thresh, &c__6, &ntra, + zchk2_(snames[isnum - 1], &eps, &thresh, &c__6, &ntra, &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, & nbet, bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, - cc, cs, ct, g, &c__1); + cc, cs, ct, g, &c__1, (ftnlen)12); } goto L190; /* Test ZTRMM, 04, ZTRSM, 05. */ L160: if (corder) { - zchk3_(snames + (isnum - 1) * 13, &eps, &thresh, &c__6, &ntra, + zchk3_(snames[isnum - 1], &eps, &thresh, &c__6, &ntra, &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, & c__65, ab, aa, as, &ab[4225], bb, bs, ct, g, c__, & - c__0); + c__0, (ftnlen)12); } if (rorder) { - zchk3_(snames + (isnum - 1) * 13, &eps, &thresh, &c__6, &ntra, + zchk3_(snames[isnum - 1], &eps, &thresh, &c__6, &ntra, &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, & c__65, ab, aa, as, &ab[4225], bb, bs, ct, g, c__, & - c__1); + c__1, (ftnlen)12); } goto L190; /* Test ZHERK, 06, ZSYRK, 07. */ L170: if (corder) { - zchk4_(snames + (isnum - 1) * 13, &eps, &thresh, &c__6, &ntra, + zchk4_(snames[isnum - 1], &eps, &thresh, &c__6, &ntra, &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, & nbet, bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, - cc, cs, ct, g, &c__0); + cc, cs, ct, g, &c__0, (ftnlen)12); } if (rorder) { - zchk4_(snames + (isnum - 1) * 13, &eps, &thresh, &c__6, &ntra, + zchk4_(snames[isnum - 1], &eps, &thresh, &c__6, &ntra, &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, & nbet, bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, - cc, cs, ct, g, &c__1); + cc, cs, ct, g, &c__1, (ftnlen)12); } goto L190; /* Test ZHER2K, 08, ZSYR2K, 09. */ L180: if (corder) { - zchk5_(snames + (isnum - 1) * 13, &eps, &thresh, &c__6, &ntra, + zchk5_(snames[isnum - 1], &eps, &thresh, &c__6, &ntra, &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, & nbet, bet, &c__65, ab, aa, as, bb, bs, c__, cc, cs, - ct, g, w, &c__0); + ct, g, w, &c__0, (ftnlen)12); } if (rorder) { - zchk5_(snames + (isnum - 1) * 13, &eps, &thresh, &c__6, &ntra, + zchk5_(snames[isnum - 1], &eps, &thresh, &c__6, &ntra, &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, & nbet, bet, &c__65, ab, aa, as, bb, bs, c__, cc, cs, - ct, g, w, &c__1); + ct, g, w, &c__1, (ftnlen)12); } goto L190; @@ -1212,122 +779,66 @@ L190: } /* L200: */ } - s_wsfe(&io___82); - e_wsfe(); + printf("\nEND OF TESTS\n"); goto L230; L210: - s_wsfe(&io___83); - e_wsfe(); + printf("\n****** FATAL ERROR - TESTS ABANDONED ******\n"); goto L230; L220: - s_wsfe(&io___84); - e_wsfe(); + printf("AMEND DATA FILE OR INCREASE ARRAY SIZES IN PROGRAM\n"); + printf("****** TESTS ABANDONED ******\n"); L230: if (trace) { - cl__1.cerr = 0; +/* cl__1.cerr = 0; cl__1.cunit = ntra; cl__1.csta = 0; - f_clos(&cl__1); + f_clos(&cl__1);*/ } - cl__1.cerr = 0; +/* cl__1.cerr = 0; cl__1.cunit = 6; cl__1.csta = 0; - f_clos(&cl__1); - s_stop("", (ftnlen)0); - + f_clos(&cl__1);*/ + exit(0); /* End of ZBLAT3. */ - return 0; } /* MAIN__ */ -/* Subroutine */ int zchk1_(char *sname, doublereal *eps, doublereal *thresh, - integer *nout, integer *ntra, logical *trace, logical *rewi, logical * - fatal, integer *nidim, integer *idim, integer *nalf, doublecomplex * - alf, integer *nbet, doublecomplex *bet, integer *nmax, doublecomplex * - a, doublecomplex *aa, doublecomplex *as, doublecomplex *b, - doublecomplex *bb, doublecomplex *bs, doublecomplex *c__, - doublecomplex *cc, doublecomplex *cs, doublecomplex *ct, doublereal * - g, integer *iorder) +/* Subroutine */ int zchk1_(char* sname, doublereal* eps, doublereal* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, doublecomplex* alf, integer* nbet, doublecomplex* bet, integer* nmax, doublecomplex* a, doublecomplex* aa, doublecomplex* as, doublecomplex* b, doublecomplex* bb, doublecomplex* bs, doublecomplex* c__, doublecomplex* cc, doublecomplex* cs, doublecomplex* ct, doublereal* g, integer* iorder, ftnlen sname_len) { /* Initialized data */ - static char ich[3] = "NTC"; - - /* Format strings */ - static char fmt_9994[] = "(\002 ******* FATAL ERROR - ERROR-CALL MYEXIT " - "TAKEN ON VALID\002,\002 CALL *******\002)"; - static char fmt_9998[] = "(\002 ******* FATAL ERROR - PARAMETER NUMBER" - " \002,i2,\002 WAS CH\002,\002ANGED INCORRECTLY *******\002)"; - static char fmt_10000[] = "(\002 \002,a13,\002 PASSED THE COLUMN-MAJOR C" - "OMPUTATIONAL TESTS\002,\002 (\002,i6,\002 CALL\002,\002S)\002)"; - static char fmt_10001[] = "(\002 \002,a13,\002 PASSED THE ROW-MAJOR C" - "OMPUTATIONAL TESTS\002,\002 (\002,i6,\002 CALL\002,\002S)\002)"; - static char fmt_10002[] = "(\002 \002,a13,\002 COMPLETED THE COLUMN-MAJO" - "R COMPUTATIONAL \002,\002TESTS (\002,i6,\002 CALLS)\002,/\002 **" - "***** BUT WITH MAXIMUM TEST \002,\002RATIO \002,f8.2,\002 - SUSP" - "ECT *******\002)"; - static char fmt_10003[] = "(\002 \002,a13,\002 COMPLETED THE ROW-MAJOR " - " COMPUTATIONAL \002,\002TESTS (\002,i6,\002 CALLS)\002,/\002 **" - "***** BUT WITH MAXIMUM TEST \002,\002RATIO \002,f8.2,\002 - SUSP" - "ECT *******\002)"; - static char fmt_9996[] = "(\002 ******* \002,a13,\002 FAILED ON CALL NUM" - "BER:\002)"; + static char ich[3+1] = "NTC"; /* System generated locals */ integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7, i__8; - alist al__1; /* Local variables */ - extern /* Subroutine */ int czgemm3m_(integer *, char *, char *, integer * - , integer *, integer *, doublecomplex *, doublecomplex *, integer - *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, - integer *); - doublecomplex beta; - integer ldas, ldbs, ldcs; - logical same, null; - integer i__, k, m, n; - doublecomplex alpha; - logical isame[13], trana, tranb; - extern /* Subroutine */ int zmake_(char *, char *, char *, integer *, - integer *, doublecomplex *, integer *, doublecomplex *, integer *, - logical *, doublecomplex *); - integer nargs; - extern /* Subroutine */ int zmmch_(char *, char *, integer *, integer *, - integer *, doublecomplex *, doublecomplex *, integer *, - doublecomplex *, integer *, doublecomplex *, doublecomplex *, - integer *, doublecomplex *, doublereal *, doublecomplex *, - integer *, doublereal *, doublereal *, logical *, integer *, - logical *); - logical reset; - integer ia, ib; - extern /* Subroutine */ int zprcn1_(integer *, integer *, char *, integer - *, char *, char *, integer *, integer *, integer *, doublecomplex - *, integer *, integer *, doublecomplex *, integer *); - integer ma, mb, na, nb, nc, ik, im, in, ks, ms, ns; - char tranas[1], tranbs[1], transa[1], transb[1]; - doublereal errmax; - extern logical lzeres_(char *, char *, integer *, integer *, - doublecomplex *, doublecomplex *, integer *); - integer ica, icb, laa, lbb, lda, lcc, ldb, ldc; - doublecomplex als, bls; - doublereal err; - extern logical lze_(doublecomplex *, doublecomplex *, integer *); - - /* Fortran I/O blocks */ - static cilist io___128 = { 0, 0, 0, fmt_9994, 0 }; - static cilist io___131 = { 0, 0, 0, fmt_9998, 0 }; - static cilist io___133 = { 0, 0, 0, fmt_10000, 0 }; - static cilist io___134 = { 0, 0, 0, fmt_10001, 0 }; - static cilist io___135 = { 0, 0, 0, fmt_10002, 0 }; - static cilist io___136 = { 0, 0, 0, fmt_10003, 0 }; - static cilist io___137 = { 0, 0, 0, fmt_9996, 0 }; - - + static doublecomplex beta; + static integer ldas, ldbs, ldcs; + static logical same, null; + static integer i__, k, m, n; + static doublecomplex alpha; + static logical isame[13], trana, tranb; + extern /* Subroutine */ int zmake_(char*, char*, char*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, logical*, doublecomplex*, ftnlen, ftnlen, ftnlen); + static integer nargs; + extern /* Subroutine */ int zmmch_(char*, char*, integer*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, doublereal*, doublecomplex*, integer*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen, ftnlen); + static logical reset; + static integer ia, ib; + extern /* Subroutine */ int zprcn1_(integer*, integer*, char*, integer*, char*, char*, integer*, integer*, integer*, doublecomplex*, integer*, integer*, doublecomplex*, integer*, ftnlen, ftnlen, ftnlen); + static integer ma, mb, na, nb, nc, ik, im, in, ks, ms, ns; + extern /* Subroutine */ void czgemm3m_(integer*, char*, char*, integer*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen); + static char tranas[1], tranbs[1], transa[1], transb[1]; + static doublereal errmax; + extern logical lzeres_(char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen); + static integer ica, icb, laa, lbb, lda, lcc, ldb, ldc; + static doublecomplex als, bls; + static doublereal err; + extern logical lze_(doublecomplex*, doublecomplex*, integer*); /* Tests ZGEMM. */ @@ -1339,6 +850,17 @@ L230: /* Jeremy Du Croz, Numerical Algorithms Group Ltd. */ /* Sven Hammarling, Numerical Algorithms Group Ltd. */ +/* .. Parameters .. */ +/* .. Scalar Arguments .. */ +/* .. Array Arguments .. */ +/* .. Local Scalars .. */ +/* .. Local Arrays .. */ +/* .. External Functions .. */ +/* .. External Subroutines .. */ +/* .. Intrinsic Functions .. */ +/* .. Scalars in Common .. */ +/* .. Common blocks .. */ +/* .. Data statements .. */ /* Parameter adjustments */ --idim; --alf; @@ -1362,6 +884,7 @@ L230: a -= a_offset; /* Function Body */ +/* .. Executable Statements .. */ nargs = 13; nc = 0; @@ -1418,7 +941,8 @@ L230: /* Generate the matrix A. */ zmake_("ge", " ", " ", &ma, &na, &a[a_offset], nmax, &aa[ - 1], &lda, &reset, &c_b1); + 1], &lda, &reset, &c_b1, (ftnlen)2, (ftnlen)1, ( + ftnlen)1); for (icb = 1; icb <= 3; ++icb) { *(unsigned char *)transb = *(unsigned char *)&ich[icb @@ -1447,7 +971,8 @@ L230: /* Generate the matrix B. */ zmake_("ge", " ", " ", &mb, &nb, &b[b_offset], nmax, & - bb[1], &ldb, &reset, &c_b1); + bb[1], &ldb, &reset, &c_b1, (ftnlen)2, ( + ftnlen)1, (ftnlen)1); i__4 = *nalf; for (ia = 1; ia <= i__4; ++ia) { @@ -1462,7 +987,8 @@ L230: /* Generate the matrix C. */ zmake_("ge", " ", " ", &m, &n, &c__[c_offset], - nmax, &cc[1], &ldc, &reset, &c_b1); + nmax, &cc[1], &ldc, &reset, &c_b1, ( + ftnlen)2, (ftnlen)1, (ftnlen)1); ++nc; @@ -1511,23 +1037,23 @@ L230: if (*trace) { zprcn1_(ntra, &nc, sname, iorder, transa, transb, &m, &n, &k, &alpha, &lda, - &ldb, &beta, &ldc); + &ldb, &beta, &ldc, (ftnlen)12, ( + ftnlen)1, (ftnlen)1); } if (*rewi) { - al__1.aerr = 0; +/* al__1.aerr = 0; al__1.aunit = *ntra; - f_rew(&al__1); + f_rew(&al__1);*/ } - czgemm3m_(iorder, transa, transb, &m, &n, &k, - &alpha, &aa[1], &lda, &bb[1], &ldb, & - beta, &cc[1], &ldc); + czgemm3m_(iorder, transa, transb, &m, &n, &k, & + alpha, &aa[1], &lda, &bb[1], &ldb, & + beta, &cc[1], &ldc, (ftnlen)1, ( + ftnlen)1); /* Check if error-exit was taken incorrectly. */ if (! infoc_1.ok) { - io___128.ciunit = *nout; - s_wsfe(&io___128); - e_wsfe(); + printf(" *** FATAL ERROR - ERROR-CALL MYEXIT TAKEN ON VALID CALL\n"); *fatal = TRUE_; goto L120; } @@ -1553,7 +1079,8 @@ L230: isame[11] = lze_(&cs[1], &cc[1], &lcc); } else { isame[11] = lzeres_("ge", " ", &m, &n, & - cs[1], &cc[1], &ldc); + cs[1], &cc[1], &ldc, (ftnlen)2, ( + ftnlen)1); } isame[12] = ldcs == ldc; @@ -1565,11 +1092,7 @@ L230: for (i__ = 1; i__ <= i__6; ++i__) { same = same && isame[i__ - 1]; if (! isame[i__ - 1]) { - io___131.ciunit = *nout; - s_wsfe(&io___131); - do_fio(&c__1, (char *)&i__, (ftnlen) - sizeof(integer)); - e_wsfe(); + printf(" ******* FATAL ERROR - PARAMETER NUMBER %d WAS CHANGED INCORRECTLY *******\n",i__); } /* L40: */ } @@ -1586,7 +1109,8 @@ L230: &a[a_offset], nmax, &b[b_offset], nmax, &beta, &c__[c_offset], nmax, &ct[1], &g[1], &cc[1], &ldc, - eps, &err, fatal, nout, &c_true); + eps, &err, fatal, nout, &c_true, + (ftnlen)1, (ftnlen)1); errmax = f2cmax(errmax,err); /* If got really bad answer, report and */ /* return. */ @@ -1623,76 +1147,44 @@ L100: if (errmax < *thresh) { if (*iorder == 0) { - io___133.ciunit = *nout; - s_wsfe(&io___133); - do_fio(&c__1, sname, (ftnlen)13); - do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); - e_wsfe(); + printf("%s PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS (%d CALLS)\n",sname,nc); } if (*iorder == 1) { - io___134.ciunit = *nout; - s_wsfe(&io___134); - do_fio(&c__1, sname, (ftnlen)13); - do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); - e_wsfe(); + printf("%s PASSED THE ROW-MAJOR COMPUTATIONAL TESTS (%d CALLS)\n",sname,nc); } } else { if (*iorder == 0) { - io___135.ciunit = *nout; - s_wsfe(&io___135); - do_fio(&c__1, sname, (ftnlen)13); - do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); - do_fio(&c__1, (char *)&errmax, (ftnlen)sizeof(doublereal)); - e_wsfe(); + printf("%s COMPLETED THE COLUMN-MAJOR COMPUTATIONAL TESTS (%d CALLS)/n",sname,nc); + printf("***** BUT WITH MAXIMUM TEST RATIO %8.2f - SUSPECT *******/n",errmax); } if (*iorder == 1) { - io___136.ciunit = *nout; - s_wsfe(&io___136); - do_fio(&c__1, sname, (ftnlen)13); - do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); - do_fio(&c__1, (char *)&errmax, (ftnlen)sizeof(doublereal)); - e_wsfe(); + printf("%s COMPLETED THE ROW-MAJOR COMPUTATIONAL TESTS (%d CALLS)/n",sname,nc); + printf("***** BUT WITH MAXIMUM TEST RATIO %8.2f - SUSPECT *******/n",errmax); } } goto L130; L120: - io___137.ciunit = *nout; - s_wsfe(&io___137); - do_fio(&c__1, sname, (ftnlen)13); - e_wsfe(); + printf(" ******* %s FAILED ON CALL NUMBER:\n",sname); zprcn1_(nout, &nc, sname, iorder, transa, transb, &m, &n, &k, &alpha, & - lda, &ldb, &beta, &ldc); + lda, &ldb, &beta, &ldc, (ftnlen)12, (ftnlen)1, (ftnlen)1); L130: return 0; -/* L9995: */ +/* 9995 FORMAT( 1X, I6, ': ', A12,'(''', A1, ''',''', A1, ''',', */ +/* $ 3( I3, ',' ), '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, */ +/* $ ',(', F4.1, ',', F4.1, '), C,', I3, ').' ) */ /* End of ZCHK1. */ } /* zchk1_ */ -/* Subroutine */ int zprcn1_(integer *nout, integer *nc, char *sname, integer - *iorder, char *transa, char *transb, integer *m, integer *n, integer * - k, doublecomplex *alpha, integer *lda, integer *ldb, doublecomplex * - beta, integer *ldc) +/* Subroutine */ int zprcn1_(integer* nout, integer* nc, char* sname, integer* iorder, char* transa, char* transb, integer* m, integer* n, integer* k, doublecomplex* alpha, integer* lda, integer* ldb, doublecomplex* beta, integer* ldc, ftnlen sname_len, ftnlen transa_len, ftnlen transb_len) { - /* Format strings */ - static char fmt_9995[] = "(1x,i6,\002: \002,a13,\002(\002,a15,\002,\002," - "a14,\002,\002,a14,\002,\002)"; - static char fmt_9994[] = "(10x,3(i3,\002,\002),\002 (\002,f4.1,\002,\002" - ",f4.1,\002) , A,\002,i3,\002, B,\002,i3,\002, (\002,f4.1,\002" - ",\002,f4.1,\002) , C,\002,i3,\002).\002)"; - /* Local variables */ - char crc[14], cta[14], ctb[14]; - - /* Fortran I/O blocks */ - static cilist io___141 = { 0, 0, 0, fmt_9995, 0 }; - static cilist io___142 = { 0, 0, 0, fmt_9994, 0 }; - + static char crc[14], cta[14], ctb[14]; if (*(unsigned char *)transa == 'N') { s_copy(cta, " CblasNoTrans", (ftnlen)14, (ftnlen)14); @@ -1713,123 +1205,52 @@ L130: } else { s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14); } - io___141.ciunit = *nout; - s_wsfe(&io___141); - do_fio(&c__1, (char *)&(*nc), (ftnlen)sizeof(integer)); - do_fio(&c__1, sname, (ftnlen)13); - do_fio(&c__1, crc, (ftnlen)14); - do_fio(&c__1, cta, (ftnlen)14); - do_fio(&c__1, ctb, (ftnlen)14); - e_wsfe(); - io___142.ciunit = *nout; - s_wsfe(&io___142); - do_fio(&c__1, (char *)&(*m), (ftnlen)sizeof(integer)); - do_fio(&c__1, (char *)&(*n), (ftnlen)sizeof(integer)); - do_fio(&c__1, (char *)&(*k), (ftnlen)sizeof(integer)); - do_fio(&c__2, (char *)&(*alpha), (ftnlen)sizeof(doublereal)); - do_fio(&c__1, (char *)&(*lda), (ftnlen)sizeof(integer)); - do_fio(&c__1, (char *)&(*ldb), (ftnlen)sizeof(integer)); - do_fio(&c__2, (char *)&(*beta), (ftnlen)sizeof(doublereal)); - do_fio(&c__1, (char *)&(*ldc), (ftnlen)sizeof(integer)); - e_wsfe(); - return 0; + printf("%6d: %s %s %s %s\n",*nc,sname,crc,cta,ctb); + printf("%d %d %d (%4.1lf,%4.1lf) , A, %d, B, %d, (%4.1lf,%4.1lf) , C, %d.\n",*m,*n,*k,alpha->r,alpha->i,*lda,*ldb,beta->r,beta->i,*ldc); + +return 0; } /* zprcn1_ */ -/* Subroutine */ int zchk2_(char *sname, doublereal *eps, doublereal *thresh, - integer *nout, integer *ntra, logical *trace, logical *rewi, logical * - fatal, integer *nidim, integer *idim, integer *nalf, doublecomplex * - alf, integer *nbet, doublecomplex *bet, integer *nmax, doublecomplex * - a, doublecomplex *aa, doublecomplex *as, doublecomplex *b, - doublecomplex *bb, doublecomplex *bs, doublecomplex *c__, - doublecomplex *cc, doublecomplex *cs, doublecomplex *ct, doublereal * - g, integer *iorder) +/* Subroutine */ int zchk2_(char* sname, doublereal* eps, doublereal* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, doublecomplex* alf, integer* nbet, doublecomplex* bet, integer* nmax, doublecomplex* a, doublecomplex* aa, doublecomplex* as, doublecomplex* b, doublecomplex* bb, doublecomplex* bs, doublecomplex* c__, doublecomplex* cc, doublecomplex* cs, doublecomplex* ct, doublereal* g, integer* iorder, ftnlen sname_len) { /* Initialized data */ - static char ichs[2] = "LR"; - static char ichu[2] = "UL"; - - /* Format strings */ - static char fmt_9994[] = "(\002 ******* FATAL ERROR - ERROR-CALL MYEXIT " - "TAKEN ON VALID\002,\002 CALL *******\002)"; - static char fmt_9998[] = "(\002 ******* FATAL ERROR - PARAMETER NUMBER" - " \002,i2,\002 WAS CH\002,\002ANGED INCORRECTLY *******\002)"; - static char fmt_10000[] = "(\002 \002,a13,\002 PASSED THE COLUMN-MAJOR C" - "OMPUTATIONAL TESTS\002,\002 (\002,i6,\002 CALL\002,\002S)\002)"; - static char fmt_10001[] = "(\002 \002,a13,\002 PASSED THE ROW-MAJOR C" - "OMPUTATIONAL TESTS\002,\002 (\002,i6,\002 CALL\002,\002S)\002)"; - static char fmt_10002[] = "(\002 \002,a13,\002 COMPLETED THE COLUMN-MAJO" - "R COMPUTATIONAL \002,\002TESTS (\002,i6,\002 CALLS)\002,/\002 **" - "***** BUT WITH MAXIMUM TEST \002,\002RATIO \002,f8.2,\002 - SUSP" - "ECT *******\002)"; - static char fmt_10003[] = "(\002 \002,a13,\002 COMPLETED THE ROW-MAJOR " - " COMPUTATIONAL \002,\002TESTS (\002,i6,\002 CALLS)\002,/\002 **" - "***** BUT WITH MAXIMUM TEST \002,\002RATIO \002,f8.2,\002 - SUSP" - "ECT *******\002)"; - static char fmt_9996[] = "(\002 ******* \002,a13,\002 FAILED ON CALL NUM" - "BER:\002)"; + static char ichs[2+1] = "LR"; + static char ichu[2+1] = "UL"; /* System generated locals */ integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7; - alist al__1; /* Local variables */ - doublecomplex beta; - integer ldas, ldbs, ldcs; - logical same; - char side[1]; - logical conj, left, null; - char uplo[1]; - integer i__, m, n; - doublecomplex alpha; - logical isame[13]; - char sides[1]; - extern /* Subroutine */ int zmake_(char *, char *, char *, integer *, - integer *, doublecomplex *, integer *, doublecomplex *, integer *, - logical *, doublecomplex *); - integer nargs; - extern /* Subroutine */ int zmmch_(char *, char *, integer *, integer *, - integer *, doublecomplex *, doublecomplex *, integer *, - doublecomplex *, integer *, doublecomplex *, doublecomplex *, - integer *, doublecomplex *, doublereal *, doublecomplex *, - integer *, doublereal *, doublereal *, logical *, integer *, - logical *); - logical reset; - char uplos[1]; - integer ia, ib; - extern /* Subroutine */ int zprcn2_(integer *, integer *, char *, integer - *, char *, char *, integer *, integer *, doublecomplex *, integer - *, integer *, doublecomplex *, integer *); - integer na, nc, im, in, ms, ns; - extern /* Subroutine */ int czhemm_(integer *, char *, char *, integer *, - integer *, doublecomplex *, doublecomplex *, integer *, - doublecomplex *, integer *, doublecomplex *, doublecomplex *, - integer *); - doublereal errmax; - extern logical lzeres_(char *, char *, integer *, integer *, - doublecomplex *, doublecomplex *, integer *); - extern /* Subroutine */ int czsymm_(integer *, char *, char *, integer *, - integer *, doublecomplex *, doublecomplex *, integer *, - doublecomplex *, integer *, doublecomplex *, doublecomplex *, - integer *); - integer laa, lbb, lda, lcc, ldb, ldc, ics; - doublecomplex als, bls; - integer icu; - doublereal err; - extern logical lze_(doublecomplex *, doublecomplex *, integer *); - - /* Fortran I/O blocks */ - static cilist io___181 = { 0, 0, 0, fmt_9994, 0 }; - static cilist io___184 = { 0, 0, 0, fmt_9998, 0 }; - static cilist io___186 = { 0, 0, 0, fmt_10000, 0 }; - static cilist io___187 = { 0, 0, 0, fmt_10001, 0 }; - static cilist io___188 = { 0, 0, 0, fmt_10002, 0 }; - static cilist io___189 = { 0, 0, 0, fmt_10003, 0 }; - static cilist io___190 = { 0, 0, 0, fmt_9996, 0 }; - - + static doublecomplex beta; + static integer ldas, ldbs, ldcs; + static logical same; + static char side[1]; + static logical isconj, left, null; + static char uplo[1]; + static integer i__, m, n; + static doublecomplex alpha; + static logical isame[13]; + static char sides[1]; + extern /* Subroutine */ int zmake_(char*, char*, char*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, logical*, doublecomplex*, ftnlen, ftnlen, ftnlen); + static integer nargs; + extern /* Subroutine */ int zmmch_(char*, char*, integer*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, doublereal*, doublecomplex*, integer*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen, ftnlen); + static logical reset; + static char uplos[1]; + static integer ia, ib; + extern /* Subroutine */ int zprcn2_(integer*, integer*, char*, integer*, char*, char*, integer*, integer*, doublecomplex*, integer*, integer*, doublecomplex*, integer*, ftnlen, ftnlen, ftnlen); + static integer na, nc, im, in, ms, ns; + extern /* Subroutine */ void czhemm_(integer*, char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen); + static doublereal errmax; + extern logical lzeres_(char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen); + extern /* Subroutine */ void czsymm_(integer*, char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen); + static integer laa, lbb, lda, lcc, ldb, ldc, ics; + static doublecomplex als, bls; + static integer icu; + static doublereal err; + extern logical lze_(doublecomplex*, doublecomplex*, integer*); /* Tests ZHEMM and ZSYMM. */ @@ -1841,6 +1262,17 @@ L130: /* Jeremy Du Croz, Numerical Algorithms Group Ltd. */ /* Sven Hammarling, Numerical Algorithms Group Ltd. */ +/* .. Parameters .. */ +/* .. Scalar Arguments .. */ +/* .. Array Arguments .. */ +/* .. Local Scalars .. */ +/* .. Local Arrays .. */ +/* .. External Functions .. */ +/* .. External Subroutines .. */ +/* .. Intrinsic Functions .. */ +/* .. Scalars in Common .. */ +/* .. Common blocks .. */ +/* .. Data statements .. */ /* Parameter adjustments */ --idim; --alf; @@ -1864,7 +1296,8 @@ L130: a -= a_offset; /* Function Body */ - conj = s_cmp(sname + 7, "he", (ftnlen)2, (ftnlen)2) == 0; +/* .. Executable Statements .. */ + isconj = s_cmp(sname + 7, "he", (ftnlen)2, (ftnlen)2) == 0; nargs = 12; nc = 0; @@ -1903,7 +1336,7 @@ L130: /* Generate the matrix B. */ zmake_("ge", " ", " ", &m, &n, &b[b_offset], nmax, &bb[1], &ldb, & - reset, &c_b1); + reset, &c_b1, (ftnlen)2, (ftnlen)1, (ftnlen)1); for (ics = 1; ics <= 2; ++ics) { *(unsigned char *)side = *(unsigned char *)&ichs[ics - 1]; @@ -1931,7 +1364,8 @@ L130: /* Generate the hermitian or symmetric matrix A. */ zmake_(sname + 7, uplo, " ", &na, &na, &a[a_offset], nmax, - &aa[1], &lda, &reset, &c_b1); + &aa[1], &lda, &reset, &c_b1, (ftnlen)2, (ftnlen) + 1, (ftnlen)1); i__3 = *nalf; for (ia = 1; ia <= i__3; ++ia) { @@ -1946,7 +1380,8 @@ L130: /* Generate the matrix C. */ zmake_("ge", " ", " ", &m, &n, &c__[c_offset], - nmax, &cc[1], &ldc, &reset, &c_b1); + nmax, &cc[1], &ldc, &reset, &c_b1, ( + ftnlen)2, (ftnlen)1, (ftnlen)1); ++nc; @@ -1992,30 +1427,28 @@ L130: if (*trace) { zprcn2_(ntra, &nc, sname, iorder, side, uplo, &m, &n, &alpha, &lda, &ldb, &beta, & - ldc) + ldc, (ftnlen)12, (ftnlen)1, (ftnlen)1) ; } if (*rewi) { - al__1.aerr = 0; +/* al__1.aerr = 0; al__1.aunit = *ntra; - f_rew(&al__1); + f_rew(&al__1);*/ } - if (conj) { + if (isconj) { czhemm_(iorder, side, uplo, &m, &n, &alpha, & aa[1], &lda, &bb[1], &ldb, &beta, &cc[ - 1], &ldc); + 1], &ldc, (ftnlen)1, (ftnlen)1); } else { czsymm_(iorder, side, uplo, &m, &n, &alpha, & aa[1], &lda, &bb[1], &ldb, &beta, &cc[ - 1], &ldc); + 1], &ldc, (ftnlen)1, (ftnlen)1); } /* Check if error-exit was taken incorrectly. */ if (! infoc_1.ok) { - io___181.ciunit = *nout; - s_wsfe(&io___181); - e_wsfe(); + printf("*** FATAL ERROR - ERROR-CALL MYEXIT TAKEN ON VALID CALL\n"); *fatal = TRUE_; goto L110; } @@ -2038,7 +1471,7 @@ L130: isame[10] = lze_(&cs[1], &cc[1], &lcc); } else { isame[10] = lzeres_("ge", " ", &m, &n, &cs[1], - &cc[1], &ldc); + &cc[1], &ldc, (ftnlen)2, (ftnlen)1); } isame[11] = ldcs == ldc; @@ -2050,11 +1483,7 @@ L130: for (i__ = 1; i__ <= i__5; ++i__) { same = same && isame[i__ - 1]; if (! isame[i__ - 1]) { - io___184.ciunit = *nout; - s_wsfe(&io___184); - do_fio(&c__1, (char *)&i__, (ftnlen) - sizeof(integer)); - e_wsfe(); + printf(" ******* FATAL ERROR - PARAMETER NUMBER %d WAS CHANGED INCORRECTLY *******\n",i__); } /* L40: */ } @@ -2072,13 +1501,15 @@ L130: a_offset], nmax, &b[b_offset], nmax, &beta, &c__[c_offset], nmax, &ct[1], &g[1], &cc[1], &ldc, eps, - &err, fatal, nout, &c_true); + &err, fatal, nout, &c_true, ( + ftnlen)1, (ftnlen)1); } else { zmmch_("N", "N", &m, &n, &n, &alpha, &b[ b_offset], nmax, &a[a_offset], nmax, &beta, &c__[c_offset], nmax, &ct[1], &g[1], &cc[1], &ldc, eps, - &err, fatal, nout, &c_true); + &err, fatal, nout, &c_true, ( + ftnlen)1, (ftnlen)1); } errmax = f2cmax(errmax,err); /* If got really bad answer, report and */ @@ -2112,76 +1543,44 @@ L90: if (errmax < *thresh) { if (*iorder == 0) { - io___186.ciunit = *nout; - s_wsfe(&io___186); - do_fio(&c__1, sname, (ftnlen)13); - do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); - e_wsfe(); + printf("%s PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS (%d CALLS)\n",sname,nc); } if (*iorder == 1) { - io___187.ciunit = *nout; - s_wsfe(&io___187); - do_fio(&c__1, sname, (ftnlen)13); - do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); - e_wsfe(); + printf("%s PASSED THE ROW-MAJOR COMPUTATIONAL TESTS (%d CALLS)\n",sname,nc); } } else { if (*iorder == 0) { - io___188.ciunit = *nout; - s_wsfe(&io___188); - do_fio(&c__1, sname, (ftnlen)13); - do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); - do_fio(&c__1, (char *)&errmax, (ftnlen)sizeof(doublereal)); - e_wsfe(); + printf("%s COMPLETED THE COLUMN-MAJOR COMPUTATIONAL TESTS (%d CALLS)/n",sname,nc); + printf("***** BUT WITH MAXIMUM TEST RATIO %8.2f - SUSPECT *******/n",errmax); } if (*iorder == 1) { - io___189.ciunit = *nout; - s_wsfe(&io___189); - do_fio(&c__1, sname, (ftnlen)13); - do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); - do_fio(&c__1, (char *)&errmax, (ftnlen)sizeof(doublereal)); - e_wsfe(); + printf("%s COMPLETED THE ROW-MAJOR COMPUTATIONAL TESTS (%d CALLS)/n",sname,nc); + printf("***** BUT WITH MAXIMUM TEST RATIO %8.2f - SUSPECT *******/n",errmax); } } goto L120; L110: - io___190.ciunit = *nout; - s_wsfe(&io___190); - do_fio(&c__1, sname, (ftnlen)13); - e_wsfe(); + printf(" ******* %s FAILED ON CALL NUMBER:\n",sname); zprcn2_(nout, &nc, sname, iorder, side, uplo, &m, &n, &alpha, &lda, &ldb, - &beta, &ldc); + &beta, &ldc, (ftnlen)12, (ftnlen)1, (ftnlen)1); L120: return 0; -/* L9995: */ +/* 9995 FORMAT(1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), */ +/* $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ',(', F4.1, */ +/* $ ',', F4.1, '), C,', I3, ') .' ) */ /* End of ZCHK2. */ } /* zchk2_ */ -/* Subroutine */ int zprcn2_(integer *nout, integer *nc, char *sname, integer - *iorder, char *side, char *uplo, integer *m, integer *n, - doublecomplex *alpha, integer *lda, integer *ldb, doublecomplex *beta, - integer *ldc) +/* Subroutine */ int zprcn2_(integer* nout, integer* nc, char* sname, integer* iorder, char* side, char* uplo, integer* m, integer* n, doublecomplex* alpha, integer* lda, integer* ldb, doublecomplex* beta, integer* ldc, ftnlen sname_len, ftnlen side_len, ftnlen uplo_len) { - /* Format strings */ - static char fmt_9995[] = "(1x,i6,\002: \002,a13,\002(\002,a15,\002,\002," - "a14,\002,\002,a14,\002,\002)"; - static char fmt_9994[] = "(10x,2(i3,\002,\002),\002 (\002,f4.1,\002,\002" - ",f4.1,\002), A,\002,i3,\002, B,\002,i3,\002, (\002,f4.1,\002," - "\002,f4.1,\002), \002,\002C,\002,i3,\002).\002)"; - /* Local variables */ - char cs[14], cu[14], crc[14]; - - /* Fortran I/O blocks */ - static cilist io___194 = { 0, 0, 0, fmt_9995, 0 }; - static cilist io___195 = { 0, 0, 0, fmt_9994, 0 }; - + static char cs[14], cu[14], crc[14]; if (*(unsigned char *)side == 'L') { s_copy(cs, " CblasLeft", (ftnlen)14, (ftnlen)14); @@ -2198,123 +1597,57 @@ L120: } else { s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14); } - io___194.ciunit = *nout; - s_wsfe(&io___194); - do_fio(&c__1, (char *)&(*nc), (ftnlen)sizeof(integer)); - do_fio(&c__1, sname, (ftnlen)13); - do_fio(&c__1, crc, (ftnlen)14); - do_fio(&c__1, cs, (ftnlen)14); - do_fio(&c__1, cu, (ftnlen)14); - e_wsfe(); - io___195.ciunit = *nout; - s_wsfe(&io___195); - do_fio(&c__1, (char *)&(*m), (ftnlen)sizeof(integer)); - do_fio(&c__1, (char *)&(*n), (ftnlen)sizeof(integer)); - do_fio(&c__2, (char *)&(*alpha), (ftnlen)sizeof(doublereal)); - do_fio(&c__1, (char *)&(*lda), (ftnlen)sizeof(integer)); - do_fio(&c__1, (char *)&(*ldb), (ftnlen)sizeof(integer)); - do_fio(&c__2, (char *)&(*beta), (ftnlen)sizeof(doublereal)); - do_fio(&c__1, (char *)&(*ldc), (ftnlen)sizeof(integer)); - e_wsfe(); - return 0; + printf("%6d: %s %s %s %s\n",*nc,sname,crc,cs,cu); + printf("%d %d (%4.1lf,%4.1lf) , A, %d, B, %d, (%4.1lf,%4.1lf) , C, %d.\n",*m,*n,alpha->r,alpha->i,*lda,*ldb,beta->r,beta->i,*ldc); + +return 0; } /* zprcn2_ */ -/* Subroutine */ int zchk3_(char *sname, doublereal *eps, doublereal *thresh, - integer *nout, integer *ntra, logical *trace, logical *rewi, logical * - fatal, integer *nidim, integer *idim, integer *nalf, doublecomplex * - alf, integer *nmax, doublecomplex *a, doublecomplex *aa, - doublecomplex *as, doublecomplex *b, doublecomplex *bb, doublecomplex - *bs, doublecomplex *ct, doublereal *g, doublecomplex *c__, integer * - iorder) +/* Subroutine */ int zchk3_(char* sname, doublereal* eps, doublereal* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, doublecomplex* alf, integer* nmax, doublecomplex* a, doublecomplex* aa, doublecomplex* as, doublecomplex* b, doublecomplex* bb, doublecomplex* bs, doublecomplex* ct, doublereal* g, doublecomplex* c__, integer* iorder, ftnlen sname_len) { /* Initialized data */ - static char ichu[2] = "UL"; - static char icht[3] = "NTC"; - static char ichd[2] = "UN"; - static char ichs[2] = "LR"; - - /* Format strings */ - static char fmt_9994[] = "(\002 ******* FATAL ERROR - ERROR-CALL MYEXIT " - "TAKEN ON VALID\002,\002 CALL *******\002)"; - static char fmt_9998[] = "(\002 ******* FATAL ERROR - PARAMETER NUMBER" - " \002,i2,\002 WAS CH\002,\002ANGED INCORRECTLY *******\002)"; - static char fmt_10000[] = "(\002 \002,a13,\002 PASSED THE COLUMN-MAJOR C" - "OMPUTATIONAL TESTS\002,\002 (\002,i6,\002 CALL\002,\002S)\002)"; - static char fmt_10001[] = "(\002 \002,a13,\002 PASSED THE ROW-MAJOR C" - "OMPUTATIONAL TESTS\002,\002 (\002,i6,\002 CALL\002,\002S)\002)"; - static char fmt_10002[] = "(\002 \002,a13,\002 COMPLETED THE COLUMN-MAJO" - "R COMPUTATIONAL \002,\002TESTS (\002,i6,\002 CALLS)\002,/\002 **" - "***** BUT WITH MAXIMUM TEST \002,\002RATIO \002,f8.2,\002 - SUSP" - "ECT *******\002)"; - static char fmt_10003[] = "(\002 \002,a13,\002 COMPLETED THE ROW-MAJOR " - " COMPUTATIONAL \002,\002TESTS (\002,i6,\002 CALLS)\002,/\002 **" - "***** BUT WITH MAXIMUM TEST \002,\002RATIO \002,f8.2,\002 - SUSP" - "ECT *******\002)"; - static char fmt_9996[] = "(\002 ******* \002,a13,\002 FAILED ON CALL NUM" - "BER:\002)"; + static char ichu[2+1] = "UL"; + static char icht[3+1] = "NTC"; + static char ichd[2+1] = "UN"; + static char ichs[2+1] = "LR"; /* System generated locals */ integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7; doublecomplex z__1; - alist al__1; /* Local variables */ - char diag[1]; - integer ldas, ldbs; - logical same; - char side[1]; - logical left, null; - char uplo[1]; - integer i__, j, m, n; - doublecomplex alpha; - char diags[1]; - logical isame[13]; - char sides[1]; - extern /* Subroutine */ int zmake_(char *, char *, char *, integer *, - integer *, doublecomplex *, integer *, doublecomplex *, integer *, - logical *, doublecomplex *); - integer nargs; - extern /* Subroutine */ int zmmch_(char *, char *, integer *, integer *, - integer *, doublecomplex *, doublecomplex *, integer *, - doublecomplex *, integer *, doublecomplex *, doublecomplex *, - integer *, doublecomplex *, doublereal *, doublecomplex *, - integer *, doublereal *, doublereal *, logical *, integer *, - logical *); - logical reset; - char uplos[1]; - integer ia, na; - extern /* Subroutine */ int zprcn3_(integer *, integer *, char *, integer - *, char *, char *, char *, char *, integer *, integer *, - doublecomplex *, integer *, integer *); - integer nc, im, in, ms, ns; - char tranas[1], transa[1]; - doublereal errmax; - extern logical lzeres_(char *, char *, integer *, integer *, - doublecomplex *, doublecomplex *, integer *); - extern /* Subroutine */ int cztrmm_(integer *, char *, char *, char *, - char *, integer *, integer *, doublecomplex *, doublecomplex *, - integer *, doublecomplex *, integer *), cztrsm_(integer *, char *, char *, char *, char *, - integer *, integer *, doublecomplex *, doublecomplex *, integer *, - doublecomplex *, integer *); - integer laa, icd, lbb, lda, ldb, ics; - doublecomplex als; - integer ict, icu; - doublereal err; - extern logical lze_(doublecomplex *, doublecomplex *, integer *); - - /* Fortran I/O blocks */ - static cilist io___236 = { 0, 0, 0, fmt_9994, 0 }; - static cilist io___239 = { 0, 0, 0, fmt_9998, 0 }; - static cilist io___241 = { 0, 0, 0, fmt_10000, 0 }; - static cilist io___242 = { 0, 0, 0, fmt_10001, 0 }; - static cilist io___243 = { 0, 0, 0, fmt_10002, 0 }; - static cilist io___244 = { 0, 0, 0, fmt_10003, 0 }; - static cilist io___245 = { 0, 0, 0, fmt_9996, 0 }; - - + static char diag[1]; + static integer ldas, ldbs; + static logical same; + static char side[1]; + static logical left, null; + static char uplo[1]; + static integer i__, j, m, n; + static doublecomplex alpha; + static char diags[1]; + static logical isame[13]; + static char sides[1]; + extern /* Subroutine */ int zmake_(char*, char*, char*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, logical*, doublecomplex*, ftnlen, ftnlen, ftnlen); + static integer nargs; + extern /* Subroutine */ int zmmch_(char*, char*, integer*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, doublereal*, doublecomplex*, integer*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen, ftnlen); + static logical reset; + static char uplos[1]; + static integer ia, na; + extern /* Subroutine */ int zprcn3_(integer*, integer*, char*, integer*, char*, char*, char*, char*, integer*, integer*, doublecomplex*, integer*, integer*, ftnlen, ftnlen, ftnlen, ftnlen, ftnlen); + static integer nc, im, in, ms, ns; + static char tranas[1], transa[1]; + static doublereal errmax; + extern logical lzeres_(char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen); + extern /* Subroutine */ void cztrmm_(integer*, char*, char*, char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, ftnlen, ftnlen, ftnlen, ftnlen); + extern /* Subroutine */ void cztrsm_(integer*, char*, char*, char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, ftnlen, ftnlen, ftnlen, ftnlen); + static integer laa, icd, lbb, lda, ldb, ics; + static doublecomplex als; + static integer ict, icu; + static doublereal err; + extern logical lze_(doublecomplex*, doublecomplex*, integer*); /* Tests ZTRMM and ZTRSM. */ @@ -2326,6 +1659,17 @@ L120: /* Jeremy Du Croz, Numerical Algorithms Group Ltd. */ /* Sven Hammarling, Numerical Algorithms Group Ltd. */ +/* .. Parameters .. */ +/* .. Scalar Arguments .. */ +/* .. Array Arguments .. */ +/* .. Local Scalars .. */ +/* .. Local Arrays .. */ +/* .. External Functions .. */ +/* .. External Subroutines .. */ +/* .. Intrinsic Functions .. */ +/* .. Scalars in Common .. */ +/* .. Common blocks .. */ +/* .. Data statements .. */ /* Parameter adjustments */ --idim; --alf; @@ -2346,6 +1690,7 @@ L120: a -= a_offset; /* Function Body */ +/* .. Executable Statements .. */ nargs = 11; nc = 0; @@ -2421,12 +1766,14 @@ L120: zmake_("tr", uplo, diag, &na, &na, &a[ a_offset], nmax, &aa[1], &lda, &reset, - &c_b1); + &c_b1, (ftnlen)2, (ftnlen)1, (ftnlen) + 1); /* Generate the matrix B. */ zmake_("ge", " ", " ", &m, &n, &b[b_offset], - nmax, &bb[1], &ldb, &reset, &c_b1); + nmax, &bb[1], &ldb, &reset, &c_b1, ( + ftnlen)2, (ftnlen)1, (ftnlen)1); ++nc; @@ -2471,42 +1818,42 @@ L120: zprcn3_(ntra, &nc, sname, iorder, side, uplo, transa, diag, &m, &n, &alpha, &lda, &ldb, ( - ftnlen)13, (ftnlen)1, (ftnlen) + ftnlen)12, (ftnlen)1, (ftnlen) 1, (ftnlen)1, (ftnlen)1); } if (*rewi) { - al__1.aerr = 0; +/* al__1.aerr = 0; al__1.aunit = *ntra; - f_rew(&al__1); + f_rew(&al__1);*/ } cztrmm_(iorder, side, uplo, transa, diag, &m, &n, &alpha, &aa[1], &lda, &bb[ - 1], &ldb); + 1], &ldb, (ftnlen)1, (ftnlen)1, ( + ftnlen)1, (ftnlen)1); } else if (s_cmp(sname + 9, "sm", (ftnlen)2, ( ftnlen)2) == 0) { if (*trace) { zprcn3_(ntra, &nc, sname, iorder, side, uplo, transa, diag, &m, &n, &alpha, &lda, &ldb, ( - ftnlen)13, (ftnlen)1, (ftnlen) + ftnlen)12, (ftnlen)1, (ftnlen) 1, (ftnlen)1, (ftnlen)1); } if (*rewi) { - al__1.aerr = 0; +/* al__1.aerr = 0; al__1.aunit = *ntra; - f_rew(&al__1); + f_rew(&al__1);*/ } cztrsm_(iorder, side, uplo, transa, diag, &m, &n, &alpha, &aa[1], &lda, &bb[ - 1], &ldb); + 1], &ldb, (ftnlen)1, (ftnlen)1, ( + ftnlen)1, (ftnlen)1); } /* Check if error-exit was taken incorrectly. */ if (! infoc_1.ok) { - io___236.ciunit = *nout; - s_wsfe(&io___236); - e_wsfe(); + printf("*** FATAL ERROR - ERROR-CALL MYEXIT TAKEN ON VALID CALL\n"); *fatal = TRUE_; goto L150; } @@ -2531,7 +1878,8 @@ L120: isame[9] = lze_(&bs[1], &bb[1], &lbb); } else { isame[9] = lzeres_("ge", " ", &m, &n, &bs[ - 1], &bb[1], &ldb); + 1], &bb[1], &ldb, (ftnlen)2, ( + ftnlen)1); } isame[10] = ldbs == ldb; @@ -2543,11 +1891,7 @@ L120: for (i__ = 1; i__ <= i__4; ++i__) { same = same && isame[i__ - 1]; if (! isame[i__ - 1]) { - io___239.ciunit = *nout; - s_wsfe(&io___239); - do_fio(&c__1, (char *)&i__, (ftnlen) - sizeof(integer)); - e_wsfe(); + printf(" ******* FATAL ERROR - PARAMETER NUMBER %d WAS CHANGED INCORRECTLY *******\n",i__); } /* L50: */ } @@ -2578,7 +1922,8 @@ L120: c_b1, &c__[c_offset], nmax, &ct[1], &g[1], &bb[ 1], &ldb, eps, &err, - fatal, nout, &c_true); + fatal, nout, &c_true, ( + ftnlen)1, (ftnlen)1); } } else if (s_cmp(sname + 9, "sm", (ftnlen) 2, (ftnlen)2) == 0) { @@ -2612,7 +1957,8 @@ L120: c_b1, &b[b_offset], nmax, &ct[1], &g[1], &bb[1], & ldb, eps, &err, fatal, - nout, &c_false); + nout, &c_false, (ftnlen)1, + (ftnlen)1); } else { zmmch_("N", transa, &m, &n, &n, & c_b2, &c__[c_offset], @@ -2620,7 +1966,8 @@ L120: &c_b1, &b[b_offset], nmax, &ct[1], &g[1], &bb[1], & ldb, eps, &err, fatal, - nout, &c_false); + nout, &c_false, (ftnlen)1, + (ftnlen)1); } } errmax = f2cmax(errmax,err); @@ -2657,77 +2004,48 @@ L130: if (errmax < *thresh) { if (*iorder == 0) { - io___241.ciunit = *nout; - s_wsfe(&io___241); - do_fio(&c__1, sname, (ftnlen)13); - do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); - e_wsfe(); + printf("%s PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS (%d CALLS)\n",sname,nc); } if (*iorder == 1) { - io___242.ciunit = *nout; - s_wsfe(&io___242); - do_fio(&c__1, sname, (ftnlen)13); - do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); - e_wsfe(); + printf("%s PASSED THE ROW-MAJOR COMPUTATIONAL TESTS (%d CALLS)\n",sname,nc); } } else { if (*iorder == 0) { - io___243.ciunit = *nout; - s_wsfe(&io___243); - do_fio(&c__1, sname, (ftnlen)13); - do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); - do_fio(&c__1, (char *)&errmax, (ftnlen)sizeof(doublereal)); - e_wsfe(); + printf("%s COMPLETED THE COLUMN-MAJOR COMPUTATIONAL TESTS (%d CALLS)/n",sname,nc); + printf("***** BUT WITH MAXIMUM TEST RATIO %8.2f - SUSPECT *******/n",errmax); } if (*iorder == 1) { - io___244.ciunit = *nout; - s_wsfe(&io___244); - do_fio(&c__1, sname, (ftnlen)13); - do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); - do_fio(&c__1, (char *)&errmax, (ftnlen)sizeof(doublereal)); - e_wsfe(); + printf("%s COMPLETED THE ROW-MAJOR COMPUTATIONAL TESTS (%d CALLS)/n",sname,nc); + printf("***** BUT WITH MAXIMUM TEST RATIO %8.2f - SUSPECT *******/n",errmax); } } goto L160; L150: - io___245.ciunit = *nout; - s_wsfe(&io___245); - do_fio(&c__1, sname, (ftnlen)13); - e_wsfe(); + printf(" ******* %s FAILED ON CALL NUMBER:\n",sname); if (*trace) { zprcn3_(ntra, &nc, sname, iorder, side, uplo, transa, diag, &m, &n, & - alpha, &lda, &ldb); + alpha, &lda, &ldb, (ftnlen)12, (ftnlen)1, (ftnlen)1, (ftnlen) + 1, (ftnlen)1); } L160: return 0; -/* L9995: */ +/* 9995 FORMAT(1X, I6, ': ', A12,'(', 4( '''', A1, ''',' ), 2( I3, ',' ), */ +/* $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ') ', */ +/* $ ' .' ) */ /* End of ZCHK3. */ } /* zchk3_ */ -/* Subroutine */ int zprcn3_(integer *nout, integer *nc, char *sname, integer - *iorder, char *side, char *uplo, char *transa, char *diag, integer *m, - integer *n, doublecomplex *alpha, integer *lda, integer *ldb) +/* Subroutine */ int zprcn3_(integer* nout, integer* nc, char* sname, integer* iorder, char* side, char* uplo, char* transa, char* diag, integer* m, integer* n, doublecomplex* alpha, integer* lda, integer* ldb, ftnlen sname_len, ftnlen side_len, ftnlen uplo_len, ftnlen transa_len, ftnlen diag_len) { - /* Format strings */ - static char fmt_9995[] = "(1x,i6,\002: \002,a13,\002(\002,a15,\002,\002," - "a14,\002,\002,a14,\002,\002)"; - static char fmt_9994[] = "(10x,2(a15,\002,\002),2(i3,\002,\002),\002 " - "(\002,f4.1,\002,\002,f4.1,\002), A,\002,i3,\002, B,\002,i3,\002)." - "\002)"; /* Local variables */ - char ca[14], cd[14], cs[14], cu[14], crc[14]; - - /* Fortran I/O blocks */ - static cilist io___251 = { 0, 0, 0, fmt_9995, 0 }; - static cilist io___252 = { 0, 0, 0, fmt_9994, 0 }; - + static char ca[14], cd[14], cs[14], cu[14], crc[14]; if (*(unsigned char *)side == 'L') { s_copy(cs, " CblasLeft", (ftnlen)14, (ftnlen)14); @@ -2756,134 +2074,61 @@ L160: } else { s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14); } - io___251.ciunit = *nout; - s_wsfe(&io___251); - do_fio(&c__1, (char *)&(*nc), (ftnlen)sizeof(integer)); - do_fio(&c__1, sname, (ftnlen)13); - do_fio(&c__1, crc, (ftnlen)14); - do_fio(&c__1, cs, (ftnlen)14); - do_fio(&c__1, cu, (ftnlen)14); - e_wsfe(); - io___252.ciunit = *nout; - s_wsfe(&io___252); - do_fio(&c__1, ca, (ftnlen)14); - do_fio(&c__1, cd, (ftnlen)14); - do_fio(&c__1, (char *)&(*m), (ftnlen)sizeof(integer)); - do_fio(&c__1, (char *)&(*n), (ftnlen)sizeof(integer)); - do_fio(&c__2, (char *)&(*alpha), (ftnlen)sizeof(doublereal)); - do_fio(&c__1, (char *)&(*lda), (ftnlen)sizeof(integer)); - do_fio(&c__1, (char *)&(*ldb), (ftnlen)sizeof(integer)); - e_wsfe(); - return 0; + printf("%6d: %s %s %s %s\n",*nc,sname,crc,cs,cu); + printf(" %s %s %d %d (%4.1lf,%4.1lf) A %d B %d\n",ca,cd,*m,*n,alpha->r,alpha->i,*lda,*ldb); + +return 0; } /* zprcn3_ */ -/* Subroutine */ int zchk4_(char *sname, doublereal *eps, doublereal *thresh, - integer *nout, integer *ntra, logical *trace, logical *rewi, logical * - fatal, integer *nidim, integer *idim, integer *nalf, doublecomplex * - alf, integer *nbet, doublecomplex *bet, integer *nmax, doublecomplex * - a, doublecomplex *aa, doublecomplex *as, doublecomplex *b, - doublecomplex *bb, doublecomplex *bs, doublecomplex *c__, - doublecomplex *cc, doublecomplex *cs, doublecomplex *ct, doublereal * - g, integer *iorder) +/* Subroutine */ int zchk4_(char* sname, doublereal* eps, doublereal* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, doublecomplex* alf, integer* nbet, doublecomplex* bet, integer* nmax, doublecomplex* a, doublecomplex* aa, doublecomplex* as, doublecomplex* b, doublecomplex* bb, doublecomplex* bs, doublecomplex* c__, doublecomplex* cc, doublecomplex* cs, doublecomplex* ct, doublereal* g, integer* iorder, ftnlen sname_len) { /* Initialized data */ - static char icht[2] = "NC"; - static char ichu[2] = "UL"; - - /* Format strings */ - static char fmt_9992[] = "(\002 ******* FATAL ERROR - ERROR-CALL MYEXIT " - "TAKEN ON VALID\002,\002 CALL *******\002)"; - static char fmt_9998[] = "(\002 ******* FATAL ERROR - PARAMETER NUMBER" - " \002,i2,\002 WAS CH\002,\002ANGED INCORRECTLY *******\002)"; - static char fmt_10000[] = "(\002 \002,a13,\002 PASSED THE COLUMN-MAJOR C" - "OMPUTATIONAL TESTS\002,\002 (\002,i6,\002 CALL\002,\002S)\002)"; - static char fmt_10001[] = "(\002 \002,a13,\002 PASSED THE ROW-MAJOR C" - "OMPUTATIONAL TESTS\002,\002 (\002,i6,\002 CALL\002,\002S)\002)"; - static char fmt_10002[] = "(\002 \002,a13,\002 COMPLETED THE COLUMN-MAJO" - "R COMPUTATIONAL \002,\002TESTS (\002,i6,\002 CALLS)\002,/\002 **" - "***** BUT WITH MAXIMUM TEST \002,\002RATIO \002,f8.2,\002 - SUSP" - "ECT *******\002)"; - static char fmt_10003[] = "(\002 \002,a13,\002 COMPLETED THE ROW-MAJOR " - " COMPUTATIONAL \002,\002TESTS (\002,i6,\002 CALLS)\002,/\002 **" - "***** BUT WITH MAXIMUM TEST \002,\002RATIO \002,f8.2,\002 - SUSP" - "ECT *******\002)"; - static char fmt_9995[] = "(\002 THESE ARE THE RESULTS FOR COLUMN" - " \002,i3)"; - static char fmt_9996[] = "(\002 ******* \002,a13,\002 FAILED ON CALL NUM" - "BER:\002)"; + static char icht[2+1] = "NC"; + static char ichu[2+1] = "UL"; /* System generated locals */ integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7; doublecomplex z__1; - alist al__1; /* Local variables */ - doublecomplex beta; - integer ldas, ldcs; - logical same, conj; - doublecomplex bets; - doublereal rals; - logical tran, null; - char uplo[1]; - integer i__, j, k, n; - doublecomplex alpha; - doublereal rbeta; - logical isame[13]; - extern /* Subroutine */ int zmake_(char *, char *, char *, integer *, - integer *, doublecomplex *, integer *, doublecomplex *, integer *, - logical *, doublecomplex *); - integer nargs; - extern /* Subroutine */ int zmmch_(char *, char *, integer *, integer *, - integer *, doublecomplex *, doublecomplex *, integer *, - doublecomplex *, integer *, doublecomplex *, doublecomplex *, - integer *, doublecomplex *, doublereal *, doublecomplex *, - integer *, doublereal *, doublereal *, logical *, integer *, - logical *); - doublereal rbets; - logical reset; - char trans[1]; - logical upper; - char uplos[1]; - integer ia, ib, jc, ma, na; - extern /* Subroutine */ int zprcn4_(integer *, integer *, char *, integer - *, char *, char *, integer *, integer *, doublecomplex *, integer - *, doublecomplex *, integer *); - integer nc; - extern /* Subroutine */ int zprcn6_(integer *, integer *, char *, integer - *, char *, char *, integer *, integer *, doublereal *, integer *, - doublereal *, integer *); - integer ik, in, jj, lj, ks, ns; - doublereal ralpha; - extern /* Subroutine */ int czherk_(integer *, char *, char *, integer *, - integer *, doublereal *, doublecomplex *, integer *, doublereal *, - doublecomplex *, integer *); - doublereal errmax; - extern logical lzeres_(char *, char *, integer *, integer *, - doublecomplex *, doublecomplex *, integer *); - char transs[1], transt[1]; - extern /* Subroutine */ int czsyrk_(integer *, char *, char *, integer *, - integer *, doublecomplex *, doublecomplex *, integer *, - doublecomplex *, doublecomplex *, integer *); - integer laa, lda, lcc, ldc; - doublecomplex als; - integer ict, icu; - doublereal err; - extern logical lze_(doublecomplex *, doublecomplex *, integer *); - - /* Fortran I/O blocks */ - static cilist io___294 = { 0, 0, 0, fmt_9992, 0 }; - static cilist io___297 = { 0, 0, 0, fmt_9998, 0 }; - static cilist io___304 = { 0, 0, 0, fmt_10000, 0 }; - static cilist io___305 = { 0, 0, 0, fmt_10001, 0 }; - static cilist io___306 = { 0, 0, 0, fmt_10002, 0 }; - static cilist io___307 = { 0, 0, 0, fmt_10003, 0 }; - static cilist io___308 = { 0, 0, 0, fmt_9995, 0 }; - static cilist io___309 = { 0, 0, 0, fmt_9996, 0 }; - - + static doublecomplex beta; + static integer ldas, ldcs; + static logical same, isconj; + static doublecomplex bets; + static doublereal rals; + static logical tran, null; + static char uplo[1]; + static integer i__, j, k, n; + static doublecomplex alpha; + static doublereal rbeta; + static logical isame[13]; + extern /* Subroutine */ int zmake_(char*, char*, char*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, logical*, doublecomplex*, ftnlen, ftnlen, ftnlen); + static integer nargs; + extern /* Subroutine */ int zmmch_(char*, char*, integer*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, doublereal*, doublecomplex*, integer*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen, ftnlen); + static doublereal rbets; + static logical reset; + static char trans[1]; + static logical upper; + static char uplos[1]; + static integer ia, ib, jc, ma, na; + extern /* Subroutine */ int zprcn4_(integer*, integer*, char*, integer*, char*, char*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, ftnlen, ftnlen, ftnlen); + static integer nc; + extern /* Subroutine */ int zprcn6_(integer*, integer*, char*, integer*, char*, char*, integer*, integer*, doublereal*, integer*, doublereal*, integer*, ftnlen, ftnlen, ftnlen); + static integer ik, in, jj, lj, ks, ns; + static doublereal ralpha; + extern /* Subroutine */ int czherk_(integer*, char*, char*, integer*, integer*, doublereal*, doublecomplex*, integer*, doublereal*, doublecomplex*, integer*, ftnlen, ftnlen); + static doublereal errmax; + extern logical lzeres_(char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen); + static char transs[1], transt[1]; + extern /* Subroutine */ int czsyrk_(integer*, char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen); + static integer laa, lda, lcc, ldc; + static doublecomplex als; + static integer ict, icu; + static doublereal err; + extern logical lze_(doublecomplex*, doublecomplex*, integer*); /* Tests ZHERK and ZSYRK. */ @@ -2895,6 +2140,17 @@ L160: /* Jeremy Du Croz, Numerical Algorithms Group Ltd. */ /* Sven Hammarling, Numerical Algorithms Group Ltd. */ +/* .. Parameters .. */ +/* .. Scalar Arguments .. */ +/* .. Array Arguments .. */ +/* .. Local Scalars .. */ +/* .. Local Arrays .. */ +/* .. External Functions .. */ +/* .. External Subroutines .. */ +/* .. Intrinsic Functions .. */ +/* .. Scalars in Common .. */ +/* .. Common blocks .. */ +/* .. Data statements .. */ /* Parameter adjustments */ --idim; --alf; @@ -2918,12 +2174,15 @@ L160: a -= a_offset; /* Function Body */ - conj = s_cmp(sname + 7, "he", (ftnlen)2, (ftnlen)2) == 0; +/* .. Executable Statements .. */ + isconj = s_cmp(sname + 7, "he", (ftnlen)2, (ftnlen)2) == 0; nargs = 10; nc = 0; reset = TRUE_; errmax = 0.; + rals = 1.; + rbets = 1.; i__1 = *nidim; for (in = 1; in <= i__1; ++in) { @@ -2946,7 +2205,7 @@ L160: for (ict = 1; ict <= 2; ++ict) { *(unsigned char *)trans = *(unsigned char *)&icht[ict - 1]; tran = *(unsigned char *)trans == 'C'; - if (tran && ! conj) { + if (tran && ! isconj) { *(unsigned char *)trans = 'T'; } if (tran) { @@ -2970,7 +2229,7 @@ L160: /* Generate the matrix A. */ zmake_("ge", " ", " ", &ma, &na, &a[a_offset], nmax, &aa[1], & - lda, &reset, &c_b1); + lda, &reset, &c_b1, (ftnlen)2, (ftnlen)1, (ftnlen)1); for (icu = 1; icu <= 2; ++icu) { *(unsigned char *)uplo = *(unsigned char *)&ichu[icu - 1]; @@ -2980,7 +2239,7 @@ L160: for (ia = 1; ia <= i__3; ++ia) { i__4 = ia; alpha.r = alf[i__4].r, alpha.i = alf[i__4].i; - if (conj) { + if (isconj) { ralpha = alpha.r; z__1.r = ralpha, z__1.i = 0.; alpha.r = z__1.r, alpha.i = z__1.i; @@ -2990,22 +2249,22 @@ L160: for (ib = 1; ib <= i__4; ++ib) { i__5 = ib; beta.r = bet[i__5].r, beta.i = bet[i__5].i; - if (conj) { + if (isconj) { rbeta = beta.r; z__1.r = rbeta, z__1.i = 0.; beta.r = z__1.r, beta.i = z__1.i; } null = n <= 0; - if (conj) { - null = null || (k <= 0 || ralpha == 0.) && - rbeta == 1.; + if (isconj) { + null = null ||( (k <= 0 || ralpha == 0.) && + rbeta == 1.); } /* Generate the matrix C. */ zmake_(sname + 7, uplo, " ", &n, &n, &c__[ c_offset], nmax, &cc[1], &ldc, &reset, & - c_b1); + c_b1, (ftnlen)2, (ftnlen)1, (ftnlen)1); ++nc; @@ -3016,7 +2275,7 @@ L160: trans; ns = n; ks = k; - if (conj) { + if (isconj) { rals = ralpha; } else { als.r = alpha.r, als.i = alpha.i; @@ -3030,7 +2289,7 @@ L160: /* L10: */ } ldas = lda; - if (conj) { + if (isconj) { rbets = rbeta; } else { bets.r = beta.r, bets.i = beta.i; @@ -3047,40 +2306,42 @@ L160: /* Call the subroutine. */ - if (conj) { + if (isconj) { if (*trace) { zprcn6_(ntra, &nc, sname, iorder, uplo, trans, &n, &k, &ralpha, &lda, & - rbeta, &ldc); + rbeta, &ldc, (ftnlen)12, (ftnlen) + 1, (ftnlen)1); } if (*rewi) { - al__1.aerr = 0; +/* al__1.aerr = 0; al__1.aunit = *ntra; - f_rew(&al__1); + f_rew(&al__1);*/ } czherk_(iorder, uplo, trans, &n, &k, &ralpha, - &aa[1], &lda, &rbeta, &cc[1], &ldc); + &aa[1], &lda, &rbeta, &cc[1], &ldc, ( + ftnlen)1, (ftnlen)1); } else { if (*trace) { zprcn4_(ntra, &nc, sname, iorder, uplo, trans, &n, &k, &alpha, &lda, & - beta, &ldc); + beta, &ldc, (ftnlen)12, (ftnlen)1, + (ftnlen)1); } if (*rewi) { - al__1.aerr = 0; +/* al__1.aerr = 0; al__1.aunit = *ntra; - f_rew(&al__1); + f_rew(&al__1);*/ } czsyrk_(iorder, uplo, trans, &n, &k, &alpha, & - aa[1], &lda, &beta, &cc[1], &ldc); + aa[1], &lda, &beta, &cc[1], &ldc, ( + ftnlen)1, (ftnlen)1); } /* Check if error-exit was taken incorrectly. */ if (! infoc_1.ok) { - io___294.ciunit = *nout; - s_wsfe(&io___294); - e_wsfe(); + printf("*** FATAL ERROR - ERROR-CALL MYEXIT TAKEN ON VALID CALL\n"); *fatal = TRUE_; goto L120; } @@ -3093,7 +2354,7 @@ L160: char *)trans; isame[2] = ns == n; isame[3] = ks == k; - if (conj) { + if (isconj) { isame[4] = rals == ralpha; } else { isame[4] = als.r == alpha.r && als.i == @@ -3101,7 +2362,7 @@ L160: } isame[5] = lze_(&as[1], &aa[1], &laa); isame[6] = ldas == lda; - if (conj) { + if (isconj) { isame[7] = rbets == rbeta; } else { isame[7] = bets.r == beta.r && bets.i == @@ -3111,7 +2372,8 @@ L160: isame[8] = lze_(&cs[1], &cc[1], &lcc); } else { isame[8] = lzeres_(sname + 7, uplo, &n, &n, & - cs[1], &cc[1], &ldc); + cs[1], &cc[1], &ldc, (ftnlen)2, ( + ftnlen)1); } isame[9] = ldcs == ldc; @@ -3123,11 +2385,7 @@ L160: for (i__ = 1; i__ <= i__5; ++i__) { same = same && isame[i__ - 1]; if (! isame[i__ - 1]) { - io___297.ciunit = *nout; - s_wsfe(&io___297); - do_fio(&c__1, (char *)&i__, (ftnlen) - sizeof(integer)); - e_wsfe(); + printf(" ******* FATAL ERROR - PARAMETER NUMBER %d WAS CHANGED INCORRECTLY *******\n",i__); } /* L30: */ } @@ -3140,7 +2398,7 @@ L160: /* Check the result column by column. */ - if (conj) { + if (isconj) { *(unsigned char *)transt = 'C'; } else { *(unsigned char *)transt = 'T'; @@ -3162,7 +2420,8 @@ L160: nmax, &beta, &c__[jj + j * c_dim1], nmax, &ct[1], &g[1], &cc[jc], &ldc, eps, &err, - fatal, nout, &c_true); + fatal, nout, &c_true, (ftnlen) + 1, (ftnlen)1); } else { zmmch_("N", transt, &lj, &c__1, &k, & alpha, &a[jj + a_dim1], nmax, @@ -3170,7 +2429,7 @@ L160: c__[jj + j * c_dim1], nmax, & ct[1], &g[1], &cc[jc], &ldc, eps, &err, fatal, nout, & - c_true); + c_true, (ftnlen)1, (ftnlen)1); } if (upper) { jc += ldc; @@ -3211,89 +2470,57 @@ L100: if (errmax < *thresh) { if (*iorder == 0) { - io___304.ciunit = *nout; - s_wsfe(&io___304); - do_fio(&c__1, sname, (ftnlen)13); - do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); - e_wsfe(); + printf("%s PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS (%d CALLS)\n",sname,nc); } if (*iorder == 1) { - io___305.ciunit = *nout; - s_wsfe(&io___305); - do_fio(&c__1, sname, (ftnlen)13); - do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); - e_wsfe(); + printf("%s PASSED THE ROW-MAJOR COMPUTATIONAL TESTS (%d CALLS)\n",sname,nc); } } else { if (*iorder == 0) { - io___306.ciunit = *nout; - s_wsfe(&io___306); - do_fio(&c__1, sname, (ftnlen)13); - do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); - do_fio(&c__1, (char *)&errmax, (ftnlen)sizeof(doublereal)); - e_wsfe(); + printf("%s COMPLETED THE COLUMN-MAJOR COMPUTATIONAL TESTS (%d CALLS)/n",sname,nc); + printf("***** BUT WITH MAXIMUM TEST RATIO %8.2f - SUSPECT *******/n",errmax); } if (*iorder == 1) { - io___307.ciunit = *nout; - s_wsfe(&io___307); - do_fio(&c__1, sname, (ftnlen)13); - do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); - do_fio(&c__1, (char *)&errmax, (ftnlen)sizeof(doublereal)); - e_wsfe(); + printf("%s COMPLETED THE ROW-MAJOR COMPUTATIONAL TESTS (%d CALLS)/n",sname,nc); + printf("***** BUT WITH MAXIMUM TEST RATIO %8.2f - SUSPECT *******/n",errmax); } } goto L130; L110: if (n > 1) { - io___308.ciunit = *nout; - s_wsfe(&io___308); - do_fio(&c__1, (char *)&j, (ftnlen)sizeof(integer)); - e_wsfe(); + printf(" THESE ARE THE RESULTS FOR COLUMN %d:\n",j); } L120: - io___309.ciunit = *nout; - s_wsfe(&io___309); - do_fio(&c__1, sname, (ftnlen)13); - e_wsfe(); - if (conj) { + printf(" ******* %s FAILED ON CALL NUMBER:\n",sname); + if (isconj) { zprcn6_(nout, &nc, sname, iorder, uplo, trans, &n, &k, &ralpha, &lda, - &rbeta, &ldc); + &rbeta, &ldc, (ftnlen)12, (ftnlen)1, (ftnlen)1); } else { zprcn4_(nout, &nc, sname, iorder, uplo, trans, &n, &k, &alpha, &lda, & - beta, &ldc); + beta, &ldc, (ftnlen)12, (ftnlen)1, (ftnlen)1); } L130: return 0; -/* L9994: */ -/* L9993: */ +/* 9994 FORMAT(1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), */ +/* $ F4.1, ', A,', I3, ',', F4.1, ', C,', I3, ') ', */ +/* $ ' .' ) */ +/* 9993 FORMAT(1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), */ +/* $ '(', F4.1, ',', F4.1, ') , A,', I3, ',(', F4.1, ',', F4.1, */ +/* $ '), C,', I3, ') .' ) */ /* End of CCHK4. */ } /* zchk4_ */ -/* Subroutine */ int zprcn4_(integer *nout, integer *nc, char *sname, integer - *iorder, char *uplo, char *transa, integer *n, integer *k, - doublecomplex *alpha, integer *lda, doublecomplex *beta, integer *ldc) +/* Subroutine */ int zprcn4_(integer* nout, integer* nc, char* sname, integer* iorder, char* uplo, char* transa, integer* n, integer* k, doublecomplex* alpha, integer* lda, doublecomplex* beta, integer* ldc, ftnlen sname_len, ftnlen uplo_len, ftnlen transa_len) { - /* Format strings */ - static char fmt_9995[] = "(1x,i6,\002: \002,a13,\002(\002,3(a15,\002," - "\002))"; - static char fmt_9994[] = "(10x,2(i3,\002,\002),\002 (\002,f4.1,\002,\002" - ",f4.1,\002), A,\002,i3,\002, (\002,f4.1,\002,\002,f4.1,\002), C" - ",\002,i3,\002).\002)"; - /* Local variables */ - char ca[14], cu[14], crc[14]; - - /* Fortran I/O blocks */ - static cilist io___313 = { 0, 0, 0, fmt_9995, 0 }; - static cilist io___314 = { 0, 0, 0, fmt_9994, 0 }; - + static char ca[14], cu[14], crc[14]; if (*(unsigned char *)uplo == 'U') { s_copy(cu, " CblasUpper", (ftnlen)14, (ftnlen)14); @@ -3312,45 +2539,19 @@ L130: } else { s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14); } - io___313.ciunit = *nout; - s_wsfe(&io___313); - do_fio(&c__1, (char *)&(*nc), (ftnlen)sizeof(integer)); - do_fio(&c__1, sname, (ftnlen)13); - do_fio(&c__1, crc, (ftnlen)14); - do_fio(&c__1, cu, (ftnlen)14); - do_fio(&c__1, ca, (ftnlen)14); - e_wsfe(); - io___314.ciunit = *nout; - s_wsfe(&io___314); - do_fio(&c__1, (char *)&(*n), (ftnlen)sizeof(integer)); - do_fio(&c__1, (char *)&(*k), (ftnlen)sizeof(integer)); - do_fio(&c__2, (char *)&(*alpha), (ftnlen)sizeof(doublereal)); - do_fio(&c__1, (char *)&(*lda), (ftnlen)sizeof(integer)); - do_fio(&c__2, (char *)&(*beta), (ftnlen)sizeof(doublereal)); - do_fio(&c__1, (char *)&(*ldc), (ftnlen)sizeof(integer)); - e_wsfe(); - return 0; + printf("%6d: %s %s %s %s\n",*nc,sname,crc,cu,ca); + printf("( %d %d (%4.1lf,%4.1lf) A %d (%4.1lf,%4.1lf) C %d\n",*n,*k,alpha->r,alpha->i,*lda,beta->r,beta->i,*ldc); + +return 0; } /* zprcn4_ */ -/* Subroutine */ int zprcn6_(integer *nout, integer *nc, char *sname, integer - *iorder, char *uplo, char *transa, integer *n, integer *k, doublereal - *alpha, integer *lda, doublereal *beta, integer *ldc) +/* Subroutine */ int zprcn6_(integer* nout, integer* nc, char* sname, integer* iorder, char* uplo, char* transa, integer* n, integer* k, doublereal* alpha, integer* lda, doublereal* beta, integer* ldc, ftnlen sname_len, ftnlen uplo_len, ftnlen transa_len) { - /* Format strings */ - static char fmt_9995[] = "(1x,i6,\002: \002,a13,\002(\002,3(a15,\002," - "\002))"; - static char fmt_9994[] = "(10x,2(i3,\002,\002),f4.1,\002, A,\002,i3" - ",\002,\002,f4.1,\002, C,\002,i3,\002).\002)"; /* Local variables */ - char ca[14], cu[14], crc[14]; - - /* Fortran I/O blocks */ - static cilist io___318 = { 0, 0, 0, fmt_9995, 0 }; - static cilist io___319 = { 0, 0, 0, fmt_9994, 0 }; - + static char ca[14], cu[14], crc[14]; if (*(unsigned char *)uplo == 'U') { s_copy(cu, " CblasUpper", (ftnlen)14, (ftnlen)14); @@ -3369,132 +2570,58 @@ L130: } else { s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14); } - io___318.ciunit = *nout; - s_wsfe(&io___318); - do_fio(&c__1, (char *)&(*nc), (ftnlen)sizeof(integer)); - do_fio(&c__1, sname, (ftnlen)13); - do_fio(&c__1, crc, (ftnlen)14); - do_fio(&c__1, cu, (ftnlen)14); - do_fio(&c__1, ca, (ftnlen)14); - e_wsfe(); - io___319.ciunit = *nout; - s_wsfe(&io___319); - do_fio(&c__1, (char *)&(*n), (ftnlen)sizeof(integer)); - do_fio(&c__1, (char *)&(*k), (ftnlen)sizeof(integer)); - do_fio(&c__1, (char *)&(*alpha), (ftnlen)sizeof(doublereal)); - do_fio(&c__1, (char *)&(*lda), (ftnlen)sizeof(integer)); - do_fio(&c__1, (char *)&(*beta), (ftnlen)sizeof(doublereal)); - do_fio(&c__1, (char *)&(*ldc), (ftnlen)sizeof(integer)); - e_wsfe(); - return 0; + printf("%6d: %s %s %s %s\n",*nc,sname,crc,cu,ca); + printf("( %d %d %4.1lf A %d %4.1lf C %d\n",*n,*k,*alpha,*lda,*beta,*ldc); + +return 0; } /* zprcn6_ */ -/* Subroutine */ int zchk5_(char *sname, doublereal *eps, doublereal *thresh, - integer *nout, integer *ntra, logical *trace, logical *rewi, logical * - fatal, integer *nidim, integer *idim, integer *nalf, doublecomplex * - alf, integer *nbet, doublecomplex *bet, integer *nmax, doublecomplex * - ab, doublecomplex *aa, doublecomplex *as, doublecomplex *bb, - doublecomplex *bs, doublecomplex *c__, doublecomplex *cc, - doublecomplex *cs, doublecomplex *ct, doublereal *g, doublecomplex *w, - integer *iorder) +/* Subroutine */ int zchk5_(char* sname, doublereal* eps, doublereal* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, doublecomplex* alf, integer* nbet, doublecomplex* bet, integer* nmax, doublecomplex* ab, doublecomplex* aa, doublecomplex* as, doublecomplex* bb, doublecomplex* bs, doublecomplex* c__, doublecomplex* cc, doublecomplex* cs, doublecomplex* ct, doublereal* g, doublecomplex* w, integer* iorder, ftnlen sname_len) { /* Initialized data */ - static char icht[2] = "NC"; - static char ichu[2] = "UL"; - - /* Format strings */ - static char fmt_9992[] = "(\002 ******* FATAL ERROR - ERROR-CALL MYEXIT " - "TAKEN ON VALID\002,\002 CALL *******\002)"; - static char fmt_9998[] = "(\002 ******* FATAL ERROR - PARAMETER NUMBER" - " \002,i2,\002 WAS CH\002,\002ANGED INCORRECTLY *******\002)"; - static char fmt_10000[] = "(\002 \002,a13,\002 PASSED THE COLUMN-MAJOR C" - "OMPUTATIONAL TESTS\002,\002 (\002,i6,\002 CALL\002,\002S)\002)"; - static char fmt_10001[] = "(\002 \002,a13,\002 PASSED THE ROW-MAJOR C" - "OMPUTATIONAL TESTS\002,\002 (\002,i6,\002 CALL\002,\002S)\002)"; - static char fmt_10002[] = "(\002 \002,a13,\002 COMPLETED THE COLUMN-MAJO" - "R COMPUTATIONAL \002,\002TESTS (\002,i6,\002 CALLS)\002,/\002 **" - "***** BUT WITH MAXIMUM TEST \002,\002RATIO \002,f8.2,\002 - SUSP" - "ECT *******\002)"; - static char fmt_10003[] = "(\002 \002,a13,\002 COMPLETED THE ROW-MAJOR " - " COMPUTATIONAL \002,\002TESTS (\002,i6,\002 CALLS)\002,/\002 **" - "***** BUT WITH MAXIMUM TEST \002,\002RATIO \002,f8.2,\002 - SUSP" - "ECT *******\002)"; - static char fmt_9995[] = "(\002 THESE ARE THE RESULTS FOR COLUMN" - " \002,i3)"; - static char fmt_9996[] = "(\002 ******* \002,a13,\002 FAILED ON CALL NUM" - "BER:\002)"; + static char icht[2+1] = "NC"; + static char ichu[2+1] = "UL"; /* System generated locals */ integer c_dim1, c_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7, i__8; doublecomplex z__1, z__2; - alist al__1; /* Local variables */ - integer jjab; - doublecomplex beta; - integer ldas, ldbs, ldcs; - logical same, conj; - doublecomplex bets; - logical tran, null; - char uplo[1]; - integer i__, j, k, n; - doublecomplex alpha; - doublereal rbeta; - logical isame[13]; - extern /* Subroutine */ int zmake_(char *, char *, char *, integer *, - integer *, doublecomplex *, integer *, doublecomplex *, integer *, - logical *, doublecomplex *); - integer nargs; - extern /* Subroutine */ int zmmch_(char *, char *, integer *, integer *, - integer *, doublecomplex *, doublecomplex *, integer *, - doublecomplex *, integer *, doublecomplex *, doublecomplex *, - integer *, doublecomplex *, doublereal *, doublecomplex *, - integer *, doublereal *, doublereal *, logical *, integer *, - logical *); - doublereal rbets; - logical reset; - char trans[1]; - logical upper; - char uplos[1]; - integer ia, ib, jc, ma, na, nc; - extern /* Subroutine */ int zprcn5_(integer *, integer *, char *, integer - *, char *, char *, integer *, integer *, doublecomplex *, integer - *, integer *, doublecomplex *, integer *), - zprcn7_(integer *, integer *, char *, integer *, char *, char *, - integer *, integer *, doublecomplex *, integer *, integer *, - doublereal *, integer *); - integer ik, in, jj, lj, ks, ns; - doublereal errmax; - extern logical lzeres_(char *, char *, integer *, integer *, - doublecomplex *, doublecomplex *, integer *); - char transs[1], transt[1]; - extern /* Subroutine */ int czher2k_(integer *, char *, char *, integer *, - integer *, doublecomplex *, doublecomplex *, integer *, - doublecomplex *, integer *, doublereal *, doublecomplex *, - integer *); - integer laa, lbb, lda, lcc, ldb, ldc; - doublecomplex als; - integer ict, icu; - extern /* Subroutine */ int czsyr2k_(integer *, char *, char *, integer *, - integer *, doublecomplex *, doublecomplex *, integer *, - doublecomplex *, integer *, doublecomplex *, doublecomplex *, - integer *); - doublereal err; - extern logical lze_(doublecomplex *, doublecomplex *, integer *); - - /* Fortran I/O blocks */ - static cilist io___362 = { 0, 0, 0, fmt_9992, 0 }; - static cilist io___365 = { 0, 0, 0, fmt_9998, 0 }; - static cilist io___373 = { 0, 0, 0, fmt_10000, 0 }; - static cilist io___374 = { 0, 0, 0, fmt_10001, 0 }; - static cilist io___375 = { 0, 0, 0, fmt_10002, 0 }; - static cilist io___376 = { 0, 0, 0, fmt_10003, 0 }; - static cilist io___377 = { 0, 0, 0, fmt_9995, 0 }; - static cilist io___378 = { 0, 0, 0, fmt_9996, 0 }; - - + static integer jjab; + static doublecomplex beta; + static integer ldas, ldbs, ldcs; + static logical same, isconj; + static doublecomplex bets; + static logical tran, null; + static char uplo[1]; + static integer i__, j, k, n; + static doublecomplex alpha; + static doublereal rbeta; + static logical isame[13]; + extern /* Subroutine */ int zmake_(char*, char*, char*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, logical*, doublecomplex*, ftnlen, ftnlen, ftnlen); + static integer nargs; + extern /* Subroutine */ int zmmch_(char*, char*, integer*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, doublereal*, doublecomplex*, integer*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen, ftnlen); + static doublereal rbets; + static logical reset; + static char trans[1]; + static logical upper; + static char uplos[1]; + static integer ia, ib, jc, ma, na, nc; + extern /* Subroutine */ int zprcn5_(integer*, integer*, char*, integer*, char*, char*, integer*, integer*, doublecomplex*, integer*, integer*, doublecomplex*, integer*, ftnlen, ftnlen, ftnlen); + extern /* Subroutine */ int zprcn7_(integer*, integer*, char*, integer*, char*, char*, integer*, integer*, doublecomplex*, integer*, integer*, doublereal*, integer*, ftnlen, ftnlen, ftnlen); + static integer ik, in, jj, lj, ks, ns; + static doublereal errmax; + extern logical lzeres_(char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen); + static char transs[1], transt[1]; + extern /* Subroutine */ int czher2k_(integer*, char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublereal*, doublecomplex*, integer*, ftnlen, ftnlen); + static integer laa, lbb, lda, lcc, ldb, ldc; + static doublecomplex als; + static integer ict, icu; + extern /* Subroutine */ int czsyr2k_(integer*, char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen); + static doublereal err; + extern logical lze_(doublecomplex*, doublecomplex*, integer*); /* Tests ZHER2K and ZSYR2K. */ @@ -3506,6 +2633,17 @@ L130: /* Jeremy Du Croz, Numerical Algorithms Group Ltd. */ /* Sven Hammarling, Numerical Algorithms Group Ltd. */ +/* .. Parameters .. */ +/* .. Scalar Arguments .. */ +/* .. Array Arguments .. */ +/* .. Local Scalars .. */ +/* .. Local Arrays .. */ +/* .. External Functions .. */ +/* .. External Subroutines .. */ +/* .. Intrinsic Functions .. */ +/* .. Scalars in Common .. */ +/* .. Common blocks .. */ +/* .. Data statements .. */ /* Parameter adjustments */ --idim; --alf; @@ -3525,7 +2663,8 @@ L130: --ab; /* Function Body */ - conj = s_cmp(sname + 7, "he", (ftnlen)2, (ftnlen)2) == 0; +/* .. Executable Statements .. */ + isconj = s_cmp(sname + 7, "he", (ftnlen)2, (ftnlen)2) == 0; nargs = 12; nc = 0; @@ -3553,7 +2692,7 @@ L130: for (ict = 1; ict <= 2; ++ict) { *(unsigned char *)trans = *(unsigned char *)&icht[ict - 1]; tran = *(unsigned char *)trans == 'C'; - if (tran && ! conj) { + if (tran && ! isconj) { *(unsigned char *)trans = 'T'; } if (tran) { @@ -3579,10 +2718,12 @@ L130: if (tran) { i__3 = *nmax << 1; zmake_("ge", " ", " ", &ma, &na, &ab[1], &i__3, &aa[1], & - lda, &reset, &c_b1); + lda, &reset, &c_b1, (ftnlen)2, (ftnlen)1, (ftnlen) + 1); } else { zmake_("ge", " ", " ", &ma, &na, &ab[1], nmax, &aa[1], & - lda, &reset, &c_b1); + lda, &reset, &c_b1, (ftnlen)2, (ftnlen)1, (ftnlen) + 1); } /* Generate the matrix B. */ @@ -3592,10 +2733,12 @@ L130: if (tran) { i__3 = *nmax << 1; zmake_("ge", " ", " ", &ma, &na, &ab[k + 1], &i__3, &bb[1] - , &ldb, &reset, &c_b1); + , &ldb, &reset, &c_b1, (ftnlen)2, (ftnlen)1, ( + ftnlen)1); } else { zmake_("ge", " ", " ", &ma, &na, &ab[k * *nmax + 1], nmax, - &bb[1], &ldb, &reset, &c_b1); + &bb[1], &ldb, &reset, &c_b1, (ftnlen)2, (ftnlen) + 1, (ftnlen)1); } for (icu = 1; icu <= 2; ++icu) { @@ -3611,22 +2754,22 @@ L130: for (ib = 1; ib <= i__4; ++ib) { i__5 = ib; beta.r = bet[i__5].r, beta.i = bet[i__5].i; - if (conj) { + if (isconj) { rbeta = beta.r; z__1.r = rbeta, z__1.i = 0.; beta.r = z__1.r, beta.i = z__1.i; } null = n <= 0; - if (conj) { - null = null || (k <= 0 || alpha.r == 0. && - alpha.i == 0.) && rbeta == 1.; + if (isconj) { + null = null ||( (k <= 0 || (alpha.r == 0. && + alpha.i == 0.)) && rbeta == 1.); } /* Generate the matrix C. */ zmake_(sname + 7, uplo, " ", &n, &n, &c__[ c_offset], nmax, &cc[1], &ldc, &reset, & - c_b1); + c_b1, (ftnlen)2, (ftnlen)1, (ftnlen)1); ++nc; @@ -3656,7 +2799,7 @@ L130: /* L20: */ } ldbs = ldb; - if (conj) { + if (isconj) { rbets = rbeta; } else { bets.r = beta.r, bets.i = beta.i; @@ -3673,42 +2816,42 @@ L130: /* Call the subroutine. */ - if (conj) { + if (isconj) { if (*trace) { zprcn7_(ntra, &nc, sname, iorder, uplo, trans, &n, &k, &alpha, &lda, &ldb, - &rbeta, &ldc); + &rbeta, &ldc, (ftnlen)12, ( + ftnlen)1, (ftnlen)1); } if (*rewi) { - al__1.aerr = 0; +/* al__1.aerr = 0; al__1.aunit = *ntra; - f_rew(&al__1); + f_rew(&al__1);*/ } czher2k_(iorder, uplo, trans, &n, &k, &alpha, &aa[1], &lda, &bb[1], &ldb, &rbeta, & - cc[1], &ldc); + cc[1], &ldc, (ftnlen)1, (ftnlen)1); } else { if (*trace) { zprcn5_(ntra, &nc, sname, iorder, uplo, trans, &n, &k, &alpha, &lda, &ldb, - &beta, &ldc); + &beta, &ldc, (ftnlen)12, (ftnlen) + 1, (ftnlen)1); } if (*rewi) { - al__1.aerr = 0; +/* al__1.aerr = 0; al__1.aunit = *ntra; - f_rew(&al__1); + f_rew(&al__1);*/ } czsyr2k_(iorder, uplo, trans, &n, &k, &alpha, &aa[1], &lda, &bb[1], &ldb, &beta, & - cc[1], &ldc); + cc[1], &ldc, (ftnlen)1, (ftnlen)1); } /* Check if error-exit was taken incorrectly. */ if (! infoc_1.ok) { - io___362.ciunit = *nout; - s_wsfe(&io___362); - e_wsfe(); + printf("*** FATAL ERROR - ERROR-CALL MYEXIT TAKEN ON VALID CALL\n"); *fatal = TRUE_; goto L150; } @@ -3726,7 +2869,7 @@ L130: isame[6] = ldas == lda; isame[7] = lze_(&bs[1], &bb[1], &lbb); isame[8] = ldbs == ldb; - if (conj) { + if (isconj) { isame[9] = rbets == rbeta; } else { isame[9] = bets.r == beta.r && bets.i == @@ -3736,7 +2879,7 @@ L130: isame[10] = lze_(&cs[1], &cc[1], &lcc); } else { isame[10] = lzeres_("he", uplo, &n, &n, &cs[1] - , &cc[1], &ldc); + , &cc[1], &ldc, (ftnlen)2, (ftnlen)1); } isame[11] = ldcs == ldc; @@ -3748,12 +2891,8 @@ L130: for (i__ = 1; i__ <= i__5; ++i__) { same = same && isame[i__ - 1]; if (! isame[i__ - 1]) { - io___365.ciunit = *nout; - s_wsfe(&io___365); - do_fio(&c__1, (char *)&i__, (ftnlen) - sizeof(integer)); - e_wsfe(); - } + printf(" ******* FATAL ERROR - PARAMETER NUMBER %d WAS CHANGED INCORRECTLY *******\n",i__); + } /* L40: */ } if (! same) { @@ -3765,7 +2904,7 @@ L130: /* Check the result column by column. */ - if (conj) { + if (isconj) { *(unsigned char *)transt = 'C'; } else { *(unsigned char *)transt = 'T'; @@ -3785,7 +2924,7 @@ L130: i__6 = k; for (i__ = 1; i__ <= i__6; ++i__) { i__7 = i__; - i__8 = (j - 1 << 1) * *nmax + k + + i__8 = ((j - 1) << 1) * *nmax + k + i__; z__1.r = alpha.r * ab[i__8].r - alpha.i * ab[i__8].i, @@ -3794,17 +2933,17 @@ L130: i__8].r; w[i__7].r = z__1.r, w[i__7].i = z__1.i; - if (conj) { + if (isconj) { i__7 = k + i__; d_cnjg(&z__2, &alpha); - i__8 = (j - 1 << 1) * *nmax + i__; + i__8 = ((j - 1) << 1) * *nmax + i__; z__1.r = z__2.r * ab[i__8].r - z__2.i * ab[i__8].i, z__1.i = z__2.r * ab[i__8].i + z__2.i * ab[ i__8].r; w[i__7].r = z__1.r, w[i__7].i = z__1.i; } else { i__7 = k + i__; - i__8 = (j - 1 << 1) * *nmax + i__; + i__8 = ((j - 1) << 1) * *nmax + i__; z__1.r = alpha.r * ab[i__8].r - alpha.i * ab[i__8] .i, z__1.i = alpha.r * ab[i__8].i + alpha.i * ab[i__8].r; @@ -3820,11 +2959,12 @@ L130: 1], &i__8, &beta, &c__[jj + j * c_dim1], nmax, &ct[1], &g[1] , &cc[jc], &ldc, eps, &err, - fatal, nout, &c_true); + fatal, nout, &c_true, (ftnlen) + 1, (ftnlen)1); } else { i__6 = k; for (i__ = 1; i__ <= i__6; ++i__) { - if (conj) { + if (isconj) { i__7 = i__; d_cnjg(&z__2, &ab[(k + i__ - 1) * *nmax + j]); z__1.r = alpha.r * z__2.r - alpha.i * z__2.i, @@ -3861,7 +3001,8 @@ L130: i__7, &beta, &c__[jj + j * c_dim1], nmax, &ct[1], &g[1], &cc[jc], &ldc, eps, &err, - fatal, nout, &c_true); + fatal, nout, &c_true, (ftnlen) + 1, (ftnlen)1); } if (upper) { jc += ldc; @@ -3905,90 +3046,57 @@ L130: if (errmax < *thresh) { if (*iorder == 0) { - io___373.ciunit = *nout; - s_wsfe(&io___373); - do_fio(&c__1, sname, (ftnlen)13); - do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); - e_wsfe(); + printf("%s PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS (%d CALLS)\n",sname,nc); } if (*iorder == 1) { - io___374.ciunit = *nout; - s_wsfe(&io___374); - do_fio(&c__1, sname, (ftnlen)13); - do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); - e_wsfe(); + printf("%s PASSED THE ROW-MAJOR COMPUTATIONAL TESTS (%d CALLS)\n",sname,nc); } } else { if (*iorder == 0) { - io___375.ciunit = *nout; - s_wsfe(&io___375); - do_fio(&c__1, sname, (ftnlen)13); - do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); - do_fio(&c__1, (char *)&errmax, (ftnlen)sizeof(doublereal)); - e_wsfe(); + printf("%s COMPLETED THE COLUMN-MAJOR COMPUTATIONAL TESTS (%d CALLS)/n",sname,nc); + printf("***** BUT WITH MAXIMUM TEST RATIO %8.2f - SUSPECT *******/n",errmax); } if (*iorder == 1) { - io___376.ciunit = *nout; - s_wsfe(&io___376); - do_fio(&c__1, sname, (ftnlen)13); - do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); - do_fio(&c__1, (char *)&errmax, (ftnlen)sizeof(doublereal)); - e_wsfe(); + printf("%s COMPLETED THE ROW-MAJOR COMPUTATIONAL TESTS (%d CALLS)/n",sname,nc); + printf("***** BUT WITH MAXIMUM TEST RATIO %8.2f - SUSPECT *******/n",errmax); } } goto L160; L140: if (n > 1) { - io___377.ciunit = *nout; - s_wsfe(&io___377); - do_fio(&c__1, (char *)&j, (ftnlen)sizeof(integer)); - e_wsfe(); + printf(" THESE ARE THE RESULTS FOR COLUMN %d:\n",j); } L150: - io___378.ciunit = *nout; - s_wsfe(&io___378); - do_fio(&c__1, sname, (ftnlen)13); - e_wsfe(); - if (conj) { + printf(" ******* %s FAILED ON CALL NUMBER:\n",sname); + if (isconj) { zprcn7_(nout, &nc, sname, iorder, uplo, trans, &n, &k, &alpha, &lda, & - ldb, &rbeta, &ldc); + ldb, &rbeta, &ldc, (ftnlen)12, (ftnlen)1, (ftnlen)1); } else { zprcn5_(nout, &nc, sname, iorder, uplo, trans, &n, &k, &alpha, &lda, & - ldb, &beta, &ldc); + ldb, &beta, &ldc, (ftnlen)12, (ftnlen)1, (ftnlen)1); } L160: return 0; -/* L9994: */ -/* L9993: */ +/* 9994 FORMAT(1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), */ +/* $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ',', F4.1, */ +/* $ ', C,', I3, ') .' ) */ +/* 9993 FORMAT(1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), */ +/* $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ',(', F4.1, */ +/* $ ',', F4.1, '), C,', I3, ') .' ) */ /* End of ZCHK5. */ } /* zchk5_ */ -/* Subroutine */ int zprcn5_(integer *nout, integer *nc, char *sname, integer - *iorder, char *uplo, char *transa, integer *n, integer *k, - doublecomplex *alpha, integer *lda, integer *ldb, doublecomplex *beta, - integer *ldc) +/* Subroutine */ int zprcn5_(integer* nout, integer* nc, char* sname, integer* iorder, char* uplo, char* transa, integer* n, integer* k, doublecomplex* alpha, integer* lda, integer* ldb, doublecomplex* beta, integer* ldc, ftnlen sname_len, ftnlen uplo_len, ftnlen transa_len) { - /* Format strings */ - static char fmt_9995[] = "(1x,i6,\002: \002,a13,\002(\002,3(a15,\002," - "\002))"; - static char fmt_9994[] = "(10x,2(i3,\002,\002),\002 (\002,f4.1,\002,\002" - ",f4.1,\002), A,\002,i3,\002, B\002,i3,\002, (\002,f4.1,\002,\002" - ",f4.1,\002), C,\002,i3,\002).\002)"; - /* Local variables */ - char ca[14], cu[14], crc[14]; - - /* Fortran I/O blocks */ - static cilist io___382 = { 0, 0, 0, fmt_9995, 0 }; - static cilist io___383 = { 0, 0, 0, fmt_9994, 0 }; - + static char ca[14], cu[14], crc[14]; if (*(unsigned char *)uplo == 'U') { s_copy(cu, " CblasUpper", (ftnlen)14, (ftnlen)14); @@ -4007,48 +3115,19 @@ L160: } else { s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14); } - io___382.ciunit = *nout; - s_wsfe(&io___382); - do_fio(&c__1, (char *)&(*nc), (ftnlen)sizeof(integer)); - do_fio(&c__1, sname, (ftnlen)13); - do_fio(&c__1, crc, (ftnlen)14); - do_fio(&c__1, cu, (ftnlen)14); - do_fio(&c__1, ca, (ftnlen)14); - e_wsfe(); - io___383.ciunit = *nout; - s_wsfe(&io___383); - do_fio(&c__1, (char *)&(*n), (ftnlen)sizeof(integer)); - do_fio(&c__1, (char *)&(*k), (ftnlen)sizeof(integer)); - do_fio(&c__2, (char *)&(*alpha), (ftnlen)sizeof(doublereal)); - do_fio(&c__1, (char *)&(*lda), (ftnlen)sizeof(integer)); - do_fio(&c__1, (char *)&(*ldb), (ftnlen)sizeof(integer)); - do_fio(&c__2, (char *)&(*beta), (ftnlen)sizeof(doublereal)); - do_fio(&c__1, (char *)&(*ldc), (ftnlen)sizeof(integer)); - e_wsfe(); - return 0; + printf("%6d: %s %s %s %s\n",*nc,sname,crc,cu,ca); + printf("%d %d (%4.1lf,%4.1lf) , A, %d, B, %d, (%4.1lf,%4.1lf) , C, %d.\n",*n,*k,alpha->r,alpha->i,*lda,*ldb,beta->r,beta->i,*ldc); + +return 0; } /* zprcn5_ */ -/* Subroutine */ int zprcn7_(integer *nout, integer *nc, char *sname, integer - *iorder, char *uplo, char *transa, integer *n, integer *k, - doublecomplex *alpha, integer *lda, integer *ldb, doublereal *beta, - integer *ldc) +/* Subroutine */ int zprcn7_(integer* nout, integer* nc, char* sname, integer* iorder, char* uplo, char* transa, integer* n, integer* k, doublecomplex* alpha, integer* lda, integer* ldb, doublereal* beta, integer* ldc, ftnlen sname_len, ftnlen uplo_len, ftnlen transa_len) { - /* Format strings */ - static char fmt_9995[] = "(1x,i6,\002: \002,a13,\002(\002,3(a15,\002," - "\002))"; - static char fmt_9994[] = "(10x,2(i3,\002,\002),\002 (\002,f4.1,\002,\002" - ",f4.1,\002), A,\002,i3,\002, B\002,i3,\002,\002,f4.1,\002, C," - "\002,i3,\002).\002)"; /* Local variables */ - char ca[14], cu[14], crc[14]; - - /* Fortran I/O blocks */ - static cilist io___387 = { 0, 0, 0, fmt_9995, 0 }; - static cilist io___388 = { 0, 0, 0, fmt_9994, 0 }; - + static char ca[14], cu[14], crc[14]; if (*(unsigned char *)uplo == 'U') { s_copy(cu, " CblasUpper", (ftnlen)14, (ftnlen)14); @@ -4067,31 +3146,14 @@ L160: } else { s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14); } - io___387.ciunit = *nout; - s_wsfe(&io___387); - do_fio(&c__1, (char *)&(*nc), (ftnlen)sizeof(integer)); - do_fio(&c__1, sname, (ftnlen)13); - do_fio(&c__1, crc, (ftnlen)14); - do_fio(&c__1, cu, (ftnlen)14); - do_fio(&c__1, ca, (ftnlen)14); - e_wsfe(); - io___388.ciunit = *nout; - s_wsfe(&io___388); - do_fio(&c__1, (char *)&(*n), (ftnlen)sizeof(integer)); - do_fio(&c__1, (char *)&(*k), (ftnlen)sizeof(integer)); - do_fio(&c__2, (char *)&(*alpha), (ftnlen)sizeof(doublereal)); - do_fio(&c__1, (char *)&(*lda), (ftnlen)sizeof(integer)); - do_fio(&c__1, (char *)&(*ldb), (ftnlen)sizeof(integer)); - do_fio(&c__1, (char *)&(*beta), (ftnlen)sizeof(doublereal)); - do_fio(&c__1, (char *)&(*ldc), (ftnlen)sizeof(integer)); - e_wsfe(); - return 0; + printf("%6d: %s %s %s %s\n",*nc,sname,crc,cu,ca); + printf("%d %d (%4.1lf,%4.1lf), A, %d, B, %d, %4.1lf, C, %d.\n",*n,*k,alpha->r,alpha->i,*lda,*ldb,*beta,*ldc); + +return 0; } /* zprcn7_ */ -/* Subroutine */ int zmake_(char *type__, char *uplo, char *diag, integer *m, - integer *n, doublecomplex *a, integer *nmax, doublecomplex *aa, - integer *lda, logical *reset, doublecomplex *transl) +/* Subroutine */ int zmake_(char* type__, char* uplo, char* diag, integer* m, integer* n, doublecomplex* a, integer* nmax, doublecomplex* aa, integer* lda, logical* reset, doublecomplex* transl, ftnlen type_len, ftnlen uplo_len, ftnlen diag_len) { /* System generated locals */ integer a_dim1, a_offset, i__1, i__2, i__3, i__4; @@ -4099,13 +3161,13 @@ L160: doublecomplex z__1, z__2; /* Local variables */ - integer ibeg, iend; - extern /* Double Complex */ VOID zbeg_(doublecomplex *, logical *); - logical unit; - integer i__, j; - logical lower, upper; - integer jj; - logical gen, her, tri, sym; + static integer ibeg, iend; + extern /* Double Complex */ VOID zbeg_(doublecomplex*, logical*); + static logical unit; + static integer i__, j; + static logical lower, upper; + static integer jj; + static logical gen, her, tri, sym; /* Generates values for an M by N matrix A. */ @@ -4122,6 +3184,13 @@ L160: /* Jeremy Du Croz, Numerical Algorithms Group Ltd. */ /* Sven Hammarling, Numerical Algorithms Group Ltd. */ +/* .. Parameters .. */ +/* .. Scalar Arguments .. */ +/* .. Array Arguments .. */ +/* .. Local Scalars .. */ +/* .. External Functions .. */ +/* .. Intrinsic Functions .. */ +/* .. Executable Statements .. */ /* Parameter adjustments */ a_dim1 = *nmax; a_offset = 1 + a_dim1 * 1; @@ -4143,7 +3212,7 @@ L160: for (j = 1; j <= i__1; ++j) { i__2 = *m; for (i__ = 1; i__ <= i__2; ++i__) { - if (gen || upper && i__ <= j || lower && i__ >= j) { + if (gen || (upper && i__ <= j) || (lower && i__ >= j)) { i__3 = i__ + j * a_dim1; zbeg_(&z__2, reset); z__1.r = z__2.r + transl->r, z__1.i = z__2.i + transl->i; @@ -4266,22 +3335,8 @@ L160: } /* zmake_ */ -/* Subroutine */ int zmmch_(char *transa, char *transb, integer *m, integer * - n, integer *kk, doublecomplex *alpha, doublecomplex *a, integer *lda, - doublecomplex *b, integer *ldb, doublecomplex *beta, doublecomplex * - c__, integer *ldc, doublecomplex *ct, doublereal *g, doublecomplex * - cc, integer *ldcc, doublereal *eps, doublereal *err, logical *fatal, - integer *nout, logical *mv) +/* Subroutine */ int zmmch_(char* transa, char* transb, integer* m, integer* n, integer* kk, doublecomplex* alpha, doublecomplex* a, integer* lda, doublecomplex* b, integer* ldb, doublecomplex* beta, doublecomplex* c__, integer* ldc, doublecomplex* ct, doublereal* g, doublecomplex* cc, integer* ldcc, doublereal* eps, doublereal* err, logical* fatal, integer* nout, logical* mv, ftnlen transa_len, ftnlen transb_len) { - /* Format strings */ - static char fmt_9999[] = "(\002 ******* FATAL ERROR - COMPUTED RESULT IS" - " LESS THAN HAL\002,\002F ACCURATE *******\002,/\002 " - " EXPECTED RE\002,\002SULT COMPUTED R" - "ESULT\002)"; - static char fmt_9998[] = "(1x,i7,2(\002 (\002,g15.6,\002,\002,g15.6," - "\002)\002))"; - static char fmt_9997[] = "(\002 THESE ARE THE RESULTS FOR COLUMN" - " \002,i3)"; /* System generated locals */ integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, cc_dim1, @@ -4289,18 +3344,11 @@ L160: doublereal d__1, d__2, d__3, d__4, d__5, d__6; doublecomplex z__1, z__2, z__3, z__4; + double sqrt(double); /* Local variables */ - doublereal erri; - integer i__, j, k; - logical trana, tranb, ctrana, ctranb; - - /* Fortran I/O blocks */ - static cilist io___409 = { 0, 0, 0, fmt_9999, 0 }; - static cilist io___410 = { 0, 0, 0, fmt_9998, 0 }; - static cilist io___411 = { 0, 0, 0, fmt_9998, 0 }; - static cilist io___412 = { 0, 0, 0, fmt_9997, 0 }; - - + static doublereal erri; + static integer i__, j, k; + static logical trana, tranb, ctrana, ctranb; /* Checks the results of the computational tests. */ @@ -4312,6 +3360,14 @@ L160: /* Jeremy Du Croz, Numerical Algorithms Group Ltd. */ /* Sven Hammarling, Numerical Algorithms Group Ltd. */ +/* .. Parameters .. */ +/* .. Scalar Arguments .. */ +/* .. Array Arguments .. */ +/* .. Local Scalars .. */ +/* .. Intrinsic Functions .. */ +/* .. Statement Functions .. */ +/* .. Statement Function definitions .. */ +/* .. Executable Statements .. */ /* Parameter adjustments */ a_dim1 = *lda; a_offset = 1 + a_dim1 * 1; @@ -4638,35 +3694,19 @@ L160: L230: *fatal = TRUE_; - io___409.ciunit = *nout; - s_wsfe(&io___409); - e_wsfe(); + printf(" ******* FATAL ERROR - COMPUTED RESULT IS LESS THAN HALF ACCURATE *******\n"); + printf(" EXPECTED RESULT COMPUTED RESULT\n"); i__1 = *m; for (i__ = 1; i__ <= i__1; ++i__) { if (*mv) { - io___410.ciunit = *nout; - s_wsfe(&io___410); - do_fio(&c__1, (char *)&i__, (ftnlen)sizeof(integer)); - do_fio(&c__2, (char *)&ct[i__], (ftnlen)sizeof(doublereal)); - do_fio(&c__2, (char *)&cc[i__ + j * cc_dim1], (ftnlen)sizeof( - doublereal)); - e_wsfe(); - } else { - io___411.ciunit = *nout; - s_wsfe(&io___411); - do_fio(&c__1, (char *)&i__, (ftnlen)sizeof(integer)); - do_fio(&c__2, (char *)&cc[i__ + j * cc_dim1], (ftnlen)sizeof( - doublereal)); - do_fio(&c__2, (char *)&ct[i__], (ftnlen)sizeof(doublereal)); - e_wsfe(); + printf("%7d (%15.6g,%15.6g) (%15.6g,%15.6g)\n",i__,ct[i__].r,ct[i__].i,cc[i__+j*cc_dim1].r,cc[i__+j*cc_dim1].i); + } else { + printf("%7d (%15.6g,%15.6g) (%15.6g,%15.6g)\n",i__,cc[i__+j*cc_dim1].r,cc[i__+j*cc_dim1].i,ct[i__].r,ct[i__].i); } /* L240: */ } if (*n > 1) { - io___412.ciunit = *nout; - s_wsfe(&io___412); - do_fio(&c__1, (char *)&j, (ftnlen)sizeof(integer)); - e_wsfe(); + printf(" THESE ARE THE RESULTS FOR COLUMN %d\n",j); } L250: @@ -4677,14 +3717,14 @@ L250: } /* zmmch_ */ -logical lze_(doublecomplex *ri, doublecomplex *rj, integer *lr) +logical lze_(doublecomplex* ri, doublecomplex* rj, integer* lr) { /* System generated locals */ integer i__1, i__2, i__3; logical ret_val; /* Local variables */ - integer i__; + static integer i__; /* Tests if two arrays are identical. */ @@ -4697,6 +3737,10 @@ logical lze_(doublecomplex *ri, doublecomplex *rj, integer *lr) /* Jeremy Du Croz, Numerical Algorithms Group Ltd. */ /* Sven Hammarling, Numerical Algorithms Group Ltd. */ +/* .. Scalar Arguments .. */ +/* .. Array Arguments .. */ +/* .. Local Scalars .. */ +/* .. Executable Statements .. */ /* Parameter adjustments */ --rj; --ri; @@ -4722,16 +3766,15 @@ L30: } /* lze_ */ -logical lzeres_(char *type__, char *uplo, integer *m, integer *n, - doublecomplex *aa, doublecomplex *as, integer *lda) +logical lzeres_(char* type__, char* uplo, integer* m, integer* n, doublecomplex *aa, doublecomplex* as, integer* lda, ftnlen type_len, ftnlen uplo_len) { /* System generated locals */ integer aa_dim1, aa_offset, as_dim1, as_offset, i__1, i__2, i__3, i__4; logical ret_val; /* Local variables */ - integer ibeg, iend, i__, j; - logical upper; + static integer ibeg, iend, i__, j; + static logical upper; /* Tests if selected elements in two arrays are equal. */ @@ -4746,6 +3789,10 @@ logical lzeres_(char *type__, char *uplo, integer *m, integer *n, /* Jeremy Du Croz, Numerical Algorithms Group Ltd. */ /* Sven Hammarling, Numerical Algorithms Group Ltd. */ +/* .. Scalar Arguments .. */ +/* .. Array Arguments .. */ +/* .. Local Scalars .. */ +/* .. Executable Statements .. */ /* Parameter adjustments */ as_dim1 = *lda; as_offset = 1 + as_dim1 * 1; @@ -4803,7 +3850,7 @@ logical lzeres_(char *type__, char *uplo, integer *m, integer *n, } } -/* L60: */ +/* 60 CONTINUE */ ret_val = TRUE_; goto L80; L70: @@ -4815,7 +3862,7 @@ L80: } /* lzeres_ */ -/* Double Complex */ VOID zbeg_(doublecomplex * ret_val, logical *reset) +/* Double Complex */ VOID zbeg_(doublecomplex* ret_val, logical* reset) { /* System generated locals */ doublereal d__1, d__2; @@ -4836,6 +3883,11 @@ L80: /* Jeremy Du Croz, Numerical Algorithms Group Ltd. */ /* Sven Hammarling, Numerical Algorithms Group Ltd. */ +/* .. Scalar Arguments .. */ +/* .. Local Scalars .. */ +/* .. Save statement .. */ +/* .. Intrinsic Functions .. */ +/* .. Executable Statements .. */ if (*reset) { /* Initialize local variables. */ mi = 891; @@ -4873,7 +3925,7 @@ L10: } /* zbeg_ */ -doublereal ddiff_(doublereal *x, doublereal *y) +doublereal ddiff_(doublereal* x, doublereal* y) { /* System generated locals */ doublereal ret_val; @@ -4887,6 +3939,8 @@ doublereal ddiff_(doublereal *x, doublereal *y) /* Jeremy Du Croz, Numerical Algorithms Group Ltd. */ /* Sven Hammarling, Numerical Algorithms Group Ltd. */ +/* .. Scalar Arguments .. */ +/* .. Executable Statements .. */ ret_val = *x - *y; return ret_val; @@ -4894,4 +3948,4 @@ doublereal ddiff_(doublereal *x, doublereal *y) } /* ddiff_ */ -/* Main program alias */ int zblat3_ () { MAIN__ (); return 0; } +/* Main program alias */ /*int zblat3_ () { MAIN__ (); }*/ From 5d929d2706f92b5fa70122b865dafe72aac6ea84 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 26 Feb 2024 21:00:57 +0100 Subject: [PATCH 11/23] avoid overriding the global USE_GEMM3M --- ctest/CMakeLists.txt | 5 ----- 1 file changed, 5 deletions(-) diff --git a/ctest/CMakeLists.txt b/ctest/CMakeLists.txt index d7baadee4..c56a78346 100644 --- a/ctest/CMakeLists.txt +++ b/ctest/CMakeLists.txt @@ -10,11 +10,6 @@ if (CMAKE_Fortran_COMPILER_ID STREQUAL GNU) set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -fno-tree-vectorize") endif() -set (USE_GEMM3M 0) -if (${ARCH} MATCHES x86|x86_64|ia64|mips) - set(USE_GEMM3M 1) -endif () - if(WIN32) FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/test_cblas_helper.ps1 "$ErrorActionPreference = \"Stop\"\n" From 28f151808ea3fa3b39e02948d7b52616ce52cdfb Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 26 Feb 2024 21:01:46 +0100 Subject: [PATCH 12/23] Avoid overriding the global USE_GEMM3M --- test/CMakeLists.txt | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index b4bf36cee..4ebd5348c 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -22,9 +22,7 @@ if (BUILD_COMPLEX16) list (APPEND OpenBLAS_Tests zblat1 zblat2 zblat3) endif() -set (USE_GEMM3M 0) -if (${ARCH} MATCHES x86|x86_64|ia64|mips) - set(USE_GEMM3M 1) +if (USE_GEMM3M) if (BUILD_COMPLEX) list (APPEND OpenBLAS_Tests cblat3_3m) endif () From 38283f678ed7683132f7b16f82fc6b13602f969a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 26 Feb 2024 22:22:48 +0100 Subject: [PATCH 13/23] Fix portability problems --- utest/test_extensions/test_cgemv_t.c | 4 ++-- utest/test_extensions/test_csbmv.c | 6 +++--- utest/test_extensions/test_idamin.c | 8 +++++--- utest/test_extensions/test_isamin.c | 8 +++++--- utest/test_extensions/test_zgemv_t.c | 4 ++-- 5 files changed, 17 insertions(+), 13 deletions(-) diff --git a/utest/test_extensions/test_cgemv_t.c b/utest/test_extensions/test_cgemv_t.c index aa3281e66..cb4e5ad9e 100644 --- a/utest/test_extensions/test_cgemv_t.c +++ b/utest/test_extensions/test_cgemv_t.c @@ -126,7 +126,7 @@ static float check_cgemv(char api, char order, char trans, blasint m, blasint n, srand_generate(data_cgemv_t.y_test, m * inc_y * 2); // Copy vector y for reference funcs - for (int i = 0; i < m * inc_y * 2; i++) { + for (i = 0; i < m * inc_y * 2; i++) { data_cgemv_t.y_verify[i] = data_cgemv_t.y_test[i]; } @@ -1129,4 +1129,4 @@ CTEST(cgemv, c_api_xerbla_invalid_order_col_major) int passed = c_api_check_badargs(corder, ctrans, m, n, lda, inc_x, inc_y, expected_info); ASSERT_EQUAL(TRUE, passed); } -#endif \ No newline at end of file +#endif diff --git a/utest/test_extensions/test_csbmv.c b/utest/test_extensions/test_csbmv.c index 8e8ce4530..41c24a2b7 100644 --- a/utest/test_extensions/test_csbmv.c +++ b/utest/test_extensions/test_csbmv.c @@ -188,7 +188,7 @@ static float check_csbmv(char uplo, blasint n, blasint k, float *alpha, blasint char trans = 'N'; // Symmetric band packed matrix for sbmv - float a[lda * n * 2]; + float *a = (float*) malloc(lda * n * 2 * sizeof(float)); // Fill symmetric packed matrix sp_matrix, vector b_test, vector c_test srand_generate(data_csbmv.sp_matrix, n * (n + 1)); @@ -216,7 +216,7 @@ static float check_csbmv(char uplo, blasint n, blasint k, float *alpha, blasint // Find the differences between output vector caculated by csbmv and cgemv for (i = 0; i < n * inc_c * 2; i++) data_csbmv.c_test[i] -= data_csbmv.c_verify[i]; - + free(a); // Find the norm of differences return BLASFUNC(scnrm2)(&n, data_csbmv.c_test, &inc_c); } @@ -603,4 +603,4 @@ CTEST(csbmv, xerbla_lda_invalid) int passed = check_badargs(uplo, n, k, lda, inc_b, inc_c, expected_info); ASSERT_EQUAL(TRUE, passed); } -#endif \ No newline at end of file +#endif diff --git a/utest/test_extensions/test_idamin.c b/utest/test_extensions/test_idamin.c index 6a7ed9d1e..bebe76dba 100644 --- a/utest/test_extensions/test_idamin.c +++ b/utest/test_extensions/test_idamin.c @@ -402,13 +402,14 @@ CTEST(idamin, min_idx_in_vec_tail){ CTEST(idamin, min_idx_in_vec_tail_inc_1){ blasint i; blasint N = ELEMENTS, inc = 1; - double x[ELEMENTS * inc]; + double *x = (double*)malloc(ELEMENTS * inc * sizeof(double)); for (i = 0; i < N * inc; i ++) { x[i] = i + 1000; } x[(N - 1) * inc] = 0.0f; blasint index = BLASFUNC(idamin)(&N, x, &inc); + free(x); ASSERT_EQUAL(N, index); } @@ -775,13 +776,14 @@ CTEST(idamin, c_api_min_idx_in_vec_tail){ CTEST(idamin, c_api_min_idx_in_vec_tail_inc_1){ blasint i; blasint N = ELEMENTS, inc = 1; - double x[ELEMENTS * inc]; + double *x = (double*) malloc(ELEMENTS * inc * sizeof(double)); for (i = 0; i < N * inc; i ++) { x[i] = i + 1000; } x[(N - 1) * inc] = 0.0; blasint index = cblas_idamin(N, x, inc); + free(x); ASSERT_EQUAL(N - 1, index); } -#endif \ No newline at end of file +#endif diff --git a/utest/test_extensions/test_isamin.c b/utest/test_extensions/test_isamin.c index 4ff235b83..673e656df 100644 --- a/utest/test_extensions/test_isamin.c +++ b/utest/test_extensions/test_isamin.c @@ -402,13 +402,14 @@ CTEST(isamin, min_idx_in_vec_tail){ CTEST(isamin, min_idx_in_vec_tail_inc_1){ blasint i; blasint N = ELEMENTS, inc = 1; - float x[ELEMENTS * inc]; + float *x = (float*) (ELEMENTS * inc * sizeof(float)); for (i = 0; i < N * inc; i ++) { x[i] = i + 1000; } x[(N - 1) * inc] = 0.0f; blasint index = BLASFUNC(isamin)(&N, x, &inc); + free(x); ASSERT_EQUAL(N, index); } @@ -775,13 +776,14 @@ CTEST(isamin, c_api_min_idx_in_vec_tail){ CTEST(isamin, c_api_min_idx_in_vec_tail_inc_1){ blasint i; blasint N = ELEMENTS, inc = 1; - float x[ELEMENTS * inc]; + float *x = (float*)malloc(ELEMENTS * inc * sizeof(float)); for (i = 0; i < N * inc; i ++) { x[i] = i + 1000; } x[(N - 1) * inc] = 0.0f; blasint index = cblas_isamin(N, x, inc); + free(x); ASSERT_EQUAL(N - 1, index); } -#endif \ No newline at end of file +#endif diff --git a/utest/test_extensions/test_zgemv_t.c b/utest/test_extensions/test_zgemv_t.c index 2e0ee65f0..b2d0b2713 100644 --- a/utest/test_extensions/test_zgemv_t.c +++ b/utest/test_extensions/test_zgemv_t.c @@ -126,7 +126,7 @@ static double check_zgemv(char api, char order, char trans, blasint m, blasint n drand_generate(data_zgemv_t.y_test, m * inc_y * 2); // Copy vector y for reference funcs - for (int i = 0; i < m * inc_y * 2; i++) + for (i = 0; i < m * inc_y * 2; i++) { data_zgemv_t.y_verify[i] = data_zgemv_t.y_test[i]; } @@ -1133,4 +1133,4 @@ CTEST(zgemv, c_api_xerbla_invalid_order_col_major) int passed = c_api_check_badargs(corder, ctrans, m, n, lda, inc_x, inc_y, expected_info); ASSERT_EQUAL(TRUE, passed); } -#endif \ No newline at end of file +#endif From f7ffab870b6992d14173b0439b803da04ba1ab12 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 26 Feb 2024 23:03:10 +0100 Subject: [PATCH 14/23] fix missing malloc --- utest/test_extensions/test_isamin.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utest/test_extensions/test_isamin.c b/utest/test_extensions/test_isamin.c index 673e656df..d93813e6f 100644 --- a/utest/test_extensions/test_isamin.c +++ b/utest/test_extensions/test_isamin.c @@ -402,7 +402,7 @@ CTEST(isamin, min_idx_in_vec_tail){ CTEST(isamin, min_idx_in_vec_tail_inc_1){ blasint i; blasint N = ELEMENTS, inc = 1; - float *x = (float*) (ELEMENTS * inc * sizeof(float)); + float *x = (float*) malloc(ELEMENTS * inc * sizeof(float)); for (i = 0; i < N * inc; i ++) { x[i] = i + 1000; } From f81c1d4b598e16205f31f8146ad64e4069fe12f9 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 27 Feb 2024 07:19:52 +0100 Subject: [PATCH 15/23] Fix portability problem --- utest/test_extensions/test_zsbmv.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/utest/test_extensions/test_zsbmv.c b/utest/test_extensions/test_zsbmv.c index afdb208c1..0e79dc0d8 100644 --- a/utest/test_extensions/test_zsbmv.c +++ b/utest/test_extensions/test_zsbmv.c @@ -188,7 +188,7 @@ static double check_zsbmv(char uplo, blasint n, blasint k, double *alpha, blasin char trans = 'N'; // Symmetric band packed matrix for sbmv - double a[lda * n * 2]; + double *a = (double*) malloc(lda * n * 2 * sizeof(double)); // Fill symmetric packed matrix sp_matrix, vector b_test, vector c_test drand_generate(data_zsbmv.sp_matrix, n * (n + 1)); @@ -213,6 +213,7 @@ static double check_zsbmv(char uplo, blasint n, blasint k, double *alpha, blasin BLASFUNC(zsbmv)(&uplo, &n, &k, alpha, a, &lda, data_zsbmv.b_test, &inc_b, beta, data_zsbmv.c_test, &inc_c); + free(a); // Find the differences between output vector caculated by zsbmv and zgemv for (i = 0; i < n * inc_c * 2; i++) data_zsbmv.c_test[i] -= data_zsbmv.c_verify[i]; @@ -603,4 +604,4 @@ CTEST(zsbmv, xerbla_lda_invalid) int passed = check_badargs(uplo, n, k, lda, inc_b, inc_c, expected_info); ASSERT_EQUAL(TRUE, passed); } -#endif \ No newline at end of file +#endif From 8e872a91a946c4fe646b9bb85732f92e3ae6424e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 27 Feb 2024 11:28:50 +0100 Subject: [PATCH 16/23] Fix erroneous mapping of SUM kernels to ASUM --- kernel/zarch/KERNEL.Z13 | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/zarch/KERNEL.Z13 b/kernel/zarch/KERNEL.Z13 index 3bcc32197..fe82d81e6 100644 --- a/kernel/zarch/KERNEL.Z13 +++ b/kernel/zarch/KERNEL.Z13 @@ -35,10 +35,10 @@ DASUMKERNEL = dasum.c CASUMKERNEL = ../arm/zasum.c ZASUMKERNEL = zasum.c -SSUMKERNEL = ../arm/asum.c -DSUMKERNEL = dasum.c -CSUMKERNEL = ../arm/zasum.c -ZSUMKERNEL = zasum.c +SSUMKERNEL = ../arm/sum.c +DSUMKERNEL = dsum.c +CSUMKERNEL = ../arm/zsum.c +ZSUMKERNEL = zsum.c SAXPYKERNEL = ../arm/axpy.c DAXPYKERNEL = daxpy.c From 4266b393046f2b12c9f2b5724889245380c0293d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 27 Feb 2024 17:52:45 +0100 Subject: [PATCH 17/23] Make building the benchmarks optional and handle dependency on other options --- CMakeLists.txt | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index af3050cf9..30adf3a66 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -24,6 +24,8 @@ option(BUILD_LAPACK_DEPRECATED "When building LAPACK, include also some older, d option(BUILD_TESTING "Build LAPACK testsuite when building LAPACK" ON) +option(BUILD_BENCHMARKS "Build the collection of BLAS/LAPACK benchmarks" OFF) + option(C_LAPACK "Build LAPACK from C sources instead of the original Fortran" OFF) option(BUILD_WITHOUT_CBLAS "Do not build the C interface (CBLAS) to the BLAS functions" OFF) @@ -458,9 +460,23 @@ if (BUILD_SHARED_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFFIX} STREQUAL "") endif() endif() -if (BUILD_TESTING) - find_package(OpenMP REQUIRED) +if (BUILD_BENCHMARKS) + #find_package(OpenMP REQUIRED) file(GLOB SOURCES "benchmark/*.c") + if (NOT USE_OPENMP) + file(GLOB REMFILE "benchmark/smallscaling.c") + list(REMOVE_ITEM SOURCES ${REMFILE}) + endif() + if (BUILD_WITHOUT_LAPACK) + file(GLOB REMFILE "benchmark/cholesky.c") + list(REMOVE_ITEM SOURCES ${REMFILE}) + file(GLOB REMFILE "benchmark/geev.c") + list(REMOVE_ITEM SOURCES ${REMFILE}) + endif() + if (NOT USE_GEMM3M) + file(GLOB REMFILE "benchmark/gemm3m.c") + list(REMOVE_ITEM SOURCES ${REMFILE}) + endif() foreach(source ${SOURCES}) get_filename_component(name ${source} NAME_WE) if ((NOT ${name} STREQUAL "zdot-intel") AND (NOT ${name} STREQUAL "cula_wrapper")) @@ -477,7 +493,8 @@ if (BUILD_TESTING) (NOT ${target_name} STREQUAL "benchmark_min_COMPLEX") AND (NOT ${target_name} STREQUAL "benchmark_min_COMPLEX_DOUBLE")) add_executable(${target_name} ${source}) target_include_directories(${target_name} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR}) - target_link_libraries(${target_name} ${OpenBLAS_LIBNAME} OpenMP::OpenMP_C) + target_link_libraries(${target_name} ${OpenBLAS_LIBNAME} ) + # target_link_libraries(${target_name} ${OpenBLAS_LIBNAME} OpenMP::OpenMP_C) if (NOT "${define}" STREQUAL "DEFAULT") target_compile_definitions(${target_name} PRIVATE ${define}) endif() From cfc28c586ebcc840623d79560710a8485eea9ec2 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 27 Feb 2024 19:55:12 +0100 Subject: [PATCH 18/23] Exclude LAPACK testsuite and LAPACK-dependent benchmarks in no-LAPACK builds --- CMakeLists.txt | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 30adf3a66..9fbe878e6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -330,7 +330,7 @@ if (NOT NOFORTRAN) # Build test and ctest add_subdirectory(test) endif() - if (BUILD_TESTING) + if (BUILD_TESTING AND NOT BUILD_WITHOUT_LAPACK) add_subdirectory(lapack-netlib/TESTING) endif() endif() @@ -472,6 +472,18 @@ if (BUILD_BENCHMARKS) list(REMOVE_ITEM SOURCES ${REMFILE}) file(GLOB REMFILE "benchmark/geev.c") list(REMOVE_ITEM SOURCES ${REMFILE}) + file(GLOB REMFILE "benchmark/gesv.c") + list(REMOVE_ITEM SOURCES ${REMFILE}) + file(GLOB REMFILE "benchmark/getri.c") + list(REMOVE_ITEM SOURCES ${REMFILE}) + file(GLOB REMFILE "benchmark/potrf.c") + list(REMOVE_ITEM SOURCES ${REMFILE}) + file(GLOB REMFILE "benchmark/spmv.c") + list(REMOVE_ITEM SOURCES ${REMFILE}) + file(GLOB REMFILE "benchmark/symv.c") + list(REMOVE_ITEM SOURCES ${REMFILE}) + file(GLOB REMFILE "benchmark/linpack.c") + list(REMOVE_ITEM SOURCES ${REMFILE}) endif() if (NOT USE_GEMM3M) file(GLOB REMFILE "benchmark/gemm3m.c") From d1409407a090a46512e1ecddc6633943ccef99be Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 27 Feb 2024 21:05:59 +0100 Subject: [PATCH 19/23] Omit redundant prefixes or suffixes in library naming --- Makefile.system | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/Makefile.system b/Makefile.system index cb5453cac..aadf3459a 100644 --- a/Makefile.system +++ b/Makefile.system @@ -1520,10 +1520,18 @@ ifndef LIBNAMEPREFIX LIBNAMEPREFIX = endif +SYMPREFIX=$(SYMBOLPREFIX) +ifeq ($(SYMBOLPREFIX),$(LIBNAMEPREFIX)) +SYMPREFIX= +endif +SYMSUFFIX=$(SYMBOLSUFFIX) +ifeq ($(SYMBOLSUFFIX),$(LIBNAMESUFFIX)) +SYMSUFFIX= +endif ifndef LIBNAMESUFFIX -LIBNAMEBASE = $(SYMBOLPREFIX)$(LIBSONAMEBASE)$(SYMBOLSUFFIX) +LIBNAMEBASE = $(SYMPREFIX)$(LIBSONAMEBASE)$(SYMSUFFIX) else -LIBNAMEBASE = $(SYMBOLPREFIX)$(LIBSONAMEBASE)$(SYMBOLSUFFIX)$(LIBNAMESUFFIX) +LIBNAMEBASE = $(SYMPREFIX)$(LIBSONAMEBASE)$(SYMSUFFIX)$(LIBNAMESUFFIX) endif ifeq ($(OSNAME), CYGWIN_NT) From c508a10cf2413fc5381376c4586303e0fa7ffbd0 Mon Sep 17 00:00:00 2001 From: gxw Date: Mon, 26 Feb 2024 20:09:42 -0500 Subject: [PATCH 20/23] LoongArch64: Add cgemv LSX opt --- kernel/loongarch64/KERNEL.LOONGSON2K1000 | 3 + kernel/loongarch64/cgemv_n_4_lsx.S | 323 +++++++++++++++++++++++ kernel/loongarch64/cgemv_t_4_lsx.S | 290 ++++++++++++++++++++ 3 files changed, 616 insertions(+) create mode 100644 kernel/loongarch64/cgemv_n_4_lsx.S create mode 100644 kernel/loongarch64/cgemv_t_4_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 index c7ef44035..befe68451 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON2K1000 +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -100,6 +100,9 @@ DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +CGEMVNKERNEL = cgemv_n_4_lsx.S +CGEMVTKERNEL = cgemv_t_4_lsx.S + CGEMMKERNEL = cgemm_kernel_8x4_lsx.S CGEMMINCOPY = cgemm_ncopy_8_lsx.S CGEMMITCOPY = cgemm_tcopy_8_lsx.S diff --git a/kernel/loongarch64/cgemv_n_4_lsx.S b/kernel/loongarch64/cgemv_n_4_lsx.S new file mode 100644 index 000000000..cf8273797 --- /dev/null +++ b/kernel/loongarch64/cgemv_n_4_lsx.S @@ -0,0 +1,323 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" +#include "loongarch64_asm.S" + +/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, + * FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) + */ +#define M $r4 +#define N $r5 +#define ALPHA_R $f0 +#define ALPHA_I $f1 +#define A $r7 +#define LDA $r8 +#define X $r9 +#define INC_X $r10 +#define Y $r11 +#define INC_Y $r6 + +#define J $r12 +#define I $r13 +#define K $r14 +#define Y_ORG $r15 +#define OFFSET $r16 +#define K_LDA $r17 +#define M8 $r18 +#define T0 $r19 +#define PA0 $r20 +#define PA1 $r23 +#define PA2 $r24 +#define PA3 $r25 +#define PA4 $r26 +#define PA5 $r27 +#define PA6 $r28 +#define PA7 $r29 + +#define VALPHA $vr1 +#define X0 $vr2 +#define X1 $vr3 +#define X2 $vr4 +#define X3 $vr5 +#define X4 $vr6 +#define X5 $vr7 +#define X6 $vr8 +#define X7 $vr9 +#define Y0 $vr10 +#define Y1 $vr11 +#define A0 $vr12 +#define A1 $vr13 +#define A2 $vr14 +#define A3 $vr15 +#define A4 $vr16 +#define A5 $vr17 +#define A6 $vr18 +#define A7 $vr19 +#define A8 $vr20 +#define A9 $vr21 +#define A10 $vr22 +#define A11 $vr23 +#define A12 $vr24 +#define A13 $vr25 +#define A14 $vr26 +#define A15 $vr27 +#define TMP0 $vr28 +#define TMP1 $vr29 +#define TMP2 $vr30 + +#if !defined(CONJ) +#if !defined(XCONJ) +#define GXCONJ 0 +#define GCONJ 0 +#else +#define GXCONJ 1 +#define GCONJ 0 +#endif +#else +#if !defined(XCONJ) +#define GXCONJ 0 +#define GCONJ 1 +#else +#define GXCONJ 1 +#define GCONJ 1 +#endif +#endif + +.macro CLOAD_X_4 + GLDREPL v, d, X0, X, 0x00, X1, X, 0x08, X2, X, 0x10, X3, X, 0x18 + GCOMPLEXMUL GXCONJ, \ + vf, s, X0, VALPHA, X0, TMP0, TMP1, TMP2, \ + X1, VALPHA, X1, TMP0, TMP1, TMP2, \ + X2, VALPHA, X2, TMP0, TMP1, TMP2, \ + X3, VALPHA, X3, TMP0, TMP1, TMP2 +.endm + +.macro CLOAD_X_4_GAP + vldrepl.d X0, X, 0x00 + PTR_ADD T0, X, INC_X + vldrepl.d X1, T0, 0x00 + PTR_ADD T0, T0, INC_X + vldrepl.d X2, T0, 0x00 + PTR_ADD T0, T0, INC_X + vldrepl.d X3, T0, 0x00 + + GCOMPLEXMUL GXCONJ, \ + vf, s, X0, VALPHA, X0, TMP0, TMP1, TMP2, \ + X1, VALPHA, X1, TMP0, TMP1, TMP2, \ + X2, VALPHA, X2, TMP0, TMP1, TMP2, \ + X3, VALPHA, X3, TMP0, TMP1, TMP2 +.endm + +.macro CLOAD_X_1 + GLDREPL v, d, X0, X, 0x00 + GCOMPLEXMUL GXCONJ, \ + vf, s, X0, VALPHA, X0, TMP0, TMP1, TMP2 +.endm + +.macro CLOAD_Y_4 + GLD v, , Y0, Y, 0, Y1, Y, 0x10 +.endm + +.macro CLOAD_Y_4_GAP + fld.d $f10, Y, 0 + fldx.d $f13, Y, INC_Y + PTR_ALSL T0, INC_Y, Y, 1 + fld.d $f11, T0, 0 + fldx.d $f17, T0, INC_Y + vpackev.d Y0, A1, Y0 + vpackev.d Y1, A5, Y1 +.endm + +.macro CLOAD_Y_1 + fld.d $f10, Y, 0 +.endm + +.macro CSTORE_Y_4 + GST v, , Y0, Y, 0, Y1, Y, 0x10 +.endm + +.macro CSTORE_Y_4_GAP + vstelm.d Y0, Y, 0, 0 + PTR_ADD T0, Y, INC_Y + vstelm.d Y0, T0, 0, 1 + PTR_ADD T0, T0, INC_Y + vstelm.d Y1, T0, 0, 0 + PTR_ADD T0, T0, INC_Y + vstelm.d Y1, T0, 0, 1 +.endm + +.macro CSTORE_Y_1 + fst.d $f10, Y, 0 +.endm + +.macro CGEMV_N_4x4 + GLD_INC v, , 0x10, \ + A0, PA0, 0, A1, PA0, 0, \ + A2, PA1, 0, A3, PA1, 0, \ + A4, PA2, 0, A5, PA2, 0, \ + A6, PA3, 0, A7, PA3, 0 + + GCOMPLEXMADD GXCONJ, GCONJ, \ + vf, s, Y0, X0, A0, Y0, TMP0, TMP1, TMP2, Y1, X0, A1, Y1, TMP0, TMP1, TMP2, \ + Y0, X1, A2, Y0, TMP0, TMP1, TMP2, Y1, X1, A3, Y1, TMP0, TMP1, TMP2, \ + Y0, X2, A4, Y0, TMP0, TMP1, TMP2, Y1, X2, A5, Y1, TMP0, TMP1, TMP2, \ + Y0, X3, A6, Y0, TMP0, TMP1, TMP2, Y1, X3, A7, Y1, TMP0, TMP1, TMP2 +.endm + +.macro CGEMV_N_1x4 + GLD_INC f, d, 0x08, $f12, PA0, 0, $f14, PA1, 0, $f16, PA2, 0, $f18, PA3, 0 + GCOMPLEXMADD GXCONJ, GCONJ, \ + vf, s, Y0, X0, A0, Y0, TMP0, TMP1, TMP2, \ + Y0, X1, A2, Y0, TMP0, TMP1, TMP2, \ + Y0, X2, A4, Y0, TMP0, TMP1, TMP2, \ + Y0, X3, A6, Y0, TMP0, TMP1, TMP2 +.endm + +.macro CGEMV_N_1x1 + fld.d $f12, PA0, 0 + PTR_ADDI PA0, PA0, 0x08 + GCOMPLEXMADD GXCONJ, GCONJ, \ + vf, s, Y0, X0, A0, Y0, TMP0, TMP1, TMP2 +.endm + +.macro CGEMV_N_LSX XW:req, X_4:req, X_1:req, Y_4:req, Y_1:req + PTR_SRLI J, N, 2 + beqz J, .L_\XW\()_N_3 + PTR_SLLI K_LDA, LDA, 2 + PTR_SUB K_LDA, K_LDA, M8 +.L_\XW\()_N_L4: + CLOAD_\X_4 + xor K, K, K + move Y, Y_ORG + PTR_SRLI I, M, 2 + beqz I, .L_\XW\()_M_3 +.align 5 +.L_\XW\()_M_L4: + CLOAD_\Y_4 + CGEMV_N_4x4 + CSTORE_\Y_4 + PTR_ADDI I, I, -1 + PTR_ALSL Y, INC_Y, Y, 2 + PTR_ADDI K, K, 4 + bnez I, .L_\XW\()_M_L4 +.L_\XW\()_M_3: + andi I, M, 3 + beqz I, .L_\XW\()_M_END +.align 5 +.L_\XW\()_M_L1: + CLOAD_\Y_1 + CGEMV_N_1x4 + CSTORE_\Y_1 + PTR_ADDI I, I, -1 + PTR_ADD Y, Y, INC_Y + PTR_ADDI K, K, 1 + bnez I, .L_\XW\()_M_L1 +.L_\XW\()_M_END: + PTR_ADDI J, J, -1 +#if __loongarch_grlen == 64 + GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA +#elif __loongarch_grlen == 32 + GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA +#else + GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA +#endif + PTR_ALSL X, INC_X, X, 2 + bnez J, .L_\XW\()_N_L4 +.L_\XW\()_N_3: + andi J, N, 3 + beqz J, .L_END +.L_\XW\()_N_L1: + CLOAD_\X_1 + xor K, K, K + move Y, Y_ORG + move I, M + beqz I, .L_END +.align 5 +.L_\XW\()_N_1_M_L1: + CLOAD_\Y_1 + CGEMV_N_1x1 + CSTORE_\Y_1 + PTR_ADDI I, I, -1 + PTR_ADD Y, Y, INC_Y + PTR_ADDI K, K, 1 + bnez I, .L_\XW\()_N_1_M_L1 +.L_\XW\()_N_1_M_END: + PTR_ADDI J, J, -1 + PTR_SUB K_LDA, LDA, M8 + PTR_ADD PA0, PA0, K_LDA + PTR_ADD X, X, INC_X + bnez J, .L_\XW\()_N_L1 + + b .L_END +.endm + + PROLOGUE + PTR_LD INC_Y, $sp, 0 + push_if_used 17 + 7, 31 + PTR_ADDI K, $r0, 0x01 + PTR_SUB I, INC_X, K + PTR_SUB J, INC_Y, K + maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */ + maskeqz J, K, J /* if(inc_y == 1) j = 0; else j = 1; */ + PTR_ALSL I, I, J, 1 + GSLLI , d, LDA, LDA, 3, INC_X, INC_X, 3, INC_Y, INC_Y, 3, M8, M, 3 + // Init VALPHA + vpackev.w $vr0, $vr1, $vr0 + vpackev.d VALPHA, $vr0, $vr0 + move Y_ORG, Y + move PA0, A +#if __loongarch_grlen == 64 + GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA +#elif __loongarch_grlen == 32 + GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA +#else + GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA +#endif + la.local T0, .L_GAP_TABLE + PTR_ALSL I, I, T0, 1 + ld.h K, I, 0 // Obtain the offset address + PTR_ADD T0, T0, K + jirl $r0, T0, 0 +.L_GAP_TABLE: + .hword .L_GAP_0_0 - .L_GAP_TABLE + .hword .L_GAP_0_1 - .L_GAP_TABLE + .hword .L_GAP_1_0 - .L_GAP_TABLE + .hword .L_GAP_1_1 - .L_GAP_TABLE +.L_GAP_0_0: /* if (inc_x == 1) && (incy == 1) */ + CGEMV_N_LSX GAP_0_0, X_4, X_1, Y_4, Y_1 +.L_GAP_0_1: /* if (inc_x == 1) && (incy != 1) */ + CGEMV_N_LSX GAP_0_1, X_4, X_1, Y_4_GAP, Y_1 +.L_GAP_1_0: /* if (inc_x != 1) && (incy == 1) */ + CGEMV_N_LSX GAP_1_0, X_4_GAP, X_1, Y_4, Y_1 +.L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */ + CGEMV_N_LSX GAP_1_1, X_4_GAP, X_1, Y_4_GAP, Y_1 +.L_END: + pop_if_used 17 + 7, 31 + jirl $r0, $r1, 0x0 + EPILOGUE diff --git a/kernel/loongarch64/cgemv_t_4_lsx.S b/kernel/loongarch64/cgemv_t_4_lsx.S new file mode 100644 index 000000000..ada349364 --- /dev/null +++ b/kernel/loongarch64/cgemv_t_4_lsx.S @@ -0,0 +1,290 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" +#include "loongarch64_asm.S" + +/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, + * FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) + */ +#define M $r4 +#define N $r5 +#define ALPHA_R $f0 +#define ALPHA_I $f1 +#define A $r7 +#define LDA $r8 +#define X $r9 +#define INC_X $r10 +#define Y $r11 +#define INC_Y $r6 + +#define J $r12 +#define I $r13 +#define K $r14 +#define PY0 $r14 +#define X_ORG $r15 +#define PY1 $r16 +#define K_LDA $r17 +#define PY2 $r18 +#define T0 $r19 +#define PA0 $r20 +#define PA1 $r23 +#define PA2 $r24 +#define PA3 $r25 +#define PA4 $r26 +#define PA5 $r27 +#define PA6 $r28 +#define PA7 $r29 +#define M8 $r30 + +#define VALPHA $vr0 +#define X0 $vr1 +#define X1 $vr2 +#define A0 $vr3 +#define A1 $vr4 +#define A2 $vr5 +#define A3 $vr6 +#define A4 $vr7 +#define A5 $vr8 +#define A6 $vr9 +#define A7 $vr10 +#define A8 $vr11 +#define A9 $vr12 +#define A10 $vr13 +#define A11 $vr14 +#define A12 $vr15 +#define A13 $vr16 +#define A14 $vr17 +#define A15 $vr18 +#define TP0 $vr19 +#define TP1 $vr20 +#define TP2 $vr21 +#define TP3 $vr22 +#define TP4 $vr23 +#define TP5 $vr24 +#define TP6 $vr25 +#define TP7 $vr26 +#define TMP0 $vr27 +#define TMP1 $vr28 +#define TMP2 $vr29 +#define Y0 $vr3 +#define Y1 $vr4 +#define Y2 $vr5 +#define Y3 $vr6 +#define Y4 $vr7 +#define Y5 $vr8 +#define Y6 $vr9 +#define Y7 $vr10 + +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) +#define GXCONJ1 0 +#define GCONJ1 0 +#else +#define GXCONJ1 1 +#define GCONJ1 0 +#endif + +#if !defined(XCONJ) +#define GXCONJ2 0 +#define GCONJ2 0 +#else +#define GXCONJ2 0 +#define GCONJ2 1 +#endif + +.macro ZERO_Y4 + GXOR v, v, TP0, TP0, TP0, TP1, TP1, TP1, TP2, TP2, TP2, TP3, TP3, TP3 +.endm + +.macro ZERO_Y1 + GXOR v, v, TP0, TP0, TP0 +.endm + +.macro CLOAD_X4 + GLD v, , X0, X, 0x00, X1, X, 0x10 +.endm + +.macro CLOAD_X4_GAP + fld.d $f1, X, 0x00 + fldx.d $f3, X, INC_X + PTR_ALSL T0, INC_X, X, 1 + fld.d $f2, T0, 0x00 + fldx.d $f4, T0, INC_X + vpackev.d X0, A0, X0 + vpackev.d X1, A1, X1 +.endm + +.macro CGEMV_T_4x4 + GLD_INC v, , 0x10, \ + A0, PA0, 0, A1, PA0, 0, \ + A2, PA1, 0, A3, PA1, 0, \ + A4, PA2, 0, A5, PA2, 0, \ + A6, PA3, 0, A7, PA3, 0 + GCOMPLEXMADD GXCONJ1, GCONJ1, \ + vf, s, TP0, A0, X0, TP0, TMP0, TMP1, TMP2, TP0, A1, X1, TP0, TMP0, TMP1, TMP2, \ + TP1, A2, X0, TP1, TMP0, TMP1, TMP2, TP1, A3, X1, TP1, TMP0, TMP1, TMP2, \ + TP2, A4, X0, TP2, TMP0, TMP1, TMP2, TP2, A5, X1, TP2, TMP0, TMP1, TMP2, \ + TP3, A6, X0, TP3, TMP0, TMP1, TMP2, TP3, A7, X1, TP3, TMP0, TMP1, TMP2 +.endm + +.macro CGEMV_T_LSX XW:req, X4:req + PTR_SRLI J, N, 2 + beqz J, .L_\XW\()_N_3 + PTR_SLLI K_LDA, LDA, 2 + PTR_SUB K_LDA, K_LDA, M8 +.L_\XW\()_N_L4: + ZERO_Y4 + move X, X_ORG + PTR_SRLI I, M, 2 + beqz I, .L_\XW\()_M_3 +.align 5 +.L_\XW\()_M_L4: + CLOAD_\X4 + CGEMV_T_4x4 + PTR_ADDI I, I, -1 + PTR_ALSL X, INC_X, X, 2 + bnez I, .L_\XW\()_M_L4 +.L_\XW\()_M_3: + // Accumulated + GCOMPLEXACC vf, s, Y0, TP0, Y1, TP1, Y2, TP2, Y3, TP3 + andi I, M, 3 + beqz I, .L_\XW\()_M_END +.align 5 +.L_\XW\()_M_L1: + fld.d $f1, X, 0x00 + fld.d $f11, PA0, 0x00 + fld.d $f12, PA1, 0x00 + fld.d $f13, PA2, 0x00 + fld.d $f14, PA3, 0x00 +#if __loongarch_grlen == 64 + GADDI , d, PA0, PA0, 0x08, PA1, PA1, 0x08, PA2, PA2, 0x08, PA3, PA3, 0x08 +#elif __loongarch_grlen == 32 + GADDI , w, PA0, PA0, 0x08, PA1, PA1, 0x08, PA2, PA2, 0x08, PA3, PA3, 0x08 +#else + GADDI , d, PA0, PA0, 0x08, PA1, PA1, 0x08, PA2, PA2, 0x08, PA3, PA3, 0x08 +#endif + + GCOMPLEXMADD GXCONJ1, GCONJ1, \ + vf, s, A0, A8, X0, A0, TMP0, TMP1, TMP2, A1, A9, X0, A1, TMP0, TMP1, TMP2, \ + A2, A10, X0, A2, TMP0, TMP1, TMP2, A3, A11, X0, A3, TMP0, TMP1, TMP2 + + PTR_ADDI I, I, -1 + PTR_ADD X, X, INC_X + bnez I, .L_\XW\()_M_L1 +.L_\XW\()_M_END: + fld.d $f11, Y, 0x00 + fldx.d $f12, Y, INC_Y + PTR_ALSL PY0, INC_Y, Y, 1 + fld.d $f13, PY0, 0x00 + fldx.d $f14, PY0, INC_Y + + GCOMPLEXMADD GXCONJ2, GCONJ2, \ + vf, s, A8, VALPHA, A0, A8, TMP0, TMP1, TMP2, A9, VALPHA, A1, A9, TMP0, TMP1, TMP2,\ + A10, VALPHA, A2, A10, TMP0, TMP1, TMP2, A11, VALPHA, A3, A11, TMP0, TMP1, TMP2 + + PTR_ADDI J, J, -1 +#if __loongarch_grlen == 64 + GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA +#elif __loongarch_grlen == 32 + GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA +#else + GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA +#endif + fst.d $f11, Y, 0x00 + fstx.d $f12, Y, INC_Y + fst.d $f13, PY0, 0x00 + fstx.d $f14, PY0, INC_Y + PTR_ALSL Y, INC_Y, Y, 2 + bnez J, .L_\XW\()_N_L4 +.L_\XW\()_N_3: + andi J, N, 3 + beqz J, .L_END + PTR_SUB K_LDA, LDA, M8 +.L_\XW\()_N_1: + ZERO_Y1 + move X, X_ORG + move I, M + beqz I, .L_END +.align 5 +.L_\XW\()_N_1_M_L1: + fld.d $f3, PA0, 0x00 + fld.d $f1, X, 0x00 + GCOMPLEXMADD GXCONJ1, GCONJ1, \ + vf, s, TP0, A0, X0, TP0, TMP0, TMP1, TMP2 + PTR_ADDI I, I, -1 + PTR_ADD X, X, INC_X + PTR_ADDI PA0, PA0, 0x08 + bnez I, .L_\XW\()_N_1_M_L1 +.L_\XW\()_N_1_M_END: + PTR_ADDI J, J, -1 + fld.d $f3, Y, 0x00 + GCOMPLEXMADD GXCONJ2, GCONJ2, \ + vf, s, A0, VALPHA, TP0, A0, TMP0, TMP1, TMP2 + fst.d $f3, Y, 0x00 + PTR_ADD PA0, PA0, K_LDA + PTR_ADD Y, Y, INC_Y + bnez J, .L_\XW\()_N_1 + + b .L_END +.endm + + PROLOGUE + PTR_LD INC_Y, $sp, 0 + push_if_used 17 + 8, 30 + PTR_ADDI K, $r0, 0x01 + PTR_SUB I, INC_X, K + maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */ + GSLLI , d, LDA, LDA, 3, INC_X, INC_X, 3, INC_Y, INC_Y, 3, M8, M, 3 + // Init VALPHA + vpackev.w $vr0, $vr1, $vr0 + vpackev.d VALPHA, $vr0, $vr0 + move X_ORG, X + move PA0, A +#if __loongarch_grlen == 64 + GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA +#elif __loongarch_grlen == 32 + GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA +#else + GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA +#endif + la.local T0, .L_GAP_TABLE + PTR_ALSL I, I, T0, 1 + ld.h K, I, 0 + PTR_ADD T0, T0, K + jirl $r0, T0, 0 +.L_GAP_TABLE: + .hword .L_GAP_0 - .L_GAP_TABLE + .hword .L_GAP_1 - .L_GAP_TABLE +.L_GAP_0: /* if (incx == 1) */ + CGEMV_T_LSX GAP_0, X4 +.L_GAP_1: /* if (incx != 1) */ + CGEMV_T_LSX GAP_1, X4_GAP +.L_END: + pop_if_used 17 + 8, 30 + jirl $r0, $r1, 0x0 + EPILOGUE From 3f22fc22333ae7b739e0620d64710e3905ef1dc1 Mon Sep 17 00:00:00 2001 From: gxw Date: Tue, 27 Feb 2024 04:16:49 -0500 Subject: [PATCH 21/23] LoongArch64: Add zgemv LSX opt --- kernel/loongarch64/KERNEL.LOONGSON2K1000 | 3 + kernel/loongarch64/loongarch64_asm.S | 3 + kernel/loongarch64/zgemv_n_2_lsx.S | 296 +++++++++++++++++++++++ kernel/loongarch64/zgemv_t_2_lsx.S | 268 ++++++++++++++++++++ 4 files changed, 570 insertions(+) create mode 100644 kernel/loongarch64/zgemv_n_2_lsx.S create mode 100644 kernel/loongarch64/zgemv_t_2_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 index befe68451..15d4358be 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON2K1000 +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -118,6 +118,9 @@ CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +ZGEMVNKERNEL = zgemv_n_2_lsx.S +ZGEMVTKERNEL = zgemv_t_2_lsx.S + ZGEMMKERNEL = zgemm_kernel_4x4_lsx.S ZGEMMONCOPY = zgemm_ncopy_4_lsx.S ZGEMMOTCOPY = zgemm_tcopy_4_lsx.S diff --git a/kernel/loongarch64/loongarch64_asm.S b/kernel/loongarch64/loongarch64_asm.S index fee46d63e..d097b3045 100644 --- a/kernel/loongarch64/loongarch64_asm.S +++ b/kernel/loongarch64/loongarch64_asm.S @@ -406,9 +406,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .ifeqs "\suf_op", "s" vpackod.d \out, \in, \in \pre_op\()add.\suf_op \out, \out, \in +.else + vor.v \out, \in, \in .endif .endif + .ifnb \more GCOMPLEXACC \pre_op, \suf_op, \more .endif diff --git a/kernel/loongarch64/zgemv_n_2_lsx.S b/kernel/loongarch64/zgemv_n_2_lsx.S new file mode 100644 index 000000000..efb376118 --- /dev/null +++ b/kernel/loongarch64/zgemv_n_2_lsx.S @@ -0,0 +1,296 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" +#include "loongarch64_asm.S" + +/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, + * FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) + */ +#define M $r4 +#define N $r5 +#define ALPHA_R $f0 +#define ALPHA_I $f1 +#define A $r7 +#define LDA $r8 +#define X $r9 +#define INC_X $r10 +#define Y $r11 +#define INC_Y $r6 + +#define J $r12 +#define I $r13 +#define K $r14 +#define Y_ORG $r15 +#define OFFSET $r16 +#define K_LDA $r17 +#define M16 $r18 +#define T0 $r19 +#define PA0 $r20 +#define PA1 $r23 +#define PA2 $r24 +#define PA3 $r25 +#define PA4 $r26 +#define PA5 $r27 +#define PA6 $r28 +#define PA7 $r29 + +#define VALPHA $vr1 +#define X0 $vr2 +#define X1 $vr3 +#define X2 $vr4 +#define X3 $vr5 +#define X4 $vr6 +#define X5 $vr7 +#define X6 $vr8 +#define X7 $vr9 +#define Y0 $vr10 +#define Y1 $vr11 +#define A0 $vr12 +#define A1 $vr13 +#define A2 $vr14 +#define A3 $vr15 +#define A4 $vr16 +#define A5 $vr17 +#define A6 $vr18 +#define A7 $vr19 +#define A8 $vr20 +#define A9 $vr21 +#define A10 $vr22 +#define A11 $vr23 +#define A12 $vr24 +#define A13 $vr25 +#define A14 $vr26 +#define A15 $vr27 +#define TMP0 $vr28 +#define TMP1 $vr29 +#define TMP2 $vr30 + +#if !defined(CONJ) +#if !defined(XCONJ) +#define GXCONJ 0 +#define GCONJ 0 +#else +#define GXCONJ 1 +#define GCONJ 0 +#endif +#else +#if !defined(XCONJ) +#define GXCONJ 0 +#define GCONJ 1 +#else +#define GXCONJ 1 +#define GCONJ 1 +#endif +#endif + +.macro ZLOAD_X_2 + GLD v, , X0, X, 0x00, X1, X, 0x10 + GCOMPLEXMUL GXCONJ, \ + vf, d, X0, VALPHA, X0, TMP0, TMP1, TMP2, \ + X1, VALPHA, X1, TMP0, TMP1, TMP2 +.endm + +.macro ZLOAD_X_2_GAP + vld X0, X, 0 + PTR_ADD T0, X, INC_X + vld X1, T0, 0 + + GCOMPLEXMUL GXCONJ, \ + vf, d, X0, VALPHA, X0, TMP0, TMP1, TMP2, \ + X1, VALPHA, X1, TMP0, TMP1, TMP2 +.endm + +.macro ZLOAD_X_1 + GLD v, , X0, X, 0x00 + GCOMPLEXMUL GXCONJ, \ + vf, d, X0, VALPHA, X0, TMP0, TMP1, TMP2 +.endm + +.macro ZLOAD_Y_2 + GLD v, , Y0, Y, 0, Y1, Y, 0x10 +.endm + +.macro ZLOAD_Y_2_GAP + vld $vr10, Y, 0 + vldx $vr11, Y, INC_Y +.endm + +.macro ZLOAD_Y_1 + vld $vr10, Y, 0 +.endm + +.macro ZGEMV_N_2x2 + GLD_INC v, , 0x10, \ + A0, PA0, 0, A1, PA0, 0, \ + A2, PA1, 0, A3, PA1, 0 + GCOMPLEXMADD GXCONJ, GCONJ, \ + vf, d, Y0, X0, A0, Y0, TMP0, TMP1, TMP2, Y1, X0, A1, Y1, TMP0, TMP1, TMP2, \ + Y0, X1, A2, Y0, TMP0, TMP1, TMP2, Y1, X1, A3, Y1, TMP0, TMP1, TMP2 +.endm + +.macro ZGEMV_N_1x2 + GLD_INC v, , 0x10, $vr12, PA0, 0, $vr14, PA1, 0 + GCOMPLEXMADD GXCONJ, GCONJ, \ + vf, d, Y0, X0, A0, Y0, TMP0, TMP1, TMP2, \ + Y0, X1, A2, Y0, TMP0, TMP1, TMP2 +.endm + +.macro ZGEMV_N_1x1 + GLD_INC v, , 0x10, $vr12, PA0, 0 + GCOMPLEXMADD GXCONJ, GCONJ, \ + vf, d, Y0, X0, A0, Y0, TMP0, TMP1, TMP2 +.endm + +.macro ZSTORE_Y_2 + GST v, , Y0, Y, 0, Y1, Y, 0x10 +.endm + +.macro ZSTORE_Y_2_GAP + vst Y0, Y, 0 + vstx Y1, Y, INC_Y +.endm + +.macro ZSTORE_Y_1 + vst $vr10, Y, 0 +.endm + +.macro ZGEMV_N_LSX XW:req, X_2:req, X_1:req, Y_2:req, Y_1:req + PTR_SRLI J, N, 1 + beqz J, .L_\XW\()_N_1 + PTR_SLLI K_LDA, LDA, 1 + PTR_SUB K_LDA, K_LDA, M16 +.L_\XW\()_N_L2: + ZLOAD_\X_2 + xor K, K, K + move Y, Y_ORG + PTR_SRLI I, M, 1 + beqz I, .L_\XW\()_M_1 +.align 5 +.L_\XW\()_M_L2: + ZLOAD_\Y_2 + ZGEMV_N_2x2 + ZSTORE_\Y_2 + PTR_ADDI I, I, -1 + PTR_ALSL Y, INC_Y, Y, 1 + PTR_ADDI K, K, 4 + bnez I, .L_\XW\()_M_L2 +.L_\XW\()_M_1: + andi I, M, 1 + beqz I, .L_\XW\()_M_END +.align 5 +.L_\XW\()_M_L1: + ZLOAD_\Y_1 + ZGEMV_N_1x2 + ZSTORE_\Y_1 + PTR_ADDI I, I, -1 + PTR_ADD Y, Y, INC_Y + PTR_ADDI K, K, 1 + bnez I, .L_\XW\()_M_L1 +.L_\XW\()_M_END: + PTR_ADDI J, J, -1 +#if __loongarch_grlen == 64 + GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA +#elif __loongarch_grlen == 32 + GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA +#else + GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA +#endif + PTR_ALSL X, INC_X, X, 1 + bnez J, .L_\XW\()_N_L2 +.L_\XW\()_N_1: + andi J, N, 1 + beqz J, .L_END +.L_\XW\()_N_L1: + ZLOAD_\X_1 + xor K, K, K + move Y, Y_ORG + move I, M + beqz I, .L_END +.align 5 +.L_\XW\()_N_1_M_L1: + ZLOAD_\Y_1 + ZGEMV_N_1x1 + ZSTORE_\Y_1 + PTR_ADDI I, I, -1 + PTR_ADD Y, Y, INC_Y + PTR_ADDI K, K, 1 + bnez I, .L_\XW\()_N_1_M_L1 +.L_\XW\()_N_1_M_END: + PTR_ADDI J, J, -1 + PTR_SUB K_LDA, LDA, M16 + PTR_ADD PA0, PA0, K_LDA + PTR_ADD X, X, INC_X + bnez J, .L_\XW\()_N_L1 + + b .L_END +.endm + + PROLOGUE + PTR_LD INC_Y, $sp, 0 + push_if_used 17 + 7, 31 + PTR_ADDI K, $r0, 0x01 + PTR_SUB I, INC_X, K + PTR_SUB J, INC_Y, K + maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */ + maskeqz J, K, J /* if(inc_y == 1) j = 0; else j = 1; */ + PTR_ALSL I, I, J, 1 + GSLLI , d, LDA, LDA, 4, INC_X, INC_X, 4, INC_Y, INC_Y, 4, M16, M, 4 + // Init VALPHA + vpackev.d VALPHA, $vr1, $vr0 + move Y_ORG, Y + move PA0, A +#if __loongarch_grlen == 64 + GADD , d, PA1, PA0, LDA +#elif __loongarch_grlen == 32 + GADD , w, PA1, PA0, LDA +#else + GADD , d, PA1, PA0, LDA +#endif + la.local T0, .L_GAP_TABLE + PTR_ALSL I, I, T0, 1 + ld.h K, I, 0 // Obtain the offset address + PTR_ADD T0, T0, K + jirl $r0, T0, 0 +.L_GAP_TABLE: + .hword .L_GAP_0_0 - .L_GAP_TABLE + .hword .L_GAP_0_1 - .L_GAP_TABLE + .hword .L_GAP_1_0 - .L_GAP_TABLE + .hword .L_GAP_1_1 - .L_GAP_TABLE +.L_GAP_0_0: /* if (inc_x == 1) && (incy == 1) */ + ZGEMV_N_LSX GAP_0_0, X_2, X_1, Y_2, Y_1 +.L_GAP_0_1: /* if (inc_x == 1) && (incy != 1) */ + ZGEMV_N_LSX GAP_0_1, X_2, X_1, Y_2_GAP, Y_1 +.L_GAP_1_0: /* if (inc_x != 1) && (incy == 1) */ + ZGEMV_N_LSX GAP_1_0, X_2_GAP, X_1, Y_2, Y_1 +.L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */ + ZGEMV_N_LSX GAP_1_1, X_2_GAP, X_1, Y_2_GAP, Y_1 +.L_END: + pop_if_used 17 + 7, 31 + jirl $r0, $r1, 0x0 + EPILOGUE diff --git a/kernel/loongarch64/zgemv_t_2_lsx.S b/kernel/loongarch64/zgemv_t_2_lsx.S new file mode 100644 index 000000000..2a0fc172e --- /dev/null +++ b/kernel/loongarch64/zgemv_t_2_lsx.S @@ -0,0 +1,268 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" +#include "loongarch64_asm.S" + +/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, + * FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) + */ +#define M $r4 +#define N $r5 +#define ALPHA_R $f0 +#define ALPHA_I $f1 +#define A $r7 +#define LDA $r8 +#define X $r9 +#define INC_X $r10 +#define Y $r11 +#define INC_Y $r6 + +#define J $r12 +#define I $r13 +#define K $r14 +#define PY0 $r14 +#define X_ORG $r15 +#define PY1 $r16 +#define K_LDA $r17 +#define PY2 $r18 +#define T0 $r19 +#define PA0 $r20 +#define PA1 $r23 +#define PA2 $r24 +#define PA3 $r25 +#define PA4 $r26 +#define PA5 $r27 +#define PA6 $r28 +#define PA7 $r29 +#define M16 $r30 + +#define VALPHA $vr0 +#define X0 $vr1 +#define X1 $vr2 +#define A0 $vr3 +#define A1 $vr4 +#define A2 $vr5 +#define A3 $vr6 +#define A4 $vr7 +#define A5 $vr8 +#define A6 $vr9 +#define A7 $vr10 +#define A8 $vr11 +#define A9 $vr12 +#define A10 $vr13 +#define A11 $vr14 +#define A12 $vr15 +#define A13 $vr16 +#define A14 $vr17 +#define A15 $vr18 +#define TP0 $vr19 +#define TP1 $vr20 +#define TP2 $vr21 +#define TP3 $vr22 +#define TP4 $vr23 +#define TP5 $vr24 +#define TP6 $vr25 +#define TP7 $vr26 +#define TMP0 $vr27 +#define TMP1 $vr28 +#define TMP2 $vr29 +#define Y0 $vr3 +#define Y1 $vr4 +#define Y2 $vr5 +#define Y3 $vr6 +#define Y4 $vr7 +#define Y5 $vr8 +#define Y6 $vr9 +#define Y7 $vr10 + +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) +#define GXCONJ1 0 +#define GCONJ1 0 +#else +#define GXCONJ1 1 +#define GCONJ1 0 +#endif + +#if !defined(XCONJ) +#define GXCONJ2 0 +#define GCONJ2 0 +#else +#define GXCONJ2 0 +#define GCONJ2 1 +#endif + +.macro ZERO_Y2 + GXOR v, v, TP0, TP0, TP0, TP1, TP1, TP1 +.endm + +.macro ZERO_Y1 + GXOR v, v, TP0, TP0, TP0 +.endm + +.macro ZLOAD_X2 + GLD v, , X0, X, 0x00, X1, X, 0x10 +.endm + +.macro ZLOAD_X2_GAP + vld X0, X, 0 + vldx X1, X, INC_X +.endm + +.macro ZGEMV_T_2x2 + GLD_INC v, , 0x10, \ + A0, PA0, 0, A1, PA0, 0, \ + A2, PA1, 0, A3, PA1, 0 + GCOMPLEXMADD GXCONJ1, GCONJ1, \ + vf, d, TP0, A0, X0, TP0, TMP0, TMP1, TMP2, TP0, A1, X1, TP0, TMP0, TMP1, TMP2, \ + TP1, A2, X0, TP1, TMP0, TMP1, TMP2, TP1, A3, X1, TP1, TMP0, TMP1, TMP2 +.endm + +.macro ZGEMV_T_LSX XW:req, X2:req + PTR_SRLI J, N, 1 + beqz J, .L_\XW\()_N_1 + PTR_SLLI K_LDA, LDA, 1 + PTR_SUB K_LDA, K_LDA, M16 +.L_\XW\()_N_L2: + ZERO_Y2 + move X, X_ORG + PTR_SRLI I, M, 1 + beqz I, .L_\XW\()_M_1 +.align 5 +.L_\XW\()_M_L2: + ZLOAD_\X2 + ZGEMV_T_2x2 + PTR_ADDI I, I, -1 + PTR_ALSL X, INC_X, X, 1 + bnez I, .L_\XW\()_M_L2 +.L_\XW\()_M_1: + // Accumulated + GCOMPLEXACC vf, d, Y0, TP0, Y1, TP1 + andi I, M, 1 + beqz I, .L_\XW\()_M_END +.align 5 +.L_\XW\()_M_L1: + GLD v, , X0, X, 0x00, A8, PA0, 0x00, A9, PA1, 0x00 +#if __loongarch_grlen == 64 + GADDI , d, PA0, PA0, 0x10, PA1, PA1, 0x10 +#elif __loongarch_grlen == 32 + GADDI , w, PA0, PA0, 0x10, PA1, PA1, 0x10 +#else + GADDI , d, PA0, PA0, 0x10, PA1, PA1, 0x10 +#endif + + GCOMPLEXMADD GXCONJ1, GCONJ1, \ + vf, d, A0, A8, X0, A0, TMP0, TMP1, TMP2, A1, A9, X0, A1, TMP0, TMP1, TMP2 + + PTR_ADDI I, I, -1 + PTR_ADD X, X, INC_X + bnez I, .L_\XW\()_M_L1 +.L_\XW\()_M_END: + vld A8, Y, 0x00 + vldx A9, Y, INC_Y + + GCOMPLEXMADD GXCONJ2, GCONJ2, \ + vf, d, A8, VALPHA, A0, A8, TMP0, TMP1, TMP2, A9, VALPHA, A1, A9, TMP0, TMP1, TMP2 + + PTR_ADDI J, J, -1 +#if __loongarch_grlen == 64 + GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA +#elif __loongarch_grlen == 32 + GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA +#else + GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA +#endif + vst $vr11, Y, 0x00 + vstx $vr12, Y, INC_Y + PTR_ALSL Y, INC_Y, Y, 1 + bnez J, .L_\XW\()_N_L2 +.L_\XW\()_N_1: + andi J, N, 1 + beqz J, .L_END + PTR_SUB K_LDA, LDA, M16 +.L_\XW\()_N_L1: + ZERO_Y1 + move X, X_ORG + move I, M + beqz I, .L_END +.align 5 +.L_\XW\()_N_1_M_L1: + GLD v, , A0, PA0, 0x00, X0, X, 0x00 + GCOMPLEXMADD GXCONJ1, GCONJ1, \ + vf, d, TP0, A0, X0, TP0, TMP0, TMP1, TMP2 + PTR_ADDI I, I, -1 + PTR_ADD X, X, INC_X + PTR_ADDI PA0, PA0, 0x10 + bnez I, .L_\XW\()_N_1_M_L1 +.L_\XW\()_N_1_M_END: + PTR_ADDI J, J, -1 + vld A0, Y, 0x00 + GCOMPLEXMADD GXCONJ2, GCONJ2, \ + vf, d, A0, VALPHA, TP0, A0, TMP0, TMP1, TMP2 + vst $vr3, Y, 0x00 + PTR_ADD PA0, PA0, K_LDA + PTR_ADD Y, Y, INC_Y + bnez J, .L_\XW\()_N_L1 + + b .L_END +.endm + + PROLOGUE + PTR_LD INC_Y, $sp, 0 + push_if_used 17 + 8, 30 + PTR_ADDI K, $r0, 0x01 + PTR_SUB I, INC_X, K + maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */ + GSLLI , d, LDA, LDA, 4, INC_X, INC_X, 4, INC_Y, INC_Y, 4, M16, M, 4 + // Init VALPHA + vpackev.d VALPHA, $vr1, $vr0 + move X_ORG, X + move PA0, A +#if __loongarch_grlen == 64 + GADD , d, PA1, PA0, LDA +#elif __loongarch_grlen == 32 + GADD , w, PA1, PA0, LDA +#else + GADD , d, PA1, PA0, LDA +#endif + la.local T0, .L_GAP_TABLE + PTR_ALSL I, I, T0, 1 + ld.h K, I, 0 + PTR_ADD T0, T0, K + jirl $r0, T0, 0 +.L_GAP_TABLE: + .hword .L_GAP_0 - .L_GAP_TABLE + .hword .L_GAP_1 - .L_GAP_TABLE +.L_GAP_0: /* if (incx == 1) */ + ZGEMV_T_LSX GAP_0, X2 +.L_GAP_1: /* if (incx != 1) */ + ZGEMV_T_LSX GAP_1, X2_GAP +.L_END: + pop_if_used 17 + 8, 30 + jirl $r0, $r1, 0x0 + EPILOGUE From 8e05c053be8b889697b17f6847d94a8e4ce3c12b Mon Sep 17 00:00:00 2001 From: gxw Date: Tue, 27 Feb 2024 21:17:01 -0500 Subject: [PATCH 22/23] LoongArch64:Fixed the failed test cases test_{c/z}gemv_n in test_extensions --- kernel/loongarch64/cgemv_n_8_lasx.S | 34 ++++++++++++++--------------- kernel/loongarch64/zgemv_n_4_lasx.S | 18 +++++++-------- 2 files changed, 26 insertions(+), 26 deletions(-) diff --git a/kernel/loongarch64/cgemv_n_8_lasx.S b/kernel/loongarch64/cgemv_n_8_lasx.S index b078e3227..ba38a9573 100644 --- a/kernel/loongarch64/cgemv_n_8_lasx.S +++ b/kernel/loongarch64/cgemv_n_8_lasx.S @@ -122,14 +122,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. GLDREPL xv, d, X0, X, 0x00, X1, X, 0x08, X2, X, 0x10, X3, X, 0x18, \ X4, X, 0x20, X5, X, 0x28, X6, X, 0x30, X7, X, 0x38 GCOMPLEXMUL GXCONJ, \ - xvf, s, X0, X0, VALPHA, TMP0, TMP1, TMP2, \ - X1, X1, VALPHA, TMP0, TMP1, TMP2, \ - X2, X2, VALPHA, TMP0, TMP1, TMP2, \ - X3, X3, VALPHA, TMP0, TMP1, TMP2, \ - X4, X4, VALPHA, TMP0, TMP1, TMP2, \ - X5, X5, VALPHA, TMP0, TMP1, TMP2, \ - X6, X6, VALPHA, TMP0, TMP1, TMP2, \ - X7, X7, VALPHA, TMP0, TMP1, TMP2 + xvf, s, X0, VALPHA, X0, TMP0, TMP1, TMP2, \ + X1, VALPHA, X1, TMP0, TMP1, TMP2, \ + X2, VALPHA, X2, TMP0, TMP1, TMP2, \ + X3, VALPHA, X3, TMP0, TMP1, TMP2, \ + X4, VALPHA, X4, TMP0, TMP1, TMP2, \ + X5, VALPHA, X5, TMP0, TMP1, TMP2, \ + X6, VALPHA, X6, TMP0, TMP1, TMP2, \ + X7, VALPHA, X7, TMP0, TMP1, TMP2 .endm .macro CLOAD_X_8_GAP @@ -150,14 +150,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvldrepl.d X7, T0, 0x00 GCOMPLEXMUL GXCONJ, \ - xvf, s, X0, X0, VALPHA, TMP0, TMP1, TMP2, \ - X1, X1, VALPHA, TMP0, TMP1, TMP2, \ - X2, X2, VALPHA, TMP0, TMP1, TMP2, \ - X3, X3, VALPHA, TMP0, TMP1, TMP2, \ - X4, X4, VALPHA, TMP0, TMP1, TMP2, \ - X5, X5, VALPHA, TMP0, TMP1, TMP2, \ - X6, X6, VALPHA, TMP0, TMP1, TMP2, \ - X7, X7, VALPHA, TMP0, TMP1, TMP2 + xvf, s, X0, VALPHA, X0, TMP0, TMP1, TMP2, \ + X1, VALPHA, X1, TMP0, TMP1, TMP2, \ + X2, VALPHA, X2, TMP0, TMP1, TMP2, \ + X3, VALPHA, X3, TMP0, TMP1, TMP2, \ + X4, VALPHA, X4, TMP0, TMP1, TMP2, \ + X5, VALPHA, X5, TMP0, TMP1, TMP2, \ + X6, VALPHA, X6, TMP0, TMP1, TMP2, \ + X7, VALPHA, X7, TMP0, TMP1, TMP2 .endm .macro CLOAD_Y_8 @@ -228,7 +228,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro CLOAD_X_1 GLDREPL xv, d, X0, X, 0x00 GCOMPLEXMUL GXCONJ, \ - xvf, s, X0, X0, VALPHA, TMP0, TMP1, TMP2 + xvf, s, X0, VALPHA, X0, TMP0, TMP1, TMP2 .endm .macro CLOAD_Y_1 diff --git a/kernel/loongarch64/zgemv_n_4_lasx.S b/kernel/loongarch64/zgemv_n_4_lasx.S index 98b1a6f7d..26edf1ed7 100644 --- a/kernel/loongarch64/zgemv_n_4_lasx.S +++ b/kernel/loongarch64/zgemv_n_4_lasx.S @@ -122,10 +122,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. GLD xv, , X0, X, 0x00, X1, X, 0x10, X2, X, 0x20, X3, X, 0x30 GPERMI xv, q, X0, X0, 0, X1, X1, 0, X2, X2, 0, X3, X3, 0 GCOMPLEXMUL GXCONJ, \ - xvf, d, X0, X0, VALPHA, TMP0, TMP1, TMP2, \ - X1, X1, VALPHA, TMP0, TMP1, TMP2, \ - X2, X2, VALPHA, TMP0, TMP1, TMP2, \ - X3, X3, VALPHA, TMP0, TMP1, TMP2 + xvf, d, X0, VALPHA, X0, TMP0, TMP1, TMP2, \ + X1, VALPHA, X1, TMP0, TMP1, TMP2, \ + X2, VALPHA, X2, TMP0, TMP1, TMP2, \ + X3, VALPHA, X3, TMP0, TMP1, TMP2 .endm .macro ZLOAD_X_4_GAP @@ -145,10 +145,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvpermi.q X3, X3, 0 GCOMPLEXMUL GXCONJ, \ - xvf, d, X0, X0, VALPHA, TMP0, TMP1, TMP2, \ - X1, X1, VALPHA, TMP0, TMP1, TMP2, \ - X2, X2, VALPHA, TMP0, TMP1, TMP2, \ - X3, X3, VALPHA, TMP0, TMP1, TMP2 + xvf, d, X0, VALPHA, X0, TMP0, TMP1, TMP2, \ + X1, VALPHA, X1, TMP0, TMP1, TMP2, \ + X2, VALPHA, X2, TMP0, TMP1, TMP2, \ + X3, VALPHA, X3, TMP0, TMP1, TMP2 .endm .macro ZLOAD_Y_4 @@ -216,7 +216,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. GLD xv, , X0, X, 0x00 GPERMI xv, q, X0, X0, 0 GCOMPLEXMUL GXCONJ, \ - xvf, d, X0, X0, VALPHA, TMP0, TMP1, TMP2 + xvf, d, X0, VALPHA, X0, TMP0, TMP1, TMP2 .endm .macro ZGEMV_N_1x1 From b2db064285bfaa36ac44b3a61cb4930cf0076d8d Mon Sep 17 00:00:00 2001 From: pengxu Date: Tue, 27 Feb 2024 10:47:49 +0800 Subject: [PATCH 23/23] Optimized sgemv and dgemv kernel LSX for LoongArch --- kernel/loongarch64/KERNEL.LOONGSON2K1000 | 12 + kernel/loongarch64/dgemv_n_lsx.S | 229 +++++++++++++++++++ kernel/loongarch64/dgemv_t_lsx.S | 279 +++++++++++++++++++++++ kernel/loongarch64/sgemv_n_lsx.S | 227 ++++++++++++++++++ kernel/loongarch64/sgemv_t_lsx.S | 275 ++++++++++++++++++++++ 5 files changed, 1022 insertions(+) create mode 100644 kernel/loongarch64/dgemv_n_lsx.S create mode 100644 kernel/loongarch64/dgemv_t_lsx.S create mode 100644 kernel/loongarch64/sgemv_n_lsx.S create mode 100644 kernel/loongarch64/sgemv_t_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 index c7ef44035..5b54a2ada 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON2K1000 +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -85,6 +85,12 @@ ZSWAPKERNEL = cswap_lsx.S CSUMKERNEL = csum_lsx.S ZSUMKERNEL = csum_lsx.S +SGEMVNKERNEL = sgemv_n_lsx.S +SGEMVTKERNEL = sgemv_t_lsx.S + +DGEMVNKERNEL = dgemv_n_lsx.S +DGEMVTKERNEL = dgemv_t_lsx.S + DGEMMKERNEL = dgemm_kernel_8x4.S DGEMMINCOPY = dgemm_ncopy_8_lsx.S DGEMMITCOPY = dgemm_tcopy_8_lsx.S @@ -100,6 +106,9 @@ DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +CGEMVNKERNEL = cgemv_n_4_lsx.S +CGEMVTKERNEL = cgemv_t_4_lsx.S + CGEMMKERNEL = cgemm_kernel_8x4_lsx.S CGEMMINCOPY = cgemm_ncopy_8_lsx.S CGEMMITCOPY = cgemm_tcopy_8_lsx.S @@ -115,6 +124,9 @@ CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +ZGEMVNKERNEL = zgemv_n_2_lsx.S +ZGEMVTKERNEL = zgemv_t_2_lsx.S + ZGEMMKERNEL = zgemm_kernel_4x4_lsx.S ZGEMMONCOPY = zgemm_ncopy_4_lsx.S ZGEMMOTCOPY = zgemm_tcopy_4_lsx.S diff --git a/kernel/loongarch64/dgemv_n_lsx.S b/kernel/loongarch64/dgemv_n_lsx.S new file mode 100644 index 000000000..9a0141fb1 --- /dev/null +++ b/kernel/loongarch64/dgemv_n_lsx.S @@ -0,0 +1,229 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +/* Param */ +#define M $r4 +#define N $r5 +#define A $r7 +#define LDA $r8 +#define X $r9 +#define INCX $r10 +#define Y $r11 +#define INCY $r6 +#define BUFFER $r16 +#define ALPHA $f0 + +#define YORIG $r18 +#define T0 $r19 +#define T1 $r20 +#define XX $r12 +#define YY $r13 +#define I $r14 +#define J $r15 +#define AO1 $r23 +#define AO2 $r24 +#define IX $r25 +#define IY $r26 +#define II $r27 +#define T2 $r28 +#define T3 $r29 +#define T4 $r30 + +/* LSX vectors */ +#define U0 $vr11 +#define U1 $vr12 +#define U2 $vr2 +#define U3 $vr3 +#define U4 $vr4 +#define U5 $vr5 +#define U6 $vr6 +#define U7 $vr7 +#define U8 $vr8 +#define U9 $vr9 +#define VALPHA $vr10 + +#define a1 $f3 +#define a2 $f4 +#define a3 $f5 +#define a4 $f6 +#define a5 $f7 +#define a6 $f8 +#define a7 $f9 +#define a8 $f10 + + + PROLOGUE + + LDARG INCY, $sp, 0 + LDARG BUFFER, $sp, 8 + + addi.d $sp, $sp, -80 + + SDARG $r23, $sp, 0 + SDARG $r24, $sp, 8 + SDARG $r25, $sp, 16 + SDARG $r26, $sp, 32 + SDARG $r27, $sp, 40 + SDARG $r28, $sp, 48 + SDARG $r29, $sp, 56 + SDARG $r30, $sp, 64 + ST ALPHA, $sp, 72 + + vldrepl.d VALPHA, $sp, 72 + + slli.d LDA, LDA, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + slli.d INCY, INCY, BASE_SHIFT + + bge $r0, M, .L999 + bge $r0, N, .L999 + + move J, $r0 + move IX, $r0 + + move AO1, A //a_ptr + move XX, X + move YY, Y + + beq J, M, .L999 + +.L01: + vldx U0, XX, IX + vshuf4i.d U0, U0, 0x00 + + vfmul.d U1, VALPHA, U0 //temp1 + + move IY, $r0 + move II, $r0 + move I, $r0 + + srai.d T0, M, 2 //n/4 + beq I, T0, .L03 + +.L02: + vldx U2, AO1, II + addi.d II, II, 16 + vldx U7, AO1, II + + move T1, IY + add.d T2, T1, INCY + add.d T3, T2, INCY + add.d T4, T3, INCY + + fldx.d a1, YY, T1 + fldx.d a2, YY, T2 + fldx.d a3, YY, T3 + fldx.d a4, YY, T4 + + vextrins.d U3, U4, 0x10 + vextrins.d U5, U6, 0x10 + + vfmadd.d U3, U1, U2, U3 + vfmadd.d U5, U1, U7, U5 + + vextrins.d U4, U3, 0x01 + vextrins.d U6, U5, 0x01 + + fstx.d a1, YY, T1 + fstx.d a2, YY, T2 + fstx.d a3, YY, T3 + fstx.d a4, YY, T4 + + add.d IY, T4, INCY + addi.d II, II, 16 + addi.d I, I, 1 + blt I, T0, .L02 + +.L03: + andi T0, M, 2 + beq $r0, T0, .L04 + + addi.d T1, $r0, 4 + mod.d T1, M, T1 + sub.d II, M, T1 + slli.d II, II, BASE_SHIFT + + move T1, IY + add.d T2, T1, INCY + + vldx U2, AO1, II + + fldx.d a1, YY, T1 + fldx.d a2, YY, T2 + + vextrins.d U3, U4, 0x10 + + vfmadd.d U3, U1, U2, U3 + + vextrins.d U4, U3, 0x01 + + fstx.d a1, YY, T1 + fstx.d a2, YY, T2 + + add.d IY, T2, INCY + +.L04: + andi T0, M, 1 + beq $r0, T0, .L05 + + addi.d II, M, -1 + slli.d II, II, BASE_SHIFT + + fldx.d a1, AO1, II + fldx.d a3, YY, IY + + fmadd.d a3, $f12, a1, a3 + + fstx.d a3, YY, IY + + add.d IY, IY, INCY + +.L05: + add.d AO1, AO1, LDA + add.d IX, IX, INCX + + addi.d J, J, 1 + blt J, N, .L01 + +.L999: + LDARG $r23, $sp, 0 + LDARG $r24, $sp, 8 + LDARG $r25, $sp, 16 + LDARG $r26, $sp, 32 + LDARG $r27, $sp, 40 + LDARG $r28, $sp, 48 + LDARG $r29, $sp, 56 + LDARG $r30, $sp, 64 + LD ALPHA, $sp, 72 + addi.d $sp, $sp, 80 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/dgemv_t_lsx.S b/kernel/loongarch64/dgemv_t_lsx.S new file mode 100644 index 000000000..76f0d9bdc --- /dev/null +++ b/kernel/loongarch64/dgemv_t_lsx.S @@ -0,0 +1,279 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +/* Param */ +#define M $r4 +#define N $r5 +#define A $r7 +#define LDA $r8 +#define X $r9 +#define INCX $r10 +#define Y $r11 +#define INCY $r6 +#define BUFFER $r16 +#define ALPHA $f0 + +#define YORIG $r18 +#define T0 $r19 +#define T1 $r20 +#define AO3 $r12 +#define AO4 $r13 +#define I $r14 +#define J $r15 +#define AO1 $r23 +#define AO2 $r24 +#define IX $r25 +#define IY $r26 +#define II $r27 +#define T2 $r28 +#define T3 $r29 +#define T4 $r30 + +/* LSX vectors */ +#define U0 $vr11 +#define U1 $vr12 +#define U2 $vr2 +#define U3 $vr3 +#define U4 $vr4 +#define U5 $vr5 +#define U6 $vr6 +#define U7 $vr7 +#define U8 $vr8 +#define U9 $vr9 +#define VALPHA $vr10 + +#define a1 $f3 +#define a2 $f4 +#define a3 $f5 +#define a4 $f6 +#define a5 $f7 +#define a6 $f8 +#define a7 $f9 +#define a8 $f10 + + + PROLOGUE + + LDARG INCY, $sp, 0 + LDARG BUFFER, $sp, 8 + + addi.d $sp, $sp, -80 + + SDARG $r23, $sp, 0 + SDARG $r24, $sp, 8 + SDARG $r25, $sp, 16 + SDARG $r26, $sp, 32 + SDARG $r27, $sp, 40 + SDARG $r28, $sp, 48 + SDARG $r29, $sp, 56 + SDARG $r30, $sp, 64 + ST ALPHA, $sp, 72 + + vldrepl.d VALPHA, $sp, 72 + + slli.d LDA, LDA, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + slli.d INCY, INCY, BASE_SHIFT + + bge $r0, M, .L999 + bge $r0, N, .L999 + + move J, $r0 + move IY, $r0 + + move AO1, A //a_ptr1 + + srai.d T0, N, 2 //n/4 + beq J, T0, .L04 + +.L01: /* j