diff --git a/.github/workflows/dynamic_arch.yml b/.github/workflows/dynamic_arch.yml index 49721958a..669aa8116 100644 --- a/.github/workflows/dynamic_arch.yml +++ b/.github/workflows/dynamic_arch.yml @@ -42,6 +42,7 @@ jobs: - name: Install Dependencies run: | if [ "$RUNNER_OS" == "Linux" ]; then + sudo apt-get update sudo apt-get install -y gfortran cmake ccache libtinfo5 elif [ "$RUNNER_OS" == "macOS" ]; then # It looks like "gfortran" isn't working correctly unless "gcc" is re-installed. diff --git a/CMakeLists.txt b/CMakeLists.txt index a4440ee1b..9fbe878e6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -24,6 +24,8 @@ option(BUILD_LAPACK_DEPRECATED "When building LAPACK, include also some older, d option(BUILD_TESTING "Build LAPACK testsuite when building LAPACK" ON) +option(BUILD_BENCHMARKS "Build the collection of BLAS/LAPACK benchmarks" OFF) + option(C_LAPACK "Build LAPACK from C sources instead of the original Fortran" OFF) option(BUILD_WITHOUT_CBLAS "Do not build the C interface (CBLAS) to the BLAS functions" OFF) @@ -328,7 +330,7 @@ if (NOT NOFORTRAN) # Build test and ctest add_subdirectory(test) endif() - if (BUILD_TESTING) + if (BUILD_TESTING AND NOT BUILD_WITHOUT_LAPACK) add_subdirectory(lapack-netlib/TESTING) endif() endif() @@ -458,6 +460,61 @@ if (BUILD_SHARED_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFFIX} STREQUAL "") endif() endif() +if (BUILD_BENCHMARKS) + #find_package(OpenMP REQUIRED) + file(GLOB SOURCES "benchmark/*.c") + if (NOT USE_OPENMP) + file(GLOB REMFILE "benchmark/smallscaling.c") + list(REMOVE_ITEM SOURCES ${REMFILE}) + endif() + if (BUILD_WITHOUT_LAPACK) + file(GLOB REMFILE "benchmark/cholesky.c") + list(REMOVE_ITEM SOURCES ${REMFILE}) + file(GLOB REMFILE "benchmark/geev.c") + list(REMOVE_ITEM SOURCES ${REMFILE}) + file(GLOB REMFILE "benchmark/gesv.c") + list(REMOVE_ITEM SOURCES ${REMFILE}) + file(GLOB REMFILE "benchmark/getri.c") + list(REMOVE_ITEM SOURCES ${REMFILE}) + file(GLOB REMFILE "benchmark/potrf.c") + list(REMOVE_ITEM SOURCES ${REMFILE}) + file(GLOB REMFILE "benchmark/spmv.c") + list(REMOVE_ITEM SOURCES ${REMFILE}) + file(GLOB REMFILE "benchmark/symv.c") + list(REMOVE_ITEM SOURCES ${REMFILE}) + file(GLOB REMFILE "benchmark/linpack.c") + list(REMOVE_ITEM SOURCES ${REMFILE}) + endif() + if (NOT USE_GEMM3M) + file(GLOB REMFILE "benchmark/gemm3m.c") + list(REMOVE_ITEM SOURCES ${REMFILE}) + endif() + foreach(source ${SOURCES}) + get_filename_component(name ${source} NAME_WE) + if ((NOT ${name} STREQUAL "zdot-intel") AND (NOT ${name} STREQUAL "cula_wrapper")) + set(defines DEFAULT COMPLEX DOUBLE "COMPLEX\;DOUBLE") + foreach(define ${defines}) + set(target_name "benchmark_${name}") + if (NOT "${define}" STREQUAL "DEFAULT") + string(JOIN "_" define_str ${define}) + set(target_name "${target_name}_${define_str}") + endif() + if ((NOT ${target_name} STREQUAL "benchmark_imax_COMPLEX") AND (NOT ${target_name} STREQUAL "benchmark_imax_COMPLEX_DOUBLE") AND + (NOT ${target_name} STREQUAL "benchmark_imin_COMPLEX") AND (NOT ${target_name} STREQUAL "benchmark_imin_COMPLEX_DOUBLE") AND + (NOT ${target_name} STREQUAL "benchmark_max_COMPLEX") AND (NOT ${target_name} STREQUAL "benchmark_max_COMPLEX_DOUBLE") AND + (NOT ${target_name} STREQUAL "benchmark_min_COMPLEX") AND (NOT ${target_name} STREQUAL "benchmark_min_COMPLEX_DOUBLE")) + add_executable(${target_name} ${source}) + target_include_directories(${target_name} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR}) + target_link_libraries(${target_name} ${OpenBLAS_LIBNAME} ) + # target_link_libraries(${target_name} ${OpenBLAS_LIBNAME} OpenMP::OpenMP_C) + if (NOT "${define}" STREQUAL "DEFAULT") + target_compile_definitions(${target_name} PRIVATE ${define}) + endif() + endif() + endforeach() + endif() + endforeach() +endif() # Install project diff --git a/Makefile.system b/Makefile.system index cb5453cac..aadf3459a 100644 --- a/Makefile.system +++ b/Makefile.system @@ -1520,10 +1520,18 @@ ifndef LIBNAMEPREFIX LIBNAMEPREFIX = endif +SYMPREFIX=$(SYMBOLPREFIX) +ifeq ($(SYMBOLPREFIX),$(LIBNAMEPREFIX)) +SYMPREFIX= +endif +SYMSUFFIX=$(SYMBOLSUFFIX) +ifeq ($(SYMBOLSUFFIX),$(LIBNAMESUFFIX)) +SYMSUFFIX= +endif ifndef LIBNAMESUFFIX -LIBNAMEBASE = $(SYMBOLPREFIX)$(LIBSONAMEBASE)$(SYMBOLSUFFIX) +LIBNAMEBASE = $(SYMPREFIX)$(LIBSONAMEBASE)$(SYMSUFFIX) else -LIBNAMEBASE = $(SYMBOLPREFIX)$(LIBSONAMEBASE)$(SYMBOLSUFFIX)$(LIBNAMESUFFIX) +LIBNAMEBASE = $(SYMPREFIX)$(LIBSONAMEBASE)$(SYMSUFFIX)$(LIBNAMESUFFIX) endif ifeq ($(OSNAME), CYGWIN_NT) diff --git a/ctest/CMakeLists.txt b/ctest/CMakeLists.txt index 6e0a7f309..c56a78346 100644 --- a/ctest/CMakeLists.txt +++ b/ctest/CMakeLists.txt @@ -88,6 +88,17 @@ if (NOT NOFORTRAN) auxiliary.c c_xerbla.c constant.c) + if (USE_GEMM3M) + if ((${float_char} STREQUAL "c") OR (${float_char} STREQUAL "z")) + add_executable(x${float_char}cblat3_3m + c_${float_char}blat3_3m.f + c_${float_char}blas3_3m.c + c_${float_char}3chke_3m.c + auxiliary.c + c_xerbla.c + constant.c) + endif() + endif() else() add_executable(x${float_char}cblat3 c_${float_char}blat3c.c @@ -96,6 +107,17 @@ else() auxiliary.c c_xerbla.c constant.c) + if (USE_GEMM3M) + if ((${float_char} STREQUAL "c") OR (${float_char} STREQUAL "z")) + add_executable(x${float_char}cblat3_3m + c_${float_char}blat3c_3m.c + c_${float_char}blas3_3m.c + c_${float_char}3chke_3m.c + auxiliary.c + c_xerbla.c + constant.c) + endif() + endif() endif() target_link_libraries(x${float_char}cblat3 ${OpenBLAS_LIBNAME}) if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang)) @@ -105,7 +127,24 @@ endif() if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX") target_link_libraries(x${float_char}cblat3 m) endif() + if (USE_GEMM3M) + if ((${float_char} STREQUAL "c") OR (${float_char} STREQUAL "z")) + target_link_libraries(x${float_char}cblat3_3m ${OpenBLAS_LIBNAME}) + if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang)) + string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}") + target_link_libraries(x${float_char}cblat3 omp pthread) + endif() + if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX") + target_link_libraries(x${float_char}cblat3_3m m) + endif() + endif() + endif() add_test(NAME "x${float_char}cblat3" COMMAND ${test_helper} $ "${PROJECT_SOURCE_DIR}/ctest/${float_char}in3") - + if (USE_GEMM3M) + if ((${float_char} STREQUAL "c") OR (${float_char} STREQUAL "z")) + add_test(NAME "x${float_char}cblat3_3m" + COMMAND ${test_helper} $ "${PROJECT_SOURCE_DIR}/ctest/${float_char}in3_3m") + endif() + endif() endforeach() diff --git a/ctest/Makefile b/ctest/Makefile index ad960b35a..bbaf96f8e 100644 --- a/ctest/Makefile +++ b/ctest/Makefile @@ -5,6 +5,24 @@ TOPDIR = .. include $(TOPDIR)/Makefile.system +SUPPORT_GEMM3M = 0 + +ifeq ($(ARCH), x86) +SUPPORT_GEMM3M = 1 +endif + +ifeq ($(ARCH), x86_64) +SUPPORT_GEMM3M = 1 +endif + +ifeq ($(ARCH), ia64) +SUPPORT_GEMM3M = 1 +endif + +ifeq ($(ARCH), MIPS) +SUPPORT_GEMM3M = 1 +endif + override CFLAGS += -DADD$(BU) -DCBLAS ifeq ($(F_COMPILER),GFORTRAN) override FFLAGS += -fno-tree-vectorize @@ -144,9 +162,15 @@ all3targets += xdcblat3 endif ifeq ($(BUILD_COMPLEX),1) all3targets += xccblat3 +ifeq ($(SUPPORT_GEMM3M),1) +all3targets += xccblat3_3m +endif endif ifeq ($(BUILD_COMPLEX16),1) all3targets += xzcblat3 +ifeq ($(SUPPORT_GEMM3M),1) +all3targets += xzcblat3_3m +endif endif all3: $(all3targets) @@ -181,9 +205,9 @@ endif endif endif -all3_3m: xzcblat3_3m xccblat3_3m +ifeq ($(SUPPORT_GEMM3M),1) ifeq ($(USE_OPENMP), 1) -ifeq ($(BUILD_SINGLE),1) +ifeq ($(BUILD_COMPLEX),1) OMP_NUM_THREADS=2 ./xccblat3_3m < cin3_3m endif ifeq ($(BUILD_COMPLEX16),1) @@ -197,6 +221,7 @@ ifeq ($(BUILD_COMPLEX16),1) OPENBLAS_NUM_THREADS=2 ./xzcblat3_3m < zin3_3m endif endif +endif @@ -271,8 +296,10 @@ xccblat2: $(ctestl2o) c_cblat2.o $(TOPDIR)/$(LIBNAME) $(FC) $(FLDFLAGS) -o xccblat2 c_cblat2.o $(ctestl2o) $(LIB) $(EXTRALIB) $(CEXTRALIB) xccblat3: $(ctestl3o) c_cblat3.o $(TOPDIR)/$(LIBNAME) $(FC) $(FLDFLAGS) -o xccblat3 c_cblat3.o $(ctestl3o) $(LIB) $(EXTRALIB) $(CEXTRALIB) +ifeq ($(SUPPORT_GEMM3M),1) xccblat3_3m: $(ctestl3o_3m) c_cblat3_3m.o $(TOPDIR)/$(LIBNAME) $(FC) $(FLDFLAGS) -o xccblat3_3m c_cblat3_3m.o $(ctestl3o_3m) $(LIB) $(EXTRALIB) $(CEXTRALIB) +endif else xccblat1: $(ctestl1o) c_cblat1c.o $(TOPDIR)/$(LIBNAME) $(CC) $(CFLAGS) -o xccblat1 c_cblat1c.o $(ctestl1o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB)) @@ -280,6 +307,10 @@ xccblat2: $(ctestl2o) c_cblat2c.o $(TOPDIR)/$(LIBNAME) $(CC) $(CFLAGS) -o xccblat2 c_cblat2c.o $(ctestl2o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB)) xccblat3: $(ctestl3o) c_cblat3c.o $(TOPDIR)/$(LIBNAME) $(CC) $(CFLAGS) -o xccblat3 c_cblat3c.o $(ctestl3o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB)) +ifeq ($(SUPPORT_GEMM3M),1) +xccblat3_3m: $(ctestl3o_3m) c_cblat3c_3m.o $(TOPDIR)/$(LIBNAME) + $(CC) $(CFLAGS) -o xccblat3_3m c_cblat3c_3m.o $(ctestl3o_3m) $(LIB) $(EXTRALIB) $(CEXTRALIB) +endif endif endif @@ -293,8 +324,10 @@ xzcblat2: $(ztestl2o) c_zblat2.o $(TOPDIR)/$(LIBNAME) $(FC) $(FLDFLAGS) -o xzcblat2 c_zblat2.o $(ztestl2o) $(LIB) $(EXTRALIB) $(CEXTRALIB) xzcblat3: $(ztestl3o) c_zblat3.o $(TOPDIR)/$(LIBNAME) $(FC) $(FLDFLAGS) -o xzcblat3 c_zblat3.o $(ztestl3o) $(LIB) $(EXTRALIB) $(CEXTRALIB) +ifeq ($(SUPPORT_GEMM3M),1) xzcblat3_3m: $(ztestl3o_3m) c_zblat3_3m.o $(TOPDIR)/$(LIBNAME) $(FC) $(FLDFLAGS) -o xzcblat3_3m c_zblat3_3m.o $(ztestl3o_3m) $(LIB) $(EXTRALIB) $(CEXTRALIB) +endif else xzcblat1: $(ztestl1o) c_zblat1c.o $(TOPDIR)/$(LIBNAME) $(CC) $(CFLAGS) -o xzcblat1 c_zblat1c.o $(ztestl1o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB)) @@ -302,6 +335,10 @@ xzcblat2: $(ztestl2o) c_zblat2c.o $(TOPDIR)/$(LIBNAME) $(CC) $(CFLAGS) -o xzcblat2 c_zblat2c.o $(ztestl2o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB)) xzcblat3: $(ztestl3o) c_zblat3c.o $(TOPDIR)/$(LIBNAME) $(CC) $(CFLAGS) -o xzcblat3 c_zblat3c.o $(ztestl3o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB)) +ifeq ($(SUPPORT_GEMM3M),1) +xzcblat3_3m: $(ztestl3o_3m) c_zblat3c_3m.o $(TOPDIR)/$(LIBNAME) + $(CC) $(CFLAGS) -o xzcblat3_3m c_zblat3c_3m.o $(ztestl3o_3m) $(LIB) $(EXTRALIB) $(CEXTRALIB) +endif endif endif diff --git a/ctest/c_cblat3c_3m.c b/ctest/c_cblat3c_3m.c new file mode 100644 index 000000000..b5d6bf9cb --- /dev/null +++ b/ctest/c_cblat3c_3m.c @@ -0,0 +1,3942 @@ +#include +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +#include "common.h" + +typedef blasint integer; + +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimagf(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} +//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 + + +/* Common Block Declarations */ + +struct { + integer infot, noutc; + logical ok, lerr; +} infoc_; + +#define infoc_1 infoc_ + +struct { + char srnamt[12]; +} srnamc_; + +#define srnamc_1 srnamc_ + +/* Table of constant values */ + +static complex c_b1 = {0.f,0.f}; +static complex c_b2 = {1.f,0.f}; +static integer c__1 = 1; +static integer c__65 = 65; +static integer c__6 = 6; +static real c_b91 = 1.f; +static logical c_true = TRUE_; +static integer c__0 = 0; +static logical c_false = FALSE_; + +int /* Main program */ main(void) +{ + /* Initialized data */ + + static char snames[9][13] = {"cblas_cgemm3m ", "cblas_chemm ", "cblas_csymm ", + "cblas_ctrmm ", "cblas_ctrsm ", "cblas_cherk ", "cblas_csyrk ", + "cblas_cher2k", "cblas_csyr2k"}; + + /* System generated locals */ + integer i__1, i__2, i__3, i__4, i__5; + real r__1; + + /* Local variables */ + integer nalf, idim[9]; + logical same; + integer nbet, ntra; + logical rewi; + extern /* Subroutine */ int cchk1_(char *, real *, real *, integer *, + integer *, logical *, logical *, logical *, integer *, integer *, + integer *, complex *, integer *, complex *, integer *, complex *, + complex *, complex *, complex *, complex *, complex *, complex *, + complex *, complex *, complex *, real *, integer *), + cchk2_(char *, real *, real *, integer *, integer *, logical *, + logical *, logical *, integer *, integer *, integer *, complex *, + integer *, complex *, integer *, complex *, complex *, complex *, + complex *, complex *, complex *, complex *, complex *, complex *, + complex *, real *, integer *), cchk3_(char *, real *, + real *, integer *, integer *, logical *, logical *, logical *, + integer *, integer *, integer *, complex *, integer *, complex *, + complex *, complex *, complex *, complex *, complex *, complex *, + real *, complex *, integer *), cchk4_(char *, real *, + real *, integer *, integer *, logical *, logical *, logical *, + integer *, integer *, integer *, complex *, integer *, complex *, + integer *, complex *, complex *, complex *, complex *, complex *, + complex *, complex *, complex *, complex *, complex *, real *, + integer *), cchk5_(char *, real *, real *, integer *, + integer *, logical *, logical *, logical *, integer *, integer *, + integer *, complex *, integer *, complex *, integer *, complex *, + complex *, complex *, complex *, complex *, complex *, complex *, + complex *, complex *, real *, complex *, integer *); + complex c__[4225] /* was [65][65] */; + real g[65]; + integer i__, j, n; + logical fatal; + complex w[130]; + extern /* Subroutine */ int cmmch_(char *, char *, integer *, integer *, + integer *, complex *, complex *, integer *, complex *, integer *, + complex *, complex *, integer *, complex *, real *, complex *, + integer *, real *, real *, logical *, integer *, logical *); + extern real sdiff_(real *, real *); + logical trace; + integer nidim; + char snaps[32]; + integer isnum; + logical ltest[9]; + complex aa[4225], ab[8450] /* was [65][130] */, bb[4225], cc[4225], as[ + 4225], bs[4225], cs[4225], ct[65]; + logical sfatal, corder; + char snamet[12], transa[1], transb[1]; + real thresh; + logical rorder; + extern /* Subroutine */ int cc3chke_(char *); + integer layout; + logical ltestt, tsterr; + complex alf[7]; + extern logical lce_(complex *, complex *, integer *); + complex bet[7]; + real eps, err; + char tmpchar; + +/* Test program for the COMPLEX Level 3 Blas. */ + +/* The program must be driven by a short data file. The first 13 records */ +/* of the file are read using list-directed input, the last 9 records */ +/* are read using the format ( A12, L2 ). An annotated example of a data */ +/* file can be obtained by deleting the first 3 characters from the */ +/* following 22 lines: */ +/* 'CBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE */ +/* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) */ +/* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. */ +/* F LOGICAL FLAG, T TO STOP ON FAILURES. */ +/* T LOGICAL FLAG, T TO TEST ERROR CALL MYEXITS. */ +/* 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH */ +/* 16.0 THRESHOLD VALUE OF TEST RATIO */ +/* 6 NUMBER OF VALUES OF N */ +/* 0 1 2 3 5 9 VALUES OF N */ +/* 3 NUMBER OF VALUES OF ALPHA */ +/* (0.0,0.0) (1.0,0.0) (0.7,-0.9) VALUES OF ALPHA */ +/* 3 NUMBER OF VALUES OF BETA */ +/* (0.0,0.0) (1.0,0.0) (1.3,-1.1) VALUES OF BETA */ +/* cblas_cgemm T PUT F FOR NO TEST. SAME COLUMNS. */ +/* cblas_chemm T PUT F FOR NO TEST. SAME COLUMNS. */ +/* cblas_csymm T PUT F FOR NO TEST. SAME COLUMNS. */ +/* cblas_ctrmm T PUT F FOR NO TEST. SAME COLUMNS. */ +/* cblas_ctrsm T PUT F FOR NO TEST. SAME COLUMNS. */ +/* cblas_cherk T PUT F FOR NO TEST. SAME COLUMNS. */ +/* cblas_csyrk T PUT F FOR NO TEST. SAME COLUMNS. */ +/* cblas_cher2k T PUT F FOR NO TEST. SAME COLUMNS. */ +/* cblas_csyr2k T PUT F FOR NO TEST. SAME COLUMNS. */ + +/* See: */ + +/* Dongarra J. J., Du Croz J. J., Duff I. S. and Hammarling S. */ +/* A Set of Level 3 Basic Linear Algebra Subprograms. */ + +/* Technical Memorandum No.88 (Revision 1), Mathematics and */ +/* Computer Science Division, Argonne National Laboratory, 9700 */ +/* South Cass Avenue, Argonne, Illinois 60439, US. */ + +/* -- Written on 8-February-1989. */ +/* Jack Dongarra, Argonne National Laboratory. */ +/* Iain Duff, AERE Harwell. */ +/* Jeremy Du Croz, Numerical Algorithms Group Ltd. */ +/* Sven Hammarling, Numerical Algorithms Group Ltd. */ + + + infoc_1.noutc = 6; + +/* Read name and unit number for snapshot output file and open file. */ + char line[80]; + + fgets(line,80,stdin); + sscanf(line,"'%s'",snaps); + fgets(line,80,stdin); +#ifdef USE64BITINT + sscanf(line,"%ld",&ntra); +#else + sscanf(line,"%d",&ntra); +#endif + trace = ntra >= 0; + if (trace) { +/* o__1.oerr = 0; + o__1.ounit = ntra; + o__1.ofnmlen = 32; + o__1.ofnm = snaps; + o__1.orl = 0; + o__1.osta = 0; + o__1.oacc = 0; + o__1.ofm = 0; + o__1.oblnk = 0; + f_open(&o__1);*/ + } +/* Read the flag that directs rewinding of the snapshot file. */ + fgets(line,80,stdin); + sscanf(line,"%d",&rewi); + rewi = rewi && trace; +/* Read the flag that directs stopping on any failure. */ + fgets(line,80,stdin); + sscanf(line,"%c",&tmpchar); + sfatal=FALSE_; + if (tmpchar=='T')sfatal=TRUE_; + fgets(line,80,stdin); + sscanf(line,"%c",&tmpchar); + tsterr=FALSE_; + if (tmpchar=='T')tsterr=TRUE_; + fgets(line,80,stdin); + sscanf(line,"%d",&layout); + fgets(line,80,stdin); + sscanf(line,"%f",&thresh); + + +/* Read and check the parameter values for the tests. */ + +/* Values of N */ + fgets(line,80,stdin); +#ifdef USE64BITINT + sscanf(line,"%ld",&nidim); +#else + sscanf(line,"%d",&nidim); +#endif + + if (nidim < 1 || nidim > 9) { + fprintf(stderr,"NUMBER OF VALUES OF N IS LESS THAN 1 OR GREATER THAN 9"); + goto L220; + } + fgets(line,80,stdin); +#ifdef USE64BITINT + sscanf(line,"%ld %ld %ld %ld %ld %ld %ld %ld %ld",&idim[0],&idim[1],&idim[2], + &idim[3],&idim[4],&idim[5],&idim[6],&idim[7],&idim[8]); +#else + sscanf(line,"%d %d %d %d %d %d %d %d %d",&idim[0],&idim[1],&idim[2], + &idim[3],&idim[4],&idim[5],&idim[6],&idim[7],&idim[8]); +#endif + i__1 = nidim; + for (i__ = 1; i__ <= i__1; ++i__) { + if (idim[i__ - 1] < 0 || idim[i__ - 1] > 65) { + fprintf(stderr,"VALUE OF N IS LESS THAN 0 OR GREATER THAN 65\n"); + goto L220; + } +/* L10: */ + } +/* Values of ALPHA */ + fgets(line,80,stdin); +#ifdef USE64BITINT + sscanf(line,"%ld",&nalf); +#else + sscanf(line,"%d",&nalf); +#endif + if (nalf < 1 || nalf > 7) { + fprintf(stderr,"VALUE OF ALPHA IS LESS THAN 0 OR GREATER THAN 7\n"); + goto L220; + } + fgets(line,80,stdin); + sscanf(line,"(%f,%f) (%f,%f) (%f,%f) (%f,%f) (%f,%f) (%f,%f) (%f,%f)",&alf[0].r,&alf[0].i,&alf[1].r,&alf[1].i,&alf[2].r,&alf[2].i,&alf[3].r,&alf[3].i, + &alf[4].r,&alf[4].i,&alf[5].r,&alf[5].i,&alf[6].r,&alf[6].i); + +// i__1 = nalf; +// for (i__ = 1; i__ <= i__1; ++i__) { +// do_lio(&c__6, &c__1, (char *)&alf[i__ - 1], (ftnlen)sizeof(complex)); +// } +/* Values of BETA */ + fgets(line,80,stdin); +#ifdef USE64BITINT + sscanf(line,"%ld",&nbet); +#else + sscanf(line,"%d",&nbet); +#endif + if (nalf < 1 || nbet > 7) { + fprintf(stderr,"VALUE OF BETA IS LESS THAN 0 OR GREATER THAN 7\n"); + goto L220; + } + fgets(line,80,stdin); + sscanf(line,"(%f,%f) (%f,%f) (%f,%f) (%f,%f) (%f,%f) (%f,%f) (%f,%f)",&bet[0].r,&bet[0].i,&bet[1].r,&bet[1].i,&bet[2].r,&bet[2].i,&bet[3].r,&bet[3].i, + &bet[4].r,&bet[4].i,&bet[5].r,&bet[5].i,&bet[6].r,&bet[6].i); + + +/* Report values of parameters. */ + + printf("TESTS OF THE COMPLEX LEVEL 3 BLAS\nTHE FOLLOWING PARAMETER VALUES WILL BE USED:\n"); + printf(" FOR N"); + for (i__ =1; i__ <=nidim;++i__) printf(" %d",idim[i__-1]); + printf("\n"); + printf(" FOR ALPHA"); + for (i__ =1; i__ <=nalf;++i__) printf(" (%f,%f)",alf[i__-1].r,alf[i__-1].i); + printf("\n"); + printf(" FOR BETA"); + for (i__ =1; i__ <=nbet;++i__) printf(" (%f,%f)",bet[i__-1].r,bet[i__-1].i); + printf("\n"); + + if (! tsterr) { + printf(" ERROR-EXITS WILL NOT BE TESTED\n"); + } + printf("ROUTINES PASS COMPUTATIONAL TESTS IF TEST RATIO IS LESS THAN %f\n",thresh); + rorder = FALSE_; + corder = FALSE_; + if (layout == 2) { + rorder = TRUE_; + corder = TRUE_; + printf("COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED\n"); + } else if (layout == 1) { + rorder = TRUE_; + printf("ROW-MAJOR DATA LAYOUT IS TESTED\n"); + } else if (layout == 0) { + corder = TRUE_; + printf("COLUMN-MAJOR DATA LAYOUT IS TESTED\n"); + } + +/* Read names of subroutines and flags which indicate */ +/* whether they are to be tested. */ + + for (i__ = 1; i__ <= 9; ++i__) { + ltest[i__ - 1] = FALSE_; +/* L20: */ + } +L30: + if (! fgets(line,80,stdin)) { + goto L60; + } + i__1 = sscanf(line,"%12c %c",snamet,&tmpchar); + ltestt=FALSE_; + if (tmpchar=='T')ltestt=TRUE_; + if (i__1 < 2) { + goto L60; + } + for (i__ = 1; i__ <= 9; ++i__) { + if (s_cmp(snamet, snames[i__ - 1] , (ftnlen)12, (ftnlen)12) == + 0) { + goto L50; + } +/* L40: */ + } + printf("SUBPROGRAM NAME %s NOT RECOGNIZED\n****** TESTS ABANDONED ******\n",snamet); + exit(1); +L50: + ltest[i__ - 1] = ltestt; + goto L30; + +L60: +/* cl__1.cerr = 0; + cl__1.cunit = 5; + cl__1.csta = 0; + f_clos(&cl__1);*/ + +/* Compute EPS (the machine precision). */ + + eps = 1.f; +L70: + r__1 = eps + 1.f; + if (sdiff_(&r__1, &c_b91) == 0.f) { + goto L80; + } + eps *= .5f; + goto L70; +L80: + eps += eps; + printf("RELATIVE MACHINE PRECISION IS TAKEN TO BE %9.1g\n",eps); + +/* Check the reliability of CMMCH using exact data. */ + + n = 32; + i__1 = n; + for (j = 1; j <= i__1; ++j) { + i__2 = n; + for (i__ = 1; i__ <= i__2; ++i__) { + i__3 = i__ + j * 65 - 66; +/* Computing MAX */ + i__5 = i__ - j + 1; + i__4 = f2cmax(i__5,0); + ab[i__3].r = (real) i__4, ab[i__3].i = 0.f; +/* L90: */ + } + i__2 = j + 4224; + ab[i__2].r = (real) j, ab[i__2].i = 0.f; + i__2 = (j + 65) * 65 - 65; + ab[i__2].r = (real) j, ab[i__2].i = 0.f; + i__2 = j - 1; + c__[i__2].r = 0.f, c__[i__2].i = 0.f; +/* L100: */ + } + i__1 = n; + for (j = 1; j <= i__1; ++j) { + i__2 = j - 1; + i__3 = j * ((j + 1) * j) / 2 - (j + 1) * j * (j - 1) / 3; + cc[i__2].r = (real) i__3, cc[i__2].i = 0.f; +/* L110: */ + } +/* CC holds the exact result. On exit from CMMCH CT holds */ +/* the result computed by CMMCH. */ + *(unsigned char *)transa = 'N'; + *(unsigned char *)transb = 'N'; + cmmch_(transa, transb, &n, &c__1, &n, &c_b2, ab, &c__65, &ab[4225], & + c__65, &c_b1, c__, &c__65, ct, g, cc, &c__65, &eps, &err, &fatal, + &c__6, &c_true); + same = lce_(cc, ct, &n); + if (! same || err != 0.f) { + printf("ERROR IN CMMCH - IN-LINE DOT PRODUCTS ARE BEING EVALUATED WRONGLY\n"); + printf("CMMCH WAS CALLED WITH TRANSA = %s AND TRANSB = %s\n", transa,transb); + printf("AND RETURNED SAME = %c AND ERR = %12.3f.\n",(same==FALSE_? 'F':'T'),err); + printf("THIS MAY BE DUE TO FAULTS IN THE ARITHMETIC OR THE COMPILER.\n"); + printf("****** TESTS ABANDONED ******\n"); + exit(1); + } + *(unsigned char *)transb = 'C'; + cmmch_(transa, transb, &n, &c__1, &n, &c_b2, ab, &c__65, &ab[4225], & + c__65, &c_b1, c__, &c__65, ct, g, cc, &c__65, &eps, &err, &fatal, + &c__6, &c_true); + same = lce_(cc, ct, &n); + if (! same || err != 0.f) { + printf("ERROR IN CMMCH - IN-LINE DOT PRODUCTS ARE BEING EVALUATED WRONGLY\n"); + printf("CMMCH WAS CALLED WITH TRANSA = %s AND TRANSB = %s\n", transa,transb); + printf("AND RETURNED SAME = %c AND ERR = %12.3f.\n",(same==FALSE_? 'F':'T'),err); + printf("THIS MAY BE DUE TO FAULTS IN THE ARITHMETIC OR THE COMPILER.\n"); + printf("****** TESTS ABANDONED ******\n"); + exit(1); + } + i__1 = n; + for (j = 1; j <= i__1; ++j) { + i__2 = j + 4224; + i__3 = n - j + 1; + ab[i__2].r = (real) i__3, ab[i__2].i = 0.f; + i__2 = (j + 65) * 65 - 65; + i__3 = n - j + 1; + ab[i__2].r = (real) i__3, ab[i__2].i = 0.f; +/* L120: */ + } + i__1 = n; + for (j = 1; j <= i__1; ++j) { + i__2 = n - j; + i__3 = j * ((j + 1) * j) / 2 - (j + 1) * j * (j - 1) / 3; + cc[i__2].r = (real) i__3, cc[i__2].i = 0.f; +/* L130: */ + } + *(unsigned char *)transa = 'C'; + *(unsigned char *)transb = 'N'; + cmmch_(transa, transb, &n, &c__1, &n, &c_b2, ab, &c__65, &ab[4225], & + c__65, &c_b1, c__, &c__65, ct, g, cc, &c__65, &eps, &err, &fatal, + &c__6, &c_true); + same = lce_(cc, ct, &n); + if (! same || err != 0.f) { + printf("ERROR IN CMMCH - IN-LINE DOT PRODUCTS ARE BEING EVALUATED WRONGLY\n"); + printf("CMMCH WAS CALLED WITH TRANSA = %s AND TRANSB = %s\n", transa,transb); + printf("AND RETURNED SAME = %c AND ERR = %12.3f.\n",(same==FALSE_? 'F':'T'),err); + printf("THIS MAY BE DUE TO FAULTS IN THE ARITHMETIC OR THE COMPILER.\n"); + printf("****** TESTS ABANDONED ******\n"); + exit(1); + } + *(unsigned char *)transb = 'C'; + cmmch_(transa, transb, &n, &c__1, &n, &c_b2, ab, &c__65, &ab[4225], & + c__65, &c_b1, c__, &c__65, ct, g, cc, &c__65, &eps, &err, &fatal, + &c__6, &c_true); + same = lce_(cc, ct, &n); + if (! same || err != 0.f) { + printf("ERROR IN CMMCH - IN-LINE DOT PRODUCTS ARE BEING EVALUATED WRONGLY\n"); + printf("CMMCH WAS CALLED WITH TRANSA = %s AND TRANSB = %s\n", transa,transb); + printf("AND RETURNED SAME = %c AND ERR = %12.3f.\n",(same==FALSE_? 'F':'T'),err); + printf("THIS MAY BE DUE TO FAULTS IN THE ARITHMETIC OR THE COMPILER.\n"); + printf("****** TESTS ABANDONED ******\n"); + exit(1); + } + +/* Test each subroutine in turn. */ + + for (isnum = 1; isnum <= 9; ++isnum) { + if (! ltest[isnum - 1]) { +/* Subprogram is not to be tested. */ + printf("%12s WAS NOT TESTED\n",snames[isnum-1]); + } else { + s_copy(srnamc_1.srnamt, snames[isnum - 1], (ftnlen)12, ( + ftnlen)12); +/* Test error exits. */ + if (tsterr) { + cc3chke_(snames[isnum - 1]); + } +/* Test computations. */ + infoc_1.infot = 0; + infoc_1.ok = TRUE_; + fatal = FALSE_; + switch (isnum) { + case 1: goto L140; + case 2: goto L150; + case 3: goto L150; + case 4: goto L160; + case 5: goto L160; + case 6: goto L170; + case 7: goto L170; + case 8: goto L180; + case 9: goto L180; + } +/* Test CGEMM, 01. */ +L140: + if (corder) { + cchk1_(snames[isnum - 1], &eps, &thresh, &c__6, &ntra, + &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, & + nbet, bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, + cc, cs, ct, g, &c__0); + } + if (rorder) { + cchk1_(snames[isnum - 1], &eps, &thresh, &c__6, &ntra, + &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, & + nbet, bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, + cc, cs, ct, g, &c__1); + } + goto L190; +/* Test CHEMM, 02, CSYMM, 03. */ +L150: + if (corder) { + cchk2_(snames[isnum - 1], &eps, &thresh, &c__6, &ntra, + &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, & + nbet, bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, + cc, cs, ct, g, &c__0); + } + if (rorder) { + cchk2_(snames[isnum - 1], &eps, &thresh, &c__6, &ntra, + &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, & + nbet, bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, + cc, cs, ct, g, &c__1); + } + goto L190; +/* Test CTRMM, 04, CTRSM, 05. */ +L160: + if (corder) { + cchk3_(snames[isnum - 1], &eps, &thresh, &c__6, &ntra, + &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, & + c__65, ab, aa, as, &ab[4225], bb, bs, ct, g, c__, & + c__0); + } + if (rorder) { + cchk3_(snames[isnum - 1], &eps, &thresh, &c__6, &ntra, + &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, & + c__65, ab, aa, as, &ab[4225], bb, bs, ct, g, c__, & + c__1); + } + goto L190; +/* Test CHERK, 06, CSYRK, 07. */ +L170: + if (corder) { + cchk4_(snames[isnum - 1], &eps, &thresh, &c__6, &ntra, + &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, & + nbet, bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, + cc, cs, ct, g, &c__0); + } + if (rorder) { + cchk4_(snames[isnum - 1], &eps, &thresh, &c__6, &ntra, + &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, & + nbet, bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, + cc, cs, ct, g, &c__1); + } + goto L190; +/* Test CHER2K, 08, CSYR2K, 09. */ +L180: + if (corder) { + cchk5_(snames[isnum - 1], &eps, &thresh, &c__6, &ntra, + &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, & + nbet, bet, &c__65, ab, aa, as, bb, bs, c__, cc, cs, + ct, g, w, &c__0); + } + if (rorder) { + cchk5_(snames[isnum - 1], &eps, &thresh, &c__6, &ntra, + &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, & + nbet, bet, &c__65, ab, aa, as, bb, bs, c__, cc, cs, + ct, g, w, &c__1); + } + goto L190; + +L190: + if (fatal && sfatal) { + goto L210; + } + } +/* L200: */ + } + printf("\nEND OF TESTS\n"); + goto L230; + +L210: + printf("\n****** FATAL ERROR - TESTS ABANDONED ******\n"); + goto L230; + +L220: + printf("AMEND DATA FILE OR INCREASE ARRAY SIZES IN PROGRAM\n"); + printf("****** TESTS ABANDONED ******\n"); +L230: + if (trace) { +/* cl__1.cerr = 0; + cl__1.cunit = ntra; + cl__1.csta = 0; + f_clos(&cl__1);*/ + } +/* cl__1.cerr = 0; + cl__1.cunit = 6; + cl__1.csta = 0; + f_clos(&cl__1); + s_stop("", (ftnlen)0);*/ + exit(0); + +/* End of CBLAT3. */ + + return 0; +} /* MAIN__ */ + +/* Subroutine */ int cchk1_(char *sname, real *eps, real *thresh, integer * + nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, + integer *nidim, integer *idim, integer *nalf, complex *alf, integer * + nbet, complex *bet, integer *nmax, complex *a, complex *aa, complex * + as, complex *b, complex *bb, complex *bs, complex *c__, complex *cc, + complex *cs, complex *ct, real *g, integer *iorder) +{ + /* Initialized data */ + + static char ich[3] = "NTC"; + + /* System generated locals */ + integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, + i__3, i__4, i__5, i__6, i__7, i__8; + + /* Local variables */ + complex beta; + integer ldas, ldbs, ldcs; + logical same, null; + integer i__, k, m, n; + extern /* Subroutine */ int cmake_(char *, char *, char *, integer *, + integer *, complex *, integer *, complex *, integer *, logical *, + complex *); + complex alpha; + extern /* Subroutine */ int cmmch_(char *, char *, integer *, integer *, + integer *, complex *, complex *, integer *, complex *, integer *, + complex *, complex *, integer *, complex *, real *, complex *, + integer *, real *, real *, logical *, integer *, logical *); + logical isame[13], trana, tranb; + integer nargs; + logical reset; + extern /* Subroutine */ int cprcn1_(integer *, integer *, char *, integer + *, char *, char *, integer *, integer *, integer *, complex *, + integer *, integer *, complex *, integer *); + integer ia, ib, ma, mb, na, nb, nc, ik, im, in; + extern /* Subroutine */ int ccgemm3m_(integer *, char *, char *, integer *, + integer *, integer *, complex *, complex *, integer *, complex *, + integer *, complex *, complex *, integer *); + integer ks, ms, ns; + extern logical lceres_(char *, char *, integer *, integer *, complex *, + complex *, integer *); + char tranas[1], tranbs[1], transa[1], transb[1]; + real errmax; + integer ica, icb, laa, lbb, lda, lcc, ldb, ldc; + extern logical lce_(complex *, complex *, integer *); + complex als, bls; + real err; + +/* Tests CGEMM. */ + +/* Auxiliary routine for test program for Level 3 Blas. */ + +/* -- Written on 8-February-1989. */ +/* Jack Dongarra, Argonne National Laboratory. */ +/* Iain Duff, AERE Harwell. */ +/* Jeremy Du Croz, Numerical Algorithms Group Ltd. */ +/* Sven Hammarling, Numerical Algorithms Group Ltd. */ + + /* Parameter adjustments */ + --idim; + --alf; + --bet; + --g; + --ct; + --cs; + --cc; + c_dim1 = *nmax; + c_offset = 1 + c_dim1 * 1; + c__ -= c_offset; + --bs; + --bb; + b_dim1 = *nmax; + b_offset = 1 + b_dim1 * 1; + b -= b_offset; + --as; + --aa; + a_dim1 = *nmax; + a_offset = 1 + a_dim1 * 1; + a -= a_offset; + + /* Function Body */ + + nargs = 13; + nc = 0; + reset = TRUE_; + errmax = 0.f; + + i__1 = *nidim; + for (im = 1; im <= i__1; ++im) { + m = idim[im]; + + i__2 = *nidim; + for (in = 1; in <= i__2; ++in) { + n = idim[in]; +/* Set LDC to 1 more than minimum value if room. */ + ldc = m; + if (ldc < *nmax) { + ++ldc; + } +/* Skip tests if not enough room. */ + if (ldc > *nmax) { + goto L100; + } + lcc = ldc * n; + null = n <= 0 || m <= 0; + + i__3 = *nidim; + for (ik = 1; ik <= i__3; ++ik) { + k = idim[ik]; + + for (ica = 1; ica <= 3; ++ica) { + *(unsigned char *)transa = *(unsigned char *)&ich[ica - 1] + ; + trana = *(unsigned char *)transa == 'T' || *(unsigned + char *)transa == 'C'; + + if (trana) { + ma = k; + na = m; + } else { + ma = m; + na = k; + } +/* Set LDA to 1 more than minimum value if room. */ + lda = ma; + if (lda < *nmax) { + ++lda; + } +/* Skip tests if not enough room. */ + if (lda > *nmax) { + goto L80; + } + laa = lda * na; + +/* Generate the matrix A. */ + + cmake_("ge", " ", " ", &ma, &na, &a[a_offset], nmax, &aa[ + 1], &lda, &reset, &c_b1); + + for (icb = 1; icb <= 3; ++icb) { + *(unsigned char *)transb = *(unsigned char *)&ich[icb + - 1]; + tranb = *(unsigned char *)transb == 'T' || *(unsigned + char *)transb == 'C'; + + if (tranb) { + mb = n; + nb = k; + } else { + mb = k; + nb = n; + } +/* Set LDB to 1 more than minimum value if room. */ + ldb = mb; + if (ldb < *nmax) { + ++ldb; + } +/* Skip tests if not enough room. */ + if (ldb > *nmax) { + goto L70; + } + lbb = ldb * nb; + +/* Generate the matrix B. */ + + cmake_("ge", " ", " ", &mb, &nb, &b[b_offset], nmax, & + bb[1], &ldb, &reset, &c_b1); + + i__4 = *nalf; + for (ia = 1; ia <= i__4; ++ia) { + i__5 = ia; + alpha.r = alf[i__5].r, alpha.i = alf[i__5].i; + + i__5 = *nbet; + for (ib = 1; ib <= i__5; ++ib) { + i__6 = ib; + beta.r = bet[i__6].r, beta.i = bet[i__6].i; + +/* Generate the matrix C. */ + + cmake_("ge", " ", " ", &m, &n, &c__[c_offset], + nmax, &cc[1], &ldc, &reset, &c_b1); + + ++nc; + +/* Save every datum before calling the */ +/* subroutine. */ + + *(unsigned char *)tranas = *(unsigned char *) + transa; + *(unsigned char *)tranbs = *(unsigned char *) + transb; + ms = m; + ns = n; + ks = k; + als.r = alpha.r, als.i = alpha.i; + i__6 = laa; + for (i__ = 1; i__ <= i__6; ++i__) { + i__7 = i__; + i__8 = i__; + as[i__7].r = aa[i__8].r, as[i__7].i = aa[ + i__8].i; +/* L10: */ + } + ldas = lda; + i__6 = lbb; + for (i__ = 1; i__ <= i__6; ++i__) { + i__7 = i__; + i__8 = i__; + bs[i__7].r = bb[i__8].r, bs[i__7].i = bb[ + i__8].i; +/* L20: */ + } + ldbs = ldb; + bls.r = beta.r, bls.i = beta.i; + i__6 = lcc; + for (i__ = 1; i__ <= i__6; ++i__) { + i__7 = i__; + i__8 = i__; + cs[i__7].r = cc[i__8].r, cs[i__7].i = cc[ + i__8].i; +/* L30: */ + } + ldcs = ldc; + +/* Call the subroutine. */ + + if (*trace) { + cprcn1_(ntra, &nc, sname, iorder, transa, + transb, &m, &n, &k, &alpha, &lda, + &ldb, &beta, &ldc); + } + if (*rewi) { +/* al__1.aerr = 0; + al__1.aunit = *ntra; + f_rew(&al__1); */ + } + ccgemm3m_(iorder, transa, transb, &m, &n, &k, & + alpha, &aa[1], &lda, &bb[1], &ldb, & + beta, &cc[1], &ldc); + +/* Check if error-exit was taken incorrectly. */ + + if (! infoc_1.ok) { +// io___128.ciunit = *nout; +// s_wsfe(&io___128); +// e_wsfe(); + printf(" *** FATAL ERROR - ERROR-CALL MYEXIT TAKEN ON VALID CALL\n"); + *fatal = TRUE_; + goto L120; + } + +/* See what data changed inside subroutines. */ + + isame[0] = *(unsigned char *)transa == *( + unsigned char *)tranas; + isame[1] = *(unsigned char *)transb == *( + unsigned char *)tranbs; + isame[2] = ms == m; + isame[3] = ns == n; + isame[4] = ks == k; + isame[5] = als.r == alpha.r && als.i == + alpha.i; + isame[6] = lce_(&as[1], &aa[1], &laa); + isame[7] = ldas == lda; + isame[8] = lce_(&bs[1], &bb[1], &lbb); + isame[9] = ldbs == ldb; + isame[10] = bls.r == beta.r && bls.i == + beta.i; + if (null) { + isame[11] = lce_(&cs[1], &cc[1], &lcc); + } else { + isame[11] = lceres_("ge", " ", &m, &n, & + cs[1], &cc[1], &ldc); + } + isame[12] = ldcs == ldc; + +/* If data was incorrectly changed, report */ +/* and return. */ + + same = TRUE_; + i__6 = nargs; + for (i__ = 1; i__ <= i__6; ++i__) { + same = same && isame[i__ - 1]; + if (! isame[i__ - 1]) { + printf(" ******* FATAL ERROR - PARAMETER NUMBER %d WAS CHANGED INCORRECTLY *******\n",i__);; + } +/* L40: */ + } + if (! same) { + *fatal = TRUE_; + goto L120; + } + + if (! null) { + +/* Check the result. */ + + cmmch_(transa, transb, &m, &n, &k, &alpha, + &a[a_offset], nmax, &b[b_offset], + nmax, &beta, &c__[c_offset], + nmax, &ct[1], &g[1], &cc[1], &ldc, + eps, &err, fatal, nout, &c_true); + errmax = f2cmax(errmax,err); +/* If got really bad answer, report and */ +/* return. */ + if (*fatal) { + goto L120; + } + } + +/* L50: */ + } + +/* L60: */ + } + +L70: + ; + } + +L80: + ; + } + +/* L90: */ + } + +L100: + ; + } + +/* L110: */ + } + +/* Report result. */ + + if (errmax < *thresh) { + if (*iorder == 0) { + printf("%s PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS (%d CALLS)\n",sname,nc); + } + if (*iorder == 1) { + printf("%s PASSED THE ROW-MAJOR COMPUTATIONAL TESTS (%d CALLS)\n",sname,nc); + } + } else { + if (*iorder == 0) { + printf("%s COMPLETED THE COLUMN-MAJOR COMPUTATIONAL TESTS (%d CALLS)/n",sname,nc); + printf("***** BUT WITH MAXIMUM TEST RATIO %8.2f - SUSPECT *******/n",errmax); + } + if (*iorder == 1) { + printf("%s COMPLETED THE ROW-MAJOR COMPUTATIONAL TESTS (%d CALLS)/n",sname,nc); + printf("***** BUT WITH MAXIMUM TEST RATIO %8.2f - SUSPECT *******/n",errmax); + } + } + goto L130; + +L120: + printf(" ******* %s FAILED ON CALL NUMBER:\n",sname); + cprcn1_(nout, &nc, sname, iorder, transa, transb, &m, &n, &k, &alpha, & + lda, &ldb, &beta, &ldc); + +L130: + return 0; + +/* 9995 FORMAT( 1X, I6, ': ', A12,'(''', A1, ''',''', A1, ''',', */ +/* $ 3( I3, ',' ), '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, */ +/* $ ',(', F4.1, ',', F4.1, '), C,', I3, ').' ) */ + +/* End of CCHK1. */ + +} /* cchk1_ */ + + +/* Subroutine */ int cprcn1_(integer *nout, integer *nc, char *sname, integer + *iorder, char *transa, char *transb, integer *m, integer *n, integer * + k, complex *alpha, integer *lda, integer *ldb, complex *beta, integer + *ldc) +{ + /* Local variables */ + char crc[14], cta[14], ctb[14]; + + if (*(unsigned char *)transa == 'N') { + s_copy(cta, " CblasNoTrans", (ftnlen)14, (ftnlen)14); + } else if (*(unsigned char *)transa == 'T') { + s_copy(cta, " CblasTrans", (ftnlen)14, (ftnlen)14); + } else { + s_copy(cta, "CblasConjTrans", (ftnlen)14, (ftnlen)14); + } + if (*(unsigned char *)transb == 'N') { + s_copy(ctb, " CblasNoTrans", (ftnlen)14, (ftnlen)14); + } else if (*(unsigned char *)transb == 'T') { + s_copy(ctb, " CblasTrans", (ftnlen)14, (ftnlen)14); + } else { + s_copy(ctb, "CblasConjTrans", (ftnlen)14, (ftnlen)14); + } + if (*iorder == 1) { + s_copy(crc, " CblasRowMajor", (ftnlen)14, (ftnlen)14); + } else { + s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14); + } + printf("%6d: %s %s %s %s\n",*nc,sname,crc,cta,ctb); + printf("%d %d %d (%4.1f,%4.1f) , A, %d, B, %d, (%4.1f,%4.1f) , C, %d.\n",*m,*n,*k,alpha->r,alpha->i,*lda,*ldb,beta->r,beta->i,*ldc); + return 0; +} /* cprcn1_ */ + + +/* Subroutine */ int cchk2_(char *sname, real *eps, real *thresh, integer * + nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, + integer *nidim, integer *idim, integer *nalf, complex *alf, integer * + nbet, complex *bet, integer *nmax, complex *a, complex *aa, complex * + as, complex *b, complex *bb, complex *bs, complex *c__, complex *cc, + complex *cs, complex *ct, real *g, integer *iorder) +{ + /* Initialized data */ + + static char ichs[2] = "LR"; + static char ichu[2] = "UL"; + + /* System generated locals */ + integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, + i__3, i__4, i__5, i__6, i__7; + + /* Local variables */ + complex beta; + integer ldas, ldbs, ldcs; + logical same; + char side[1]; + logical conj, left, null; + char uplo[1]; + integer i__, m, n; + extern /* Subroutine */ int cmake_(char *, char *, char *, integer *, + integer *, complex *, integer *, complex *, integer *, logical *, + complex *); + complex alpha; + extern /* Subroutine */ int cmmch_(char *, char *, integer *, integer *, + integer *, complex *, complex *, integer *, complex *, integer *, + complex *, complex *, integer *, complex *, real *, complex *, + integer *, real *, real *, logical *, integer *, logical *); + logical isame[13]; + char sides[1]; + integer nargs; + logical reset; + char uplos[1]; + extern /* Subroutine */ int cprcn2_(integer *, integer *, char *, integer + *, char *, char *, integer *, integer *, complex *, integer *, + integer *, complex *, integer *); + integer ia, ib, na, nc, im, in; + extern /* Subroutine */ int cchemm_(integer *, char *, char *, integer *, + integer *, complex *, complex *, integer *, complex *, integer *, + complex *, complex *, integer *); + integer ms, ns; + extern logical lceres_(char *, char *, integer *, integer *, complex *, + complex *, integer *); + extern /* Subroutine */ int ccsymm_(integer *, char *, char *, integer *, + integer *, complex *, complex *, integer *, complex *, integer *, + complex *, complex *, integer *); + real errmax; + integer laa, lbb, lda, lcc, ldb, ldc; + extern logical lce_(complex *, complex *, integer *); + integer ics; + complex als, bls; + integer icu; + real err; + +/* Tests CHEMM and CSYMM. */ + +/* Auxiliary routine for test program for Level 3 Blas. */ + +/* -- Written on 8-February-1989. */ +/* Jack Dongarra, Argonne National Laboratory. */ +/* Iain Duff, AERE Harwell. */ +/* Jeremy Du Croz, Numerical Algorithms Group Ltd. */ +/* Sven Hammarling, Numerical Algorithms Group Ltd. */ + + /* Parameter adjustments */ + --idim; + --alf; + --bet; + --g; + --ct; + --cs; + --cc; + c_dim1 = *nmax; + c_offset = 1 + c_dim1 * 1; + c__ -= c_offset; + --bs; + --bb; + b_dim1 = *nmax; + b_offset = 1 + b_dim1 * 1; + b -= b_offset; + --as; + --aa; + a_dim1 = *nmax; + a_offset = 1 + a_dim1 * 1; + a -= a_offset; + + /* Function Body */ + conj = s_cmp(sname + 7, "he", (ftnlen)2, (ftnlen)2) == 0; + + nargs = 12; + nc = 0; + reset = TRUE_; + errmax = 0.f; + + i__1 = *nidim; + for (im = 1; im <= i__1; ++im) { + m = idim[im]; + + i__2 = *nidim; + for (in = 1; in <= i__2; ++in) { + n = idim[in]; +/* Set LDC to 1 more than minimum value if room. */ + ldc = m; + if (ldc < *nmax) { + ++ldc; + } +/* Skip tests if not enough room. */ + if (ldc > *nmax) { + goto L90; + } + lcc = ldc * n; + null = n <= 0 || m <= 0; +/* Set LDB to 1 more than minimum value if room. */ + ldb = m; + if (ldb < *nmax) { + ++ldb; + } +/* Skip tests if not enough room. */ + if (ldb > *nmax) { + goto L90; + } + lbb = ldb * n; + +/* Generate the matrix B. */ + + cmake_("ge", " ", " ", &m, &n, &b[b_offset], nmax, &bb[1], &ldb, & + reset, &c_b1); + + for (ics = 1; ics <= 2; ++ics) { + *(unsigned char *)side = *(unsigned char *)&ichs[ics - 1]; + left = *(unsigned char *)side == 'L'; + + if (left) { + na = m; + } else { + na = n; + } +/* Set LDA to 1 more than minimum value if room. */ + lda = na; + if (lda < *nmax) { + ++lda; + } +/* Skip tests if not enough room. */ + if (lda > *nmax) { + goto L80; + } + laa = lda * na; + + for (icu = 1; icu <= 2; ++icu) { + *(unsigned char *)uplo = *(unsigned char *)&ichu[icu - 1]; + +/* Generate the hermitian or symmetric matrix A. */ + + cmake_(sname + 7, uplo, " ", &na, &na, &a[a_offset], nmax, + &aa[1], &lda, &reset, &c_b1); + + i__3 = *nalf; + for (ia = 1; ia <= i__3; ++ia) { + i__4 = ia; + alpha.r = alf[i__4].r, alpha.i = alf[i__4].i; + + i__4 = *nbet; + for (ib = 1; ib <= i__4; ++ib) { + i__5 = ib; + beta.r = bet[i__5].r, beta.i = bet[i__5].i; + +/* Generate the matrix C. */ + + cmake_("ge", " ", " ", &m, &n, &c__[c_offset], + nmax, &cc[1], &ldc, &reset, &c_b1); + + ++nc; + +/* Save every datum before calling the */ +/* subroutine. */ + + *(unsigned char *)sides = *(unsigned char *)side; + *(unsigned char *)uplos = *(unsigned char *)uplo; + ms = m; + ns = n; + als.r = alpha.r, als.i = alpha.i; + i__5 = laa; + for (i__ = 1; i__ <= i__5; ++i__) { + i__6 = i__; + i__7 = i__; + as[i__6].r = aa[i__7].r, as[i__6].i = aa[i__7] + .i; +/* L10: */ + } + ldas = lda; + i__5 = lbb; + for (i__ = 1; i__ <= i__5; ++i__) { + i__6 = i__; + i__7 = i__; + bs[i__6].r = bb[i__7].r, bs[i__6].i = bb[i__7] + .i; +/* L20: */ + } + ldbs = ldb; + bls.r = beta.r, bls.i = beta.i; + i__5 = lcc; + for (i__ = 1; i__ <= i__5; ++i__) { + i__6 = i__; + i__7 = i__; + cs[i__6].r = cc[i__7].r, cs[i__6].i = cc[i__7] + .i; +/* L30: */ + } + ldcs = ldc; + +/* Call the subroutine. */ + + if (*trace) { + cprcn2_(ntra, &nc, sname, iorder, side, uplo, + &m, &n, &alpha, &lda, &ldb, &beta, & + ldc) + ; + } + if (*rewi) { +/* al__1.aerr = 0; + al__1.aunit = *ntra; + f_rew(&al__1);*/ + } + if (conj) { + cchemm_(iorder, side, uplo, &m, &n, &alpha, & + aa[1], &lda, &bb[1], &ldb, &beta, &cc[ + 1], &ldc); + } else { + ccsymm_(iorder, side, uplo, &m, &n, &alpha, & + aa[1], &lda, &bb[1], &ldb, &beta, &cc[ + 1], &ldc); + } + +/* Check if error-exit was taken incorrectly. */ + + if (! infoc_1.ok) { + printf("*** FATAL ERROR - ERROR-CALL MYEXIT TAKEN ON VALID CALL\n"); + *fatal = TRUE_; + goto L110; + } + +/* See what data changed inside subroutines. */ + + isame[0] = *(unsigned char *)sides == *(unsigned + char *)side; + isame[1] = *(unsigned char *)uplos == *(unsigned + char *)uplo; + isame[2] = ms == m; + isame[3] = ns == n; + isame[4] = als.r == alpha.r && als.i == alpha.i; + isame[5] = lce_(&as[1], &aa[1], &laa); + isame[6] = ldas == lda; + isame[7] = lce_(&bs[1], &bb[1], &lbb); + isame[8] = ldbs == ldb; + isame[9] = bls.r == beta.r && bls.i == beta.i; + if (null) { + isame[10] = lce_(&cs[1], &cc[1], &lcc); + } else { + isame[10] = lceres_("ge", " ", &m, &n, &cs[1], + &cc[1], &ldc); + } + isame[11] = ldcs == ldc; + +/* If data was incorrectly changed, report and */ +/* return. */ + + same = TRUE_; + i__5 = nargs; + for (i__ = 1; i__ <= i__5; ++i__) { + same = same && isame[i__ - 1]; + if (! isame[i__ - 1]) { + printf(" ******* FATAL ERROR - PARAMETER NUMBER %d WAS CHANGED INCORRECTLY *******\n",i__); + } +/* L40: */ + } + if (! same) { + *fatal = TRUE_; + goto L110; + } + + if (! null) { + +/* Check the result. */ + + if (left) { + cmmch_("N", "N", &m, &n, &m, &alpha, &a[ + a_offset], nmax, &b[b_offset], + nmax, &beta, &c__[c_offset], nmax, + &ct[1], &g[1], &cc[1], &ldc, eps, + &err, fatal, nout, &c_true); + } else { + cmmch_("N", "N", &m, &n, &n, &alpha, &b[ + b_offset], nmax, &a[a_offset], + nmax, &beta, &c__[c_offset], nmax, + &ct[1], &g[1], &cc[1], &ldc, eps, + &err, fatal, nout, &c_true); + } + errmax = f2cmax(errmax,err); +/* If got really bad answer, report and */ +/* return. */ + if (*fatal) { + goto L110; + } + } + +/* L50: */ + } + +/* L60: */ + } + +/* L70: */ + } + +L80: + ; + } + +L90: + ; + } + +/* L100: */ + } + +/* Report result. */ + + if (errmax < *thresh) { + if (*iorder == 0) { + printf("%s PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS (%d CALLS)\n",sname,nc); + } + if (*iorder == 1) { + printf("%s PASSED THE ROW-MAJOR COMPUTATIONAL TESTS (%d CALLS)\n",sname,nc); + } + } else { + if (*iorder == 0) { + printf("%s COMPLETED THE COLUMN-MAJOR COMPUTATIONAL TESTS (%d CALLS)/n",sname,nc); + printf("***** BUT WITH MAXIMUM TEST RATIO %8.2f - SUSPECT *******/n",errmax); + } + if (*iorder == 1) { + printf("%s COMPLETED THE ROW-MAJOR COMPUTATIONAL TESTS (%d CALLS)/n",sname,nc); + printf("***** BUT WITH MAXIMUM TEST RATIO %8.2f - SUSPECT *******/n",errmax); + } + } + goto L120; + +L110: + printf(" ******* %s FAILED ON CALL NUMBER:\n",sname); + cprcn2_(nout, &nc, sname, iorder, side, uplo, &m, &n, &alpha, &lda, &ldb, + &beta, &ldc); + +L120: + return 0; + +/* 9995 FORMAT(1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), */ +/* $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ',(', F4.1, */ +/* $ ',', F4.1, '), C,', I3, ') .' ) */ + +/* End of CCHK2. */ + +} /* cchk2_ */ + + +/* Subroutine */ int cprcn2_(integer *nout, integer *nc, char *sname, integer + *iorder, char *side, char *uplo, integer *m, integer *n, complex * + alpha, integer *lda, integer *ldb, complex *beta, integer *ldc) +{ + /* Local variables */ + char cs[14], cu[14], crc[14]; + + if (*(unsigned char *)side == 'L') { + s_copy(cs, " CblasLeft", (ftnlen)14, (ftnlen)14); + } else { + s_copy(cs, " CblasRight", (ftnlen)14, (ftnlen)14); + } + if (*(unsigned char *)uplo == 'U') { + s_copy(cu, " CblasUpper", (ftnlen)14, (ftnlen)14); + } else { + s_copy(cu, " CblasLower", (ftnlen)14, (ftnlen)14); + } + if (*iorder == 1) { + s_copy(crc, " CblasRowMajor", (ftnlen)14, (ftnlen)14); + } else { + s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14); + } + printf("%6d: %s %s %s %s\n",*nc,sname,crc,cs,cu); + printf("%d %d (%4.1f,%4.1f) , A, %d, B, %d, (%4.1f,%4.1f) , C, %d.\n",*m,*n,alpha->r,alpha->i,*lda,*ldb,beta->r,beta->i,*ldc); + return 0; +} /* cprcn2_ */ + + +/* Subroutine */ int cchk3_(char *sname, real *eps, real *thresh, integer * + nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, + integer *nidim, integer *idim, integer *nalf, complex *alf, integer * + nmax, complex *a, complex *aa, complex *as, complex *b, complex *bb, + complex *bs, complex *ct, real *g, complex *c__, integer *iorder) +{ + /* Initialized data */ + + static char ichu[2] = "UL"; + static char icht[3] = "NTC"; + static char ichd[2] = "UN"; + static char ichs[2] = "LR"; + + /* System generated locals */ + integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, + i__3, i__4, i__5, i__6, i__7; + complex q__1; + + /* Local variables */ + char diag[1]; + integer ldas, ldbs; + logical same; + char side[1]; + logical left, null; + char uplo[1]; + integer i__, j, m, n; + extern /* Subroutine */ int cmake_(char *, char *, char *, integer *, + integer *, complex *, integer *, complex *, integer *, logical *, + complex *); + complex alpha; + char diags[1]; + extern /* Subroutine */ int cmmch_(char *, char *, integer *, integer *, + integer *, complex *, complex *, integer *, complex *, integer *, + complex *, complex *, integer *, complex *, real *, complex *, + integer *, real *, real *, logical *, integer *, logical *); + logical isame[13]; + char sides[1]; + integer nargs; + logical reset; + char uplos[1]; + extern /* Subroutine */ int cprcn3_(integer *, integer *, char *, integer + *, char *, char *, char *, char *, integer *, integer *, complex * + , integer *, integer *); + integer ia, na, nc, im, in, ms, ns; + extern logical lceres_(char *, char *, integer *, integer *, complex *, + complex *, integer *); + extern /* Subroutine */ int cctrmm_(integer *, char *, char *, char *, + char *, integer *, integer *, complex *, complex *, integer *, + complex *, integer *); + char tranas[1], transa[1]; + extern /* Subroutine */ int cctrsm_(integer *, char *, char *, char *, + char *, integer *, integer *, complex *, complex *, integer *, + complex *, integer *); + real errmax; + integer laa, icd, lbb, lda, ldb; + extern logical lce_(complex *, complex *, integer *); + integer ics; + complex als; + integer ict, icu; + real err; + +/* Tests CTRMM and CTRSM. */ + +/* Auxiliary routine for test program for Level 3 Blas. */ + +/* -- Written on 8-February-1989. */ +/* Jack Dongarra, Argonne National Laboratory. */ +/* Iain Duff, AERE Harwell. */ +/* Jeremy Du Croz, Numerical Algorithms Group Ltd. */ +/* Sven Hammarling, Numerical Algorithms Group Ltd. */ + + /* Parameter adjustments */ + --idim; + --alf; + c_dim1 = *nmax; + c_offset = 1 + c_dim1 * 1; + c__ -= c_offset; + --g; + --ct; + --bs; + --bb; + b_dim1 = *nmax; + b_offset = 1 + b_dim1 * 1; + b -= b_offset; + --as; + --aa; + a_dim1 = *nmax; + a_offset = 1 + a_dim1 * 1; + a -= a_offset; + + /* Function Body */ + + nargs = 11; + nc = 0; + reset = TRUE_; + errmax = 0.f; +/* Set up zero matrix for CMMCH. */ + i__1 = *nmax; + for (j = 1; j <= i__1; ++j) { + i__2 = *nmax; + for (i__ = 1; i__ <= i__2; ++i__) { + i__3 = i__ + j * c_dim1; + c__[i__3].r = 0.f, c__[i__3].i = 0.f; +/* L10: */ + } +/* L20: */ + } + + i__1 = *nidim; + for (im = 1; im <= i__1; ++im) { + m = idim[im]; + + i__2 = *nidim; + for (in = 1; in <= i__2; ++in) { + n = idim[in]; +/* Set LDB to 1 more than minimum value if room. */ + ldb = m; + if (ldb < *nmax) { + ++ldb; + } +/* Skip tests if not enough room. */ + if (ldb > *nmax) { + goto L130; + } + lbb = ldb * n; + null = m <= 0 || n <= 0; + + for (ics = 1; ics <= 2; ++ics) { + *(unsigned char *)side = *(unsigned char *)&ichs[ics - 1]; + left = *(unsigned char *)side == 'L'; + if (left) { + na = m; + } else { + na = n; + } +/* Set LDA to 1 more than minimum value if room. */ + lda = na; + if (lda < *nmax) { + ++lda; + } +/* Skip tests if not enough room. */ + if (lda > *nmax) { + goto L130; + } + laa = lda * na; + + for (icu = 1; icu <= 2; ++icu) { + *(unsigned char *)uplo = *(unsigned char *)&ichu[icu - 1]; + + for (ict = 1; ict <= 3; ++ict) { + *(unsigned char *)transa = *(unsigned char *)&icht[ + ict - 1]; + + for (icd = 1; icd <= 2; ++icd) { + *(unsigned char *)diag = *(unsigned char *)&ichd[ + icd - 1]; + + i__3 = *nalf; + for (ia = 1; ia <= i__3; ++ia) { + i__4 = ia; + alpha.r = alf[i__4].r, alpha.i = alf[i__4].i; + +/* Generate the matrix A. */ + + cmake_("tr", uplo, diag, &na, &na, &a[ + a_offset], nmax, &aa[1], &lda, &reset, + &c_b1); + +/* Generate the matrix B. */ + + cmake_("ge", " ", " ", &m, &n, &b[b_offset], + nmax, &bb[1], &ldb, &reset, &c_b1); + + ++nc; + +/* Save every datum before calling the */ +/* subroutine. */ + + *(unsigned char *)sides = *(unsigned char *) + side; + *(unsigned char *)uplos = *(unsigned char *) + uplo; + *(unsigned char *)tranas = *(unsigned char *) + transa; + *(unsigned char *)diags = *(unsigned char *) + diag; + ms = m; + ns = n; + als.r = alpha.r, als.i = alpha.i; + i__4 = laa; + for (i__ = 1; i__ <= i__4; ++i__) { + i__5 = i__; + i__6 = i__; + as[i__5].r = aa[i__6].r, as[i__5].i = aa[ + i__6].i; +/* L30: */ + } + ldas = lda; + i__4 = lbb; + for (i__ = 1; i__ <= i__4; ++i__) { + i__5 = i__; + i__6 = i__; + bs[i__5].r = bb[i__6].r, bs[i__5].i = bb[ + i__6].i; +/* L40: */ + } + ldbs = ldb; + +/* Call the subroutine. */ + + if (s_cmp(sname + 9, "mm", (ftnlen)2, (ftnlen) + 2) == 0) { + if (*trace) { + cprcn3_(ntra, &nc, sname, iorder, + side, uplo, transa, diag, &m, + &n, &alpha, &lda, &ldb/*, ( + ftnlen)12, (ftnlen)1, (ftnlen) + 1, (ftnlen)1, (ftnlen)1*/); + } + if (*rewi) { +/* al__1.aerr = 0; + al__1.aunit = *ntra; + f_rew(&al__1);*/ + } + cctrmm_(iorder, side, uplo, transa, diag, + &m, &n, &alpha, &aa[1], &lda, &bb[ + 1], &ldb); + } else if (s_cmp(sname + 9, "sm", (ftnlen)2, ( + ftnlen)2) == 0) { + if (*trace) { + cprcn3_(ntra, &nc, sname, iorder, + side, uplo, transa, diag, &m, + &n, &alpha, &lda, &ldb/*, ( + ftnlen)12, (ftnlen)1, (ftnlen) + 1, (ftnlen)1, (ftnlen)1*/); + } + if (*rewi) { +/* al__1.aerr = 0; + al__1.aunit = *ntra; + f_rew(&al__1);*/ + } + cctrsm_(iorder, side, uplo, transa, diag, + &m, &n, &alpha, &aa[1], &lda, &bb[ + 1], &ldb); + } + +/* Check if error-exit was taken incorrectly. */ + + if (! infoc_1.ok) { + printf("*** FATAL ERROR - ERROR-CALL MYEXIT TAKEN ON VALID CALL\n"); + *fatal = TRUE_; + goto L150; + } + +/* See what data changed inside subroutines. */ + + isame[0] = *(unsigned char *)sides == *( + unsigned char *)side; + isame[1] = *(unsigned char *)uplos == *( + unsigned char *)uplo; + isame[2] = *(unsigned char *)tranas == *( + unsigned char *)transa; + isame[3] = *(unsigned char *)diags == *( + unsigned char *)diag; + isame[4] = ms == m; + isame[5] = ns == n; + isame[6] = als.r == alpha.r && als.i == + alpha.i; + isame[7] = lce_(&as[1], &aa[1], &laa); + isame[8] = ldas == lda; + if (null) { + isame[9] = lce_(&bs[1], &bb[1], &lbb); + } else { + isame[9] = lceres_("ge", " ", &m, &n, &bs[ + 1], &bb[1], &ldb); + } + isame[10] = ldbs == ldb; + +/* If data was incorrectly changed, report and */ +/* return. */ + + same = TRUE_; + i__4 = nargs; + for (i__ = 1; i__ <= i__4; ++i__) { + same = same && isame[i__ - 1]; + if (! isame[i__ - 1]) { + printf(" ******* FATAL ERROR - PARAMETER NUMBER %d WAS CHANGED INCORRECTLY *******\n",i__); + } +/* L50: */ + } + if (! same) { + *fatal = TRUE_; + goto L150; + } + + if (! null) { + if (s_cmp(sname + 9, "mm", (ftnlen)2, ( + ftnlen)2) == 0) { + +/* Check the result. */ + + if (left) { + cmmch_(transa, "N", &m, &n, &m, & + alpha, &a[a_offset], nmax, + &b[b_offset], nmax, & + c_b1, &c__[c_offset], + nmax, &ct[1], &g[1], &bb[ + 1], &ldb, eps, &err, + fatal, nout, &c_true/*, ( + ftnlen)1, (ftnlen)1*/); + } else { + cmmch_("N", transa, &m, &n, &n, & + alpha, &b[b_offset], nmax, + &a[a_offset], nmax, & + c_b1, &c__[c_offset], + nmax, &ct[1], &g[1], &bb[ + 1], &ldb, eps, &err, + fatal, nout, &c_true); + } + } else if (s_cmp(sname + 9, "sm", (ftnlen) + 2, (ftnlen)2) == 0) { + +/* Compute approximation to original */ +/* matrix. */ + + i__4 = n; + for (j = 1; j <= i__4; ++j) { + i__5 = m; + for (i__ = 1; i__ <= i__5; ++i__) + { + i__6 = i__ + j * c_dim1; + i__7 = i__ + (j - 1) * ldb; + c__[i__6].r = bb[i__7].r, c__[i__6].i = bb[i__7].i; + i__6 = i__ + (j - 1) * ldb; + i__7 = i__ + j * b_dim1; + q__1.r = alpha.r * b[i__7].r - alpha.i * b[i__7].i, + q__1.i = alpha.r * b[i__7].i + alpha.i * b[ + i__7].r; + bb[i__6].r = q__1.r, bb[i__6].i = q__1.i; +/* L60: */ + } +/* L70: */ + } + + if (left) { + cmmch_(transa, "N", &m, &n, &m, & + c_b2, &a[a_offset], nmax, + &c__[c_offset], nmax, & + c_b1, &b[b_offset], nmax, + &ct[1], &g[1], &bb[1], & + ldb, eps, &err, fatal, + nout, &c_false); + } else { + cmmch_("N", transa, &m, &n, &n, & + c_b2, &c__[c_offset], + nmax, &a[a_offset], nmax, + &c_b1, &b[b_offset], nmax, + &ct[1], &g[1], &bb[1], & + ldb, eps, &err, fatal, + nout, &c_false); + } + } + errmax = f2cmax(errmax,err); +/* If got really bad answer, report and */ +/* return. */ + if (*fatal) { + goto L150; + } + } + +/* L80: */ + } + +/* L90: */ + } + +/* L100: */ + } + +/* L110: */ + } + +/* L120: */ + } + +L130: + ; + } + +/* L140: */ + } + +/* Report result. */ + + if (errmax < *thresh) { + if (*iorder == 0) { + printf("%s PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS (%d CALLS)\n",sname,nc); + } + if (*iorder == 1) { + printf("%s PASSED THE ROW-MAJOR COMPUTATIONAL TESTS (%d CALLS)\n",sname,nc); + } + } else { + if (*iorder == 0) { + printf("%s COMPLETED THE COLUMN-MAJOR COMPUTATIONAL TESTS (%d CALLS)/n",sname,nc); + printf("***** BUT WITH MAXIMUM TEST RATIO %8.2f - SUSPECT *******/n",errmax); + } + if (*iorder == 1) { + printf("%s COMPLETED THE ROW-MAJOR COMPUTATIONAL TESTS (%d CALLS)/n",sname,nc); + printf("***** BUT WITH MAXIMUM TEST RATIO %8.2f - SUSPECT *******/n",errmax); + } + } + goto L160; + +L150: + printf(" ******* %s FAILED ON CALL NUMBER:\n",sname); + if (*trace) { + cprcn3_(ntra, &nc, sname, iorder, side, uplo, transa, diag, &m, &n, & + alpha, &lda, &ldb); + } + +L160: + return 0; + +/* 9995 FORMAT(1X, I6, ': ', A12,'(', 4( '''', A1, ''',' ), 2( I3, ',' ), */ +/* $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ') ', */ +/* $ ' .' ) */ + +/* End of CCHK3. */ + +} /* cchk3_ */ + + +/* Subroutine */ int cprcn3_(integer *nout, integer *nc, char *sname, integer + *iorder, char *side, char *uplo, char *transa, char *diag, integer *m, + integer *n, complex *alpha, integer *lda, integer *ldb) +{ + /* Local variables */ + char ca[14], cd[14], cs[14], cu[14], crc[14]; + + if (*(unsigned char *)side == 'L') { + s_copy(cs, " CblasLeft", (ftnlen)14, (ftnlen)14); + } else { + s_copy(cs, " CblasRight", (ftnlen)14, (ftnlen)14); + } + if (*(unsigned char *)uplo == 'U') { + s_copy(cu, " CblasUpper", (ftnlen)14, (ftnlen)14); + } else { + s_copy(cu, " CblasLower", (ftnlen)14, (ftnlen)14); + } + if (*(unsigned char *)transa == 'N') { + s_copy(ca, " CblasNoTrans", (ftnlen)14, (ftnlen)14); + } else if (*(unsigned char *)transa == 'T') { + s_copy(ca, " CblasTrans", (ftnlen)14, (ftnlen)14); + } else { + s_copy(ca, "CblasConjTrans", (ftnlen)14, (ftnlen)14); + } + if (*(unsigned char *)diag == 'N') { + s_copy(cd, " CblasNonUnit", (ftnlen)14, (ftnlen)14); + } else { + s_copy(cd, " CblasUnit", (ftnlen)14, (ftnlen)14); + } + if (*iorder == 1) { + s_copy(crc, " CblasRowMajor", (ftnlen)14, (ftnlen)14); + } else { + s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14); + } + printf("%6d: %s %s %s %s\n",*nc,sname,crc,cs,cu); + printf(" %s %s %d %d (%4.1f,%4.1f) A %d B %d\n",ca,cd,*m,*n,alpha->r,alpha->i,*lda,*ldb); + + return 0; +} /* cprcn3_ */ + + +/* Subroutine */ int cchk4_(char *sname, real *eps, real *thresh, integer * + nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, + integer *nidim, integer *idim, integer *nalf, complex *alf, integer * + nbet, complex *bet, integer *nmax, complex *a, complex *aa, complex * + as, complex *b, complex *bb, complex *bs, complex *c__, complex *cc, + complex *cs, complex *ct, real *g, integer *iorder) +{ + /* Initialized data */ + + static char icht[2] = "NC"; + static char ichu[2] = "UL"; + + /* System generated locals */ + integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, + i__3, i__4, i__5, i__6, i__7; + complex q__1; + + /* Local variables */ + complex beta; + integer ldas, ldcs; + logical same, conj; + complex bets; + real rals; + logical tran, null; + char uplo[1]; + integer i__, j, k, n; + extern /* Subroutine */ int cmake_(char *, char *, char *, integer *, + integer *, complex *, integer *, complex *, integer *, logical *, + complex *); + complex alpha; + extern /* Subroutine */ int cmmch_(char *, char *, integer *, integer *, + integer *, complex *, complex *, integer *, complex *, integer *, + complex *, complex *, integer *, complex *, real *, complex *, + integer *, real *, real *, logical *, integer *, logical *); + real rbeta; + logical isame[13]; + integer nargs; + real rbets; + logical reset; + char trans[1]; + logical upper; + char uplos[1]; + extern /* Subroutine */ int cprcn4_(integer *, integer *, char *, integer + *, char *, char *, integer *, integer *, complex *, integer *, + complex *, integer *), cprcn6_(integer *, + integer *, char *, integer *, char *, char *, integer *, integer * + , real *, integer *, real *, integer *); + integer ia, ib, jc, ma, na, nc, ik, in, jj, lj, ks; + extern /* Subroutine */ int ccherk_(integer *, char *, char *, integer *, + integer *, real *, complex *, integer *, real *, complex *, + integer *); + integer ns; + real ralpha; + extern logical lceres_(char *, char *, integer *, integer *, complex *, + complex *, integer *); + real errmax; + extern /* Subroutine */ int ccsyrk_(integer *, char *, char *, integer *, + integer *, complex *, complex *, integer *, complex *, complex *, + integer *); + char transs[1], transt[1]; + integer laa, lda, lcc, ldc; + extern logical lce_(complex *, complex *, integer *); + complex als; + integer ict, icu; + real err; + +/* Tests CHERK and CSYRK. */ + +/* Auxiliary routine for test program for Level 3 Blas. */ + +/* -- Written on 8-February-1989. */ +/* Jack Dongarra, Argonne National Laboratory. */ +/* Iain Duff, AERE Harwell. */ +/* Jeremy Du Croz, Numerical Algorithms Group Ltd. */ +/* Sven Hammarling, Numerical Algorithms Group Ltd. */ + + /* Parameter adjustments */ + --idim; + --alf; + --bet; + --g; + --ct; + --cs; + --cc; + c_dim1 = *nmax; + c_offset = 1 + c_dim1 * 1; + c__ -= c_offset; + --bs; + --bb; + b_dim1 = *nmax; + b_offset = 1 + b_dim1 * 1; + b -= b_offset; + --as; + --aa; + a_dim1 = *nmax; + a_offset = 1 + a_dim1 * 1; + a -= a_offset; + + /* Function Body */ + conj = s_cmp(sname + 7, "he", (ftnlen)2, (ftnlen)2) == 0; + + nargs = 10; + nc = 0; + reset = TRUE_; + errmax = 0.f; + rals = 1.f; + rbets = 1.f; + + i__1 = *nidim; + for (in = 1; in <= i__1; ++in) { + n = idim[in]; +/* Set LDC to 1 more than minimum value if room. */ + ldc = n; + if (ldc < *nmax) { + ++ldc; + } +/* Skip tests if not enough room. */ + if (ldc > *nmax) { + goto L100; + } + lcc = ldc * n; + + i__2 = *nidim; + for (ik = 1; ik <= i__2; ++ik) { + k = idim[ik]; + + for (ict = 1; ict <= 2; ++ict) { + *(unsigned char *)trans = *(unsigned char *)&icht[ict - 1]; + tran = *(unsigned char *)trans == 'C'; + if (tran && ! conj) { + *(unsigned char *)trans = 'T'; + } + if (tran) { + ma = k; + na = n; + } else { + ma = n; + na = k; + } +/* Set LDA to 1 more than minimum value if room. */ + lda = ma; + if (lda < *nmax) { + ++lda; + } +/* Skip tests if not enough room. */ + if (lda > *nmax) { + goto L80; + } + laa = lda * na; + +/* Generate the matrix A. */ + + cmake_("ge", " ", " ", &ma, &na, &a[a_offset], nmax, &aa[1], & + lda, &reset, &c_b1); + + for (icu = 1; icu <= 2; ++icu) { + *(unsigned char *)uplo = *(unsigned char *)&ichu[icu - 1]; + upper = *(unsigned char *)uplo == 'U'; + + i__3 = *nalf; + for (ia = 1; ia <= i__3; ++ia) { + i__4 = ia; + alpha.r = alf[i__4].r, alpha.i = alf[i__4].i; + if (conj) { + ralpha = alpha.r; + q__1.r = ralpha, q__1.i = 0.f; + alpha.r = q__1.r, alpha.i = q__1.i; + } + + i__4 = *nbet; + for (ib = 1; ib <= i__4; ++ib) { + i__5 = ib; + beta.r = bet[i__5].r, beta.i = bet[i__5].i; + if (conj) { + rbeta = beta.r; + q__1.r = rbeta, q__1.i = 0.f; + beta.r = q__1.r, beta.i = q__1.i; + } + null = n <= 0; + if (conj) { + null = null || ((k <= 0 || ralpha == 0.f) && + rbeta == 1.f); + } + +/* Generate the matrix C. */ + + cmake_(sname + 7, uplo, " ", &n, &n, &c__[ + c_offset], nmax, &cc[1], &ldc, &reset, & + c_b1); + + ++nc; + +/* Save every datum before calling the subroutine. */ + + *(unsigned char *)uplos = *(unsigned char *)uplo; + *(unsigned char *)transs = *(unsigned char *) + trans; + ns = n; + ks = k; + if (conj) { + rals = ralpha; + } else { + als.r = alpha.r, als.i = alpha.i; + } + i__5 = laa; + for (i__ = 1; i__ <= i__5; ++i__) { + i__6 = i__; + i__7 = i__; + as[i__6].r = aa[i__7].r, as[i__6].i = aa[i__7] + .i; +/* L10: */ + } + ldas = lda; + if (conj) { + rbets = rbeta; + } else { + bets.r = beta.r, bets.i = beta.i; + } + i__5 = lcc; + for (i__ = 1; i__ <= i__5; ++i__) { + i__6 = i__; + i__7 = i__; + cs[i__6].r = cc[i__7].r, cs[i__6].i = cc[i__7] + .i; +/* L20: */ + } + ldcs = ldc; + +/* Call the subroutine. */ + + if (conj) { + if (*trace) { + cprcn6_(ntra, &nc, sname, iorder, uplo, + trans, &n, &k, &ralpha, &lda, & + rbeta, &ldc); + } + if (*rewi) { +/* al__1.aerr = 0; + al__1.aunit = *ntra; + f_rew(&al__1);*/ + } + ccherk_(iorder, uplo, trans, &n, &k, &ralpha, + &aa[1], &lda, &rbeta, &cc[1], &ldc); + } else { + if (*trace) { + cprcn4_(ntra, &nc, sname, iorder, uplo, + trans, &n, &k, &alpha, &lda, & + beta, &ldc); + } + if (*rewi) { +/* al__1.aerr = 0; + al__1.aunit = *ntra; + f_rew(&al__1);*/ + } + ccsyrk_(iorder, uplo, trans, &n, &k, &alpha, & + aa[1], &lda, &beta, &cc[1], &ldc); + } + +/* Check if error-exit was taken incorrectly. */ + + if (! infoc_1.ok) { + printf("*** FATAL ERROR - ERROR-CALL MYEXIT TAKEN ON VALID CALL\n"); + *fatal = TRUE_; + goto L120; + } + +/* See what data changed inside subroutines. */ + + isame[0] = *(unsigned char *)uplos == *(unsigned + char *)uplo; + isame[1] = *(unsigned char *)transs == *(unsigned + char *)trans; + isame[2] = ns == n; + isame[3] = ks == k; + if (conj) { + isame[4] = rals == ralpha; + } else { + isame[4] = als.r == alpha.r && als.i == + alpha.i; + } + isame[5] = lce_(&as[1], &aa[1], &laa); + isame[6] = ldas == lda; + if (conj) { + isame[7] = rbets == rbeta; + } else { + isame[7] = bets.r == beta.r && bets.i == + beta.i; + } + if (null) { + isame[8] = lce_(&cs[1], &cc[1], &lcc); + } else { + isame[8] = lceres_(sname + 7, uplo, &n, &n, & + cs[1], &cc[1], &ldc); + } + isame[9] = ldcs == ldc; + +/* If data was incorrectly changed, report and */ +/* return. */ + + same = TRUE_; + i__5 = nargs; + for (i__ = 1; i__ <= i__5; ++i__) { + same = same && isame[i__ - 1]; + if (! isame[i__ - 1]) { + printf(" ******* FATAL ERROR - PARAMETER NUMBER %d WAS CHANGED INCORRECTLY *******\n",i__); + } +/* L30: */ + } + if (! same) { + *fatal = TRUE_; + goto L120; + } + + if (! null) { + +/* Check the result column by column. */ + + if (conj) { + *(unsigned char *)transt = 'C'; + } else { + *(unsigned char *)transt = 'T'; + } + jc = 1; + i__5 = n; + for (j = 1; j <= i__5; ++j) { + if (upper) { + jj = 1; + lj = j; + } else { + jj = j; + lj = n - j + 1; + } + if (tran) { + cmmch_(transt, "N", &lj, &c__1, &k, & + alpha, &a[jj * a_dim1 + 1], + nmax, &a[j * a_dim1 + 1], + nmax, &beta, &c__[jj + j * + c_dim1], nmax, &ct[1], &g[1], + &cc[jc], &ldc, eps, &err, + fatal, nout, &c_true); + } else { + cmmch_("N", transt, &lj, &c__1, &k, & + alpha, &a[jj + a_dim1], nmax, + &a[j + a_dim1], nmax, &beta, & + c__[jj + j * c_dim1], nmax, & + ct[1], &g[1], &cc[jc], &ldc, + eps, &err, fatal, nout, & + c_true); + } + if (upper) { + jc += ldc; + } else { + jc = jc + ldc + 1; + } + errmax = f2cmax(errmax,err); +/* If got really bad answer, report and */ +/* return. */ + if (*fatal) { + goto L110; + } +/* L40: */ + } + } + +/* L50: */ + } + +/* L60: */ + } + +/* L70: */ + } + +L80: + ; + } + +/* L90: */ + } + +L100: + ; + } + +/* Report result. */ + + if (errmax < *thresh) { + if (*iorder == 0) { + printf("%s PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS (%d CALLS)\n",sname,nc); + } + if (*iorder == 1) { + printf("%s PASSED THE ROW-MAJOR COMPUTATIONAL TESTS (%d CALLS)\n",sname,nc); + } + } else { + if (*iorder == 0) { + printf("%s COMPLETED THE COLUMN-MAJOR COMPUTATIONAL TESTS (%d CALLS)/n",sname,nc); + printf("***** BUT WITH MAXIMUM TEST RATIO %8.2f - SUSPECT *******/n",errmax); + } + if (*iorder == 1) { + printf("%s COMPLETED THE ROW-MAJOR COMPUTATIONAL TESTS (%d CALLS)/n",sname,nc); + printf("***** BUT WITH MAXIMUM TEST RATIO %8.2f - SUSPECT *******/n",errmax); + } + } + goto L130; + +L110: + if (n > 1) { + printf(" THESE ARE THE RESULTS FOR COLUMN %d:\n",j); + } + +L120: + printf(" ******* %s FAILED ON CALL NUMBER:\n",sname); + if (conj) { + cprcn6_(nout, &nc, sname, iorder, uplo, trans, &n, &k, &ralpha, &lda, + &rbeta, &ldc); + } else { + cprcn4_(nout, &nc, sname, iorder, uplo, trans, &n, &k, &alpha, &lda, & + beta, &ldc); + } + +L130: + return 0; + +/* 9994 FORMAT(1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), */ +/* $ F4.1, ', A,', I3, ',', F4.1, ', C,', I3, ') ', */ +/* $ ' .' ) */ +/* 9993 FORMAT(1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), */ +/* $ '(', F4.1, ',', F4.1, ') , A,', I3, ',(', F4.1, ',', F4.1, */ +/* $ '), C,', I3, ') .' ) */ + +/* End of CCHK4. */ + +} /* cchk4_ */ + + +/* Subroutine */ int cprcn4_(integer *nout, integer *nc, char *sname, integer + *iorder, char *uplo, char *transa, integer *n, integer *k, complex * + alpha, integer *lda, complex *beta, integer *ldc) +{ + /* Local variables */ + char ca[14], cu[14], crc[14]; + + if (*(unsigned char *)uplo == 'U') { + s_copy(cu, " CblasUpper", (ftnlen)14, (ftnlen)14); + } else { + s_copy(cu, " CblasLower", (ftnlen)14, (ftnlen)14); + } + if (*(unsigned char *)transa == 'N') { + s_copy(ca, " CblasNoTrans", (ftnlen)14, (ftnlen)14); + } else if (*(unsigned char *)transa == 'T') { + s_copy(ca, " CblasTrans", (ftnlen)14, (ftnlen)14); + } else { + s_copy(ca, "CblasConjTrans", (ftnlen)14, (ftnlen)14); + } + if (*iorder == 1) { + s_copy(crc, " CblasRowMajor", (ftnlen)14, (ftnlen)14); + } else { + s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14); + } + printf("%6d: %s %s %s %s\n",*nc,sname,crc,cu,ca); + printf("( %d %d (%4.1f,%4.1f) A %d (%4.1f,%4.1f) C %d\n",*n,*k,alpha->r,alpha->i,*lda,beta->r,beta->i,*ldc); + return 0; +} /* cprcn4_ */ + + + +/* Subroutine */ int cprcn6_(integer *nout, integer *nc, char *sname, integer + *iorder, char *uplo, char *transa, integer *n, integer *k, real * + alpha, integer *lda, real *beta, integer *ldc) +{ + /* Local variables */ + char ca[14], cu[14], crc[14]; + + if (*(unsigned char *)uplo == 'U') { + s_copy(cu, " CblasUpper", (ftnlen)14, (ftnlen)14); + } else { + s_copy(cu, " CblasLower", (ftnlen)14, (ftnlen)14); + } + if (*(unsigned char *)transa == 'N') { + s_copy(ca, " CblasNoTrans", (ftnlen)14, (ftnlen)14); + } else if (*(unsigned char *)transa == 'T') { + s_copy(ca, " CblasTrans", (ftnlen)14, (ftnlen)14); + } else { + s_copy(ca, "CblasConjTrans", (ftnlen)14, (ftnlen)14); + } + if (*iorder == 1) { + s_copy(crc, " CblasRowMajor", (ftnlen)14, (ftnlen)14); + } else { + s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14); + } + printf("%6d: %s %s %s %s\n",*nc,sname,crc,cu,ca); + printf("( %d %d %4.1f A %d %4.1f C %d\n",*n,*k,*alpha,*lda,*beta,*ldc); + return 0; +} /* cprcn6_ */ + + +/* Subroutine */ int cchk5_(char *sname, real *eps, real *thresh, integer * + nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, + integer *nidim, integer *idim, integer *nalf, complex *alf, integer * + nbet, complex *bet, integer *nmax, complex *ab, complex *aa, complex * + as, complex *bb, complex *bs, complex *c__, complex *cc, complex *cs, + complex *ct, real *g, complex *w, integer *iorder) +{ + /* Initialized data */ + + static char icht[2] = "NC"; + static char ichu[2] = "UL"; + + + /* System generated locals */ + integer c_dim1, c_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7, i__8; + complex q__1, q__2; + + /* Local variables */ + integer jjab; + complex beta; + integer ldas, ldbs, ldcs; + logical same, conj; + complex bets; + logical tran, null; + char uplo[1]; + integer i__, j, k, n; + extern /* Subroutine */ int cmake_(char *, char *, char *, integer *, + integer *, complex *, integer *, complex *, integer *, logical *, + complex *); + complex alpha; + extern /* Subroutine */ int cmmch_(char *, char *, integer *, integer *, + integer *, complex *, complex *, integer *, complex *, integer *, + complex *, complex *, integer *, complex *, real *, complex *, + integer *, real *, real *, logical *, integer *, logical *); + real rbeta; + logical isame[13]; + integer nargs; + real rbets; + logical reset; + char trans[1]; + logical upper; + char uplos[1]; + extern /* Subroutine */ int cprcn5_(integer *, integer *, char *, integer + *, char *, char *, integer *, integer *, complex *, integer *, + integer *, complex *, integer *), cprcn7_( + integer *, integer *, char *, integer *, char *, char *, integer * + , integer *, complex *, integer *, integer *, real *, integer *); + integer ia, ib, jc, ma, na, nc, ik, in, jj, lj, ks, ns; + extern logical lceres_(char *, char *, integer *, integer *, complex *, + complex *, integer *); + real errmax; + char transs[1], transt[1]; + extern /* Subroutine */ int ccher2k_(integer *, char *, char *, integer *, + integer *, complex *, complex *, integer *, complex *, integer *, + real *, complex *, integer *); + integer laa, lbb, lda, lcc, ldb, ldc; + extern logical lce_(complex *, complex *, integer *); + extern /* Subroutine */ int ccsyr2k_(integer *, char *, char *, integer *, + integer *, complex *, complex *, integer *, complex *, integer *, + complex *, complex *, integer *); + complex als; + integer ict, icu; + real err; + +/* Tests CHER2K and CSYR2K. */ + +/* Auxiliary routine for test program for Level 3 Blas. */ + +/* -- Written on 8-February-1989. */ +/* Jack Dongarra, Argonne National Laboratory. */ +/* Iain Duff, AERE Harwell. */ +/* Jeremy Du Croz, Numerical Algorithms Group Ltd. */ +/* Sven Hammarling, Numerical Algorithms Group Ltd. */ + + /* Parameter adjustments */ + --idim; + --alf; + --bet; + --w; + --g; + --ct; + --cs; + --cc; + c_dim1 = *nmax; + c_offset = 1 + c_dim1 * 1; + c__ -= c_offset; + --bs; + --bb; + --as; + --aa; + --ab; + + /* Function Body */ + conj = s_cmp(sname + 7, "he", (ftnlen)2, (ftnlen)2) == 0; + + nargs = 12; + nc = 0; + reset = TRUE_; + errmax = 0.f; + + i__1 = *nidim; + for (in = 1; in <= i__1; ++in) { + n = idim[in]; +/* Set LDC to 1 more than minimum value if room. */ + ldc = n; + if (ldc < *nmax) { + ++ldc; + } +/* Skip tests if not enough room. */ + if (ldc > *nmax) { + goto L130; + } + lcc = ldc * n; + + i__2 = *nidim; + for (ik = 1; ik <= i__2; ++ik) { + k = idim[ik]; + + for (ict = 1; ict <= 2; ++ict) { + *(unsigned char *)trans = *(unsigned char *)&icht[ict - 1]; + tran = *(unsigned char *)trans == 'C'; + if (tran && ! conj) { + *(unsigned char *)trans = 'T'; + } + if (tran) { + ma = k; + na = n; + } else { + ma = n; + na = k; + } +/* Set LDA to 1 more than minimum value if room. */ + lda = ma; + if (lda < *nmax) { + ++lda; + } +/* Skip tests if not enough room. */ + if (lda > *nmax) { + goto L110; + } + laa = lda * na; + +/* Generate the matrix A. */ + + if (tran) { + i__3 = *nmax << 1; + cmake_("ge", " ", " ", &ma, &na, &ab[1], &i__3, &aa[1], & + lda, &reset, &c_b1); + } else { + cmake_("ge", " ", " ", &ma, &na, &ab[1], nmax, &aa[1], & + lda, &reset, &c_b1); + } + +/* Generate the matrix B. */ + + ldb = lda; + lbb = laa; + if (tran) { + i__3 = *nmax << 1; + cmake_("ge", " ", " ", &ma, &na, &ab[k + 1], &i__3, &bb[1] + , &ldb, &reset, &c_b1); + } else { + cmake_("ge", " ", " ", &ma, &na, &ab[k * *nmax + 1], nmax, + &bb[1], &ldb, &reset, &c_b1); + } + + for (icu = 1; icu <= 2; ++icu) { + *(unsigned char *)uplo = *(unsigned char *)&ichu[icu - 1]; + upper = *(unsigned char *)uplo == 'U'; + + i__3 = *nalf; + for (ia = 1; ia <= i__3; ++ia) { + i__4 = ia; + alpha.r = alf[i__4].r, alpha.i = alf[i__4].i; + + i__4 = *nbet; + for (ib = 1; ib <= i__4; ++ib) { + i__5 = ib; + beta.r = bet[i__5].r, beta.i = bet[i__5].i; + if (conj) { + rbeta = beta.r; + q__1.r = rbeta, q__1.i = 0.f; + beta.r = q__1.r, beta.i = q__1.i; + } + null = n <= 0; + if (conj) { + null = null || ((k <= 0 || (alpha.r == 0.f && + alpha.i == 0.f)) && rbeta == 1.f); + } + +/* Generate the matrix C. */ + + cmake_(sname + 7, uplo, " ", &n, &n, &c__[ + c_offset], nmax, &cc[1], &ldc, &reset, & + c_b1); + + ++nc; + +/* Save every datum before calling the subroutine. */ + + *(unsigned char *)uplos = *(unsigned char *)uplo; + *(unsigned char *)transs = *(unsigned char *) + trans; + ns = n; + ks = k; + als.r = alpha.r, als.i = alpha.i; + i__5 = laa; + for (i__ = 1; i__ <= i__5; ++i__) { + i__6 = i__; + i__7 = i__; + as[i__6].r = aa[i__7].r, as[i__6].i = aa[i__7] + .i; +/* L10: */ + } + ldas = lda; + i__5 = lbb; + for (i__ = 1; i__ <= i__5; ++i__) { + i__6 = i__; + i__7 = i__; + bs[i__6].r = bb[i__7].r, bs[i__6].i = bb[i__7] + .i; +/* L20: */ + } + ldbs = ldb; + if (conj) { + rbets = rbeta; + } else { + bets.r = beta.r, bets.i = beta.i; + } + i__5 = lcc; + for (i__ = 1; i__ <= i__5; ++i__) { + i__6 = i__; + i__7 = i__; + cs[i__6].r = cc[i__7].r, cs[i__6].i = cc[i__7] + .i; +/* L30: */ + } + ldcs = ldc; + +/* Call the subroutine. */ + + if (conj) { + if (*trace) { + cprcn7_(ntra, &nc, sname, iorder, uplo, + trans, &n, &k, &alpha, &lda, &ldb, + &rbeta, &ldc); + } + if (*rewi) { +/* al__1.aerr = 0; + al__1.aunit = *ntra; + f_rew(&al__1);*/ + } + ccher2k_(iorder, uplo, trans, &n, &k, &alpha, + &aa[1], &lda, &bb[1], &ldb, &rbeta, & + cc[1], &ldc); + } else { + if (*trace) { + cprcn5_(ntra, &nc, sname, iorder, uplo, + trans, &n, &k, &alpha, &lda, &ldb, + &beta, &ldc); + } + if (*rewi) { +/* al__1.aerr = 0; + al__1.aunit = *ntra; + f_rew(&al__1);*/ + } + ccsyr2k_(iorder, uplo, trans, &n, &k, &alpha, + &aa[1], &lda, &bb[1], &ldb, &beta, & + cc[1], &ldc); + } + +/* Check if error-exit was taken incorrectly. */ + + if (! infoc_1.ok) { + printf("*** FATAL ERROR - ERROR-CALL MYEXIT TAKEN ON VALID CALL\n"); + *fatal = TRUE_; + goto L150; + } + +/* See what data changed inside subroutines. */ + + isame[0] = *(unsigned char *)uplos == *(unsigned + char *)uplo; + isame[1] = *(unsigned char *)transs == *(unsigned + char *)trans; + isame[2] = ns == n; + isame[3] = ks == k; + isame[4] = als.r == alpha.r && als.i == alpha.i; + isame[5] = lce_(&as[1], &aa[1], &laa); + isame[6] = ldas == lda; + isame[7] = lce_(&bs[1], &bb[1], &lbb); + isame[8] = ldbs == ldb; + if (conj) { + isame[9] = rbets == rbeta; + } else { + isame[9] = bets.r == beta.r && bets.i == + beta.i; + } + if (null) { + isame[10] = lce_(&cs[1], &cc[1], &lcc); + } else { + isame[10] = lceres_("he", uplo, &n, &n, &cs[1] + , &cc[1], &ldc); + } + isame[11] = ldcs == ldc; + +/* If data was incorrectly changed, report and */ +/* return. */ + + same = TRUE_; + i__5 = nargs; + for (i__ = 1; i__ <= i__5; ++i__) { + same = same && isame[i__ - 1]; + if (! isame[i__ - 1]) { + printf(" ******* FATAL ERROR - PARAMETER NUMBER %d WAS CHANGED INCORRECTLY *******\n",i__); + } +/* L40: */ + } + if (! same) { + *fatal = TRUE_; + goto L150; + } + + if (! null) { + +/* Check the result column by column. */ + + if (conj) { + *(unsigned char *)transt = 'C'; + } else { + *(unsigned char *)transt = 'T'; + } + jjab = 1; + jc = 1; + i__5 = n; + for (j = 1; j <= i__5; ++j) { + if (upper) { + jj = 1; + lj = j; + } else { + jj = j; + lj = n - j + 1; + } + if (tran) { + i__6 = k; + for (i__ = 1; i__ <= i__6; ++i__) { + i__7 = i__; + i__8 = ((j - 1) << 1) * *nmax + k + + i__; + q__1.r = alpha.r * ab[i__8].r - + alpha.i * ab[i__8].i, + q__1.i = alpha.r * ab[ + i__8].i + alpha.i * ab[ + i__8].r; + w[i__7].r = q__1.r, w[i__7].i = + q__1.i; + if (conj) { + i__7 = k + i__; + r_cnjg(&q__2, &alpha); + i__8 = ((j - 1) << 1) * *nmax + i__; + q__1.r = q__2.r * ab[i__8].r - q__2.i * ab[i__8].i, + q__1.i = q__2.r * ab[i__8].i + q__2.i * ab[ + i__8].r; + w[i__7].r = q__1.r, w[i__7].i = q__1.i; + } else { + i__7 = k + i__; + i__8 = ((j - 1) << 1) * *nmax + i__; + q__1.r = alpha.r * ab[i__8].r - alpha.i * ab[i__8] + .i, q__1.i = alpha.r * ab[i__8].i + alpha.i + * ab[i__8].r; + w[i__7].r = q__1.r, w[i__7].i = q__1.i; + } +/* L50: */ + } + i__6 = k << 1; + i__7 = *nmax << 1; + i__8 = *nmax << 1; + cmmch_(transt, "N", &lj, &c__1, &i__6, + &c_b2, &ab[jjab], &i__7, &w[ + 1], &i__8, &beta, &c__[jj + j + * c_dim1], nmax, &ct[1], &g[1] + , &cc[jc], &ldc, eps, &err, + fatal, nout, &c_true); + } else { + i__6 = k; + for (i__ = 1; i__ <= i__6; ++i__) { + if (conj) { + i__7 = i__; + r_cnjg(&q__2, &ab[(k + i__ - 1) * *nmax + j]); + q__1.r = alpha.r * q__2.r - alpha.i * q__2.i, + q__1.i = alpha.r * q__2.i + alpha.i * + q__2.r; + w[i__7].r = q__1.r, w[i__7].i = q__1.i; + i__7 = k + i__; + i__8 = (i__ - 1) * *nmax + j; + q__2.r = alpha.r * ab[i__8].r - alpha.i * ab[i__8] + .i, q__2.i = alpha.r * ab[i__8].i + alpha.i + * ab[i__8].r; + r_cnjg(&q__1, &q__2); + w[i__7].r = q__1.r, w[i__7].i = q__1.i; + } else { + i__7 = i__; + i__8 = (k + i__ - 1) * *nmax + j; + q__1.r = alpha.r * ab[i__8].r - alpha.i * ab[i__8] + .i, q__1.i = alpha.r * ab[i__8].i + alpha.i + * ab[i__8].r; + w[i__7].r = q__1.r, w[i__7].i = q__1.i; + i__7 = k + i__; + i__8 = (i__ - 1) * *nmax + j; + q__1.r = alpha.r * ab[i__8].r - alpha.i * ab[i__8] + .i, q__1.i = alpha.r * ab[i__8].i + alpha.i + * ab[i__8].r; + w[i__7].r = q__1.r, w[i__7].i = q__1.i; + } +/* L60: */ + } + i__6 = k << 1; + i__7 = *nmax << 1; + cmmch_("N", "N", &lj, &c__1, &i__6, & + c_b2, &ab[jj], nmax, &w[1], & + i__7, &beta, &c__[jj + j * + c_dim1], nmax, &ct[1], &g[1], + &cc[jc], &ldc, eps, &err, + fatal, nout, &c_true); + } + if (upper) { + jc += ldc; + } else { + jc = jc + ldc + 1; + if (tran) { + jjab += *nmax << 1; + } + } + errmax = f2cmax(errmax,err); +/* If got really bad answer, report and */ +/* return. */ + if (*fatal) { + goto L140; + } +/* L70: */ + } + } + +/* L80: */ + } + +/* L90: */ + } + +/* L100: */ + } + +L110: + ; + } + +/* L120: */ + } + +L130: + ; + } + +/* Report result. */ + + if (errmax < *thresh) { + if (*iorder == 0) { + printf("%s PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS (%d CALLS)\n",sname,nc); + } + if (*iorder == 1) { + printf("%s PASSED THE ROW-MAJOR COMPUTATIONAL TESTS (%d CALLS)\n",sname,nc); + } + } else { + if (*iorder == 0) { + printf("%s COMPLETED THE COLUMN-MAJOR COMPUTATIONAL TESTS (%d CALLS)/n",sname,nc); + printf("***** BUT WITH MAXIMUM TEST RATIO %8.2f - SUSPECT *******/n",errmax); + } + if (*iorder == 1) { + printf("%s COMPLETED THE ROW-MAJOR COMPUTATIONAL TESTS (%d CALLS)/n",sname,nc); + printf("***** BUT WITH MAXIMUM TEST RATIO %8.2f - SUSPECT *******/n",errmax); + } + } + goto L160; + +L140: + if (n > 1) { + printf(" THESE ARE THE RESULTS FOR COLUMN %d:\n",j); + } + +L150: + printf(" ******* %s FAILED ON CALL NUMBER:\n",sname); + if (conj) { + cprcn7_(nout, &nc, sname, iorder, uplo, trans, &n, &k, &alpha, &lda, & + ldb, &rbeta, &ldc); + } else { + cprcn5_(nout, &nc, sname, iorder, uplo, trans, &n, &k, &alpha, &lda, & + ldb, &beta, &ldc); + } + +L160: + return 0; + +/* 9994 FORMAT(1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), */ +/* $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ',', F4.1, */ +/* $ ', C,', I3, ') .' ) */ +/* 9993 FORMAT(1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), */ +/* $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ',(', F4.1, */ +/* $ ',', F4.1, '), C,', I3, ') .' ) */ + +/* End of CCHK5. */ + +} /* cchk5_ */ + + +/* Subroutine */ int cprcn5_(integer *nout, integer *nc, char *sname, integer + *iorder, char *uplo, char *transa, integer *n, integer *k, complex * + alpha, integer *lda, integer *ldb, complex *beta, integer *ldc) +{ + + /* Local variables */ + char ca[14], cu[14], crc[14]; + + if (*(unsigned char *)uplo == 'U') { + s_copy(cu, " CblasUpper", (ftnlen)14, (ftnlen)14); + } else { + s_copy(cu, " CblasLower", (ftnlen)14, (ftnlen)14); + } + if (*(unsigned char *)transa == 'N') { + s_copy(ca, " CblasNoTrans", (ftnlen)14, (ftnlen)14); + } else if (*(unsigned char *)transa == 'T') { + s_copy(ca, " CblasTrans", (ftnlen)14, (ftnlen)14); + } else { + s_copy(ca, "CblasConjTrans", (ftnlen)14, (ftnlen)14); + } + if (*iorder == 1) { + s_copy(crc, " CblasRowMajor", (ftnlen)14, (ftnlen)14); + } else { + s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14); + } + printf("%6d: %s %s %s %s\n",*nc,sname,crc,cu,ca); + printf("%d %d (%4.1f,%4.1f) , A, %d, B, %d, (%4.1f,%4.1f) , C, %d.\n",*n,*k,alpha->r,alpha->i,*lda,*ldb,beta->r,beta->i,*ldc); + return 0; +} /* cprcn5_ */ + + + +/* Subroutine */ int cprcn7_(integer *nout, integer *nc, char *sname, integer + *iorder, char *uplo, char *transa, integer *n, integer *k, complex * + alpha, integer *lda, integer *ldb, real *beta, integer *ldc) +{ + + /* Local variables */ + char ca[14], cu[14], crc[14]; + + if (*(unsigned char *)uplo == 'U') { + s_copy(cu, " CblasUpper", (ftnlen)14, (ftnlen)14); + } else { + s_copy(cu, " CblasLower", (ftnlen)14, (ftnlen)14); + } + if (*(unsigned char *)transa == 'N') { + s_copy(ca, " CblasNoTrans", (ftnlen)14, (ftnlen)14); + } else if (*(unsigned char *)transa == 'T') { + s_copy(ca, " CblasTrans", (ftnlen)14, (ftnlen)14); + } else { + s_copy(ca, "CblasConjTrans", (ftnlen)14, (ftnlen)14); + } + if (*iorder == 1) { + s_copy(crc, " CblasRowMajor", (ftnlen)14, (ftnlen)14); + } else { + s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14); + } + printf("%6d: %s %s %s %s\n",*nc,sname,crc,cu,ca); + printf("%d %d (%4.1f,%4.1f), A, %d, B, %d, %4.1f, C, %d.\n",*n,*k,alpha->r,alpha->i,*lda,*ldb,*beta,*ldc); + return 0; +} /* cprcn7_ */ + + +/* Subroutine */ int cmake_(char *type__, char *uplo, char *diag, integer *m, + integer *n, complex *a, integer *nmax, complex *aa, integer *lda, + logical *reset, complex *transl) +{ + /* System generated locals */ + integer a_dim1, a_offset, i__1, i__2, i__3, i__4; + real r__1; + complex q__1, q__2; + + /* Local variables */ + extern /* Complex */ VOID cbeg_(complex *, logical *); + integer ibeg, iend; + logical unit; + integer i__, j; + logical lower, upper; + integer jj; + logical gen, her, tri, sym; + + +/* Generates values for an M by N matrix A. */ +/* Stores the values in the array AA in the data structure required */ +/* by the routine, with unwanted elements set to rogue value. */ + +/* TYPE is 'ge', 'he', 'sy' or 'tr'. */ + +/* Auxiliary routine for test program for Level 3 Blas. */ + +/* -- Written on 8-February-1989. */ +/* Jack Dongarra, Argonne National Laboratory. */ +/* Iain Duff, AERE Harwell. */ +/* Jeremy Du Croz, Numerical Algorithms Group Ltd. */ +/* Sven Hammarling, Numerical Algorithms Group Ltd. */ + + /* Parameter adjustments */ + a_dim1 = *nmax; + a_offset = 1 + a_dim1 * 1; + a -= a_offset; + --aa; + + /* Function Body */ + gen = s_cmp(type__, "ge", (ftnlen)2, (ftnlen)2) == 0; + her = s_cmp(type__, "he", (ftnlen)2, (ftnlen)2) == 0; + sym = s_cmp(type__, "sy", (ftnlen)2, (ftnlen)2) == 0; + tri = s_cmp(type__, "tr", (ftnlen)2, (ftnlen)2) == 0; + upper = (her || sym || tri) && *(unsigned char *)uplo == 'U'; + lower = (her || sym || tri) && *(unsigned char *)uplo == 'L'; + unit = tri && *(unsigned char *)diag == 'U'; + +/* Generate data in array A. */ + + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + i__2 = *m; + for (i__ = 1; i__ <= i__2; ++i__) { + if (gen || (upper && i__ <= j) || (lower && i__ >= j)) { + i__3 = i__ + j * a_dim1; + cbeg_(&q__2, reset); + q__1.r = q__2.r + transl->r, q__1.i = q__2.i + transl->i; + a[i__3].r = q__1.r, a[i__3].i = q__1.i; + if (i__ != j) { +/* Set some elements to zero */ + if (*n > 3 && j == *n / 2) { + i__3 = i__ + j * a_dim1; + a[i__3].r = 0.f, a[i__3].i = 0.f; + } + if (her) { + i__3 = j + i__ * a_dim1; + r_cnjg(&q__1, &a[i__ + j * a_dim1]); + a[i__3].r = q__1.r, a[i__3].i = q__1.i; + } else if (sym) { + i__3 = j + i__ * a_dim1; + i__4 = i__ + j * a_dim1; + a[i__3].r = a[i__4].r, a[i__3].i = a[i__4].i; + } else if (tri) { + i__3 = j + i__ * a_dim1; + a[i__3].r = 0.f, a[i__3].i = 0.f; + } + } + } +/* L10: */ + } + if (her) { + i__2 = j + j * a_dim1; + i__3 = j + j * a_dim1; + r__1 = a[i__3].r; + q__1.r = r__1, q__1.i = 0.f; + a[i__2].r = q__1.r, a[i__2].i = q__1.i; + } + if (tri) { + i__2 = j + j * a_dim1; + i__3 = j + j * a_dim1; + q__1.r = a[i__3].r + 1.f, q__1.i = a[i__3].i + 0.f; + a[i__2].r = q__1.r, a[i__2].i = q__1.i; + } + if (unit) { + i__2 = j + j * a_dim1; + a[i__2].r = 1.f, a[i__2].i = 0.f; + } +/* L20: */ + } + +/* Store elements in array AS in data structure required by routine. */ + + if (s_cmp(type__, "ge", (ftnlen)2, (ftnlen)2) == 0) { + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + i__2 = *m; + for (i__ = 1; i__ <= i__2; ++i__) { + i__3 = i__ + (j - 1) * *lda; + i__4 = i__ + j * a_dim1; + aa[i__3].r = a[i__4].r, aa[i__3].i = a[i__4].i; +/* L30: */ + } + i__2 = *lda; + for (i__ = *m + 1; i__ <= i__2; ++i__) { + i__3 = i__ + (j - 1) * *lda; + aa[i__3].r = -1e10f, aa[i__3].i = 1e10f; +/* L40: */ + } +/* L50: */ + } + } else if (s_cmp(type__, "he", (ftnlen)2, (ftnlen)2) == 0 || s_cmp(type__, + "sy", (ftnlen)2, (ftnlen)2) == 0 || s_cmp(type__, "tr", (ftnlen) + 2, (ftnlen)2) == 0) { + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + if (upper) { + ibeg = 1; + if (unit) { + iend = j - 1; + } else { + iend = j; + } + } else { + if (unit) { + ibeg = j + 1; + } else { + ibeg = j; + } + iend = *n; + } + i__2 = ibeg - 1; + for (i__ = 1; i__ <= i__2; ++i__) { + i__3 = i__ + (j - 1) * *lda; + aa[i__3].r = -1e10f, aa[i__3].i = 1e10f; +/* L60: */ + } + i__2 = iend; + for (i__ = ibeg; i__ <= i__2; ++i__) { + i__3 = i__ + (j - 1) * *lda; + i__4 = i__ + j * a_dim1; + aa[i__3].r = a[i__4].r, aa[i__3].i = a[i__4].i; +/* L70: */ + } + i__2 = *lda; + for (i__ = iend + 1; i__ <= i__2; ++i__) { + i__3 = i__ + (j - 1) * *lda; + aa[i__3].r = -1e10f, aa[i__3].i = 1e10f; +/* L80: */ + } + if (her) { + jj = j + (j - 1) * *lda; + i__2 = jj; + i__3 = jj; + r__1 = aa[i__3].r; + q__1.r = r__1, q__1.i = -1e10f; + aa[i__2].r = q__1.r, aa[i__2].i = q__1.i; + } +/* L90: */ + } + } + return 0; + +/* End of CMAKE. */ + +} /* cmake_ */ + +/* Subroutine */ int cmmch_(char *transa, char *transb, integer *m, integer * + n, integer *kk, complex *alpha, complex *a, integer *lda, complex *b, + integer *ldb, complex *beta, complex *c__, integer *ldc, complex *ct, + real *g, complex *cc, integer *ldcc, real *eps, real *err, logical * + fatal, integer *nout, logical *mv) +{ + + /* System generated locals */ + integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, cc_dim1, + cc_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7; + real r__1, r__2, r__3, r__4, r__5, r__6; + complex q__1, q__2, q__3, q__4; + + /* Local variables */ + real erri; + integer i__, j, k; + logical trana, tranb, ctrana, ctranb; + +/* Checks the results of the computational tests. */ + +/* Auxiliary routine for test program for Level 3 Blas. */ + +/* -- Written on 8-February-1989. */ +/* Jack Dongarra, Argonne National Laboratory. */ +/* Iain Duff, AERE Harwell. */ +/* Jeremy Du Croz, Numerical Algorithms Group Ltd. */ +/* Sven Hammarling, Numerical Algorithms Group Ltd. */ + + /* Parameter adjustments */ + a_dim1 = *lda; + a_offset = 1 + a_dim1 * 1; + a -= a_offset; + b_dim1 = *ldb; + b_offset = 1 + b_dim1 * 1; + b -= b_offset; + c_dim1 = *ldc; + c_offset = 1 + c_dim1 * 1; + c__ -= c_offset; + --ct; + --g; + cc_dim1 = *ldcc; + cc_offset = 1 + cc_dim1 * 1; + cc -= cc_offset; + + /* Function Body */ + trana = *(unsigned char *)transa == 'T' || *(unsigned char *)transa == + 'C'; + tranb = *(unsigned char *)transb == 'T' || *(unsigned char *)transb == + 'C'; + ctrana = *(unsigned char *)transa == 'C'; + ctranb = *(unsigned char *)transb == 'C'; + +/* Compute expected result, one column at a time, in CT using data */ +/* in A, B and C. */ +/* Compute gauges in G. */ + + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + + i__2 = *m; + for (i__ = 1; i__ <= i__2; ++i__) { + i__3 = i__; + ct[i__3].r = 0.f, ct[i__3].i = 0.f; + g[i__] = 0.f; +/* L10: */ + } + if (! trana && ! tranb) { + i__2 = *kk; + for (k = 1; k <= i__2; ++k) { + i__3 = *m; + for (i__ = 1; i__ <= i__3; ++i__) { + i__4 = i__; + i__5 = i__; + i__6 = i__ + k * a_dim1; + i__7 = k + j * b_dim1; + q__2.r = a[i__6].r * b[i__7].r - a[i__6].i * b[i__7].i, + q__2.i = a[i__6].r * b[i__7].i + a[i__6].i * b[ + i__7].r; + q__1.r = ct[i__5].r + q__2.r, q__1.i = ct[i__5].i + + q__2.i; + ct[i__4].r = q__1.r, ct[i__4].i = q__1.i; + i__4 = i__ + k * a_dim1; + i__5 = k + j * b_dim1; + g[i__] += ((r__1 = a[i__4].r, abs(r__1)) + (r__2 = r_imag( + &a[i__ + k * a_dim1]), abs(r__2))) * ((r__3 = b[ + i__5].r, abs(r__3)) + (r__4 = r_imag(&b[k + j * + b_dim1]), abs(r__4))); +/* L20: */ + } +/* L30: */ + } + } else if (trana && ! tranb) { + if (ctrana) { + i__2 = *kk; + for (k = 1; k <= i__2; ++k) { + i__3 = *m; + for (i__ = 1; i__ <= i__3; ++i__) { + i__4 = i__; + i__5 = i__; + r_cnjg(&q__3, &a[k + i__ * a_dim1]); + i__6 = k + j * b_dim1; + q__2.r = q__3.r * b[i__6].r - q__3.i * b[i__6].i, + q__2.i = q__3.r * b[i__6].i + q__3.i * b[i__6] + .r; + q__1.r = ct[i__5].r + q__2.r, q__1.i = ct[i__5].i + + q__2.i; + ct[i__4].r = q__1.r, ct[i__4].i = q__1.i; + i__4 = k + i__ * a_dim1; + i__5 = k + j * b_dim1; + g[i__] += ((r__1 = a[i__4].r, abs(r__1)) + (r__2 = + r_imag(&a[k + i__ * a_dim1]), abs(r__2))) * (( + r__3 = b[i__5].r, abs(r__3)) + (r__4 = r_imag( + &b[k + j * b_dim1]), abs(r__4))); +/* L40: */ + } +/* L50: */ + } + } else { + i__2 = *kk; + for (k = 1; k <= i__2; ++k) { + i__3 = *m; + for (i__ = 1; i__ <= i__3; ++i__) { + i__4 = i__; + i__5 = i__; + i__6 = k + i__ * a_dim1; + i__7 = k + j * b_dim1; + q__2.r = a[i__6].r * b[i__7].r - a[i__6].i * b[i__7] + .i, q__2.i = a[i__6].r * b[i__7].i + a[i__6] + .i * b[i__7].r; + q__1.r = ct[i__5].r + q__2.r, q__1.i = ct[i__5].i + + q__2.i; + ct[i__4].r = q__1.r, ct[i__4].i = q__1.i; + i__4 = k + i__ * a_dim1; + i__5 = k + j * b_dim1; + g[i__] += ((r__1 = a[i__4].r, abs(r__1)) + (r__2 = + r_imag(&a[k + i__ * a_dim1]), abs(r__2))) * (( + r__3 = b[i__5].r, abs(r__3)) + (r__4 = r_imag( + &b[k + j * b_dim1]), abs(r__4))); +/* L60: */ + } +/* L70: */ + } + } + } else if (! trana && tranb) { + if (ctranb) { + i__2 = *kk; + for (k = 1; k <= i__2; ++k) { + i__3 = *m; + for (i__ = 1; i__ <= i__3; ++i__) { + i__4 = i__; + i__5 = i__; + i__6 = i__ + k * a_dim1; + r_cnjg(&q__3, &b[j + k * b_dim1]); + q__2.r = a[i__6].r * q__3.r - a[i__6].i * q__3.i, + q__2.i = a[i__6].r * q__3.i + a[i__6].i * + q__3.r; + q__1.r = ct[i__5].r + q__2.r, q__1.i = ct[i__5].i + + q__2.i; + ct[i__4].r = q__1.r, ct[i__4].i = q__1.i; + i__4 = i__ + k * a_dim1; + i__5 = j + k * b_dim1; + g[i__] += ((r__1 = a[i__4].r, abs(r__1)) + (r__2 = + r_imag(&a[i__ + k * a_dim1]), abs(r__2))) * (( + r__3 = b[i__5].r, abs(r__3)) + (r__4 = r_imag( + &b[j + k * b_dim1]), abs(r__4))); +/* L80: */ + } +/* L90: */ + } + } else { + i__2 = *kk; + for (k = 1; k <= i__2; ++k) { + i__3 = *m; + for (i__ = 1; i__ <= i__3; ++i__) { + i__4 = i__; + i__5 = i__; + i__6 = i__ + k * a_dim1; + i__7 = j + k * b_dim1; + q__2.r = a[i__6].r * b[i__7].r - a[i__6].i * b[i__7] + .i, q__2.i = a[i__6].r * b[i__7].i + a[i__6] + .i * b[i__7].r; + q__1.r = ct[i__5].r + q__2.r, q__1.i = ct[i__5].i + + q__2.i; + ct[i__4].r = q__1.r, ct[i__4].i = q__1.i; + i__4 = i__ + k * a_dim1; + i__5 = j + k * b_dim1; + g[i__] += ((r__1 = a[i__4].r, abs(r__1)) + (r__2 = + r_imag(&a[i__ + k * a_dim1]), abs(r__2))) * (( + r__3 = b[i__5].r, abs(r__3)) + (r__4 = r_imag( + &b[j + k * b_dim1]), abs(r__4))); +/* L100: */ + } +/* L110: */ + } + } + } else if (trana && tranb) { + if (ctrana) { + if (ctranb) { + i__2 = *kk; + for (k = 1; k <= i__2; ++k) { + i__3 = *m; + for (i__ = 1; i__ <= i__3; ++i__) { + i__4 = i__; + i__5 = i__; + r_cnjg(&q__3, &a[k + i__ * a_dim1]); + r_cnjg(&q__4, &b[j + k * b_dim1]); + q__2.r = q__3.r * q__4.r - q__3.i * q__4.i, + q__2.i = q__3.r * q__4.i + q__3.i * + q__4.r; + q__1.r = ct[i__5].r + q__2.r, q__1.i = ct[i__5].i + + q__2.i; + ct[i__4].r = q__1.r, ct[i__4].i = q__1.i; + i__4 = k + i__ * a_dim1; + i__5 = j + k * b_dim1; + g[i__] += ((r__1 = a[i__4].r, abs(r__1)) + (r__2 = + r_imag(&a[k + i__ * a_dim1]), abs(r__2))) + * ((r__3 = b[i__5].r, abs(r__3)) + (r__4 + = r_imag(&b[j + k * b_dim1]), abs(r__4))); +/* L120: */ + } +/* L130: */ + } + } else { + i__2 = *kk; + for (k = 1; k <= i__2; ++k) { + i__3 = *m; + for (i__ = 1; i__ <= i__3; ++i__) { + i__4 = i__; + i__5 = i__; + r_cnjg(&q__3, &a[k + i__ * a_dim1]); + i__6 = j + k * b_dim1; + q__2.r = q__3.r * b[i__6].r - q__3.i * b[i__6].i, + q__2.i = q__3.r * b[i__6].i + q__3.i * b[ + i__6].r; + q__1.r = ct[i__5].r + q__2.r, q__1.i = ct[i__5].i + + q__2.i; + ct[i__4].r = q__1.r, ct[i__4].i = q__1.i; + i__4 = k + i__ * a_dim1; + i__5 = j + k * b_dim1; + g[i__] += ((r__1 = a[i__4].r, abs(r__1)) + (r__2 = + r_imag(&a[k + i__ * a_dim1]), abs(r__2))) + * ((r__3 = b[i__5].r, abs(r__3)) + (r__4 + = r_imag(&b[j + k * b_dim1]), abs(r__4))); +/* L140: */ + } +/* L150: */ + } + } + } else { + if (ctranb) { + i__2 = *kk; + for (k = 1; k <= i__2; ++k) { + i__3 = *m; + for (i__ = 1; i__ <= i__3; ++i__) { + i__4 = i__; + i__5 = i__; + i__6 = k + i__ * a_dim1; + r_cnjg(&q__3, &b[j + k * b_dim1]); + q__2.r = a[i__6].r * q__3.r - a[i__6].i * q__3.i, + q__2.i = a[i__6].r * q__3.i + a[i__6].i * + q__3.r; + q__1.r = ct[i__5].r + q__2.r, q__1.i = ct[i__5].i + + q__2.i; + ct[i__4].r = q__1.r, ct[i__4].i = q__1.i; + i__4 = k + i__ * a_dim1; + i__5 = j + k * b_dim1; + g[i__] += ((r__1 = a[i__4].r, abs(r__1)) + (r__2 = + r_imag(&a[k + i__ * a_dim1]), abs(r__2))) + * ((r__3 = b[i__5].r, abs(r__3)) + (r__4 + = r_imag(&b[j + k * b_dim1]), abs(r__4))); +/* L160: */ + } +/* L170: */ + } + } else { + i__2 = *kk; + for (k = 1; k <= i__2; ++k) { + i__3 = *m; + for (i__ = 1; i__ <= i__3; ++i__) { + i__4 = i__; + i__5 = i__; + i__6 = k + i__ * a_dim1; + i__7 = j + k * b_dim1; + q__2.r = a[i__6].r * b[i__7].r - a[i__6].i * b[ + i__7].i, q__2.i = a[i__6].r * b[i__7].i + + a[i__6].i * b[i__7].r; + q__1.r = ct[i__5].r + q__2.r, q__1.i = ct[i__5].i + + q__2.i; + ct[i__4].r = q__1.r, ct[i__4].i = q__1.i; + i__4 = k + i__ * a_dim1; + i__5 = j + k * b_dim1; + g[i__] += ((r__1 = a[i__4].r, abs(r__1)) + (r__2 = + r_imag(&a[k + i__ * a_dim1]), abs(r__2))) + * ((r__3 = b[i__5].r, abs(r__3)) + (r__4 + = r_imag(&b[j + k * b_dim1]), abs(r__4))); +/* L180: */ + } +/* L190: */ + } + } + } + } + i__2 = *m; + for (i__ = 1; i__ <= i__2; ++i__) { + i__3 = i__; + i__4 = i__; + q__2.r = alpha->r * ct[i__4].r - alpha->i * ct[i__4].i, q__2.i = + alpha->r * ct[i__4].i + alpha->i * ct[i__4].r; + i__5 = i__ + j * c_dim1; + q__3.r = beta->r * c__[i__5].r - beta->i * c__[i__5].i, q__3.i = + beta->r * c__[i__5].i + beta->i * c__[i__5].r; + q__1.r = q__2.r + q__3.r, q__1.i = q__2.i + q__3.i; + ct[i__3].r = q__1.r, ct[i__3].i = q__1.i; + i__3 = i__ + j * c_dim1; + g[i__] = ((r__1 = alpha->r, abs(r__1)) + (r__2 = r_imag(alpha), + abs(r__2))) * g[i__] + ((r__3 = beta->r, abs(r__3)) + ( + r__4 = r_imag(beta), abs(r__4))) * ((r__5 = c__[i__3].r, + abs(r__5)) + (r__6 = r_imag(&c__[i__ + j * c_dim1]), abs( + r__6))); +/* L200: */ + } + +/* Compute the error ratio for this result. */ + + *err = 0.f; + i__2 = *m; + for (i__ = 1; i__ <= i__2; ++i__) { + i__3 = i__; + i__4 = i__ + j * cc_dim1; + q__2.r = ct[i__3].r - cc[i__4].r, q__2.i = ct[i__3].i - cc[i__4] + .i; + q__1.r = q__2.r, q__1.i = q__2.i; + erri = ((r__1 = q__1.r, abs(r__1)) + (r__2 = r_imag(&q__1), abs( + r__2))) / *eps; + if (g[i__] != 0.f) { + erri /= g[i__]; + } + *err = f2cmax(*err,erri); + if (*err * sqrt(*eps) >= 1.f) { + goto L230; + } +/* L210: */ + } + +/* L220: */ + } + +/* If the loop completes, all results are at least half accurate. */ + goto L250; + +/* Report fatal error. */ + +L230: + *fatal = TRUE_; + printf(" ******* FATAL ERROR - COMPUTED RESULT IS LESS THAN HALF ACCURATE *******\n"); + printf(" EXPECTED RESULT COMPUTED RESULT\n"); + i__1 = *m; + for (i__ = 1; i__ <= i__1; ++i__) { + if (*mv) { + printf("%7d (%15.6g,%15.6g) (%15.6g,%15.6g)\n",i__,ct[i__].r,ct[i__].i,cc[i__+j*cc_dim1].r,cc[i__+j*cc_dim1].i); + } else { + printf("%7d (%15.6g,%15.6g) (%15.6g,%15.6g)\n",i__,cc[i__+j*cc_dim1].r,cc[i__+j*cc_dim1].i,ct[i__].r,ct[i__].i); + } +/* L240: */ + } + if (*n > 1) { + printf(" THESE ARE THE RESULTS FOR COLUMN %d\n",j); + } + +L250: + return 0; + + +/* End of CMMCH. */ + +} /* cmmch_ */ + +logical lce_(complex *ri, complex *rj, integer *lr) +{ + /* System generated locals */ + integer i__1, i__2, i__3; + logical ret_val; + + /* Local variables */ + integer i__; + + +/* Tests if two arrays are identical. */ + +/* Auxiliary routine for test program for Level 3 Blas. */ + +/* -- Written on 8-February-1989. */ +/* Jack Dongarra, Argonne National Laboratory. */ +/* Iain Duff, AERE Harwell. */ +/* Jeremy Du Croz, Numerical Algorithms Group Ltd. */ +/* Sven Hammarling, Numerical Algorithms Group Ltd. */ + + /* Parameter adjustments */ + --rj; + --ri; + + /* Function Body */ + i__1 = *lr; + for (i__ = 1; i__ <= i__1; ++i__) { + i__2 = i__; + i__3 = i__; + if (ri[i__2].r != rj[i__3].r || ri[i__2].i != rj[i__3].i) { + goto L20; + } +/* L10: */ + } + ret_val = TRUE_; + goto L30; +L20: + ret_val = FALSE_; +L30: + return ret_val; + +/* End of LCE. */ + +} /* lce_ */ + +logical lceres_(char *type__, char *uplo, integer *m, integer *n, complex *aa, + complex *as, integer *lda) +{ + /* System generated locals */ + integer aa_dim1, aa_offset, as_dim1, as_offset, i__1, i__2, i__3, i__4; + logical ret_val; + + /* Local variables */ + integer ibeg, iend, i__, j; + logical upper; + + +/* Tests if selected elements in two arrays are equal. */ + +/* TYPE is 'ge' or 'he' or 'sy'. */ + +/* Auxiliary routine for test program for Level 3 Blas. */ + +/* -- Written on 8-February-1989. */ +/* Jack Dongarra, Argonne National Laboratory. */ +/* Iain Duff, AERE Harwell. */ +/* Jeremy Du Croz, Numerical Algorithms Group Ltd. */ +/* Sven Hammarling, Numerical Algorithms Group Ltd. */ + + /* Parameter adjustments */ + as_dim1 = *lda; + as_offset = 1 + as_dim1 * 1; + as -= as_offset; + aa_dim1 = *lda; + aa_offset = 1 + aa_dim1 * 1; + aa -= aa_offset; + + /* Function Body */ + upper = *(unsigned char *)uplo == 'U'; + if (s_cmp(type__, "ge", (ftnlen)2, (ftnlen)2) == 0) { + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + i__2 = *lda; + for (i__ = *m + 1; i__ <= i__2; ++i__) { + i__3 = i__ + j * aa_dim1; + i__4 = i__ + j * as_dim1; + if (aa[i__3].r != as[i__4].r || aa[i__3].i != as[i__4].i) { + goto L70; + } +/* L10: */ + } +/* L20: */ + } + } else if (s_cmp(type__, "he", (ftnlen)2, (ftnlen)2) == 0 || s_cmp(type__, + "sy", (ftnlen)2, (ftnlen)2) == 0) { + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + if (upper) { + ibeg = 1; + iend = j; + } else { + ibeg = j; + iend = *n; + } + i__2 = ibeg - 1; + for (i__ = 1; i__ <= i__2; ++i__) { + i__3 = i__ + j * aa_dim1; + i__4 = i__ + j * as_dim1; + if (aa[i__3].r != as[i__4].r || aa[i__3].i != as[i__4].i) { + goto L70; + } +/* L30: */ + } + i__2 = *lda; + for (i__ = iend + 1; i__ <= i__2; ++i__) { + i__3 = i__ + j * aa_dim1; + i__4 = i__ + j * as_dim1; + if (aa[i__3].r != as[i__4].r || aa[i__3].i != as[i__4].i) { + goto L70; + } +/* L40: */ + } +/* L50: */ + } + } + +/* 60 CONTINUE */ + ret_val = TRUE_; + goto L80; +L70: + ret_val = FALSE_; +L80: + return ret_val; + +/* End of LCERES. */ + +} /* lceres_ */ + +/* Complex */ VOID cbeg_(complex * ret_val, logical *reset) +{ + /* System generated locals */ + real r__1, r__2; + complex q__1; + + /* Local variables */ + static integer i__, j, ic, mi, mj; + + +/* Generates complex numbers as pairs of random numbers uniformly */ +/* distributed between -0.5 and 0.5. */ + +/* Auxiliary routine for test program for Level 3 Blas. */ + +/* -- Written on 8-February-1989. */ +/* Jack Dongarra, Argonne National Laboratory. */ +/* Iain Duff, AERE Harwell. */ +/* Jeremy Du Croz, Numerical Algorithms Group Ltd. */ +/* Sven Hammarling, Numerical Algorithms Group Ltd. */ + + if (*reset) { +/* Initialize local variables. */ + mi = 891; + mj = 457; + i__ = 7; + j = 7; + ic = 0; + *reset = FALSE_; + } + +/* The sequence of values of I or J is bounded between 1 and 999. */ +/* If initial I or J = 1,2,3,6,7 or 9, the period will be 50. */ +/* If initial I or J = 4 or 8, the period will be 25. */ +/* If initial I or J = 5, the period will be 10. */ +/* IC is used to break up the period by skipping 1 value of I or J */ +/* in 6. */ + + ++ic; +L10: + i__ *= mi; + j *= mj; + i__ -= i__ / 1000 * 1000; + j -= j / 1000 * 1000; + if (ic >= 5) { + ic = 0; + goto L10; + } + r__1 = (i__ - 500) / 1001.f; + r__2 = (j - 500) / 1001.f; + q__1.r = r__1, q__1.i = r__2; + ret_val->r = q__1.r, ret_val->i = q__1.i; + return ; + +/* End of CBEG. */ + +} /* cbeg_ */ + +real sdiff_(real *x, real *y) +{ + /* System generated locals */ + real ret_val; + + +/* Auxiliary routine for test program for Level 3 Blas. */ + +/* -- Written on 8-February-1989. */ +/* Jack Dongarra, Argonne National Laboratory. */ +/* Iain Duff, AERE Harwell. */ +/* Jeremy Du Croz, Numerical Algorithms Group Ltd. */ +/* Sven Hammarling, Numerical Algorithms Group Ltd. */ + + ret_val = *x - *y; + return ret_val; + +/* End of SDIFF. */ + +} /* sdiff_ */ + +/* Main program alias */ /*int cblat3_ () { MAIN__ (); return 0; }*/ diff --git a/ctest/c_zblat3c_3m.c b/ctest/c_zblat3c_3m.c new file mode 100644 index 000000000..0c76f11e7 --- /dev/null +++ b/ctest/c_zblat3c_3m.c @@ -0,0 +1,3951 @@ +#include +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +#include "common.h" + +typedef blasint integer; + +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimagf(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} +//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 + + +/* Common Block Declarations */ + +struct { + integer infot, noutc; + logical ok, lerr; +} infoc_; + +#define infoc_1 infoc_ + +struct { + char srnamt[12]; +} srnamc_; + +#define srnamc_1 srnamc_ + +/* Table of constant values */ + +static doublecomplex c_b1 = {0.,0.}; +static doublecomplex c_b2 = {1.,0.}; +static integer c__1 = 1; +static integer c__65 = 65; +static doublereal c_b92 = 1.; +static integer c__6 = 6; +static logical c_true = TRUE_; +static integer c__0 = 0; +static logical c_false = FALSE_; + +/* Main program MAIN__() */ int main(void) +{ + /* Initialized data */ + + static char snames[9][13] = { "cblas_zgemm3m ", "cblas_zhemm ", "cblas_zsymm ", "cblas_ztrmm ", + "cblas_ztrsm ", "cblas_zherk ", "cblas_zsyrk ", "cblas_zher2k", "cblas_zsyr2k"}; + + /* System generated locals */ + integer i__1, i__2, i__3, i__4, i__5; + doublereal d__1; + + /* Builtin functions */ + integer s_rsle(void), do_lio(void), e_rsle(void), f_open(void), s_wsfe(void), do_fio(void), + e_wsfe(void), s_wsle(void), e_wsle(void), s_rsfe(void), e_rsfe(void); + + /* Local variables */ + static integer nalf, idim[9]; + static logical same; + static integer nbet, ntra; + static logical rewi; + extern /* Subroutine */ int zchk1_(char*, doublereal*, doublereal*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublereal*, integer*, ftnlen); + extern /* Subroutine */ int zchk2_(char*, doublereal*, doublereal*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublereal*, integer*, ftnlen); + extern /* Subroutine */ int zchk3_(char*, doublereal*, doublereal*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublereal*, doublecomplex*, integer*, ftnlen); + extern /* Subroutine */ int zchk4_(char*, doublereal*, doublereal*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublereal*, integer*, ftnlen); + extern /* Subroutine */ int zchk5_(char*, doublereal*, doublereal*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublereal*, doublecomplex*, integer*, ftnlen); + static doublecomplex c__[4225] /* was [65][65] */; + static doublereal g[65]; + static integer i__, j; + extern doublereal ddiff_(doublereal*, doublereal*); + static integer n; + static logical fatal; + static doublecomplex w[130]; + static logical trace; + static integer nidim; + extern /* Subroutine */ int zmmch_(char*, char*, integer*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, doublereal*, doublecomplex*, integer*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen, ftnlen); + static char snaps[32]; + static integer isnum; + static logical ltest[9]; + static doublecomplex aa[4225], ab[8450] /* was [65][130] */, bb[4225], + cc[4225], as[4225], bs[4225], cs[4225], ct[65]; + static logical sfatal, corder; + static char snamet[12], transa[1], transb[1]; + static doublereal thresh; + static logical rorder; + static integer layout; + static logical ltestt, tsterr; + extern /* Subroutine */ int cz3chke_(char*, ftnlen); + static doublecomplex alf[7], bet[7]; + static doublereal eps, err; + extern logical lze_(doublecomplex*, doublecomplex*, integer*); + char tmpchar; + +/* Test program for the COMPLEX*16 Level 3 Blas. */ + +/* The program must be driven by a short data file. The first 13 records */ +/* of the file are read using list-directed input, the last 9 records */ +/* are read using the format ( A12,L2 ). An annotated example of a data */ +/* file can be obtained by deleting the first 3 characters from the */ +/* following 22 lines: */ +/* 'CBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE */ +/* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) */ +/* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. */ +/* F LOGICAL FLAG, T TO STOP ON FAILURES. */ +/* T LOGICAL FLAG, T TO TEST ERROR EXITS. */ +/* 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH */ +/* 16.0 THRESHOLD VALUE OF TEST RATIO */ +/* 6 NUMBER OF VALUES OF N */ +/* 0 1 2 3 5 9 VALUES OF N */ +/* 3 NUMBER OF VALUES OF ALPHA */ +/* (0.0,0.0) (1.0,0.0) (0.7,-0.9) VALUES OF ALPHA */ +/* 3 NUMBER OF VALUES OF BETA */ +/* (0.0,0.0) (1.0,0.0) (1.3,-1.1) VALUES OF BETA */ +/* ZGEMM T PUT F FOR NO TEST. SAME COLUMNS. */ +/* ZHEMM T PUT F FOR NO TEST. SAME COLUMNS. */ +/* ZSYMM T PUT F FOR NO TEST. SAME COLUMNS. */ +/* ZTRMM T PUT F FOR NO TEST. SAME COLUMNS. */ +/* ZTRSM T PUT F FOR NO TEST. SAME COLUMNS. */ +/* ZHERK T PUT F FOR NO TEST. SAME COLUMNS. */ +/* ZSYRK T PUT F FOR NO TEST. SAME COLUMNS. */ +/* ZHER2K T PUT F FOR NO TEST. SAME COLUMNS. */ +/* ZSYR2K T PUT F FOR NO TEST. SAME COLUMNS. */ + +/* See: */ + +/* Dongarra J. J., Du Croz J. J., Duff I. S. and Hammarling S. */ +/* A Set of Level 3 Basic Linear Algebra Subprograms. */ + +/* Technical Memorandum No.88 (Revision 1), Mathematics and */ +/* Computer Science Division, Argonne National Laboratory, 9700 */ +/* South Cass Avenue, Argonne, Illinois 60439, US. */ + +/* -- Written on 8-February-1989. */ +/* Jack Dongarra, Argonne National Laboratory. */ +/* Iain Duff, AERE Harwell. */ +/* Jeremy Du Croz, Numerical Algorithms Group Ltd. */ +/* Sven Hammarling, Numerical Algorithms Group Ltd. */ + +/* .. Parameters .. */ +/* .. Local Scalars .. */ +/* .. Local Arrays .. */ +/* .. External Functions .. */ +/* .. External Subroutines .. */ +/* .. Intrinsic Functions .. */ +/* .. Scalars in Common .. */ +/* .. Common blocks .. */ +/* .. Data statements .. */ +/* .. Executable Statements .. */ + + infoc_1.noutc = 6; + +/* Read name and unit number for snapshot output file and open file. */ + + char line[80]; + + fgets(line,80,stdin); + sscanf(line,"'%s'",snaps); + fgets(line,80,stdin); +#ifdef USE64BITINT + sscanf(line,"%ld",&ntra); +#else + sscanf(line,"%d",&ntra); +#endif + trace = ntra >= 0; + if (trace) { +/* o__1.oerr = 0; + o__1.ounit = ntra; + o__1.ofnmlen = 32; + o__1.ofnm = snaps; + o__1.orl = 0; + o__1.osta = "NEW"; + o__1.oacc = 0; + o__1.ofm = 0; + o__1.oblnk = 0; + f_open(&o__1);*/ + } +/* Read the flag that directs rewinding of the snapshot file. */ + fgets(line,80,stdin); + sscanf(line,"%d",&rewi); + rewi = rewi && trace; +/* Read the flag that directs stopping on any failure. */ + fgets(line,80,stdin); + sscanf(line,"%c",&tmpchar); + sfatal=FALSE_; + if (tmpchar=='T')sfatal=TRUE_; +/* Read the flag that indicates whether error exits are to be tested. */ + fgets(line,80,stdin); + sscanf(line,"%c",&tmpchar); + tsterr=FALSE_; + if (tmpchar=='T')tsterr=TRUE_; +/* Read the flag that indicates whether row-major data layout to be tested. */ + fgets(line,80,stdin); + sscanf(line,"%d",&layout); +/* Read the threshold value of the test ratio */ + fgets(line,80,stdin); + sscanf(line,"%lf",&thresh); + +/* Read and check the parameter values for the tests. */ + +/* Values of N */ + fgets(line,80,stdin); +#ifdef USE64BITINT + sscanf(line,"%d",&nidim); +#else + sscanf(line,"%d",&nidim); +#endif + if (nidim < 1 || nidim > 9) { + fprintf(stderr,"NUMBER OF VALUES OF N IS LESS THAN 1 OR GREATER THAN 9"); + goto L220; + } + fgets(line,80,stdin); +#ifdef USE64BITINT + sscanf(line,"%ld %ld %ld %ld %ld %ld %ld %ld %ld",&idim[0],&idim[1],&idim[2], + &idim[3],&idim[4],&idim[5],&idim[6],&idim[7],&idim[8]); +#else + sscanf(line,"%d %d %d %d %d %d %d %d %d",&idim[0],&idim[1],&idim[2], + &idim[3],&idim[4],&idim[5],&idim[6],&idim[7],&idim[8]); +#endif + i__1 = nidim; + for (i__ = 1; i__ <= i__1; ++i__) { + if (idim[i__ - 1] < 0 || idim[i__ - 1] > 65) { + fprintf(stderr,"VALUE OF N IS LESS THAN 0 OR GREATER THAN 65\n"); + goto L220; + } +/* L10: */ + } +/* Values of ALPHA */ + fgets(line,80,stdin); +#ifdef USE64BITINT + sscanf(line,"%ld",&nalf); +#else + sscanf(line,"%d",&nalf); +#endif + if (nalf < 1 || nalf > 7) { + fprintf(stderr,"VALUE OF ALPHA IS LESS THAN 0 OR GREATER THAN 7\n"); + goto L220; + } + fgets(line,80,stdin); + sscanf(line,"(%lf,%lf) (%lf,%lf) (%lf,%lf) (%lf,%lf) (%lf,%lf) (%lf,%lf) (%lf,%lf)",&alf[0].r,&alf[0].i,&alf[1].r,&alf[1].i,&alf[2].r,&alf[2].i,&alf[3].r,&alf[3].i, + &alf[4].r,&alf[4].i,&alf[5].r,&alf[5].i,&alf[6].r,&alf[6].i); + +/* Values of BETA */ + fgets(line,80,stdin); +#ifdef USE64BITINT + sscanf(line,"%ld",&nbet); +#else + sscanf(line,"%d",&nbet); +#endif + if (nalf < 1 || nbet > 7) { + fprintf(stderr,"VALUE OF BETA IS LESS THAN 0 OR GREATER THAN 7\n"); + goto L220; + } + fgets(line,80,stdin); + sscanf(line,"(%lf,%lf) (%lf,%lf) (%lf,%lf) (%lf,%lf) (%lf,%lf) (%lf,%lf) (%lf,%lf)",&bet[0].r,&bet[0].i,&bet[1].r,&bet[1].i,&bet[2].r,&bet[2].i,&bet[3].r,&bet[3].i, + &bet[4].r,&bet[4].i,&bet[5].r,&bet[5].i,&bet[6].r,&bet[6].i); + +/* Report values of parameters. */ + + printf("TESTS OF THE DOUBLE PRECISION COMPLEX LEVEL 3 BLAS\nTHE FOLLOWING PARAMETER VALUES WILL BE USED:\n"); + printf(" FOR N"); + for (i__ =1; i__ <=nidim;++i__) printf(" %d",idim[i__-1]); + printf("\n"); + printf(" FOR ALPHA"); + for (i__ =1; i__ <=nalf;++i__) printf(" (%lf,%lf)",alf[i__-1].r,alf[i__-1].i); + printf("\n"); + printf(" FOR BETA"); + for (i__ =1; i__ <=nbet;++i__) printf(" (%lf,%lf)",bet[i__-1].r,bet[i__-1].i); + printf("\n"); + + if (! tsterr) { + printf(" ERROR-EXITS WILL NOT BE TESTED\n"); + } + + printf("ROUTINES PASS COMPUTATIONAL TESTS IF TEST RATIO IS LESS THAN %lf\n",thresh); + rorder = FALSE_; + corder = FALSE_; + if (layout == 2) { + rorder = TRUE_; + corder = TRUE_; + printf("COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED\n"); + } else if (layout == 1) { + rorder = TRUE_; + printf("ROW-MAJOR DATA LAYOUT IS TESTED\n"); + } else if (layout == 0) { + corder = TRUE_; + printf("COLUMN-MAJOR DATA LAYOUT IS TESTED\n"); + } + +/* Read names of subroutines and flags which indicate */ +/* whether they are to be tested. */ + + for (i__ = 1; i__ <= 9; ++i__) { + ltest[i__ - 1] = FALSE_; +/* L20: */ + } +L30: + if (! fgets(line,80,stdin)) { + goto L60; + } + i__1 = sscanf(line,"%12c %c",snamet,&tmpchar); + ltestt=FALSE_; + if (tmpchar=='T')ltestt=TRUE_; + if (i__1 < 2) { + goto L60; + } + for (i__ = 1; i__ <= 9; ++i__) { + if (s_cmp(snamet, snames[i__ - 1] , (ftnlen)12, (ftnlen)12) == + 0) { + goto L50; + } +/* L40: */ + } + printf("SUBPROGRAM NAME %s NOT RECOGNIZED\n****** TESTS ABANDONED ******\n",snamet); + exit(1); +L50: + ltest[i__ - 1] = ltestt; + goto L30; + +L60: +/* cl__1.cerr = 0; + cl__1.cunit = 5; + cl__1.csta = 0; + f_clos(&cl__1);*/ + +/* Compute EPS (the machine precision). */ + + eps = 1.; +L70: + d__1 = eps + 1.; + if (ddiff_(&d__1, &c_b92) == 0.) { + goto L80; + } + eps *= .5; + goto L70; +L80: + eps += eps; + printf("RELATIVE MACHINE PRECISION IS TAKEN TO BE %9.1g\n",eps); + +/* Check the reliability of ZMMCH using exact data. */ + + n = 32; + i__1 = n; + for (j = 1; j <= i__1; ++j) { + i__2 = n; + for (i__ = 1; i__ <= i__2; ++i__) { + i__3 = i__ + j * 65 - 66; +/* Computing MAX */ + i__5 = i__ - j + 1; + i__4 = f2cmax(i__5,0); + ab[i__3].r = (doublereal) i__4, ab[i__3].i = 0.; +/* L90: */ + } + i__2 = j + 4224; + ab[i__2].r = (doublereal) j, ab[i__2].i = 0.; + i__2 = (j + 65) * 65 - 65; + ab[i__2].r = (doublereal) j, ab[i__2].i = 0.; + i__2 = j - 1; + c__[i__2].r = 0., c__[i__2].i = 0.; +/* L100: */ + } + i__1 = n; + for (j = 1; j <= i__1; ++j) { + i__2 = j - 1; + i__3 = j * ((j + 1) * j) / 2 - (j + 1) * j * (j - 1) / 3; + cc[i__2].r = (doublereal) i__3, cc[i__2].i = 0.; +/* L110: */ + } +/* CC holds the exact result. On exit from ZMMCH CT holds */ +/* the result computed by ZMMCH. */ + *(unsigned char *)transa = 'N'; + *(unsigned char *)transb = 'N'; + zmmch_(transa, transb, &n, &c__1, &n, &c_b2, ab, &c__65, &ab[4225], & + c__65, &c_b1, c__, &c__65, ct, g, cc, &c__65, &eps, &err, &fatal, + &c__6, &c_true, (ftnlen)1, (ftnlen)1); + same = lze_(cc, ct, &n); + if (! same || err != 0.) { + printf("ERROR IN ZMMCH - IN-LINE DOT PRODUCTS ARE BEING EVALUATED WRONGLY\n"); + printf("ZMMCH WAS CALLED WITH TRANSA = %s AND TRANSB = %s\n", transa,transb); + printf("AND RETURNED SAME = %c AND ERR = %12.3f.\n",(same==FALSE_? 'F':'T'),err); + printf("THIS MAY BE DUE TO FAULTS IN THE ARITHMETIC OR THE COMPILER.\n"); + printf("****** TESTS ABANDONED ******\n"); + exit(1); + } + *(unsigned char *)transb = 'C'; + zmmch_(transa, transb, &n, &c__1, &n, &c_b2, ab, &c__65, &ab[4225], & + c__65, &c_b1, c__, &c__65, ct, g, cc, &c__65, &eps, &err, &fatal, + &c__6, &c_true, (ftnlen)1, (ftnlen)1); + same = lze_(cc, ct, &n); + if (! same || err != 0.) { + printf("ERROR IN ZMMCH - IN-LINE DOT PRODUCTS ARE BEING EVALUATED WRONGLY\n"); + printf("ZMMCH WAS CALLED WITH TRANSA = %s AND TRANSB = %s\n", transa,transb); + printf("AND RETURNED SAME = %c AND ERR = %12.3f.\n",(same==FALSE_? 'F':'T'),err); + printf("THIS MAY BE DUE TO FAULTS IN THE ARITHMETIC OR THE COMPILER.\n"); + printf("****** TESTS ABANDONED ******\n"); + exit(1); + } + i__1 = n; + for (j = 1; j <= i__1; ++j) { + i__2 = j + 4224; + i__3 = n - j + 1; + ab[i__2].r = (doublereal) i__3, ab[i__2].i = 0.; + i__2 = (j + 65) * 65 - 65; + i__3 = n - j + 1; + ab[i__2].r = (doublereal) i__3, ab[i__2].i = 0.; +/* L120: */ + } + i__1 = n; + for (j = 1; j <= i__1; ++j) { + i__2 = n - j; + i__3 = j * ((j + 1) * j) / 2 - (j + 1) * j * (j - 1) / 3; + cc[i__2].r = (doublereal) i__3, cc[i__2].i = 0.; +/* L130: */ + } + *(unsigned char *)transa = 'C'; + *(unsigned char *)transb = 'N'; + zmmch_(transa, transb, &n, &c__1, &n, &c_b2, ab, &c__65, &ab[4225], & + c__65, &c_b1, c__, &c__65, ct, g, cc, &c__65, &eps, &err, &fatal, + &c__6, &c_true, (ftnlen)1, (ftnlen)1); + same = lze_(cc, ct, &n); + if (! same || err != 0.) { + printf("ERROR IN ZMMCH - IN-LINE DOT PRODUCTS ARE BEING EVALUATED WRONGLY\n"); + printf("ZMMCH WAS CALLED WITH TRANSA = %s AND TRANSB = %s\n", transa,transb); + printf("AND RETURNED SAME = %c AND ERR = %12.3f.\n",(same==FALSE_? 'F':'T'),err); + printf("THIS MAY BE DUE TO FAULTS IN THE ARITHMETIC OR THE COMPILER.\n"); + printf("****** TESTS ABANDONED ******\n"); + exit(1); + } + *(unsigned char *)transb = 'C'; + zmmch_(transa, transb, &n, &c__1, &n, &c_b2, ab, &c__65, &ab[4225], & + c__65, &c_b1, c__, &c__65, ct, g, cc, &c__65, &eps, &err, &fatal, + &c__6, &c_true, (ftnlen)1, (ftnlen)1); + same = lze_(cc, ct, &n); + if (! same || err != 0.) { + printf("ERROR IN ZMMCH - IN-LINE DOT PRODUCTS ARE BEING EVALUATED WRONGLY\n"); + printf("ZMMCH WAS CALLED WITH TRANSA = %s AND TRANSB = %s\n", transa,transb); + printf("AND RETURNED SAME = %c AND ERR = %12.3f.\n",(same==FALSE_? 'F':'T'),err); + printf("THIS MAY BE DUE TO FAULTS IN THE ARITHMETIC OR THE COMPILER.\n"); + printf("****** TESTS ABANDONED ******\n"); + exit(1); + } + +/* Test each subroutine in turn. */ + + for (isnum = 1; isnum <= 9; ++isnum) { + if (! ltest[isnum - 1]) { +/* Subprogram is not to be tested. */ + printf("%12s WAS NOT TESTED\n",snames[isnum-1]); + } else { + s_copy(srnamc_1.srnamt, snames[isnum - 1], (ftnlen)12, ( + ftnlen)12); +/* Test error exits. */ + if (tsterr) { + cz3chke_(snames[isnum - 1], (ftnlen)12); + } +/* Test computations. */ + infoc_1.infot = 0; + infoc_1.ok = TRUE_; + fatal = FALSE_; + switch ((int)isnum) { + case 1: goto L140; + case 2: goto L150; + case 3: goto L150; + case 4: goto L160; + case 5: goto L160; + case 6: goto L170; + case 7: goto L170; + case 8: goto L180; + case 9: goto L180; + } +/* Test ZGEMM, 01. */ +L140: + if (corder) { + zchk1_(snames[isnum - 1], &eps, &thresh, &c__6, &ntra, + &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, & + nbet, bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, + cc, cs, ct, g, &c__0, (ftnlen)12); + } + if (rorder) { + zchk1_(snames[isnum - 1], &eps, &thresh, &c__6, &ntra, + &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, & + nbet, bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, + cc, cs, ct, g, &c__1, (ftnlen)12); + } + goto L190; +/* Test ZHEMM, 02, ZSYMM, 03. */ +L150: + if (corder) { + zchk2_(snames[isnum - 1], &eps, &thresh, &c__6, &ntra, + &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, & + nbet, bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, + cc, cs, ct, g, &c__0, (ftnlen)12); + } + if (rorder) { + zchk2_(snames[isnum - 1], &eps, &thresh, &c__6, &ntra, + &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, & + nbet, bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, + cc, cs, ct, g, &c__1, (ftnlen)12); + } + goto L190; +/* Test ZTRMM, 04, ZTRSM, 05. */ +L160: + if (corder) { + zchk3_(snames[isnum - 1], &eps, &thresh, &c__6, &ntra, + &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, & + c__65, ab, aa, as, &ab[4225], bb, bs, ct, g, c__, & + c__0, (ftnlen)12); + } + if (rorder) { + zchk3_(snames[isnum - 1], &eps, &thresh, &c__6, &ntra, + &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, & + c__65, ab, aa, as, &ab[4225], bb, bs, ct, g, c__, & + c__1, (ftnlen)12); + } + goto L190; +/* Test ZHERK, 06, ZSYRK, 07. */ +L170: + if (corder) { + zchk4_(snames[isnum - 1], &eps, &thresh, &c__6, &ntra, + &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, & + nbet, bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, + cc, cs, ct, g, &c__0, (ftnlen)12); + } + if (rorder) { + zchk4_(snames[isnum - 1], &eps, &thresh, &c__6, &ntra, + &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, & + nbet, bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, + cc, cs, ct, g, &c__1, (ftnlen)12); + } + goto L190; +/* Test ZHER2K, 08, ZSYR2K, 09. */ +L180: + if (corder) { + zchk5_(snames[isnum - 1], &eps, &thresh, &c__6, &ntra, + &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, & + nbet, bet, &c__65, ab, aa, as, bb, bs, c__, cc, cs, + ct, g, w, &c__0, (ftnlen)12); + } + if (rorder) { + zchk5_(snames[isnum - 1], &eps, &thresh, &c__6, &ntra, + &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, & + nbet, bet, &c__65, ab, aa, as, bb, bs, c__, cc, cs, + ct, g, w, &c__1, (ftnlen)12); + } + goto L190; + +L190: + if (fatal && sfatal) { + goto L210; + } + } +/* L200: */ + } + printf("\nEND OF TESTS\n"); + goto L230; + +L210: + printf("\n****** FATAL ERROR - TESTS ABANDONED ******\n"); + goto L230; + +L220: + printf("AMEND DATA FILE OR INCREASE ARRAY SIZES IN PROGRAM\n"); + printf("****** TESTS ABANDONED ******\n"); + +L230: + if (trace) { +/* cl__1.cerr = 0; + cl__1.cunit = ntra; + cl__1.csta = 0; + f_clos(&cl__1);*/ + } +/* cl__1.cerr = 0; + cl__1.cunit = 6; + cl__1.csta = 0; + f_clos(&cl__1);*/ + exit(0); + +/* End of ZBLAT3. */ + +} /* MAIN__ */ + +/* Subroutine */ int zchk1_(char* sname, doublereal* eps, doublereal* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, doublecomplex* alf, integer* nbet, doublecomplex* bet, integer* nmax, doublecomplex* a, doublecomplex* aa, doublecomplex* as, doublecomplex* b, doublecomplex* bb, doublecomplex* bs, doublecomplex* c__, doublecomplex* cc, doublecomplex* cs, doublecomplex* ct, doublereal* g, integer* iorder, ftnlen sname_len) +{ + /* Initialized data */ + + static char ich[3+1] = "NTC"; + + /* System generated locals */ + integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, + i__3, i__4, i__5, i__6, i__7, i__8; + + /* Local variables */ + static doublecomplex beta; + static integer ldas, ldbs, ldcs; + static logical same, null; + static integer i__, k, m, n; + static doublecomplex alpha; + static logical isame[13], trana, tranb; + extern /* Subroutine */ int zmake_(char*, char*, char*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, logical*, doublecomplex*, ftnlen, ftnlen, ftnlen); + static integer nargs; + extern /* Subroutine */ int zmmch_(char*, char*, integer*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, doublereal*, doublecomplex*, integer*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen, ftnlen); + static logical reset; + static integer ia, ib; + extern /* Subroutine */ int zprcn1_(integer*, integer*, char*, integer*, char*, char*, integer*, integer*, integer*, doublecomplex*, integer*, integer*, doublecomplex*, integer*, ftnlen, ftnlen, ftnlen); + static integer ma, mb, na, nb, nc, ik, im, in, ks, ms, ns; + extern /* Subroutine */ void czgemm3m_(integer*, char*, char*, integer*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen); + static char tranas[1], tranbs[1], transa[1], transb[1]; + static doublereal errmax; + extern logical lzeres_(char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen); + static integer ica, icb, laa, lbb, lda, lcc, ldb, ldc; + static doublecomplex als, bls; + static doublereal err; + extern logical lze_(doublecomplex*, doublecomplex*, integer*); + +/* Tests ZGEMM. */ + +/* Auxiliary routine for test program for Level 3 Blas. */ + +/* -- Written on 8-February-1989. */ +/* Jack Dongarra, Argonne National Laboratory. */ +/* Iain Duff, AERE Harwell. */ +/* Jeremy Du Croz, Numerical Algorithms Group Ltd. */ +/* Sven Hammarling, Numerical Algorithms Group Ltd. */ + +/* .. Parameters .. */ +/* .. Scalar Arguments .. */ +/* .. Array Arguments .. */ +/* .. Local Scalars .. */ +/* .. Local Arrays .. */ +/* .. External Functions .. */ +/* .. External Subroutines .. */ +/* .. Intrinsic Functions .. */ +/* .. Scalars in Common .. */ +/* .. Common blocks .. */ +/* .. Data statements .. */ + /* Parameter adjustments */ + --idim; + --alf; + --bet; + --g; + --ct; + --cs; + --cc; + c_dim1 = *nmax; + c_offset = 1 + c_dim1 * 1; + c__ -= c_offset; + --bs; + --bb; + b_dim1 = *nmax; + b_offset = 1 + b_dim1 * 1; + b -= b_offset; + --as; + --aa; + a_dim1 = *nmax; + a_offset = 1 + a_dim1 * 1; + a -= a_offset; + + /* Function Body */ +/* .. Executable Statements .. */ + + nargs = 13; + nc = 0; + reset = TRUE_; + errmax = 0.; + + i__1 = *nidim; + for (im = 1; im <= i__1; ++im) { + m = idim[im]; + + i__2 = *nidim; + for (in = 1; in <= i__2; ++in) { + n = idim[in]; +/* Set LDC to 1 more than minimum value if room. */ + ldc = m; + if (ldc < *nmax) { + ++ldc; + } +/* Skip tests if not enough room. */ + if (ldc > *nmax) { + goto L100; + } + lcc = ldc * n; + null = n <= 0 || m <= 0; + + i__3 = *nidim; + for (ik = 1; ik <= i__3; ++ik) { + k = idim[ik]; + + for (ica = 1; ica <= 3; ++ica) { + *(unsigned char *)transa = *(unsigned char *)&ich[ica - 1] + ; + trana = *(unsigned char *)transa == 'T' || *(unsigned + char *)transa == 'C'; + + if (trana) { + ma = k; + na = m; + } else { + ma = m; + na = k; + } +/* Set LDA to 1 more than minimum value if room. */ + lda = ma; + if (lda < *nmax) { + ++lda; + } +/* Skip tests if not enough room. */ + if (lda > *nmax) { + goto L80; + } + laa = lda * na; + +/* Generate the matrix A. */ + + zmake_("ge", " ", " ", &ma, &na, &a[a_offset], nmax, &aa[ + 1], &lda, &reset, &c_b1, (ftnlen)2, (ftnlen)1, ( + ftnlen)1); + + for (icb = 1; icb <= 3; ++icb) { + *(unsigned char *)transb = *(unsigned char *)&ich[icb + - 1]; + tranb = *(unsigned char *)transb == 'T' || *(unsigned + char *)transb == 'C'; + + if (tranb) { + mb = n; + nb = k; + } else { + mb = k; + nb = n; + } +/* Set LDB to 1 more than minimum value if room. */ + ldb = mb; + if (ldb < *nmax) { + ++ldb; + } +/* Skip tests if not enough room. */ + if (ldb > *nmax) { + goto L70; + } + lbb = ldb * nb; + +/* Generate the matrix B. */ + + zmake_("ge", " ", " ", &mb, &nb, &b[b_offset], nmax, & + bb[1], &ldb, &reset, &c_b1, (ftnlen)2, ( + ftnlen)1, (ftnlen)1); + + i__4 = *nalf; + for (ia = 1; ia <= i__4; ++ia) { + i__5 = ia; + alpha.r = alf[i__5].r, alpha.i = alf[i__5].i; + + i__5 = *nbet; + for (ib = 1; ib <= i__5; ++ib) { + i__6 = ib; + beta.r = bet[i__6].r, beta.i = bet[i__6].i; + +/* Generate the matrix C. */ + + zmake_("ge", " ", " ", &m, &n, &c__[c_offset], + nmax, &cc[1], &ldc, &reset, &c_b1, ( + ftnlen)2, (ftnlen)1, (ftnlen)1); + + ++nc; + +/* Save every datum before calling the */ +/* subroutine. */ + + *(unsigned char *)tranas = *(unsigned char *) + transa; + *(unsigned char *)tranbs = *(unsigned char *) + transb; + ms = m; + ns = n; + ks = k; + als.r = alpha.r, als.i = alpha.i; + i__6 = laa; + for (i__ = 1; i__ <= i__6; ++i__) { + i__7 = i__; + i__8 = i__; + as[i__7].r = aa[i__8].r, as[i__7].i = aa[ + i__8].i; +/* L10: */ + } + ldas = lda; + i__6 = lbb; + for (i__ = 1; i__ <= i__6; ++i__) { + i__7 = i__; + i__8 = i__; + bs[i__7].r = bb[i__8].r, bs[i__7].i = bb[ + i__8].i; +/* L20: */ + } + ldbs = ldb; + bls.r = beta.r, bls.i = beta.i; + i__6 = lcc; + for (i__ = 1; i__ <= i__6; ++i__) { + i__7 = i__; + i__8 = i__; + cs[i__7].r = cc[i__8].r, cs[i__7].i = cc[ + i__8].i; +/* L30: */ + } + ldcs = ldc; + +/* Call the subroutine. */ + + if (*trace) { + zprcn1_(ntra, &nc, sname, iorder, transa, + transb, &m, &n, &k, &alpha, &lda, + &ldb, &beta, &ldc, (ftnlen)12, ( + ftnlen)1, (ftnlen)1); + } + if (*rewi) { +/* al__1.aerr = 0; + al__1.aunit = *ntra; + f_rew(&al__1);*/ + } + czgemm3m_(iorder, transa, transb, &m, &n, &k, & + alpha, &aa[1], &lda, &bb[1], &ldb, & + beta, &cc[1], &ldc, (ftnlen)1, ( + ftnlen)1); + +/* Check if error-exit was taken incorrectly. */ + + if (! infoc_1.ok) { + printf(" *** FATAL ERROR - ERROR-CALL MYEXIT TAKEN ON VALID CALL\n"); + *fatal = TRUE_; + goto L120; + } + +/* See what data changed inside subroutines. */ + + isame[0] = *(unsigned char *)transa == *( + unsigned char *)tranas; + isame[1] = *(unsigned char *)transb == *( + unsigned char *)tranbs; + isame[2] = ms == m; + isame[3] = ns == n; + isame[4] = ks == k; + isame[5] = als.r == alpha.r && als.i == + alpha.i; + isame[6] = lze_(&as[1], &aa[1], &laa); + isame[7] = ldas == lda; + isame[8] = lze_(&bs[1], &bb[1], &lbb); + isame[9] = ldbs == ldb; + isame[10] = bls.r == beta.r && bls.i == + beta.i; + if (null) { + isame[11] = lze_(&cs[1], &cc[1], &lcc); + } else { + isame[11] = lzeres_("ge", " ", &m, &n, & + cs[1], &cc[1], &ldc, (ftnlen)2, ( + ftnlen)1); + } + isame[12] = ldcs == ldc; + +/* If data was incorrectly changed, report */ +/* and return. */ + + same = TRUE_; + i__6 = nargs; + for (i__ = 1; i__ <= i__6; ++i__) { + same = same && isame[i__ - 1]; + if (! isame[i__ - 1]) { + printf(" ******* FATAL ERROR - PARAMETER NUMBER %d WAS CHANGED INCORRECTLY *******\n",i__); + } +/* L40: */ + } + if (! same) { + *fatal = TRUE_; + goto L120; + } + + if (! null) { + +/* Check the result. */ + + zmmch_(transa, transb, &m, &n, &k, &alpha, + &a[a_offset], nmax, &b[b_offset], + nmax, &beta, &c__[c_offset], + nmax, &ct[1], &g[1], &cc[1], &ldc, + eps, &err, fatal, nout, &c_true, + (ftnlen)1, (ftnlen)1); + errmax = f2cmax(errmax,err); +/* If got really bad answer, report and */ +/* return. */ + if (*fatal) { + goto L120; + } + } + +/* L50: */ + } + +/* L60: */ + } + +L70: + ; + } + +L80: + ; + } + +/* L90: */ + } + +L100: + ; + } + +/* L110: */ + } + +/* Report result. */ + + if (errmax < *thresh) { + if (*iorder == 0) { + printf("%s PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS (%d CALLS)\n",sname,nc); + } + if (*iorder == 1) { + printf("%s PASSED THE ROW-MAJOR COMPUTATIONAL TESTS (%d CALLS)\n",sname,nc); + } + } else { + if (*iorder == 0) { + printf("%s COMPLETED THE COLUMN-MAJOR COMPUTATIONAL TESTS (%d CALLS)/n",sname,nc); + printf("***** BUT WITH MAXIMUM TEST RATIO %8.2f - SUSPECT *******/n",errmax); + } + if (*iorder == 1) { + printf("%s COMPLETED THE ROW-MAJOR COMPUTATIONAL TESTS (%d CALLS)/n",sname,nc); + printf("***** BUT WITH MAXIMUM TEST RATIO %8.2f - SUSPECT *******/n",errmax); + } + } + goto L130; + +L120: + printf(" ******* %s FAILED ON CALL NUMBER:\n",sname); + zprcn1_(nout, &nc, sname, iorder, transa, transb, &m, &n, &k, &alpha, & + lda, &ldb, &beta, &ldc, (ftnlen)12, (ftnlen)1, (ftnlen)1); + +L130: + return 0; + +/* 9995 FORMAT( 1X, I6, ': ', A12,'(''', A1, ''',''', A1, ''',', */ +/* $ 3( I3, ',' ), '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, */ +/* $ ',(', F4.1, ',', F4.1, '), C,', I3, ').' ) */ + +/* End of ZCHK1. */ + +} /* zchk1_ */ + + +/* Subroutine */ int zprcn1_(integer* nout, integer* nc, char* sname, integer* iorder, char* transa, char* transb, integer* m, integer* n, integer* k, doublecomplex* alpha, integer* lda, integer* ldb, doublecomplex* beta, integer* ldc, ftnlen sname_len, ftnlen transa_len, ftnlen transb_len) +{ + /* Local variables */ + static char crc[14], cta[14], ctb[14]; + + if (*(unsigned char *)transa == 'N') { + s_copy(cta, " CblasNoTrans", (ftnlen)14, (ftnlen)14); + } else if (*(unsigned char *)transa == 'T') { + s_copy(cta, " CblasTrans", (ftnlen)14, (ftnlen)14); + } else { + s_copy(cta, "CblasConjTrans", (ftnlen)14, (ftnlen)14); + } + if (*(unsigned char *)transb == 'N') { + s_copy(ctb, " CblasNoTrans", (ftnlen)14, (ftnlen)14); + } else if (*(unsigned char *)transb == 'T') { + s_copy(ctb, " CblasTrans", (ftnlen)14, (ftnlen)14); + } else { + s_copy(ctb, "CblasConjTrans", (ftnlen)14, (ftnlen)14); + } + if (*iorder == 1) { + s_copy(crc, " CblasRowMajor", (ftnlen)14, (ftnlen)14); + } else { + s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14); + } + printf("%6d: %s %s %s %s\n",*nc,sname,crc,cta,ctb); + printf("%d %d %d (%4.1lf,%4.1lf) , A, %d, B, %d, (%4.1lf,%4.1lf) , C, %d.\n",*m,*n,*k,alpha->r,alpha->i,*lda,*ldb,beta->r,beta->i,*ldc); + +return 0; +} /* zprcn1_ */ + + +/* Subroutine */ int zchk2_(char* sname, doublereal* eps, doublereal* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, doublecomplex* alf, integer* nbet, doublecomplex* bet, integer* nmax, doublecomplex* a, doublecomplex* aa, doublecomplex* as, doublecomplex* b, doublecomplex* bb, doublecomplex* bs, doublecomplex* c__, doublecomplex* cc, doublecomplex* cs, doublecomplex* ct, doublereal* g, integer* iorder, ftnlen sname_len) +{ + /* Initialized data */ + + static char ichs[2+1] = "LR"; + static char ichu[2+1] = "UL"; + + /* System generated locals */ + integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, + i__3, i__4, i__5, i__6, i__7; + + /* Local variables */ + static doublecomplex beta; + static integer ldas, ldbs, ldcs; + static logical same; + static char side[1]; + static logical isconj, left, null; + static char uplo[1]; + static integer i__, m, n; + static doublecomplex alpha; + static logical isame[13]; + static char sides[1]; + extern /* Subroutine */ int zmake_(char*, char*, char*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, logical*, doublecomplex*, ftnlen, ftnlen, ftnlen); + static integer nargs; + extern /* Subroutine */ int zmmch_(char*, char*, integer*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, doublereal*, doublecomplex*, integer*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen, ftnlen); + static logical reset; + static char uplos[1]; + static integer ia, ib; + extern /* Subroutine */ int zprcn2_(integer*, integer*, char*, integer*, char*, char*, integer*, integer*, doublecomplex*, integer*, integer*, doublecomplex*, integer*, ftnlen, ftnlen, ftnlen); + static integer na, nc, im, in, ms, ns; + extern /* Subroutine */ void czhemm_(integer*, char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen); + static doublereal errmax; + extern logical lzeres_(char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen); + extern /* Subroutine */ void czsymm_(integer*, char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen); + static integer laa, lbb, lda, lcc, ldb, ldc, ics; + static doublecomplex als, bls; + static integer icu; + static doublereal err; + extern logical lze_(doublecomplex*, doublecomplex*, integer*); + +/* Tests ZHEMM and ZSYMM. */ + +/* Auxiliary routine for test program for Level 3 Blas. */ + +/* -- Written on 8-February-1989. */ +/* Jack Dongarra, Argonne National Laboratory. */ +/* Iain Duff, AERE Harwell. */ +/* Jeremy Du Croz, Numerical Algorithms Group Ltd. */ +/* Sven Hammarling, Numerical Algorithms Group Ltd. */ + +/* .. Parameters .. */ +/* .. Scalar Arguments .. */ +/* .. Array Arguments .. */ +/* .. Local Scalars .. */ +/* .. Local Arrays .. */ +/* .. External Functions .. */ +/* .. External Subroutines .. */ +/* .. Intrinsic Functions .. */ +/* .. Scalars in Common .. */ +/* .. Common blocks .. */ +/* .. Data statements .. */ + /* Parameter adjustments */ + --idim; + --alf; + --bet; + --g; + --ct; + --cs; + --cc; + c_dim1 = *nmax; + c_offset = 1 + c_dim1 * 1; + c__ -= c_offset; + --bs; + --bb; + b_dim1 = *nmax; + b_offset = 1 + b_dim1 * 1; + b -= b_offset; + --as; + --aa; + a_dim1 = *nmax; + a_offset = 1 + a_dim1 * 1; + a -= a_offset; + + /* Function Body */ +/* .. Executable Statements .. */ + isconj = s_cmp(sname + 7, "he", (ftnlen)2, (ftnlen)2) == 0; + + nargs = 12; + nc = 0; + reset = TRUE_; + errmax = 0.; + + i__1 = *nidim; + for (im = 1; im <= i__1; ++im) { + m = idim[im]; + + i__2 = *nidim; + for (in = 1; in <= i__2; ++in) { + n = idim[in]; +/* Set LDC to 1 more than minimum value if room. */ + ldc = m; + if (ldc < *nmax) { + ++ldc; + } +/* Skip tests if not enough room. */ + if (ldc > *nmax) { + goto L90; + } + lcc = ldc * n; + null = n <= 0 || m <= 0; +/* Set LDB to 1 more than minimum value if room. */ + ldb = m; + if (ldb < *nmax) { + ++ldb; + } +/* Skip tests if not enough room. */ + if (ldb > *nmax) { + goto L90; + } + lbb = ldb * n; + +/* Generate the matrix B. */ + + zmake_("ge", " ", " ", &m, &n, &b[b_offset], nmax, &bb[1], &ldb, & + reset, &c_b1, (ftnlen)2, (ftnlen)1, (ftnlen)1); + + for (ics = 1; ics <= 2; ++ics) { + *(unsigned char *)side = *(unsigned char *)&ichs[ics - 1]; + left = *(unsigned char *)side == 'L'; + + if (left) { + na = m; + } else { + na = n; + } +/* Set LDA to 1 more than minimum value if room. */ + lda = na; + if (lda < *nmax) { + ++lda; + } +/* Skip tests if not enough room. */ + if (lda > *nmax) { + goto L80; + } + laa = lda * na; + + for (icu = 1; icu <= 2; ++icu) { + *(unsigned char *)uplo = *(unsigned char *)&ichu[icu - 1]; + +/* Generate the hermitian or symmetric matrix A. */ + + zmake_(sname + 7, uplo, " ", &na, &na, &a[a_offset], nmax, + &aa[1], &lda, &reset, &c_b1, (ftnlen)2, (ftnlen) + 1, (ftnlen)1); + + i__3 = *nalf; + for (ia = 1; ia <= i__3; ++ia) { + i__4 = ia; + alpha.r = alf[i__4].r, alpha.i = alf[i__4].i; + + i__4 = *nbet; + for (ib = 1; ib <= i__4; ++ib) { + i__5 = ib; + beta.r = bet[i__5].r, beta.i = bet[i__5].i; + +/* Generate the matrix C. */ + + zmake_("ge", " ", " ", &m, &n, &c__[c_offset], + nmax, &cc[1], &ldc, &reset, &c_b1, ( + ftnlen)2, (ftnlen)1, (ftnlen)1); + + ++nc; + +/* Save every datum before calling the */ +/* subroutine. */ + + *(unsigned char *)sides = *(unsigned char *)side; + *(unsigned char *)uplos = *(unsigned char *)uplo; + ms = m; + ns = n; + als.r = alpha.r, als.i = alpha.i; + i__5 = laa; + for (i__ = 1; i__ <= i__5; ++i__) { + i__6 = i__; + i__7 = i__; + as[i__6].r = aa[i__7].r, as[i__6].i = aa[i__7] + .i; +/* L10: */ + } + ldas = lda; + i__5 = lbb; + for (i__ = 1; i__ <= i__5; ++i__) { + i__6 = i__; + i__7 = i__; + bs[i__6].r = bb[i__7].r, bs[i__6].i = bb[i__7] + .i; +/* L20: */ + } + ldbs = ldb; + bls.r = beta.r, bls.i = beta.i; + i__5 = lcc; + for (i__ = 1; i__ <= i__5; ++i__) { + i__6 = i__; + i__7 = i__; + cs[i__6].r = cc[i__7].r, cs[i__6].i = cc[i__7] + .i; +/* L30: */ + } + ldcs = ldc; + +/* Call the subroutine. */ + + if (*trace) { + zprcn2_(ntra, &nc, sname, iorder, side, uplo, + &m, &n, &alpha, &lda, &ldb, &beta, & + ldc, (ftnlen)12, (ftnlen)1, (ftnlen)1) + ; + } + if (*rewi) { +/* al__1.aerr = 0; + al__1.aunit = *ntra; + f_rew(&al__1);*/ + } + if (isconj) { + czhemm_(iorder, side, uplo, &m, &n, &alpha, & + aa[1], &lda, &bb[1], &ldb, &beta, &cc[ + 1], &ldc, (ftnlen)1, (ftnlen)1); + } else { + czsymm_(iorder, side, uplo, &m, &n, &alpha, & + aa[1], &lda, &bb[1], &ldb, &beta, &cc[ + 1], &ldc, (ftnlen)1, (ftnlen)1); + } + +/* Check if error-exit was taken incorrectly. */ + + if (! infoc_1.ok) { + printf("*** FATAL ERROR - ERROR-CALL MYEXIT TAKEN ON VALID CALL\n"); + *fatal = TRUE_; + goto L110; + } + +/* See what data changed inside subroutines. */ + + isame[0] = *(unsigned char *)sides == *(unsigned + char *)side; + isame[1] = *(unsigned char *)uplos == *(unsigned + char *)uplo; + isame[2] = ms == m; + isame[3] = ns == n; + isame[4] = als.r == alpha.r && als.i == alpha.i; + isame[5] = lze_(&as[1], &aa[1], &laa); + isame[6] = ldas == lda; + isame[7] = lze_(&bs[1], &bb[1], &lbb); + isame[8] = ldbs == ldb; + isame[9] = bls.r == beta.r && bls.i == beta.i; + if (null) { + isame[10] = lze_(&cs[1], &cc[1], &lcc); + } else { + isame[10] = lzeres_("ge", " ", &m, &n, &cs[1], + &cc[1], &ldc, (ftnlen)2, (ftnlen)1); + } + isame[11] = ldcs == ldc; + +/* If data was incorrectly changed, report and */ +/* return. */ + + same = TRUE_; + i__5 = nargs; + for (i__ = 1; i__ <= i__5; ++i__) { + same = same && isame[i__ - 1]; + if (! isame[i__ - 1]) { + printf(" ******* FATAL ERROR - PARAMETER NUMBER %d WAS CHANGED INCORRECTLY *******\n",i__); + } +/* L40: */ + } + if (! same) { + *fatal = TRUE_; + goto L110; + } + + if (! null) { + +/* Check the result. */ + + if (left) { + zmmch_("N", "N", &m, &n, &m, &alpha, &a[ + a_offset], nmax, &b[b_offset], + nmax, &beta, &c__[c_offset], nmax, + &ct[1], &g[1], &cc[1], &ldc, eps, + &err, fatal, nout, &c_true, ( + ftnlen)1, (ftnlen)1); + } else { + zmmch_("N", "N", &m, &n, &n, &alpha, &b[ + b_offset], nmax, &a[a_offset], + nmax, &beta, &c__[c_offset], nmax, + &ct[1], &g[1], &cc[1], &ldc, eps, + &err, fatal, nout, &c_true, ( + ftnlen)1, (ftnlen)1); + } + errmax = f2cmax(errmax,err); +/* If got really bad answer, report and */ +/* return. */ + if (*fatal) { + goto L110; + } + } + +/* L50: */ + } + +/* L60: */ + } + +/* L70: */ + } + +L80: + ; + } + +L90: + ; + } + +/* L100: */ + } + +/* Report result. */ + + if (errmax < *thresh) { + if (*iorder == 0) { + printf("%s PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS (%d CALLS)\n",sname,nc); + } + if (*iorder == 1) { + printf("%s PASSED THE ROW-MAJOR COMPUTATIONAL TESTS (%d CALLS)\n",sname,nc); + } + } else { + if (*iorder == 0) { + printf("%s COMPLETED THE COLUMN-MAJOR COMPUTATIONAL TESTS (%d CALLS)/n",sname,nc); + printf("***** BUT WITH MAXIMUM TEST RATIO %8.2f - SUSPECT *******/n",errmax); + } + if (*iorder == 1) { + printf("%s COMPLETED THE ROW-MAJOR COMPUTATIONAL TESTS (%d CALLS)/n",sname,nc); + printf("***** BUT WITH MAXIMUM TEST RATIO %8.2f - SUSPECT *******/n",errmax); + } + } + goto L120; + +L110: + printf(" ******* %s FAILED ON CALL NUMBER:\n",sname); + zprcn2_(nout, &nc, sname, iorder, side, uplo, &m, &n, &alpha, &lda, &ldb, + &beta, &ldc, (ftnlen)12, (ftnlen)1, (ftnlen)1); + +L120: + return 0; + +/* 9995 FORMAT(1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), */ +/* $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ',(', F4.1, */ +/* $ ',', F4.1, '), C,', I3, ') .' ) */ + +/* End of ZCHK2. */ + +} /* zchk2_ */ + + +/* Subroutine */ int zprcn2_(integer* nout, integer* nc, char* sname, integer* iorder, char* side, char* uplo, integer* m, integer* n, doublecomplex* alpha, integer* lda, integer* ldb, doublecomplex* beta, integer* ldc, ftnlen sname_len, ftnlen side_len, ftnlen uplo_len) +{ + /* Local variables */ + static char cs[14], cu[14], crc[14]; + + if (*(unsigned char *)side == 'L') { + s_copy(cs, " CblasLeft", (ftnlen)14, (ftnlen)14); + } else { + s_copy(cs, " CblasRight", (ftnlen)14, (ftnlen)14); + } + if (*(unsigned char *)uplo == 'U') { + s_copy(cu, " CblasUpper", (ftnlen)14, (ftnlen)14); + } else { + s_copy(cu, " CblasLower", (ftnlen)14, (ftnlen)14); + } + if (*iorder == 1) { + s_copy(crc, " CblasRowMajor", (ftnlen)14, (ftnlen)14); + } else { + s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14); + } + printf("%6d: %s %s %s %s\n",*nc,sname,crc,cs,cu); + printf("%d %d (%4.1lf,%4.1lf) , A, %d, B, %d, (%4.1lf,%4.1lf) , C, %d.\n",*m,*n,alpha->r,alpha->i,*lda,*ldb,beta->r,beta->i,*ldc); + +return 0; +} /* zprcn2_ */ + + +/* Subroutine */ int zchk3_(char* sname, doublereal* eps, doublereal* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, doublecomplex* alf, integer* nmax, doublecomplex* a, doublecomplex* aa, doublecomplex* as, doublecomplex* b, doublecomplex* bb, doublecomplex* bs, doublecomplex* ct, doublereal* g, doublecomplex* c__, integer* iorder, ftnlen sname_len) +{ + /* Initialized data */ + + static char ichu[2+1] = "UL"; + static char icht[3+1] = "NTC"; + static char ichd[2+1] = "UN"; + static char ichs[2+1] = "LR"; + + /* System generated locals */ + integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, + i__3, i__4, i__5, i__6, i__7; + doublecomplex z__1; + + /* Local variables */ + static char diag[1]; + static integer ldas, ldbs; + static logical same; + static char side[1]; + static logical left, null; + static char uplo[1]; + static integer i__, j, m, n; + static doublecomplex alpha; + static char diags[1]; + static logical isame[13]; + static char sides[1]; + extern /* Subroutine */ int zmake_(char*, char*, char*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, logical*, doublecomplex*, ftnlen, ftnlen, ftnlen); + static integer nargs; + extern /* Subroutine */ int zmmch_(char*, char*, integer*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, doublereal*, doublecomplex*, integer*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen, ftnlen); + static logical reset; + static char uplos[1]; + static integer ia, na; + extern /* Subroutine */ int zprcn3_(integer*, integer*, char*, integer*, char*, char*, char*, char*, integer*, integer*, doublecomplex*, integer*, integer*, ftnlen, ftnlen, ftnlen, ftnlen, ftnlen); + static integer nc, im, in, ms, ns; + static char tranas[1], transa[1]; + static doublereal errmax; + extern logical lzeres_(char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen); + extern /* Subroutine */ void cztrmm_(integer*, char*, char*, char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, ftnlen, ftnlen, ftnlen, ftnlen); + extern /* Subroutine */ void cztrsm_(integer*, char*, char*, char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, ftnlen, ftnlen, ftnlen, ftnlen); + static integer laa, icd, lbb, lda, ldb, ics; + static doublecomplex als; + static integer ict, icu; + static doublereal err; + extern logical lze_(doublecomplex*, doublecomplex*, integer*); + +/* Tests ZTRMM and ZTRSM. */ + +/* Auxiliary routine for test program for Level 3 Blas. */ + +/* -- Written on 8-February-1989. */ +/* Jack Dongarra, Argonne National Laboratory. */ +/* Iain Duff, AERE Harwell. */ +/* Jeremy Du Croz, Numerical Algorithms Group Ltd. */ +/* Sven Hammarling, Numerical Algorithms Group Ltd. */ + +/* .. Parameters .. */ +/* .. Scalar Arguments .. */ +/* .. Array Arguments .. */ +/* .. Local Scalars .. */ +/* .. Local Arrays .. */ +/* .. External Functions .. */ +/* .. External Subroutines .. */ +/* .. Intrinsic Functions .. */ +/* .. Scalars in Common .. */ +/* .. Common blocks .. */ +/* .. Data statements .. */ + /* Parameter adjustments */ + --idim; + --alf; + c_dim1 = *nmax; + c_offset = 1 + c_dim1 * 1; + c__ -= c_offset; + --g; + --ct; + --bs; + --bb; + b_dim1 = *nmax; + b_offset = 1 + b_dim1 * 1; + b -= b_offset; + --as; + --aa; + a_dim1 = *nmax; + a_offset = 1 + a_dim1 * 1; + a -= a_offset; + + /* Function Body */ +/* .. Executable Statements .. */ + + nargs = 11; + nc = 0; + reset = TRUE_; + errmax = 0.; +/* Set up zero matrix for ZMMCH. */ + i__1 = *nmax; + for (j = 1; j <= i__1; ++j) { + i__2 = *nmax; + for (i__ = 1; i__ <= i__2; ++i__) { + i__3 = i__ + j * c_dim1; + c__[i__3].r = 0., c__[i__3].i = 0.; +/* L10: */ + } +/* L20: */ + } + + i__1 = *nidim; + for (im = 1; im <= i__1; ++im) { + m = idim[im]; + + i__2 = *nidim; + for (in = 1; in <= i__2; ++in) { + n = idim[in]; +/* Set LDB to 1 more than minimum value if room. */ + ldb = m; + if (ldb < *nmax) { + ++ldb; + } +/* Skip tests if not enough room. */ + if (ldb > *nmax) { + goto L130; + } + lbb = ldb * n; + null = m <= 0 || n <= 0; + + for (ics = 1; ics <= 2; ++ics) { + *(unsigned char *)side = *(unsigned char *)&ichs[ics - 1]; + left = *(unsigned char *)side == 'L'; + if (left) { + na = m; + } else { + na = n; + } +/* Set LDA to 1 more than minimum value if room. */ + lda = na; + if (lda < *nmax) { + ++lda; + } +/* Skip tests if not enough room. */ + if (lda > *nmax) { + goto L130; + } + laa = lda * na; + + for (icu = 1; icu <= 2; ++icu) { + *(unsigned char *)uplo = *(unsigned char *)&ichu[icu - 1]; + + for (ict = 1; ict <= 3; ++ict) { + *(unsigned char *)transa = *(unsigned char *)&icht[ + ict - 1]; + + for (icd = 1; icd <= 2; ++icd) { + *(unsigned char *)diag = *(unsigned char *)&ichd[ + icd - 1]; + + i__3 = *nalf; + for (ia = 1; ia <= i__3; ++ia) { + i__4 = ia; + alpha.r = alf[i__4].r, alpha.i = alf[i__4].i; + +/* Generate the matrix A. */ + + zmake_("tr", uplo, diag, &na, &na, &a[ + a_offset], nmax, &aa[1], &lda, &reset, + &c_b1, (ftnlen)2, (ftnlen)1, (ftnlen) + 1); + +/* Generate the matrix B. */ + + zmake_("ge", " ", " ", &m, &n, &b[b_offset], + nmax, &bb[1], &ldb, &reset, &c_b1, ( + ftnlen)2, (ftnlen)1, (ftnlen)1); + + ++nc; + +/* Save every datum before calling the */ +/* subroutine. */ + + *(unsigned char *)sides = *(unsigned char *) + side; + *(unsigned char *)uplos = *(unsigned char *) + uplo; + *(unsigned char *)tranas = *(unsigned char *) + transa; + *(unsigned char *)diags = *(unsigned char *) + diag; + ms = m; + ns = n; + als.r = alpha.r, als.i = alpha.i; + i__4 = laa; + for (i__ = 1; i__ <= i__4; ++i__) { + i__5 = i__; + i__6 = i__; + as[i__5].r = aa[i__6].r, as[i__5].i = aa[ + i__6].i; +/* L30: */ + } + ldas = lda; + i__4 = lbb; + for (i__ = 1; i__ <= i__4; ++i__) { + i__5 = i__; + i__6 = i__; + bs[i__5].r = bb[i__6].r, bs[i__5].i = bb[ + i__6].i; +/* L40: */ + } + ldbs = ldb; + +/* Call the subroutine. */ + + if (s_cmp(sname + 9, "mm", (ftnlen)2, (ftnlen) + 2) == 0) { + if (*trace) { + zprcn3_(ntra, &nc, sname, iorder, + side, uplo, transa, diag, &m, + &n, &alpha, &lda, &ldb, ( + ftnlen)12, (ftnlen)1, (ftnlen) + 1, (ftnlen)1, (ftnlen)1); + } + if (*rewi) { +/* al__1.aerr = 0; + al__1.aunit = *ntra; + f_rew(&al__1);*/ + } + cztrmm_(iorder, side, uplo, transa, diag, + &m, &n, &alpha, &aa[1], &lda, &bb[ + 1], &ldb, (ftnlen)1, (ftnlen)1, ( + ftnlen)1, (ftnlen)1); + } else if (s_cmp(sname + 9, "sm", (ftnlen)2, ( + ftnlen)2) == 0) { + if (*trace) { + zprcn3_(ntra, &nc, sname, iorder, + side, uplo, transa, diag, &m, + &n, &alpha, &lda, &ldb, ( + ftnlen)12, (ftnlen)1, (ftnlen) + 1, (ftnlen)1, (ftnlen)1); + } + if (*rewi) { +/* al__1.aerr = 0; + al__1.aunit = *ntra; + f_rew(&al__1);*/ + } + cztrsm_(iorder, side, uplo, transa, diag, + &m, &n, &alpha, &aa[1], &lda, &bb[ + 1], &ldb, (ftnlen)1, (ftnlen)1, ( + ftnlen)1, (ftnlen)1); + } + +/* Check if error-exit was taken incorrectly. */ + + if (! infoc_1.ok) { + printf("*** FATAL ERROR - ERROR-CALL MYEXIT TAKEN ON VALID CALL\n"); + *fatal = TRUE_; + goto L150; + } + +/* See what data changed inside subroutines. */ + + isame[0] = *(unsigned char *)sides == *( + unsigned char *)side; + isame[1] = *(unsigned char *)uplos == *( + unsigned char *)uplo; + isame[2] = *(unsigned char *)tranas == *( + unsigned char *)transa; + isame[3] = *(unsigned char *)diags == *( + unsigned char *)diag; + isame[4] = ms == m; + isame[5] = ns == n; + isame[6] = als.r == alpha.r && als.i == + alpha.i; + isame[7] = lze_(&as[1], &aa[1], &laa); + isame[8] = ldas == lda; + if (null) { + isame[9] = lze_(&bs[1], &bb[1], &lbb); + } else { + isame[9] = lzeres_("ge", " ", &m, &n, &bs[ + 1], &bb[1], &ldb, (ftnlen)2, ( + ftnlen)1); + } + isame[10] = ldbs == ldb; + +/* If data was incorrectly changed, report and */ +/* return. */ + + same = TRUE_; + i__4 = nargs; + for (i__ = 1; i__ <= i__4; ++i__) { + same = same && isame[i__ - 1]; + if (! isame[i__ - 1]) { + printf(" ******* FATAL ERROR - PARAMETER NUMBER %d WAS CHANGED INCORRECTLY *******\n",i__); + } +/* L50: */ + } + if (! same) { + *fatal = TRUE_; + goto L150; + } + + if (! null) { + if (s_cmp(sname + 9, "mm", (ftnlen)2, ( + ftnlen)2) == 0) { + +/* Check the result. */ + + if (left) { + zmmch_(transa, "N", &m, &n, &m, & + alpha, &a[a_offset], nmax, + &b[b_offset], nmax, & + c_b1, &c__[c_offset], + nmax, &ct[1], &g[1], &bb[ + 1], &ldb, eps, &err, + fatal, nout, &c_true, ( + ftnlen)1, (ftnlen)1); + } else { + zmmch_("N", transa, &m, &n, &n, & + alpha, &b[b_offset], nmax, + &a[a_offset], nmax, & + c_b1, &c__[c_offset], + nmax, &ct[1], &g[1], &bb[ + 1], &ldb, eps, &err, + fatal, nout, &c_true, ( + ftnlen)1, (ftnlen)1); + } + } else if (s_cmp(sname + 9, "sm", (ftnlen) + 2, (ftnlen)2) == 0) { + +/* Compute approximation to original */ +/* matrix. */ + + i__4 = n; + for (j = 1; j <= i__4; ++j) { + i__5 = m; + for (i__ = 1; i__ <= i__5; ++i__) + { + i__6 = i__ + j * c_dim1; + i__7 = i__ + (j - 1) * ldb; + c__[i__6].r = bb[i__7].r, c__[i__6].i = bb[i__7].i; + i__6 = i__ + (j - 1) * ldb; + i__7 = i__ + j * b_dim1; + z__1.r = alpha.r * b[i__7].r - alpha.i * b[i__7].i, + z__1.i = alpha.r * b[i__7].i + alpha.i * b[ + i__7].r; + bb[i__6].r = z__1.r, bb[i__6].i = z__1.i; +/* L60: */ + } +/* L70: */ + } + + if (left) { + zmmch_(transa, "N", &m, &n, &m, & + c_b2, &a[a_offset], nmax, + &c__[c_offset], nmax, & + c_b1, &b[b_offset], nmax, + &ct[1], &g[1], &bb[1], & + ldb, eps, &err, fatal, + nout, &c_false, (ftnlen)1, + (ftnlen)1); + } else { + zmmch_("N", transa, &m, &n, &n, & + c_b2, &c__[c_offset], + nmax, &a[a_offset], nmax, + &c_b1, &b[b_offset], nmax, + &ct[1], &g[1], &bb[1], & + ldb, eps, &err, fatal, + nout, &c_false, (ftnlen)1, + (ftnlen)1); + } + } + errmax = f2cmax(errmax,err); +/* If got really bad answer, report and */ +/* return. */ + if (*fatal) { + goto L150; + } + } + +/* L80: */ + } + +/* L90: */ + } + +/* L100: */ + } + +/* L110: */ + } + +/* L120: */ + } + +L130: + ; + } + +/* L140: */ + } + +/* Report result. */ + + if (errmax < *thresh) { + if (*iorder == 0) { + printf("%s PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS (%d CALLS)\n",sname,nc); + } + if (*iorder == 1) { + printf("%s PASSED THE ROW-MAJOR COMPUTATIONAL TESTS (%d CALLS)\n",sname,nc); + } + } else { + if (*iorder == 0) { + printf("%s COMPLETED THE COLUMN-MAJOR COMPUTATIONAL TESTS (%d CALLS)/n",sname,nc); + printf("***** BUT WITH MAXIMUM TEST RATIO %8.2f - SUSPECT *******/n",errmax); + } + if (*iorder == 1) { + printf("%s COMPLETED THE ROW-MAJOR COMPUTATIONAL TESTS (%d CALLS)/n",sname,nc); + printf("***** BUT WITH MAXIMUM TEST RATIO %8.2f - SUSPECT *******/n",errmax); + } + } + goto L160; + +L150: + printf(" ******* %s FAILED ON CALL NUMBER:\n",sname); + if (*trace) { + zprcn3_(ntra, &nc, sname, iorder, side, uplo, transa, diag, &m, &n, & + alpha, &lda, &ldb, (ftnlen)12, (ftnlen)1, (ftnlen)1, (ftnlen) + 1, (ftnlen)1); + } + +L160: + return 0; + +/* 9995 FORMAT(1X, I6, ': ', A12,'(', 4( '''', A1, ''',' ), 2( I3, ',' ), */ +/* $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ') ', */ +/* $ ' .' ) */ + +/* End of ZCHK3. */ + +} /* zchk3_ */ + + +/* Subroutine */ int zprcn3_(integer* nout, integer* nc, char* sname, integer* iorder, char* side, char* uplo, char* transa, char* diag, integer* m, integer* n, doublecomplex* alpha, integer* lda, integer* ldb, ftnlen sname_len, ftnlen side_len, ftnlen uplo_len, ftnlen transa_len, ftnlen diag_len) +{ + + /* Local variables */ + static char ca[14], cd[14], cs[14], cu[14], crc[14]; + + if (*(unsigned char *)side == 'L') { + s_copy(cs, " CblasLeft", (ftnlen)14, (ftnlen)14); + } else { + s_copy(cs, " CblasRight", (ftnlen)14, (ftnlen)14); + } + if (*(unsigned char *)uplo == 'U') { + s_copy(cu, " CblasUpper", (ftnlen)14, (ftnlen)14); + } else { + s_copy(cu, " CblasLower", (ftnlen)14, (ftnlen)14); + } + if (*(unsigned char *)transa == 'N') { + s_copy(ca, " CblasNoTrans", (ftnlen)14, (ftnlen)14); + } else if (*(unsigned char *)transa == 'T') { + s_copy(ca, " CblasTrans", (ftnlen)14, (ftnlen)14); + } else { + s_copy(ca, "CblasConjTrans", (ftnlen)14, (ftnlen)14); + } + if (*(unsigned char *)diag == 'N') { + s_copy(cd, " CblasNonUnit", (ftnlen)14, (ftnlen)14); + } else { + s_copy(cd, " CblasUnit", (ftnlen)14, (ftnlen)14); + } + if (*iorder == 1) { + s_copy(crc, " CblasRowMajor", (ftnlen)14, (ftnlen)14); + } else { + s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14); + } + printf("%6d: %s %s %s %s\n",*nc,sname,crc,cs,cu); + printf(" %s %s %d %d (%4.1lf,%4.1lf) A %d B %d\n",ca,cd,*m,*n,alpha->r,alpha->i,*lda,*ldb); + +return 0; +} /* zprcn3_ */ + + +/* Subroutine */ int zchk4_(char* sname, doublereal* eps, doublereal* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, doublecomplex* alf, integer* nbet, doublecomplex* bet, integer* nmax, doublecomplex* a, doublecomplex* aa, doublecomplex* as, doublecomplex* b, doublecomplex* bb, doublecomplex* bs, doublecomplex* c__, doublecomplex* cc, doublecomplex* cs, doublecomplex* ct, doublereal* g, integer* iorder, ftnlen sname_len) +{ + /* Initialized data */ + + static char icht[2+1] = "NC"; + static char ichu[2+1] = "UL"; + + /* System generated locals */ + integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, + i__3, i__4, i__5, i__6, i__7; + doublecomplex z__1; + + /* Local variables */ + static doublecomplex beta; + static integer ldas, ldcs; + static logical same, isconj; + static doublecomplex bets; + static doublereal rals; + static logical tran, null; + static char uplo[1]; + static integer i__, j, k, n; + static doublecomplex alpha; + static doublereal rbeta; + static logical isame[13]; + extern /* Subroutine */ int zmake_(char*, char*, char*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, logical*, doublecomplex*, ftnlen, ftnlen, ftnlen); + static integer nargs; + extern /* Subroutine */ int zmmch_(char*, char*, integer*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, doublereal*, doublecomplex*, integer*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen, ftnlen); + static doublereal rbets; + static logical reset; + static char trans[1]; + static logical upper; + static char uplos[1]; + static integer ia, ib, jc, ma, na; + extern /* Subroutine */ int zprcn4_(integer*, integer*, char*, integer*, char*, char*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, ftnlen, ftnlen, ftnlen); + static integer nc; + extern /* Subroutine */ int zprcn6_(integer*, integer*, char*, integer*, char*, char*, integer*, integer*, doublereal*, integer*, doublereal*, integer*, ftnlen, ftnlen, ftnlen); + static integer ik, in, jj, lj, ks, ns; + static doublereal ralpha; + extern /* Subroutine */ int czherk_(integer*, char*, char*, integer*, integer*, doublereal*, doublecomplex*, integer*, doublereal*, doublecomplex*, integer*, ftnlen, ftnlen); + static doublereal errmax; + extern logical lzeres_(char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen); + static char transs[1], transt[1]; + extern /* Subroutine */ int czsyrk_(integer*, char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen); + static integer laa, lda, lcc, ldc; + static doublecomplex als; + static integer ict, icu; + static doublereal err; + extern logical lze_(doublecomplex*, doublecomplex*, integer*); + +/* Tests ZHERK and ZSYRK. */ + +/* Auxiliary routine for test program for Level 3 Blas. */ + +/* -- Written on 8-February-1989. */ +/* Jack Dongarra, Argonne National Laboratory. */ +/* Iain Duff, AERE Harwell. */ +/* Jeremy Du Croz, Numerical Algorithms Group Ltd. */ +/* Sven Hammarling, Numerical Algorithms Group Ltd. */ + +/* .. Parameters .. */ +/* .. Scalar Arguments .. */ +/* .. Array Arguments .. */ +/* .. Local Scalars .. */ +/* .. Local Arrays .. */ +/* .. External Functions .. */ +/* .. External Subroutines .. */ +/* .. Intrinsic Functions .. */ +/* .. Scalars in Common .. */ +/* .. Common blocks .. */ +/* .. Data statements .. */ + /* Parameter adjustments */ + --idim; + --alf; + --bet; + --g; + --ct; + --cs; + --cc; + c_dim1 = *nmax; + c_offset = 1 + c_dim1 * 1; + c__ -= c_offset; + --bs; + --bb; + b_dim1 = *nmax; + b_offset = 1 + b_dim1 * 1; + b -= b_offset; + --as; + --aa; + a_dim1 = *nmax; + a_offset = 1 + a_dim1 * 1; + a -= a_offset; + + /* Function Body */ +/* .. Executable Statements .. */ + isconj = s_cmp(sname + 7, "he", (ftnlen)2, (ftnlen)2) == 0; + + nargs = 10; + nc = 0; + reset = TRUE_; + errmax = 0.; + rals = 1.; + rbets = 1.; + + i__1 = *nidim; + for (in = 1; in <= i__1; ++in) { + n = idim[in]; +/* Set LDC to 1 more than minimum value if room. */ + ldc = n; + if (ldc < *nmax) { + ++ldc; + } +/* Skip tests if not enough room. */ + if (ldc > *nmax) { + goto L100; + } + lcc = ldc * n; + + i__2 = *nidim; + for (ik = 1; ik <= i__2; ++ik) { + k = idim[ik]; + + for (ict = 1; ict <= 2; ++ict) { + *(unsigned char *)trans = *(unsigned char *)&icht[ict - 1]; + tran = *(unsigned char *)trans == 'C'; + if (tran && ! isconj) { + *(unsigned char *)trans = 'T'; + } + if (tran) { + ma = k; + na = n; + } else { + ma = n; + na = k; + } +/* Set LDA to 1 more than minimum value if room. */ + lda = ma; + if (lda < *nmax) { + ++lda; + } +/* Skip tests if not enough room. */ + if (lda > *nmax) { + goto L80; + } + laa = lda * na; + +/* Generate the matrix A. */ + + zmake_("ge", " ", " ", &ma, &na, &a[a_offset], nmax, &aa[1], & + lda, &reset, &c_b1, (ftnlen)2, (ftnlen)1, (ftnlen)1); + + for (icu = 1; icu <= 2; ++icu) { + *(unsigned char *)uplo = *(unsigned char *)&ichu[icu - 1]; + upper = *(unsigned char *)uplo == 'U'; + + i__3 = *nalf; + for (ia = 1; ia <= i__3; ++ia) { + i__4 = ia; + alpha.r = alf[i__4].r, alpha.i = alf[i__4].i; + if (isconj) { + ralpha = alpha.r; + z__1.r = ralpha, z__1.i = 0.; + alpha.r = z__1.r, alpha.i = z__1.i; + } + + i__4 = *nbet; + for (ib = 1; ib <= i__4; ++ib) { + i__5 = ib; + beta.r = bet[i__5].r, beta.i = bet[i__5].i; + if (isconj) { + rbeta = beta.r; + z__1.r = rbeta, z__1.i = 0.; + beta.r = z__1.r, beta.i = z__1.i; + } + null = n <= 0; + if (isconj) { + null = null ||( (k <= 0 || ralpha == 0.) && + rbeta == 1.); + } + +/* Generate the matrix C. */ + + zmake_(sname + 7, uplo, " ", &n, &n, &c__[ + c_offset], nmax, &cc[1], &ldc, &reset, & + c_b1, (ftnlen)2, (ftnlen)1, (ftnlen)1); + + ++nc; + +/* Save every datum before calling the subroutine. */ + + *(unsigned char *)uplos = *(unsigned char *)uplo; + *(unsigned char *)transs = *(unsigned char *) + trans; + ns = n; + ks = k; + if (isconj) { + rals = ralpha; + } else { + als.r = alpha.r, als.i = alpha.i; + } + i__5 = laa; + for (i__ = 1; i__ <= i__5; ++i__) { + i__6 = i__; + i__7 = i__; + as[i__6].r = aa[i__7].r, as[i__6].i = aa[i__7] + .i; +/* L10: */ + } + ldas = lda; + if (isconj) { + rbets = rbeta; + } else { + bets.r = beta.r, bets.i = beta.i; + } + i__5 = lcc; + for (i__ = 1; i__ <= i__5; ++i__) { + i__6 = i__; + i__7 = i__; + cs[i__6].r = cc[i__7].r, cs[i__6].i = cc[i__7] + .i; +/* L20: */ + } + ldcs = ldc; + +/* Call the subroutine. */ + + if (isconj) { + if (*trace) { + zprcn6_(ntra, &nc, sname, iorder, uplo, + trans, &n, &k, &ralpha, &lda, & + rbeta, &ldc, (ftnlen)12, (ftnlen) + 1, (ftnlen)1); + } + if (*rewi) { +/* al__1.aerr = 0; + al__1.aunit = *ntra; + f_rew(&al__1);*/ + } + czherk_(iorder, uplo, trans, &n, &k, &ralpha, + &aa[1], &lda, &rbeta, &cc[1], &ldc, ( + ftnlen)1, (ftnlen)1); + } else { + if (*trace) { + zprcn4_(ntra, &nc, sname, iorder, uplo, + trans, &n, &k, &alpha, &lda, & + beta, &ldc, (ftnlen)12, (ftnlen)1, + (ftnlen)1); + } + if (*rewi) { +/* al__1.aerr = 0; + al__1.aunit = *ntra; + f_rew(&al__1);*/ + } + czsyrk_(iorder, uplo, trans, &n, &k, &alpha, & + aa[1], &lda, &beta, &cc[1], &ldc, ( + ftnlen)1, (ftnlen)1); + } + +/* Check if error-exit was taken incorrectly. */ + + if (! infoc_1.ok) { + printf("*** FATAL ERROR - ERROR-CALL MYEXIT TAKEN ON VALID CALL\n"); + *fatal = TRUE_; + goto L120; + } + +/* See what data changed inside subroutines. */ + + isame[0] = *(unsigned char *)uplos == *(unsigned + char *)uplo; + isame[1] = *(unsigned char *)transs == *(unsigned + char *)trans; + isame[2] = ns == n; + isame[3] = ks == k; + if (isconj) { + isame[4] = rals == ralpha; + } else { + isame[4] = als.r == alpha.r && als.i == + alpha.i; + } + isame[5] = lze_(&as[1], &aa[1], &laa); + isame[6] = ldas == lda; + if (isconj) { + isame[7] = rbets == rbeta; + } else { + isame[7] = bets.r == beta.r && bets.i == + beta.i; + } + if (null) { + isame[8] = lze_(&cs[1], &cc[1], &lcc); + } else { + isame[8] = lzeres_(sname + 7, uplo, &n, &n, & + cs[1], &cc[1], &ldc, (ftnlen)2, ( + ftnlen)1); + } + isame[9] = ldcs == ldc; + +/* If data was incorrectly changed, report and */ +/* return. */ + + same = TRUE_; + i__5 = nargs; + for (i__ = 1; i__ <= i__5; ++i__) { + same = same && isame[i__ - 1]; + if (! isame[i__ - 1]) { + printf(" ******* FATAL ERROR - PARAMETER NUMBER %d WAS CHANGED INCORRECTLY *******\n",i__); + } +/* L30: */ + } + if (! same) { + *fatal = TRUE_; + goto L120; + } + + if (! null) { + +/* Check the result column by column. */ + + if (isconj) { + *(unsigned char *)transt = 'C'; + } else { + *(unsigned char *)transt = 'T'; + } + jc = 1; + i__5 = n; + for (j = 1; j <= i__5; ++j) { + if (upper) { + jj = 1; + lj = j; + } else { + jj = j; + lj = n - j + 1; + } + if (tran) { + zmmch_(transt, "N", &lj, &c__1, &k, & + alpha, &a[jj * a_dim1 + 1], + nmax, &a[j * a_dim1 + 1], + nmax, &beta, &c__[jj + j * + c_dim1], nmax, &ct[1], &g[1], + &cc[jc], &ldc, eps, &err, + fatal, nout, &c_true, (ftnlen) + 1, (ftnlen)1); + } else { + zmmch_("N", transt, &lj, &c__1, &k, & + alpha, &a[jj + a_dim1], nmax, + &a[j + a_dim1], nmax, &beta, & + c__[jj + j * c_dim1], nmax, & + ct[1], &g[1], &cc[jc], &ldc, + eps, &err, fatal, nout, & + c_true, (ftnlen)1, (ftnlen)1); + } + if (upper) { + jc += ldc; + } else { + jc = jc + ldc + 1; + } + errmax = f2cmax(errmax,err); +/* If got really bad answer, report and */ +/* return. */ + if (*fatal) { + goto L110; + } +/* L40: */ + } + } + +/* L50: */ + } + +/* L60: */ + } + +/* L70: */ + } + +L80: + ; + } + +/* L90: */ + } + +L100: + ; + } + +/* Report result. */ + + if (errmax < *thresh) { + if (*iorder == 0) { + printf("%s PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS (%d CALLS)\n",sname,nc); + } + if (*iorder == 1) { + printf("%s PASSED THE ROW-MAJOR COMPUTATIONAL TESTS (%d CALLS)\n",sname,nc); + } + } else { + if (*iorder == 0) { + printf("%s COMPLETED THE COLUMN-MAJOR COMPUTATIONAL TESTS (%d CALLS)/n",sname,nc); + printf("***** BUT WITH MAXIMUM TEST RATIO %8.2f - SUSPECT *******/n",errmax); + } + if (*iorder == 1) { + printf("%s COMPLETED THE ROW-MAJOR COMPUTATIONAL TESTS (%d CALLS)/n",sname,nc); + printf("***** BUT WITH MAXIMUM TEST RATIO %8.2f - SUSPECT *******/n",errmax); + } + } + goto L130; + +L110: + if (n > 1) { + printf(" THESE ARE THE RESULTS FOR COLUMN %d:\n",j); + } + +L120: + printf(" ******* %s FAILED ON CALL NUMBER:\n",sname); + if (isconj) { + zprcn6_(nout, &nc, sname, iorder, uplo, trans, &n, &k, &ralpha, &lda, + &rbeta, &ldc, (ftnlen)12, (ftnlen)1, (ftnlen)1); + } else { + zprcn4_(nout, &nc, sname, iorder, uplo, trans, &n, &k, &alpha, &lda, & + beta, &ldc, (ftnlen)12, (ftnlen)1, (ftnlen)1); + } + +L130: + return 0; + +/* 9994 FORMAT(1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), */ +/* $ F4.1, ', A,', I3, ',', F4.1, ', C,', I3, ') ', */ +/* $ ' .' ) */ +/* 9993 FORMAT(1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), */ +/* $ '(', F4.1, ',', F4.1, ') , A,', I3, ',(', F4.1, ',', F4.1, */ +/* $ '), C,', I3, ') .' ) */ + +/* End of CCHK4. */ + +} /* zchk4_ */ + + +/* Subroutine */ int zprcn4_(integer* nout, integer* nc, char* sname, integer* iorder, char* uplo, char* transa, integer* n, integer* k, doublecomplex* alpha, integer* lda, doublecomplex* beta, integer* ldc, ftnlen sname_len, ftnlen uplo_len, ftnlen transa_len) +{ + /* Local variables */ + static char ca[14], cu[14], crc[14]; + + if (*(unsigned char *)uplo == 'U') { + s_copy(cu, " CblasUpper", (ftnlen)14, (ftnlen)14); + } else { + s_copy(cu, " CblasLower", (ftnlen)14, (ftnlen)14); + } + if (*(unsigned char *)transa == 'N') { + s_copy(ca, " CblasNoTrans", (ftnlen)14, (ftnlen)14); + } else if (*(unsigned char *)transa == 'T') { + s_copy(ca, " CblasTrans", (ftnlen)14, (ftnlen)14); + } else { + s_copy(ca, "CblasConjTrans", (ftnlen)14, (ftnlen)14); + } + if (*iorder == 1) { + s_copy(crc, " CblasRowMajor", (ftnlen)14, (ftnlen)14); + } else { + s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14); + } + printf("%6d: %s %s %s %s\n",*nc,sname,crc,cu,ca); + printf("( %d %d (%4.1lf,%4.1lf) A %d (%4.1lf,%4.1lf) C %d\n",*n,*k,alpha->r,alpha->i,*lda,beta->r,beta->i,*ldc); + +return 0; +} /* zprcn4_ */ + + + +/* Subroutine */ int zprcn6_(integer* nout, integer* nc, char* sname, integer* iorder, char* uplo, char* transa, integer* n, integer* k, doublereal* alpha, integer* lda, doublereal* beta, integer* ldc, ftnlen sname_len, ftnlen uplo_len, ftnlen transa_len) +{ + + /* Local variables */ + static char ca[14], cu[14], crc[14]; + + if (*(unsigned char *)uplo == 'U') { + s_copy(cu, " CblasUpper", (ftnlen)14, (ftnlen)14); + } else { + s_copy(cu, " CblasLower", (ftnlen)14, (ftnlen)14); + } + if (*(unsigned char *)transa == 'N') { + s_copy(ca, " CblasNoTrans", (ftnlen)14, (ftnlen)14); + } else if (*(unsigned char *)transa == 'T') { + s_copy(ca, " CblasTrans", (ftnlen)14, (ftnlen)14); + } else { + s_copy(ca, "CblasConjTrans", (ftnlen)14, (ftnlen)14); + } + if (*iorder == 1) { + s_copy(crc, " CblasRowMajor", (ftnlen)14, (ftnlen)14); + } else { + s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14); + } + printf("%6d: %s %s %s %s\n",*nc,sname,crc,cu,ca); + printf("( %d %d %4.1lf A %d %4.1lf C %d\n",*n,*k,*alpha,*lda,*beta,*ldc); + +return 0; +} /* zprcn6_ */ + + +/* Subroutine */ int zchk5_(char* sname, doublereal* eps, doublereal* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, doublecomplex* alf, integer* nbet, doublecomplex* bet, integer* nmax, doublecomplex* ab, doublecomplex* aa, doublecomplex* as, doublecomplex* bb, doublecomplex* bs, doublecomplex* c__, doublecomplex* cc, doublecomplex* cs, doublecomplex* ct, doublereal* g, doublecomplex* w, integer* iorder, ftnlen sname_len) +{ + /* Initialized data */ + + static char icht[2+1] = "NC"; + static char ichu[2+1] = "UL"; + + /* System generated locals */ + integer c_dim1, c_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7, i__8; + doublecomplex z__1, z__2; + + /* Local variables */ + static integer jjab; + static doublecomplex beta; + static integer ldas, ldbs, ldcs; + static logical same, isconj; + static doublecomplex bets; + static logical tran, null; + static char uplo[1]; + static integer i__, j, k, n; + static doublecomplex alpha; + static doublereal rbeta; + static logical isame[13]; + extern /* Subroutine */ int zmake_(char*, char*, char*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, logical*, doublecomplex*, ftnlen, ftnlen, ftnlen); + static integer nargs; + extern /* Subroutine */ int zmmch_(char*, char*, integer*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, doublereal*, doublecomplex*, integer*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen, ftnlen); + static doublereal rbets; + static logical reset; + static char trans[1]; + static logical upper; + static char uplos[1]; + static integer ia, ib, jc, ma, na, nc; + extern /* Subroutine */ int zprcn5_(integer*, integer*, char*, integer*, char*, char*, integer*, integer*, doublecomplex*, integer*, integer*, doublecomplex*, integer*, ftnlen, ftnlen, ftnlen); + extern /* Subroutine */ int zprcn7_(integer*, integer*, char*, integer*, char*, char*, integer*, integer*, doublecomplex*, integer*, integer*, doublereal*, integer*, ftnlen, ftnlen, ftnlen); + static integer ik, in, jj, lj, ks, ns; + static doublereal errmax; + extern logical lzeres_(char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen); + static char transs[1], transt[1]; + extern /* Subroutine */ int czher2k_(integer*, char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublereal*, doublecomplex*, integer*, ftnlen, ftnlen); + static integer laa, lbb, lda, lcc, ldb, ldc; + static doublecomplex als; + static integer ict, icu; + extern /* Subroutine */ int czsyr2k_(integer*, char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen); + static doublereal err; + extern logical lze_(doublecomplex*, doublecomplex*, integer*); + +/* Tests ZHER2K and ZSYR2K. */ + +/* Auxiliary routine for test program for Level 3 Blas. */ + +/* -- Written on 8-February-1989. */ +/* Jack Dongarra, Argonne National Laboratory. */ +/* Iain Duff, AERE Harwell. */ +/* Jeremy Du Croz, Numerical Algorithms Group Ltd. */ +/* Sven Hammarling, Numerical Algorithms Group Ltd. */ + +/* .. Parameters .. */ +/* .. Scalar Arguments .. */ +/* .. Array Arguments .. */ +/* .. Local Scalars .. */ +/* .. Local Arrays .. */ +/* .. External Functions .. */ +/* .. External Subroutines .. */ +/* .. Intrinsic Functions .. */ +/* .. Scalars in Common .. */ +/* .. Common blocks .. */ +/* .. Data statements .. */ + /* Parameter adjustments */ + --idim; + --alf; + --bet; + --w; + --g; + --ct; + --cs; + --cc; + c_dim1 = *nmax; + c_offset = 1 + c_dim1 * 1; + c__ -= c_offset; + --bs; + --bb; + --as; + --aa; + --ab; + + /* Function Body */ +/* .. Executable Statements .. */ + isconj = s_cmp(sname + 7, "he", (ftnlen)2, (ftnlen)2) == 0; + + nargs = 12; + nc = 0; + reset = TRUE_; + errmax = 0.; + + i__1 = *nidim; + for (in = 1; in <= i__1; ++in) { + n = idim[in]; +/* Set LDC to 1 more than minimum value if room. */ + ldc = n; + if (ldc < *nmax) { + ++ldc; + } +/* Skip tests if not enough room. */ + if (ldc > *nmax) { + goto L130; + } + lcc = ldc * n; + + i__2 = *nidim; + for (ik = 1; ik <= i__2; ++ik) { + k = idim[ik]; + + for (ict = 1; ict <= 2; ++ict) { + *(unsigned char *)trans = *(unsigned char *)&icht[ict - 1]; + tran = *(unsigned char *)trans == 'C'; + if (tran && ! isconj) { + *(unsigned char *)trans = 'T'; + } + if (tran) { + ma = k; + na = n; + } else { + ma = n; + na = k; + } +/* Set LDA to 1 more than minimum value if room. */ + lda = ma; + if (lda < *nmax) { + ++lda; + } +/* Skip tests if not enough room. */ + if (lda > *nmax) { + goto L110; + } + laa = lda * na; + +/* Generate the matrix A. */ + + if (tran) { + i__3 = *nmax << 1; + zmake_("ge", " ", " ", &ma, &na, &ab[1], &i__3, &aa[1], & + lda, &reset, &c_b1, (ftnlen)2, (ftnlen)1, (ftnlen) + 1); + } else { + zmake_("ge", " ", " ", &ma, &na, &ab[1], nmax, &aa[1], & + lda, &reset, &c_b1, (ftnlen)2, (ftnlen)1, (ftnlen) + 1); + } + +/* Generate the matrix B. */ + + ldb = lda; + lbb = laa; + if (tran) { + i__3 = *nmax << 1; + zmake_("ge", " ", " ", &ma, &na, &ab[k + 1], &i__3, &bb[1] + , &ldb, &reset, &c_b1, (ftnlen)2, (ftnlen)1, ( + ftnlen)1); + } else { + zmake_("ge", " ", " ", &ma, &na, &ab[k * *nmax + 1], nmax, + &bb[1], &ldb, &reset, &c_b1, (ftnlen)2, (ftnlen) + 1, (ftnlen)1); + } + + for (icu = 1; icu <= 2; ++icu) { + *(unsigned char *)uplo = *(unsigned char *)&ichu[icu - 1]; + upper = *(unsigned char *)uplo == 'U'; + + i__3 = *nalf; + for (ia = 1; ia <= i__3; ++ia) { + i__4 = ia; + alpha.r = alf[i__4].r, alpha.i = alf[i__4].i; + + i__4 = *nbet; + for (ib = 1; ib <= i__4; ++ib) { + i__5 = ib; + beta.r = bet[i__5].r, beta.i = bet[i__5].i; + if (isconj) { + rbeta = beta.r; + z__1.r = rbeta, z__1.i = 0.; + beta.r = z__1.r, beta.i = z__1.i; + } + null = n <= 0; + if (isconj) { + null = null ||( (k <= 0 || (alpha.r == 0. && + alpha.i == 0.)) && rbeta == 1.); + } + +/* Generate the matrix C. */ + + zmake_(sname + 7, uplo, " ", &n, &n, &c__[ + c_offset], nmax, &cc[1], &ldc, &reset, & + c_b1, (ftnlen)2, (ftnlen)1, (ftnlen)1); + + ++nc; + +/* Save every datum before calling the subroutine. */ + + *(unsigned char *)uplos = *(unsigned char *)uplo; + *(unsigned char *)transs = *(unsigned char *) + trans; + ns = n; + ks = k; + als.r = alpha.r, als.i = alpha.i; + i__5 = laa; + for (i__ = 1; i__ <= i__5; ++i__) { + i__6 = i__; + i__7 = i__; + as[i__6].r = aa[i__7].r, as[i__6].i = aa[i__7] + .i; +/* L10: */ + } + ldas = lda; + i__5 = lbb; + for (i__ = 1; i__ <= i__5; ++i__) { + i__6 = i__; + i__7 = i__; + bs[i__6].r = bb[i__7].r, bs[i__6].i = bb[i__7] + .i; +/* L20: */ + } + ldbs = ldb; + if (isconj) { + rbets = rbeta; + } else { + bets.r = beta.r, bets.i = beta.i; + } + i__5 = lcc; + for (i__ = 1; i__ <= i__5; ++i__) { + i__6 = i__; + i__7 = i__; + cs[i__6].r = cc[i__7].r, cs[i__6].i = cc[i__7] + .i; +/* L30: */ + } + ldcs = ldc; + +/* Call the subroutine. */ + + if (isconj) { + if (*trace) { + zprcn7_(ntra, &nc, sname, iorder, uplo, + trans, &n, &k, &alpha, &lda, &ldb, + &rbeta, &ldc, (ftnlen)12, ( + ftnlen)1, (ftnlen)1); + } + if (*rewi) { +/* al__1.aerr = 0; + al__1.aunit = *ntra; + f_rew(&al__1);*/ + } + czher2k_(iorder, uplo, trans, &n, &k, &alpha, + &aa[1], &lda, &bb[1], &ldb, &rbeta, & + cc[1], &ldc, (ftnlen)1, (ftnlen)1); + } else { + if (*trace) { + zprcn5_(ntra, &nc, sname, iorder, uplo, + trans, &n, &k, &alpha, &lda, &ldb, + &beta, &ldc, (ftnlen)12, (ftnlen) + 1, (ftnlen)1); + } + if (*rewi) { +/* al__1.aerr = 0; + al__1.aunit = *ntra; + f_rew(&al__1);*/ + } + czsyr2k_(iorder, uplo, trans, &n, &k, &alpha, + &aa[1], &lda, &bb[1], &ldb, &beta, & + cc[1], &ldc, (ftnlen)1, (ftnlen)1); + } + +/* Check if error-exit was taken incorrectly. */ + + if (! infoc_1.ok) { + printf("*** FATAL ERROR - ERROR-CALL MYEXIT TAKEN ON VALID CALL\n"); + *fatal = TRUE_; + goto L150; + } + +/* See what data changed inside subroutines. */ + + isame[0] = *(unsigned char *)uplos == *(unsigned + char *)uplo; + isame[1] = *(unsigned char *)transs == *(unsigned + char *)trans; + isame[2] = ns == n; + isame[3] = ks == k; + isame[4] = als.r == alpha.r && als.i == alpha.i; + isame[5] = lze_(&as[1], &aa[1], &laa); + isame[6] = ldas == lda; + isame[7] = lze_(&bs[1], &bb[1], &lbb); + isame[8] = ldbs == ldb; + if (isconj) { + isame[9] = rbets == rbeta; + } else { + isame[9] = bets.r == beta.r && bets.i == + beta.i; + } + if (null) { + isame[10] = lze_(&cs[1], &cc[1], &lcc); + } else { + isame[10] = lzeres_("he", uplo, &n, &n, &cs[1] + , &cc[1], &ldc, (ftnlen)2, (ftnlen)1); + } + isame[11] = ldcs == ldc; + +/* If data was incorrectly changed, report and */ +/* return. */ + + same = TRUE_; + i__5 = nargs; + for (i__ = 1; i__ <= i__5; ++i__) { + same = same && isame[i__ - 1]; + if (! isame[i__ - 1]) { + printf(" ******* FATAL ERROR - PARAMETER NUMBER %d WAS CHANGED INCORRECTLY *******\n",i__); + } +/* L40: */ + } + if (! same) { + *fatal = TRUE_; + goto L150; + } + + if (! null) { + +/* Check the result column by column. */ + + if (isconj) { + *(unsigned char *)transt = 'C'; + } else { + *(unsigned char *)transt = 'T'; + } + jjab = 1; + jc = 1; + i__5 = n; + for (j = 1; j <= i__5; ++j) { + if (upper) { + jj = 1; + lj = j; + } else { + jj = j; + lj = n - j + 1; + } + if (tran) { + i__6 = k; + for (i__ = 1; i__ <= i__6; ++i__) { + i__7 = i__; + i__8 = ((j - 1) << 1) * *nmax + k + + i__; + z__1.r = alpha.r * ab[i__8].r - + alpha.i * ab[i__8].i, + z__1.i = alpha.r * ab[ + i__8].i + alpha.i * ab[ + i__8].r; + w[i__7].r = z__1.r, w[i__7].i = + z__1.i; + if (isconj) { + i__7 = k + i__; + d_cnjg(&z__2, &alpha); + i__8 = ((j - 1) << 1) * *nmax + i__; + z__1.r = z__2.r * ab[i__8].r - z__2.i * ab[i__8].i, + z__1.i = z__2.r * ab[i__8].i + z__2.i * ab[ + i__8].r; + w[i__7].r = z__1.r, w[i__7].i = z__1.i; + } else { + i__7 = k + i__; + i__8 = ((j - 1) << 1) * *nmax + i__; + z__1.r = alpha.r * ab[i__8].r - alpha.i * ab[i__8] + .i, z__1.i = alpha.r * ab[i__8].i + alpha.i + * ab[i__8].r; + w[i__7].r = z__1.r, w[i__7].i = z__1.i; + } +/* L50: */ + } + i__6 = k << 1; + i__7 = *nmax << 1; + i__8 = *nmax << 1; + zmmch_(transt, "N", &lj, &c__1, &i__6, + &c_b2, &ab[jjab], &i__7, &w[ + 1], &i__8, &beta, &c__[jj + j + * c_dim1], nmax, &ct[1], &g[1] + , &cc[jc], &ldc, eps, &err, + fatal, nout, &c_true, (ftnlen) + 1, (ftnlen)1); + } else { + i__6 = k; + for (i__ = 1; i__ <= i__6; ++i__) { + if (isconj) { + i__7 = i__; + d_cnjg(&z__2, &ab[(k + i__ - 1) * *nmax + j]); + z__1.r = alpha.r * z__2.r - alpha.i * z__2.i, + z__1.i = alpha.r * z__2.i + alpha.i * + z__2.r; + w[i__7].r = z__1.r, w[i__7].i = z__1.i; + i__7 = k + i__; + i__8 = (i__ - 1) * *nmax + j; + z__2.r = alpha.r * ab[i__8].r - alpha.i * ab[i__8] + .i, z__2.i = alpha.r * ab[i__8].i + alpha.i + * ab[i__8].r; + d_cnjg(&z__1, &z__2); + w[i__7].r = z__1.r, w[i__7].i = z__1.i; + } else { + i__7 = i__; + i__8 = (k + i__ - 1) * *nmax + j; + z__1.r = alpha.r * ab[i__8].r - alpha.i * ab[i__8] + .i, z__1.i = alpha.r * ab[i__8].i + alpha.i + * ab[i__8].r; + w[i__7].r = z__1.r, w[i__7].i = z__1.i; + i__7 = k + i__; + i__8 = (i__ - 1) * *nmax + j; + z__1.r = alpha.r * ab[i__8].r - alpha.i * ab[i__8] + .i, z__1.i = alpha.r * ab[i__8].i + alpha.i + * ab[i__8].r; + w[i__7].r = z__1.r, w[i__7].i = z__1.i; + } +/* L60: */ + } + i__6 = k << 1; + i__7 = *nmax << 1; + zmmch_("N", "N", &lj, &c__1, &i__6, & + c_b2, &ab[jj], nmax, &w[1], & + i__7, &beta, &c__[jj + j * + c_dim1], nmax, &ct[1], &g[1], + &cc[jc], &ldc, eps, &err, + fatal, nout, &c_true, (ftnlen) + 1, (ftnlen)1); + } + if (upper) { + jc += ldc; + } else { + jc = jc + ldc + 1; + if (tran) { + jjab += *nmax << 1; + } + } + errmax = f2cmax(errmax,err); +/* If got really bad answer, report and */ +/* return. */ + if (*fatal) { + goto L140; + } +/* L70: */ + } + } + +/* L80: */ + } + +/* L90: */ + } + +/* L100: */ + } + +L110: + ; + } + +/* L120: */ + } + +L130: + ; + } + +/* Report result. */ + + if (errmax < *thresh) { + if (*iorder == 0) { + printf("%s PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS (%d CALLS)\n",sname,nc); + } + if (*iorder == 1) { + printf("%s PASSED THE ROW-MAJOR COMPUTATIONAL TESTS (%d CALLS)\n",sname,nc); + } + } else { + if (*iorder == 0) { + printf("%s COMPLETED THE COLUMN-MAJOR COMPUTATIONAL TESTS (%d CALLS)/n",sname,nc); + printf("***** BUT WITH MAXIMUM TEST RATIO %8.2f - SUSPECT *******/n",errmax); + } + if (*iorder == 1) { + printf("%s COMPLETED THE ROW-MAJOR COMPUTATIONAL TESTS (%d CALLS)/n",sname,nc); + printf("***** BUT WITH MAXIMUM TEST RATIO %8.2f - SUSPECT *******/n",errmax); + } + } + goto L160; + +L140: + if (n > 1) { + printf(" THESE ARE THE RESULTS FOR COLUMN %d:\n",j); + } + +L150: + printf(" ******* %s FAILED ON CALL NUMBER:\n",sname); + if (isconj) { + zprcn7_(nout, &nc, sname, iorder, uplo, trans, &n, &k, &alpha, &lda, & + ldb, &rbeta, &ldc, (ftnlen)12, (ftnlen)1, (ftnlen)1); + } else { + zprcn5_(nout, &nc, sname, iorder, uplo, trans, &n, &k, &alpha, &lda, & + ldb, &beta, &ldc, (ftnlen)12, (ftnlen)1, (ftnlen)1); + } + +L160: + return 0; + +/* 9994 FORMAT(1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), */ +/* $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ',', F4.1, */ +/* $ ', C,', I3, ') .' ) */ +/* 9993 FORMAT(1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), */ +/* $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ',(', F4.1, */ +/* $ ',', F4.1, '), C,', I3, ') .' ) */ + +/* End of ZCHK5. */ + +} /* zchk5_ */ + + +/* Subroutine */ int zprcn5_(integer* nout, integer* nc, char* sname, integer* iorder, char* uplo, char* transa, integer* n, integer* k, doublecomplex* alpha, integer* lda, integer* ldb, doublecomplex* beta, integer* ldc, ftnlen sname_len, ftnlen uplo_len, ftnlen transa_len) +{ + /* Local variables */ + static char ca[14], cu[14], crc[14]; + + if (*(unsigned char *)uplo == 'U') { + s_copy(cu, " CblasUpper", (ftnlen)14, (ftnlen)14); + } else { + s_copy(cu, " CblasLower", (ftnlen)14, (ftnlen)14); + } + if (*(unsigned char *)transa == 'N') { + s_copy(ca, " CblasNoTrans", (ftnlen)14, (ftnlen)14); + } else if (*(unsigned char *)transa == 'T') { + s_copy(ca, " CblasTrans", (ftnlen)14, (ftnlen)14); + } else { + s_copy(ca, "CblasConjTrans", (ftnlen)14, (ftnlen)14); + } + if (*iorder == 1) { + s_copy(crc, " CblasRowMajor", (ftnlen)14, (ftnlen)14); + } else { + s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14); + } + printf("%6d: %s %s %s %s\n",*nc,sname,crc,cu,ca); + printf("%d %d (%4.1lf,%4.1lf) , A, %d, B, %d, (%4.1lf,%4.1lf) , C, %d.\n",*n,*k,alpha->r,alpha->i,*lda,*ldb,beta->r,beta->i,*ldc); + +return 0; +} /* zprcn5_ */ + + + +/* Subroutine */ int zprcn7_(integer* nout, integer* nc, char* sname, integer* iorder, char* uplo, char* transa, integer* n, integer* k, doublecomplex* alpha, integer* lda, integer* ldb, doublereal* beta, integer* ldc, ftnlen sname_len, ftnlen uplo_len, ftnlen transa_len) +{ + + /* Local variables */ + static char ca[14], cu[14], crc[14]; + + if (*(unsigned char *)uplo == 'U') { + s_copy(cu, " CblasUpper", (ftnlen)14, (ftnlen)14); + } else { + s_copy(cu, " CblasLower", (ftnlen)14, (ftnlen)14); + } + if (*(unsigned char *)transa == 'N') { + s_copy(ca, " CblasNoTrans", (ftnlen)14, (ftnlen)14); + } else if (*(unsigned char *)transa == 'T') { + s_copy(ca, " CblasTrans", (ftnlen)14, (ftnlen)14); + } else { + s_copy(ca, "CblasConjTrans", (ftnlen)14, (ftnlen)14); + } + if (*iorder == 1) { + s_copy(crc, " CblasRowMajor", (ftnlen)14, (ftnlen)14); + } else { + s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14); + } + printf("%6d: %s %s %s %s\n",*nc,sname,crc,cu,ca); + printf("%d %d (%4.1lf,%4.1lf), A, %d, B, %d, %4.1lf, C, %d.\n",*n,*k,alpha->r,alpha->i,*lda,*ldb,*beta,*ldc); + +return 0; +} /* zprcn7_ */ + + +/* Subroutine */ int zmake_(char* type__, char* uplo, char* diag, integer* m, integer* n, doublecomplex* a, integer* nmax, doublecomplex* aa, integer* lda, logical* reset, doublecomplex* transl, ftnlen type_len, ftnlen uplo_len, ftnlen diag_len) +{ + /* System generated locals */ + integer a_dim1, a_offset, i__1, i__2, i__3, i__4; + doublereal d__1; + doublecomplex z__1, z__2; + + /* Local variables */ + static integer ibeg, iend; + extern /* Double Complex */ VOID zbeg_(doublecomplex*, logical*); + static logical unit; + static integer i__, j; + static logical lower, upper; + static integer jj; + static logical gen, her, tri, sym; + + +/* Generates values for an M by N matrix A. */ +/* Stores the values in the array AA in the data structure required */ +/* by the routine, with unwanted elements set to rogue value. */ + +/* TYPE is 'ge', 'he', 'sy' or 'tr'. */ + +/* Auxiliary routine for test program for Level 3 Blas. */ + +/* -- Written on 8-February-1989. */ +/* Jack Dongarra, Argonne National Laboratory. */ +/* Iain Duff, AERE Harwell. */ +/* Jeremy Du Croz, Numerical Algorithms Group Ltd. */ +/* Sven Hammarling, Numerical Algorithms Group Ltd. */ + +/* .. Parameters .. */ +/* .. Scalar Arguments .. */ +/* .. Array Arguments .. */ +/* .. Local Scalars .. */ +/* .. External Functions .. */ +/* .. Intrinsic Functions .. */ +/* .. Executable Statements .. */ + /* Parameter adjustments */ + a_dim1 = *nmax; + a_offset = 1 + a_dim1 * 1; + a -= a_offset; + --aa; + + /* Function Body */ + gen = s_cmp(type__, "ge", (ftnlen)2, (ftnlen)2) == 0; + her = s_cmp(type__, "he", (ftnlen)2, (ftnlen)2) == 0; + sym = s_cmp(type__, "sy", (ftnlen)2, (ftnlen)2) == 0; + tri = s_cmp(type__, "tr", (ftnlen)2, (ftnlen)2) == 0; + upper = (her || sym || tri) && *(unsigned char *)uplo == 'U'; + lower = (her || sym || tri) && *(unsigned char *)uplo == 'L'; + unit = tri && *(unsigned char *)diag == 'U'; + +/* Generate data in array A. */ + + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + i__2 = *m; + for (i__ = 1; i__ <= i__2; ++i__) { + if (gen || (upper && i__ <= j) || (lower && i__ >= j)) { + i__3 = i__ + j * a_dim1; + zbeg_(&z__2, reset); + z__1.r = z__2.r + transl->r, z__1.i = z__2.i + transl->i; + a[i__3].r = z__1.r, a[i__3].i = z__1.i; + if (i__ != j) { +/* Set some elements to zero */ + if (*n > 3 && j == *n / 2) { + i__3 = i__ + j * a_dim1; + a[i__3].r = 0., a[i__3].i = 0.; + } + if (her) { + i__3 = j + i__ * a_dim1; + d_cnjg(&z__1, &a[i__ + j * a_dim1]); + a[i__3].r = z__1.r, a[i__3].i = z__1.i; + } else if (sym) { + i__3 = j + i__ * a_dim1; + i__4 = i__ + j * a_dim1; + a[i__3].r = a[i__4].r, a[i__3].i = a[i__4].i; + } else if (tri) { + i__3 = j + i__ * a_dim1; + a[i__3].r = 0., a[i__3].i = 0.; + } + } + } +/* L10: */ + } + if (her) { + i__2 = j + j * a_dim1; + i__3 = j + j * a_dim1; + d__1 = a[i__3].r; + z__1.r = d__1, z__1.i = 0.; + a[i__2].r = z__1.r, a[i__2].i = z__1.i; + } + if (tri) { + i__2 = j + j * a_dim1; + i__3 = j + j * a_dim1; + z__1.r = a[i__3].r + 1., z__1.i = a[i__3].i + 0.; + a[i__2].r = z__1.r, a[i__2].i = z__1.i; + } + if (unit) { + i__2 = j + j * a_dim1; + a[i__2].r = 1., a[i__2].i = 0.; + } +/* L20: */ + } + +/* Store elements in array AS in data structure required by routine. */ + + if (s_cmp(type__, "ge", (ftnlen)2, (ftnlen)2) == 0) { + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + i__2 = *m; + for (i__ = 1; i__ <= i__2; ++i__) { + i__3 = i__ + (j - 1) * *lda; + i__4 = i__ + j * a_dim1; + aa[i__3].r = a[i__4].r, aa[i__3].i = a[i__4].i; +/* L30: */ + } + i__2 = *lda; + for (i__ = *m + 1; i__ <= i__2; ++i__) { + i__3 = i__ + (j - 1) * *lda; + aa[i__3].r = -1e10, aa[i__3].i = 1e10; +/* L40: */ + } +/* L50: */ + } + } else if (s_cmp(type__, "he", (ftnlen)2, (ftnlen)2) == 0 || s_cmp(type__, + "sy", (ftnlen)2, (ftnlen)2) == 0 || s_cmp(type__, "tr", (ftnlen) + 2, (ftnlen)2) == 0) { + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + if (upper) { + ibeg = 1; + if (unit) { + iend = j - 1; + } else { + iend = j; + } + } else { + if (unit) { + ibeg = j + 1; + } else { + ibeg = j; + } + iend = *n; + } + i__2 = ibeg - 1; + for (i__ = 1; i__ <= i__2; ++i__) { + i__3 = i__ + (j - 1) * *lda; + aa[i__3].r = -1e10, aa[i__3].i = 1e10; +/* L60: */ + } + i__2 = iend; + for (i__ = ibeg; i__ <= i__2; ++i__) { + i__3 = i__ + (j - 1) * *lda; + i__4 = i__ + j * a_dim1; + aa[i__3].r = a[i__4].r, aa[i__3].i = a[i__4].i; +/* L70: */ + } + i__2 = *lda; + for (i__ = iend + 1; i__ <= i__2; ++i__) { + i__3 = i__ + (j - 1) * *lda; + aa[i__3].r = -1e10, aa[i__3].i = 1e10; +/* L80: */ + } + if (her) { + jj = j + (j - 1) * *lda; + i__2 = jj; + i__3 = jj; + d__1 = aa[i__3].r; + z__1.r = d__1, z__1.i = -1e10; + aa[i__2].r = z__1.r, aa[i__2].i = z__1.i; + } +/* L90: */ + } + } + return 0; + +/* End of ZMAKE. */ + +} /* zmake_ */ + +/* Subroutine */ int zmmch_(char* transa, char* transb, integer* m, integer* n, integer* kk, doublecomplex* alpha, doublecomplex* a, integer* lda, doublecomplex* b, integer* ldb, doublecomplex* beta, doublecomplex* c__, integer* ldc, doublecomplex* ct, doublereal* g, doublecomplex* cc, integer* ldcc, doublereal* eps, doublereal* err, logical* fatal, integer* nout, logical* mv, ftnlen transa_len, ftnlen transb_len) +{ + + /* System generated locals */ + integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, cc_dim1, + cc_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7; + doublereal d__1, d__2, d__3, d__4, d__5, d__6; + doublecomplex z__1, z__2, z__3, z__4; + + double sqrt(double); + /* Local variables */ + static doublereal erri; + static integer i__, j, k; + static logical trana, tranb, ctrana, ctranb; + +/* Checks the results of the computational tests. */ + +/* Auxiliary routine for test program for Level 3 Blas. */ + +/* -- Written on 8-February-1989. */ +/* Jack Dongarra, Argonne National Laboratory. */ +/* Iain Duff, AERE Harwell. */ +/* Jeremy Du Croz, Numerical Algorithms Group Ltd. */ +/* Sven Hammarling, Numerical Algorithms Group Ltd. */ + +/* .. Parameters .. */ +/* .. Scalar Arguments .. */ +/* .. Array Arguments .. */ +/* .. Local Scalars .. */ +/* .. Intrinsic Functions .. */ +/* .. Statement Functions .. */ +/* .. Statement Function definitions .. */ +/* .. Executable Statements .. */ + /* Parameter adjustments */ + a_dim1 = *lda; + a_offset = 1 + a_dim1 * 1; + a -= a_offset; + b_dim1 = *ldb; + b_offset = 1 + b_dim1 * 1; + b -= b_offset; + c_dim1 = *ldc; + c_offset = 1 + c_dim1 * 1; + c__ -= c_offset; + --ct; + --g; + cc_dim1 = *ldcc; + cc_offset = 1 + cc_dim1 * 1; + cc -= cc_offset; + + /* Function Body */ + trana = *(unsigned char *)transa == 'T' || *(unsigned char *)transa == + 'C'; + tranb = *(unsigned char *)transb == 'T' || *(unsigned char *)transb == + 'C'; + ctrana = *(unsigned char *)transa == 'C'; + ctranb = *(unsigned char *)transb == 'C'; + +/* Compute expected result, one column at a time, in CT using data */ +/* in A, B and C. */ +/* Compute gauges in G. */ + + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + + i__2 = *m; + for (i__ = 1; i__ <= i__2; ++i__) { + i__3 = i__; + ct[i__3].r = 0., ct[i__3].i = 0.; + g[i__] = 0.; +/* L10: */ + } + if (! trana && ! tranb) { + i__2 = *kk; + for (k = 1; k <= i__2; ++k) { + i__3 = *m; + for (i__ = 1; i__ <= i__3; ++i__) { + i__4 = i__; + i__5 = i__; + i__6 = i__ + k * a_dim1; + i__7 = k + j * b_dim1; + z__2.r = a[i__6].r * b[i__7].r - a[i__6].i * b[i__7].i, + z__2.i = a[i__6].r * b[i__7].i + a[i__6].i * b[ + i__7].r; + z__1.r = ct[i__5].r + z__2.r, z__1.i = ct[i__5].i + + z__2.i; + ct[i__4].r = z__1.r, ct[i__4].i = z__1.i; + i__4 = i__ + k * a_dim1; + i__5 = k + j * b_dim1; + g[i__] += ((d__1 = a[i__4].r, abs(d__1)) + (d__2 = d_imag( + &a[i__ + k * a_dim1]), abs(d__2))) * ((d__3 = b[ + i__5].r, abs(d__3)) + (d__4 = d_imag(&b[k + j * + b_dim1]), abs(d__4))); +/* L20: */ + } +/* L30: */ + } + } else if (trana && ! tranb) { + if (ctrana) { + i__2 = *kk; + for (k = 1; k <= i__2; ++k) { + i__3 = *m; + for (i__ = 1; i__ <= i__3; ++i__) { + i__4 = i__; + i__5 = i__; + d_cnjg(&z__3, &a[k + i__ * a_dim1]); + i__6 = k + j * b_dim1; + z__2.r = z__3.r * b[i__6].r - z__3.i * b[i__6].i, + z__2.i = z__3.r * b[i__6].i + z__3.i * b[i__6] + .r; + z__1.r = ct[i__5].r + z__2.r, z__1.i = ct[i__5].i + + z__2.i; + ct[i__4].r = z__1.r, ct[i__4].i = z__1.i; + i__4 = k + i__ * a_dim1; + i__5 = k + j * b_dim1; + g[i__] += ((d__1 = a[i__4].r, abs(d__1)) + (d__2 = + d_imag(&a[k + i__ * a_dim1]), abs(d__2))) * (( + d__3 = b[i__5].r, abs(d__3)) + (d__4 = d_imag( + &b[k + j * b_dim1]), abs(d__4))); +/* L40: */ + } +/* L50: */ + } + } else { + i__2 = *kk; + for (k = 1; k <= i__2; ++k) { + i__3 = *m; + for (i__ = 1; i__ <= i__3; ++i__) { + i__4 = i__; + i__5 = i__; + i__6 = k + i__ * a_dim1; + i__7 = k + j * b_dim1; + z__2.r = a[i__6].r * b[i__7].r - a[i__6].i * b[i__7] + .i, z__2.i = a[i__6].r * b[i__7].i + a[i__6] + .i * b[i__7].r; + z__1.r = ct[i__5].r + z__2.r, z__1.i = ct[i__5].i + + z__2.i; + ct[i__4].r = z__1.r, ct[i__4].i = z__1.i; + i__4 = k + i__ * a_dim1; + i__5 = k + j * b_dim1; + g[i__] += ((d__1 = a[i__4].r, abs(d__1)) + (d__2 = + d_imag(&a[k + i__ * a_dim1]), abs(d__2))) * (( + d__3 = b[i__5].r, abs(d__3)) + (d__4 = d_imag( + &b[k + j * b_dim1]), abs(d__4))); +/* L60: */ + } +/* L70: */ + } + } + } else if (! trana && tranb) { + if (ctranb) { + i__2 = *kk; + for (k = 1; k <= i__2; ++k) { + i__3 = *m; + for (i__ = 1; i__ <= i__3; ++i__) { + i__4 = i__; + i__5 = i__; + i__6 = i__ + k * a_dim1; + d_cnjg(&z__3, &b[j + k * b_dim1]); + z__2.r = a[i__6].r * z__3.r - a[i__6].i * z__3.i, + z__2.i = a[i__6].r * z__3.i + a[i__6].i * + z__3.r; + z__1.r = ct[i__5].r + z__2.r, z__1.i = ct[i__5].i + + z__2.i; + ct[i__4].r = z__1.r, ct[i__4].i = z__1.i; + i__4 = i__ + k * a_dim1; + i__5 = j + k * b_dim1; + g[i__] += ((d__1 = a[i__4].r, abs(d__1)) + (d__2 = + d_imag(&a[i__ + k * a_dim1]), abs(d__2))) * (( + d__3 = b[i__5].r, abs(d__3)) + (d__4 = d_imag( + &b[j + k * b_dim1]), abs(d__4))); +/* L80: */ + } +/* L90: */ + } + } else { + i__2 = *kk; + for (k = 1; k <= i__2; ++k) { + i__3 = *m; + for (i__ = 1; i__ <= i__3; ++i__) { + i__4 = i__; + i__5 = i__; + i__6 = i__ + k * a_dim1; + i__7 = j + k * b_dim1; + z__2.r = a[i__6].r * b[i__7].r - a[i__6].i * b[i__7] + .i, z__2.i = a[i__6].r * b[i__7].i + a[i__6] + .i * b[i__7].r; + z__1.r = ct[i__5].r + z__2.r, z__1.i = ct[i__5].i + + z__2.i; + ct[i__4].r = z__1.r, ct[i__4].i = z__1.i; + i__4 = i__ + k * a_dim1; + i__5 = j + k * b_dim1; + g[i__] += ((d__1 = a[i__4].r, abs(d__1)) + (d__2 = + d_imag(&a[i__ + k * a_dim1]), abs(d__2))) * (( + d__3 = b[i__5].r, abs(d__3)) + (d__4 = d_imag( + &b[j + k * b_dim1]), abs(d__4))); +/* L100: */ + } +/* L110: */ + } + } + } else if (trana && tranb) { + if (ctrana) { + if (ctranb) { + i__2 = *kk; + for (k = 1; k <= i__2; ++k) { + i__3 = *m; + for (i__ = 1; i__ <= i__3; ++i__) { + i__4 = i__; + i__5 = i__; + d_cnjg(&z__3, &a[k + i__ * a_dim1]); + d_cnjg(&z__4, &b[j + k * b_dim1]); + z__2.r = z__3.r * z__4.r - z__3.i * z__4.i, + z__2.i = z__3.r * z__4.i + z__3.i * + z__4.r; + z__1.r = ct[i__5].r + z__2.r, z__1.i = ct[i__5].i + + z__2.i; + ct[i__4].r = z__1.r, ct[i__4].i = z__1.i; + i__4 = k + i__ * a_dim1; + i__5 = j + k * b_dim1; + g[i__] += ((d__1 = a[i__4].r, abs(d__1)) + (d__2 = + d_imag(&a[k + i__ * a_dim1]), abs(d__2))) + * ((d__3 = b[i__5].r, abs(d__3)) + (d__4 + = d_imag(&b[j + k * b_dim1]), abs(d__4))); +/* L120: */ + } +/* L130: */ + } + } else { + i__2 = *kk; + for (k = 1; k <= i__2; ++k) { + i__3 = *m; + for (i__ = 1; i__ <= i__3; ++i__) { + i__4 = i__; + i__5 = i__; + d_cnjg(&z__3, &a[k + i__ * a_dim1]); + i__6 = j + k * b_dim1; + z__2.r = z__3.r * b[i__6].r - z__3.i * b[i__6].i, + z__2.i = z__3.r * b[i__6].i + z__3.i * b[ + i__6].r; + z__1.r = ct[i__5].r + z__2.r, z__1.i = ct[i__5].i + + z__2.i; + ct[i__4].r = z__1.r, ct[i__4].i = z__1.i; + i__4 = k + i__ * a_dim1; + i__5 = j + k * b_dim1; + g[i__] += ((d__1 = a[i__4].r, abs(d__1)) + (d__2 = + d_imag(&a[k + i__ * a_dim1]), abs(d__2))) + * ((d__3 = b[i__5].r, abs(d__3)) + (d__4 + = d_imag(&b[j + k * b_dim1]), abs(d__4))); +/* L140: */ + } +/* L150: */ + } + } + } else { + if (ctranb) { + i__2 = *kk; + for (k = 1; k <= i__2; ++k) { + i__3 = *m; + for (i__ = 1; i__ <= i__3; ++i__) { + i__4 = i__; + i__5 = i__; + i__6 = k + i__ * a_dim1; + d_cnjg(&z__3, &b[j + k * b_dim1]); + z__2.r = a[i__6].r * z__3.r - a[i__6].i * z__3.i, + z__2.i = a[i__6].r * z__3.i + a[i__6].i * + z__3.r; + z__1.r = ct[i__5].r + z__2.r, z__1.i = ct[i__5].i + + z__2.i; + ct[i__4].r = z__1.r, ct[i__4].i = z__1.i; + i__4 = k + i__ * a_dim1; + i__5 = j + k * b_dim1; + g[i__] += ((d__1 = a[i__4].r, abs(d__1)) + (d__2 = + d_imag(&a[k + i__ * a_dim1]), abs(d__2))) + * ((d__3 = b[i__5].r, abs(d__3)) + (d__4 + = d_imag(&b[j + k * b_dim1]), abs(d__4))); +/* L160: */ + } +/* L170: */ + } + } else { + i__2 = *kk; + for (k = 1; k <= i__2; ++k) { + i__3 = *m; + for (i__ = 1; i__ <= i__3; ++i__) { + i__4 = i__; + i__5 = i__; + i__6 = k + i__ * a_dim1; + i__7 = j + k * b_dim1; + z__2.r = a[i__6].r * b[i__7].r - a[i__6].i * b[ + i__7].i, z__2.i = a[i__6].r * b[i__7].i + + a[i__6].i * b[i__7].r; + z__1.r = ct[i__5].r + z__2.r, z__1.i = ct[i__5].i + + z__2.i; + ct[i__4].r = z__1.r, ct[i__4].i = z__1.i; + i__4 = k + i__ * a_dim1; + i__5 = j + k * b_dim1; + g[i__] += ((d__1 = a[i__4].r, abs(d__1)) + (d__2 = + d_imag(&a[k + i__ * a_dim1]), abs(d__2))) + * ((d__3 = b[i__5].r, abs(d__3)) + (d__4 + = d_imag(&b[j + k * b_dim1]), abs(d__4))); +/* L180: */ + } +/* L190: */ + } + } + } + } + i__2 = *m; + for (i__ = 1; i__ <= i__2; ++i__) { + i__3 = i__; + i__4 = i__; + z__2.r = alpha->r * ct[i__4].r - alpha->i * ct[i__4].i, z__2.i = + alpha->r * ct[i__4].i + alpha->i * ct[i__4].r; + i__5 = i__ + j * c_dim1; + z__3.r = beta->r * c__[i__5].r - beta->i * c__[i__5].i, z__3.i = + beta->r * c__[i__5].i + beta->i * c__[i__5].r; + z__1.r = z__2.r + z__3.r, z__1.i = z__2.i + z__3.i; + ct[i__3].r = z__1.r, ct[i__3].i = z__1.i; + i__3 = i__ + j * c_dim1; + g[i__] = ((d__1 = alpha->r, abs(d__1)) + (d__2 = d_imag(alpha), + abs(d__2))) * g[i__] + ((d__3 = beta->r, abs(d__3)) + ( + d__4 = d_imag(beta), abs(d__4))) * ((d__5 = c__[i__3].r, + abs(d__5)) + (d__6 = d_imag(&c__[i__ + j * c_dim1]), abs( + d__6))); +/* L200: */ + } + +/* Compute the error ratio for this result. */ + + *err = 0.; + i__2 = *m; + for (i__ = 1; i__ <= i__2; ++i__) { + i__3 = i__; + i__4 = i__ + j * cc_dim1; + z__2.r = ct[i__3].r - cc[i__4].r, z__2.i = ct[i__3].i - cc[i__4] + .i; + z__1.r = z__2.r, z__1.i = z__2.i; + erri = ((d__1 = z__1.r, abs(d__1)) + (d__2 = d_imag(&z__1), abs( + d__2))) / *eps; + if (g[i__] != 0.) { + erri /= g[i__]; + } + *err = f2cmax(*err,erri); + if (*err * sqrt(*eps) >= 1.) { + goto L230; + } +/* L210: */ + } + +/* L220: */ + } + +/* If the loop completes, all results are at least half accurate. */ + goto L250; + +/* Report fatal error. */ + +L230: + *fatal = TRUE_; + printf(" ******* FATAL ERROR - COMPUTED RESULT IS LESS THAN HALF ACCURATE *******\n"); + printf(" EXPECTED RESULT COMPUTED RESULT\n"); + i__1 = *m; + for (i__ = 1; i__ <= i__1; ++i__) { + if (*mv) { + printf("%7d (%15.6g,%15.6g) (%15.6g,%15.6g)\n",i__,ct[i__].r,ct[i__].i,cc[i__+j*cc_dim1].r,cc[i__+j*cc_dim1].i); + } else { + printf("%7d (%15.6g,%15.6g) (%15.6g,%15.6g)\n",i__,cc[i__+j*cc_dim1].r,cc[i__+j*cc_dim1].i,ct[i__].r,ct[i__].i); + } +/* L240: */ + } + if (*n > 1) { + printf(" THESE ARE THE RESULTS FOR COLUMN %d\n",j); + } + +L250: + return 0; + + +/* End of ZMMCH. */ + +} /* zmmch_ */ + +logical lze_(doublecomplex* ri, doublecomplex* rj, integer* lr) +{ + /* System generated locals */ + integer i__1, i__2, i__3; + logical ret_val; + + /* Local variables */ + static integer i__; + + +/* Tests if two arrays are identical. */ + +/* Auxiliary routine for test program for Level 3 Blas. */ + +/* -- Written on 8-February-1989. */ +/* Jack Dongarra, Argonne National Laboratory. */ +/* Iain Duff, AERE Harwell. */ +/* Jeremy Du Croz, Numerical Algorithms Group Ltd. */ +/* Sven Hammarling, Numerical Algorithms Group Ltd. */ + +/* .. Scalar Arguments .. */ +/* .. Array Arguments .. */ +/* .. Local Scalars .. */ +/* .. Executable Statements .. */ + /* Parameter adjustments */ + --rj; + --ri; + + /* Function Body */ + i__1 = *lr; + for (i__ = 1; i__ <= i__1; ++i__) { + i__2 = i__; + i__3 = i__; + if (ri[i__2].r != rj[i__3].r || ri[i__2].i != rj[i__3].i) { + goto L20; + } +/* L10: */ + } + ret_val = TRUE_; + goto L30; +L20: + ret_val = FALSE_; +L30: + return ret_val; + +/* End of LZE. */ + +} /* lze_ */ + +logical lzeres_(char* type__, char* uplo, integer* m, integer* n, doublecomplex *aa, doublecomplex* as, integer* lda, ftnlen type_len, ftnlen uplo_len) +{ + /* System generated locals */ + integer aa_dim1, aa_offset, as_dim1, as_offset, i__1, i__2, i__3, i__4; + logical ret_val; + + /* Local variables */ + static integer ibeg, iend, i__, j; + static logical upper; + + +/* Tests if selected elements in two arrays are equal. */ + +/* TYPE is 'ge' or 'he' or 'sy'. */ + +/* Auxiliary routine for test program for Level 3 Blas. */ + +/* -- Written on 8-February-1989. */ +/* Jack Dongarra, Argonne National Laboratory. */ +/* Iain Duff, AERE Harwell. */ +/* Jeremy Du Croz, Numerical Algorithms Group Ltd. */ +/* Sven Hammarling, Numerical Algorithms Group Ltd. */ + +/* .. Scalar Arguments .. */ +/* .. Array Arguments .. */ +/* .. Local Scalars .. */ +/* .. Executable Statements .. */ + /* Parameter adjustments */ + as_dim1 = *lda; + as_offset = 1 + as_dim1 * 1; + as -= as_offset; + aa_dim1 = *lda; + aa_offset = 1 + aa_dim1 * 1; + aa -= aa_offset; + + /* Function Body */ + upper = *(unsigned char *)uplo == 'U'; + if (s_cmp(type__, "ge", (ftnlen)2, (ftnlen)2) == 0) { + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + i__2 = *lda; + for (i__ = *m + 1; i__ <= i__2; ++i__) { + i__3 = i__ + j * aa_dim1; + i__4 = i__ + j * as_dim1; + if (aa[i__3].r != as[i__4].r || aa[i__3].i != as[i__4].i) { + goto L70; + } +/* L10: */ + } +/* L20: */ + } + } else if (s_cmp(type__, "he", (ftnlen)2, (ftnlen)2) == 0 || s_cmp(type__, + "sy", (ftnlen)2, (ftnlen)2) == 0) { + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + if (upper) { + ibeg = 1; + iend = j; + } else { + ibeg = j; + iend = *n; + } + i__2 = ibeg - 1; + for (i__ = 1; i__ <= i__2; ++i__) { + i__3 = i__ + j * aa_dim1; + i__4 = i__ + j * as_dim1; + if (aa[i__3].r != as[i__4].r || aa[i__3].i != as[i__4].i) { + goto L70; + } +/* L30: */ + } + i__2 = *lda; + for (i__ = iend + 1; i__ <= i__2; ++i__) { + i__3 = i__ + j * aa_dim1; + i__4 = i__ + j * as_dim1; + if (aa[i__3].r != as[i__4].r || aa[i__3].i != as[i__4].i) { + goto L70; + } +/* L40: */ + } +/* L50: */ + } + } + +/* 60 CONTINUE */ + ret_val = TRUE_; + goto L80; +L70: + ret_val = FALSE_; +L80: + return ret_val; + +/* End of LZERES. */ + +} /* lzeres_ */ + +/* Double Complex */ VOID zbeg_(doublecomplex* ret_val, logical* reset) +{ + /* System generated locals */ + doublereal d__1, d__2; + doublecomplex z__1; + + /* Local variables */ + static integer i__, j, ic, mi, mj; + + +/* Generates complex numbers as pairs of random numbers uniformly */ +/* distributed between -0.5 and 0.5. */ + +/* Auxiliary routine for test program for Level 3 Blas. */ + +/* -- Written on 8-February-1989. */ +/* Jack Dongarra, Argonne National Laboratory. */ +/* Iain Duff, AERE Harwell. */ +/* Jeremy Du Croz, Numerical Algorithms Group Ltd. */ +/* Sven Hammarling, Numerical Algorithms Group Ltd. */ + +/* .. Scalar Arguments .. */ +/* .. Local Scalars .. */ +/* .. Save statement .. */ +/* .. Intrinsic Functions .. */ +/* .. Executable Statements .. */ + if (*reset) { +/* Initialize local variables. */ + mi = 891; + mj = 457; + i__ = 7; + j = 7; + ic = 0; + *reset = FALSE_; + } + +/* The sequence of values of I or J is bounded between 1 and 999. */ +/* If initial I or J = 1,2,3,6,7 or 9, the period will be 50. */ +/* If initial I or J = 4 or 8, the period will be 25. */ +/* If initial I or J = 5, the period will be 10. */ +/* IC is used to break up the period by skipping 1 value of I or J */ +/* in 6. */ + + ++ic; +L10: + i__ *= mi; + j *= mj; + i__ -= i__ / 1000 * 1000; + j -= j / 1000 * 1000; + if (ic >= 5) { + ic = 0; + goto L10; + } + d__1 = (i__ - 500) / 1001.; + d__2 = (j - 500) / 1001.; + z__1.r = d__1, z__1.i = d__2; + ret_val->r = z__1.r, ret_val->i = z__1.i; + return ; + +/* End of ZBEG. */ + +} /* zbeg_ */ + +doublereal ddiff_(doublereal* x, doublereal* y) +{ + /* System generated locals */ + doublereal ret_val; + + +/* Auxiliary routine for test program for Level 3 Blas. */ + +/* -- Written on 8-February-1989. */ +/* Jack Dongarra, Argonne National Laboratory. */ +/* Iain Duff, AERE Harwell. */ +/* Jeremy Du Croz, Numerical Algorithms Group Ltd. */ +/* Sven Hammarling, Numerical Algorithms Group Ltd. */ + +/* .. Scalar Arguments .. */ +/* .. Executable Statements .. */ + ret_val = *x - *y; + return ret_val; + +/* End of DDIFF. */ + +} /* ddiff_ */ + +/* Main program alias */ /*int zblat3_ () { MAIN__ (); }*/ diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 index c7ef44035..5b54a2ada 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON2K1000 +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -85,6 +85,12 @@ ZSWAPKERNEL = cswap_lsx.S CSUMKERNEL = csum_lsx.S ZSUMKERNEL = csum_lsx.S +SGEMVNKERNEL = sgemv_n_lsx.S +SGEMVTKERNEL = sgemv_t_lsx.S + +DGEMVNKERNEL = dgemv_n_lsx.S +DGEMVTKERNEL = dgemv_t_lsx.S + DGEMMKERNEL = dgemm_kernel_8x4.S DGEMMINCOPY = dgemm_ncopy_8_lsx.S DGEMMITCOPY = dgemm_tcopy_8_lsx.S @@ -100,6 +106,9 @@ DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +CGEMVNKERNEL = cgemv_n_4_lsx.S +CGEMVTKERNEL = cgemv_t_4_lsx.S + CGEMMKERNEL = cgemm_kernel_8x4_lsx.S CGEMMINCOPY = cgemm_ncopy_8_lsx.S CGEMMITCOPY = cgemm_tcopy_8_lsx.S @@ -115,6 +124,9 @@ CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +ZGEMVNKERNEL = zgemv_n_2_lsx.S +ZGEMVTKERNEL = zgemv_t_2_lsx.S + ZGEMMKERNEL = zgemm_kernel_4x4_lsx.S ZGEMMONCOPY = zgemm_ncopy_4_lsx.S ZGEMMOTCOPY = zgemm_tcopy_4_lsx.S diff --git a/kernel/loongarch64/cgemv_n_4_lsx.S b/kernel/loongarch64/cgemv_n_4_lsx.S new file mode 100644 index 000000000..cf8273797 --- /dev/null +++ b/kernel/loongarch64/cgemv_n_4_lsx.S @@ -0,0 +1,323 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" +#include "loongarch64_asm.S" + +/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, + * FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) + */ +#define M $r4 +#define N $r5 +#define ALPHA_R $f0 +#define ALPHA_I $f1 +#define A $r7 +#define LDA $r8 +#define X $r9 +#define INC_X $r10 +#define Y $r11 +#define INC_Y $r6 + +#define J $r12 +#define I $r13 +#define K $r14 +#define Y_ORG $r15 +#define OFFSET $r16 +#define K_LDA $r17 +#define M8 $r18 +#define T0 $r19 +#define PA0 $r20 +#define PA1 $r23 +#define PA2 $r24 +#define PA3 $r25 +#define PA4 $r26 +#define PA5 $r27 +#define PA6 $r28 +#define PA7 $r29 + +#define VALPHA $vr1 +#define X0 $vr2 +#define X1 $vr3 +#define X2 $vr4 +#define X3 $vr5 +#define X4 $vr6 +#define X5 $vr7 +#define X6 $vr8 +#define X7 $vr9 +#define Y0 $vr10 +#define Y1 $vr11 +#define A0 $vr12 +#define A1 $vr13 +#define A2 $vr14 +#define A3 $vr15 +#define A4 $vr16 +#define A5 $vr17 +#define A6 $vr18 +#define A7 $vr19 +#define A8 $vr20 +#define A9 $vr21 +#define A10 $vr22 +#define A11 $vr23 +#define A12 $vr24 +#define A13 $vr25 +#define A14 $vr26 +#define A15 $vr27 +#define TMP0 $vr28 +#define TMP1 $vr29 +#define TMP2 $vr30 + +#if !defined(CONJ) +#if !defined(XCONJ) +#define GXCONJ 0 +#define GCONJ 0 +#else +#define GXCONJ 1 +#define GCONJ 0 +#endif +#else +#if !defined(XCONJ) +#define GXCONJ 0 +#define GCONJ 1 +#else +#define GXCONJ 1 +#define GCONJ 1 +#endif +#endif + +.macro CLOAD_X_4 + GLDREPL v, d, X0, X, 0x00, X1, X, 0x08, X2, X, 0x10, X3, X, 0x18 + GCOMPLEXMUL GXCONJ, \ + vf, s, X0, VALPHA, X0, TMP0, TMP1, TMP2, \ + X1, VALPHA, X1, TMP0, TMP1, TMP2, \ + X2, VALPHA, X2, TMP0, TMP1, TMP2, \ + X3, VALPHA, X3, TMP0, TMP1, TMP2 +.endm + +.macro CLOAD_X_4_GAP + vldrepl.d X0, X, 0x00 + PTR_ADD T0, X, INC_X + vldrepl.d X1, T0, 0x00 + PTR_ADD T0, T0, INC_X + vldrepl.d X2, T0, 0x00 + PTR_ADD T0, T0, INC_X + vldrepl.d X3, T0, 0x00 + + GCOMPLEXMUL GXCONJ, \ + vf, s, X0, VALPHA, X0, TMP0, TMP1, TMP2, \ + X1, VALPHA, X1, TMP0, TMP1, TMP2, \ + X2, VALPHA, X2, TMP0, TMP1, TMP2, \ + X3, VALPHA, X3, TMP0, TMP1, TMP2 +.endm + +.macro CLOAD_X_1 + GLDREPL v, d, X0, X, 0x00 + GCOMPLEXMUL GXCONJ, \ + vf, s, X0, VALPHA, X0, TMP0, TMP1, TMP2 +.endm + +.macro CLOAD_Y_4 + GLD v, , Y0, Y, 0, Y1, Y, 0x10 +.endm + +.macro CLOAD_Y_4_GAP + fld.d $f10, Y, 0 + fldx.d $f13, Y, INC_Y + PTR_ALSL T0, INC_Y, Y, 1 + fld.d $f11, T0, 0 + fldx.d $f17, T0, INC_Y + vpackev.d Y0, A1, Y0 + vpackev.d Y1, A5, Y1 +.endm + +.macro CLOAD_Y_1 + fld.d $f10, Y, 0 +.endm + +.macro CSTORE_Y_4 + GST v, , Y0, Y, 0, Y1, Y, 0x10 +.endm + +.macro CSTORE_Y_4_GAP + vstelm.d Y0, Y, 0, 0 + PTR_ADD T0, Y, INC_Y + vstelm.d Y0, T0, 0, 1 + PTR_ADD T0, T0, INC_Y + vstelm.d Y1, T0, 0, 0 + PTR_ADD T0, T0, INC_Y + vstelm.d Y1, T0, 0, 1 +.endm + +.macro CSTORE_Y_1 + fst.d $f10, Y, 0 +.endm + +.macro CGEMV_N_4x4 + GLD_INC v, , 0x10, \ + A0, PA0, 0, A1, PA0, 0, \ + A2, PA1, 0, A3, PA1, 0, \ + A4, PA2, 0, A5, PA2, 0, \ + A6, PA3, 0, A7, PA3, 0 + + GCOMPLEXMADD GXCONJ, GCONJ, \ + vf, s, Y0, X0, A0, Y0, TMP0, TMP1, TMP2, Y1, X0, A1, Y1, TMP0, TMP1, TMP2, \ + Y0, X1, A2, Y0, TMP0, TMP1, TMP2, Y1, X1, A3, Y1, TMP0, TMP1, TMP2, \ + Y0, X2, A4, Y0, TMP0, TMP1, TMP2, Y1, X2, A5, Y1, TMP0, TMP1, TMP2, \ + Y0, X3, A6, Y0, TMP0, TMP1, TMP2, Y1, X3, A7, Y1, TMP0, TMP1, TMP2 +.endm + +.macro CGEMV_N_1x4 + GLD_INC f, d, 0x08, $f12, PA0, 0, $f14, PA1, 0, $f16, PA2, 0, $f18, PA3, 0 + GCOMPLEXMADD GXCONJ, GCONJ, \ + vf, s, Y0, X0, A0, Y0, TMP0, TMP1, TMP2, \ + Y0, X1, A2, Y0, TMP0, TMP1, TMP2, \ + Y0, X2, A4, Y0, TMP0, TMP1, TMP2, \ + Y0, X3, A6, Y0, TMP0, TMP1, TMP2 +.endm + +.macro CGEMV_N_1x1 + fld.d $f12, PA0, 0 + PTR_ADDI PA0, PA0, 0x08 + GCOMPLEXMADD GXCONJ, GCONJ, \ + vf, s, Y0, X0, A0, Y0, TMP0, TMP1, TMP2 +.endm + +.macro CGEMV_N_LSX XW:req, X_4:req, X_1:req, Y_4:req, Y_1:req + PTR_SRLI J, N, 2 + beqz J, .L_\XW\()_N_3 + PTR_SLLI K_LDA, LDA, 2 + PTR_SUB K_LDA, K_LDA, M8 +.L_\XW\()_N_L4: + CLOAD_\X_4 + xor K, K, K + move Y, Y_ORG + PTR_SRLI I, M, 2 + beqz I, .L_\XW\()_M_3 +.align 5 +.L_\XW\()_M_L4: + CLOAD_\Y_4 + CGEMV_N_4x4 + CSTORE_\Y_4 + PTR_ADDI I, I, -1 + PTR_ALSL Y, INC_Y, Y, 2 + PTR_ADDI K, K, 4 + bnez I, .L_\XW\()_M_L4 +.L_\XW\()_M_3: + andi I, M, 3 + beqz I, .L_\XW\()_M_END +.align 5 +.L_\XW\()_M_L1: + CLOAD_\Y_1 + CGEMV_N_1x4 + CSTORE_\Y_1 + PTR_ADDI I, I, -1 + PTR_ADD Y, Y, INC_Y + PTR_ADDI K, K, 1 + bnez I, .L_\XW\()_M_L1 +.L_\XW\()_M_END: + PTR_ADDI J, J, -1 +#if __loongarch_grlen == 64 + GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA +#elif __loongarch_grlen == 32 + GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA +#else + GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA +#endif + PTR_ALSL X, INC_X, X, 2 + bnez J, .L_\XW\()_N_L4 +.L_\XW\()_N_3: + andi J, N, 3 + beqz J, .L_END +.L_\XW\()_N_L1: + CLOAD_\X_1 + xor K, K, K + move Y, Y_ORG + move I, M + beqz I, .L_END +.align 5 +.L_\XW\()_N_1_M_L1: + CLOAD_\Y_1 + CGEMV_N_1x1 + CSTORE_\Y_1 + PTR_ADDI I, I, -1 + PTR_ADD Y, Y, INC_Y + PTR_ADDI K, K, 1 + bnez I, .L_\XW\()_N_1_M_L1 +.L_\XW\()_N_1_M_END: + PTR_ADDI J, J, -1 + PTR_SUB K_LDA, LDA, M8 + PTR_ADD PA0, PA0, K_LDA + PTR_ADD X, X, INC_X + bnez J, .L_\XW\()_N_L1 + + b .L_END +.endm + + PROLOGUE + PTR_LD INC_Y, $sp, 0 + push_if_used 17 + 7, 31 + PTR_ADDI K, $r0, 0x01 + PTR_SUB I, INC_X, K + PTR_SUB J, INC_Y, K + maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */ + maskeqz J, K, J /* if(inc_y == 1) j = 0; else j = 1; */ + PTR_ALSL I, I, J, 1 + GSLLI , d, LDA, LDA, 3, INC_X, INC_X, 3, INC_Y, INC_Y, 3, M8, M, 3 + // Init VALPHA + vpackev.w $vr0, $vr1, $vr0 + vpackev.d VALPHA, $vr0, $vr0 + move Y_ORG, Y + move PA0, A +#if __loongarch_grlen == 64 + GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA +#elif __loongarch_grlen == 32 + GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA +#else + GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA +#endif + la.local T0, .L_GAP_TABLE + PTR_ALSL I, I, T0, 1 + ld.h K, I, 0 // Obtain the offset address + PTR_ADD T0, T0, K + jirl $r0, T0, 0 +.L_GAP_TABLE: + .hword .L_GAP_0_0 - .L_GAP_TABLE + .hword .L_GAP_0_1 - .L_GAP_TABLE + .hword .L_GAP_1_0 - .L_GAP_TABLE + .hword .L_GAP_1_1 - .L_GAP_TABLE +.L_GAP_0_0: /* if (inc_x == 1) && (incy == 1) */ + CGEMV_N_LSX GAP_0_0, X_4, X_1, Y_4, Y_1 +.L_GAP_0_1: /* if (inc_x == 1) && (incy != 1) */ + CGEMV_N_LSX GAP_0_1, X_4, X_1, Y_4_GAP, Y_1 +.L_GAP_1_0: /* if (inc_x != 1) && (incy == 1) */ + CGEMV_N_LSX GAP_1_0, X_4_GAP, X_1, Y_4, Y_1 +.L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */ + CGEMV_N_LSX GAP_1_1, X_4_GAP, X_1, Y_4_GAP, Y_1 +.L_END: + pop_if_used 17 + 7, 31 + jirl $r0, $r1, 0x0 + EPILOGUE diff --git a/kernel/loongarch64/cgemv_n_8_lasx.S b/kernel/loongarch64/cgemv_n_8_lasx.S index b078e3227..ba38a9573 100644 --- a/kernel/loongarch64/cgemv_n_8_lasx.S +++ b/kernel/loongarch64/cgemv_n_8_lasx.S @@ -122,14 +122,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. GLDREPL xv, d, X0, X, 0x00, X1, X, 0x08, X2, X, 0x10, X3, X, 0x18, \ X4, X, 0x20, X5, X, 0x28, X6, X, 0x30, X7, X, 0x38 GCOMPLEXMUL GXCONJ, \ - xvf, s, X0, X0, VALPHA, TMP0, TMP1, TMP2, \ - X1, X1, VALPHA, TMP0, TMP1, TMP2, \ - X2, X2, VALPHA, TMP0, TMP1, TMP2, \ - X3, X3, VALPHA, TMP0, TMP1, TMP2, \ - X4, X4, VALPHA, TMP0, TMP1, TMP2, \ - X5, X5, VALPHA, TMP0, TMP1, TMP2, \ - X6, X6, VALPHA, TMP0, TMP1, TMP2, \ - X7, X7, VALPHA, TMP0, TMP1, TMP2 + xvf, s, X0, VALPHA, X0, TMP0, TMP1, TMP2, \ + X1, VALPHA, X1, TMP0, TMP1, TMP2, \ + X2, VALPHA, X2, TMP0, TMP1, TMP2, \ + X3, VALPHA, X3, TMP0, TMP1, TMP2, \ + X4, VALPHA, X4, TMP0, TMP1, TMP2, \ + X5, VALPHA, X5, TMP0, TMP1, TMP2, \ + X6, VALPHA, X6, TMP0, TMP1, TMP2, \ + X7, VALPHA, X7, TMP0, TMP1, TMP2 .endm .macro CLOAD_X_8_GAP @@ -150,14 +150,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvldrepl.d X7, T0, 0x00 GCOMPLEXMUL GXCONJ, \ - xvf, s, X0, X0, VALPHA, TMP0, TMP1, TMP2, \ - X1, X1, VALPHA, TMP0, TMP1, TMP2, \ - X2, X2, VALPHA, TMP0, TMP1, TMP2, \ - X3, X3, VALPHA, TMP0, TMP1, TMP2, \ - X4, X4, VALPHA, TMP0, TMP1, TMP2, \ - X5, X5, VALPHA, TMP0, TMP1, TMP2, \ - X6, X6, VALPHA, TMP0, TMP1, TMP2, \ - X7, X7, VALPHA, TMP0, TMP1, TMP2 + xvf, s, X0, VALPHA, X0, TMP0, TMP1, TMP2, \ + X1, VALPHA, X1, TMP0, TMP1, TMP2, \ + X2, VALPHA, X2, TMP0, TMP1, TMP2, \ + X3, VALPHA, X3, TMP0, TMP1, TMP2, \ + X4, VALPHA, X4, TMP0, TMP1, TMP2, \ + X5, VALPHA, X5, TMP0, TMP1, TMP2, \ + X6, VALPHA, X6, TMP0, TMP1, TMP2, \ + X7, VALPHA, X7, TMP0, TMP1, TMP2 .endm .macro CLOAD_Y_8 @@ -228,7 +228,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro CLOAD_X_1 GLDREPL xv, d, X0, X, 0x00 GCOMPLEXMUL GXCONJ, \ - xvf, s, X0, X0, VALPHA, TMP0, TMP1, TMP2 + xvf, s, X0, VALPHA, X0, TMP0, TMP1, TMP2 .endm .macro CLOAD_Y_1 diff --git a/kernel/loongarch64/cgemv_t_4_lsx.S b/kernel/loongarch64/cgemv_t_4_lsx.S new file mode 100644 index 000000000..ada349364 --- /dev/null +++ b/kernel/loongarch64/cgemv_t_4_lsx.S @@ -0,0 +1,290 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" +#include "loongarch64_asm.S" + +/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, + * FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) + */ +#define M $r4 +#define N $r5 +#define ALPHA_R $f0 +#define ALPHA_I $f1 +#define A $r7 +#define LDA $r8 +#define X $r9 +#define INC_X $r10 +#define Y $r11 +#define INC_Y $r6 + +#define J $r12 +#define I $r13 +#define K $r14 +#define PY0 $r14 +#define X_ORG $r15 +#define PY1 $r16 +#define K_LDA $r17 +#define PY2 $r18 +#define T0 $r19 +#define PA0 $r20 +#define PA1 $r23 +#define PA2 $r24 +#define PA3 $r25 +#define PA4 $r26 +#define PA5 $r27 +#define PA6 $r28 +#define PA7 $r29 +#define M8 $r30 + +#define VALPHA $vr0 +#define X0 $vr1 +#define X1 $vr2 +#define A0 $vr3 +#define A1 $vr4 +#define A2 $vr5 +#define A3 $vr6 +#define A4 $vr7 +#define A5 $vr8 +#define A6 $vr9 +#define A7 $vr10 +#define A8 $vr11 +#define A9 $vr12 +#define A10 $vr13 +#define A11 $vr14 +#define A12 $vr15 +#define A13 $vr16 +#define A14 $vr17 +#define A15 $vr18 +#define TP0 $vr19 +#define TP1 $vr20 +#define TP2 $vr21 +#define TP3 $vr22 +#define TP4 $vr23 +#define TP5 $vr24 +#define TP6 $vr25 +#define TP7 $vr26 +#define TMP0 $vr27 +#define TMP1 $vr28 +#define TMP2 $vr29 +#define Y0 $vr3 +#define Y1 $vr4 +#define Y2 $vr5 +#define Y3 $vr6 +#define Y4 $vr7 +#define Y5 $vr8 +#define Y6 $vr9 +#define Y7 $vr10 + +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) +#define GXCONJ1 0 +#define GCONJ1 0 +#else +#define GXCONJ1 1 +#define GCONJ1 0 +#endif + +#if !defined(XCONJ) +#define GXCONJ2 0 +#define GCONJ2 0 +#else +#define GXCONJ2 0 +#define GCONJ2 1 +#endif + +.macro ZERO_Y4 + GXOR v, v, TP0, TP0, TP0, TP1, TP1, TP1, TP2, TP2, TP2, TP3, TP3, TP3 +.endm + +.macro ZERO_Y1 + GXOR v, v, TP0, TP0, TP0 +.endm + +.macro CLOAD_X4 + GLD v, , X0, X, 0x00, X1, X, 0x10 +.endm + +.macro CLOAD_X4_GAP + fld.d $f1, X, 0x00 + fldx.d $f3, X, INC_X + PTR_ALSL T0, INC_X, X, 1 + fld.d $f2, T0, 0x00 + fldx.d $f4, T0, INC_X + vpackev.d X0, A0, X0 + vpackev.d X1, A1, X1 +.endm + +.macro CGEMV_T_4x4 + GLD_INC v, , 0x10, \ + A0, PA0, 0, A1, PA0, 0, \ + A2, PA1, 0, A3, PA1, 0, \ + A4, PA2, 0, A5, PA2, 0, \ + A6, PA3, 0, A7, PA3, 0 + GCOMPLEXMADD GXCONJ1, GCONJ1, \ + vf, s, TP0, A0, X0, TP0, TMP0, TMP1, TMP2, TP0, A1, X1, TP0, TMP0, TMP1, TMP2, \ + TP1, A2, X0, TP1, TMP0, TMP1, TMP2, TP1, A3, X1, TP1, TMP0, TMP1, TMP2, \ + TP2, A4, X0, TP2, TMP0, TMP1, TMP2, TP2, A5, X1, TP2, TMP0, TMP1, TMP2, \ + TP3, A6, X0, TP3, TMP0, TMP1, TMP2, TP3, A7, X1, TP3, TMP0, TMP1, TMP2 +.endm + +.macro CGEMV_T_LSX XW:req, X4:req + PTR_SRLI J, N, 2 + beqz J, .L_\XW\()_N_3 + PTR_SLLI K_LDA, LDA, 2 + PTR_SUB K_LDA, K_LDA, M8 +.L_\XW\()_N_L4: + ZERO_Y4 + move X, X_ORG + PTR_SRLI I, M, 2 + beqz I, .L_\XW\()_M_3 +.align 5 +.L_\XW\()_M_L4: + CLOAD_\X4 + CGEMV_T_4x4 + PTR_ADDI I, I, -1 + PTR_ALSL X, INC_X, X, 2 + bnez I, .L_\XW\()_M_L4 +.L_\XW\()_M_3: + // Accumulated + GCOMPLEXACC vf, s, Y0, TP0, Y1, TP1, Y2, TP2, Y3, TP3 + andi I, M, 3 + beqz I, .L_\XW\()_M_END +.align 5 +.L_\XW\()_M_L1: + fld.d $f1, X, 0x00 + fld.d $f11, PA0, 0x00 + fld.d $f12, PA1, 0x00 + fld.d $f13, PA2, 0x00 + fld.d $f14, PA3, 0x00 +#if __loongarch_grlen == 64 + GADDI , d, PA0, PA0, 0x08, PA1, PA1, 0x08, PA2, PA2, 0x08, PA3, PA3, 0x08 +#elif __loongarch_grlen == 32 + GADDI , w, PA0, PA0, 0x08, PA1, PA1, 0x08, PA2, PA2, 0x08, PA3, PA3, 0x08 +#else + GADDI , d, PA0, PA0, 0x08, PA1, PA1, 0x08, PA2, PA2, 0x08, PA3, PA3, 0x08 +#endif + + GCOMPLEXMADD GXCONJ1, GCONJ1, \ + vf, s, A0, A8, X0, A0, TMP0, TMP1, TMP2, A1, A9, X0, A1, TMP0, TMP1, TMP2, \ + A2, A10, X0, A2, TMP0, TMP1, TMP2, A3, A11, X0, A3, TMP0, TMP1, TMP2 + + PTR_ADDI I, I, -1 + PTR_ADD X, X, INC_X + bnez I, .L_\XW\()_M_L1 +.L_\XW\()_M_END: + fld.d $f11, Y, 0x00 + fldx.d $f12, Y, INC_Y + PTR_ALSL PY0, INC_Y, Y, 1 + fld.d $f13, PY0, 0x00 + fldx.d $f14, PY0, INC_Y + + GCOMPLEXMADD GXCONJ2, GCONJ2, \ + vf, s, A8, VALPHA, A0, A8, TMP0, TMP1, TMP2, A9, VALPHA, A1, A9, TMP0, TMP1, TMP2,\ + A10, VALPHA, A2, A10, TMP0, TMP1, TMP2, A11, VALPHA, A3, A11, TMP0, TMP1, TMP2 + + PTR_ADDI J, J, -1 +#if __loongarch_grlen == 64 + GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA +#elif __loongarch_grlen == 32 + GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA +#else + GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA +#endif + fst.d $f11, Y, 0x00 + fstx.d $f12, Y, INC_Y + fst.d $f13, PY0, 0x00 + fstx.d $f14, PY0, INC_Y + PTR_ALSL Y, INC_Y, Y, 2 + bnez J, .L_\XW\()_N_L4 +.L_\XW\()_N_3: + andi J, N, 3 + beqz J, .L_END + PTR_SUB K_LDA, LDA, M8 +.L_\XW\()_N_1: + ZERO_Y1 + move X, X_ORG + move I, M + beqz I, .L_END +.align 5 +.L_\XW\()_N_1_M_L1: + fld.d $f3, PA0, 0x00 + fld.d $f1, X, 0x00 + GCOMPLEXMADD GXCONJ1, GCONJ1, \ + vf, s, TP0, A0, X0, TP0, TMP0, TMP1, TMP2 + PTR_ADDI I, I, -1 + PTR_ADD X, X, INC_X + PTR_ADDI PA0, PA0, 0x08 + bnez I, .L_\XW\()_N_1_M_L1 +.L_\XW\()_N_1_M_END: + PTR_ADDI J, J, -1 + fld.d $f3, Y, 0x00 + GCOMPLEXMADD GXCONJ2, GCONJ2, \ + vf, s, A0, VALPHA, TP0, A0, TMP0, TMP1, TMP2 + fst.d $f3, Y, 0x00 + PTR_ADD PA0, PA0, K_LDA + PTR_ADD Y, Y, INC_Y + bnez J, .L_\XW\()_N_1 + + b .L_END +.endm + + PROLOGUE + PTR_LD INC_Y, $sp, 0 + push_if_used 17 + 8, 30 + PTR_ADDI K, $r0, 0x01 + PTR_SUB I, INC_X, K + maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */ + GSLLI , d, LDA, LDA, 3, INC_X, INC_X, 3, INC_Y, INC_Y, 3, M8, M, 3 + // Init VALPHA + vpackev.w $vr0, $vr1, $vr0 + vpackev.d VALPHA, $vr0, $vr0 + move X_ORG, X + move PA0, A +#if __loongarch_grlen == 64 + GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA +#elif __loongarch_grlen == 32 + GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA +#else + GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA +#endif + la.local T0, .L_GAP_TABLE + PTR_ALSL I, I, T0, 1 + ld.h K, I, 0 + PTR_ADD T0, T0, K + jirl $r0, T0, 0 +.L_GAP_TABLE: + .hword .L_GAP_0 - .L_GAP_TABLE + .hword .L_GAP_1 - .L_GAP_TABLE +.L_GAP_0: /* if (incx == 1) */ + CGEMV_T_LSX GAP_0, X4 +.L_GAP_1: /* if (incx != 1) */ + CGEMV_T_LSX GAP_1, X4_GAP +.L_END: + pop_if_used 17 + 8, 30 + jirl $r0, $r1, 0x0 + EPILOGUE diff --git a/kernel/loongarch64/dgemv_n_lsx.S b/kernel/loongarch64/dgemv_n_lsx.S new file mode 100644 index 000000000..9a0141fb1 --- /dev/null +++ b/kernel/loongarch64/dgemv_n_lsx.S @@ -0,0 +1,229 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +/* Param */ +#define M $r4 +#define N $r5 +#define A $r7 +#define LDA $r8 +#define X $r9 +#define INCX $r10 +#define Y $r11 +#define INCY $r6 +#define BUFFER $r16 +#define ALPHA $f0 + +#define YORIG $r18 +#define T0 $r19 +#define T1 $r20 +#define XX $r12 +#define YY $r13 +#define I $r14 +#define J $r15 +#define AO1 $r23 +#define AO2 $r24 +#define IX $r25 +#define IY $r26 +#define II $r27 +#define T2 $r28 +#define T3 $r29 +#define T4 $r30 + +/* LSX vectors */ +#define U0 $vr11 +#define U1 $vr12 +#define U2 $vr2 +#define U3 $vr3 +#define U4 $vr4 +#define U5 $vr5 +#define U6 $vr6 +#define U7 $vr7 +#define U8 $vr8 +#define U9 $vr9 +#define VALPHA $vr10 + +#define a1 $f3 +#define a2 $f4 +#define a3 $f5 +#define a4 $f6 +#define a5 $f7 +#define a6 $f8 +#define a7 $f9 +#define a8 $f10 + + + PROLOGUE + + LDARG INCY, $sp, 0 + LDARG BUFFER, $sp, 8 + + addi.d $sp, $sp, -80 + + SDARG $r23, $sp, 0 + SDARG $r24, $sp, 8 + SDARG $r25, $sp, 16 + SDARG $r26, $sp, 32 + SDARG $r27, $sp, 40 + SDARG $r28, $sp, 48 + SDARG $r29, $sp, 56 + SDARG $r30, $sp, 64 + ST ALPHA, $sp, 72 + + vldrepl.d VALPHA, $sp, 72 + + slli.d LDA, LDA, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + slli.d INCY, INCY, BASE_SHIFT + + bge $r0, M, .L999 + bge $r0, N, .L999 + + move J, $r0 + move IX, $r0 + + move AO1, A //a_ptr + move XX, X + move YY, Y + + beq J, M, .L999 + +.L01: + vldx U0, XX, IX + vshuf4i.d U0, U0, 0x00 + + vfmul.d U1, VALPHA, U0 //temp1 + + move IY, $r0 + move II, $r0 + move I, $r0 + + srai.d T0, M, 2 //n/4 + beq I, T0, .L03 + +.L02: + vldx U2, AO1, II + addi.d II, II, 16 + vldx U7, AO1, II + + move T1, IY + add.d T2, T1, INCY + add.d T3, T2, INCY + add.d T4, T3, INCY + + fldx.d a1, YY, T1 + fldx.d a2, YY, T2 + fldx.d a3, YY, T3 + fldx.d a4, YY, T4 + + vextrins.d U3, U4, 0x10 + vextrins.d U5, U6, 0x10 + + vfmadd.d U3, U1, U2, U3 + vfmadd.d U5, U1, U7, U5 + + vextrins.d U4, U3, 0x01 + vextrins.d U6, U5, 0x01 + + fstx.d a1, YY, T1 + fstx.d a2, YY, T2 + fstx.d a3, YY, T3 + fstx.d a4, YY, T4 + + add.d IY, T4, INCY + addi.d II, II, 16 + addi.d I, I, 1 + blt I, T0, .L02 + +.L03: + andi T0, M, 2 + beq $r0, T0, .L04 + + addi.d T1, $r0, 4 + mod.d T1, M, T1 + sub.d II, M, T1 + slli.d II, II, BASE_SHIFT + + move T1, IY + add.d T2, T1, INCY + + vldx U2, AO1, II + + fldx.d a1, YY, T1 + fldx.d a2, YY, T2 + + vextrins.d U3, U4, 0x10 + + vfmadd.d U3, U1, U2, U3 + + vextrins.d U4, U3, 0x01 + + fstx.d a1, YY, T1 + fstx.d a2, YY, T2 + + add.d IY, T2, INCY + +.L04: + andi T0, M, 1 + beq $r0, T0, .L05 + + addi.d II, M, -1 + slli.d II, II, BASE_SHIFT + + fldx.d a1, AO1, II + fldx.d a3, YY, IY + + fmadd.d a3, $f12, a1, a3 + + fstx.d a3, YY, IY + + add.d IY, IY, INCY + +.L05: + add.d AO1, AO1, LDA + add.d IX, IX, INCX + + addi.d J, J, 1 + blt J, N, .L01 + +.L999: + LDARG $r23, $sp, 0 + LDARG $r24, $sp, 8 + LDARG $r25, $sp, 16 + LDARG $r26, $sp, 32 + LDARG $r27, $sp, 40 + LDARG $r28, $sp, 48 + LDARG $r29, $sp, 56 + LDARG $r30, $sp, 64 + LD ALPHA, $sp, 72 + addi.d $sp, $sp, 80 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/dgemv_t_lsx.S b/kernel/loongarch64/dgemv_t_lsx.S new file mode 100644 index 000000000..76f0d9bdc --- /dev/null +++ b/kernel/loongarch64/dgemv_t_lsx.S @@ -0,0 +1,279 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +/* Param */ +#define M $r4 +#define N $r5 +#define A $r7 +#define LDA $r8 +#define X $r9 +#define INCX $r10 +#define Y $r11 +#define INCY $r6 +#define BUFFER $r16 +#define ALPHA $f0 + +#define YORIG $r18 +#define T0 $r19 +#define T1 $r20 +#define AO3 $r12 +#define AO4 $r13 +#define I $r14 +#define J $r15 +#define AO1 $r23 +#define AO2 $r24 +#define IX $r25 +#define IY $r26 +#define II $r27 +#define T2 $r28 +#define T3 $r29 +#define T4 $r30 + +/* LSX vectors */ +#define U0 $vr11 +#define U1 $vr12 +#define U2 $vr2 +#define U3 $vr3 +#define U4 $vr4 +#define U5 $vr5 +#define U6 $vr6 +#define U7 $vr7 +#define U8 $vr8 +#define U9 $vr9 +#define VALPHA $vr10 + +#define a1 $f3 +#define a2 $f4 +#define a3 $f5 +#define a4 $f6 +#define a5 $f7 +#define a6 $f8 +#define a7 $f9 +#define a8 $f10 + + + PROLOGUE + + LDARG INCY, $sp, 0 + LDARG BUFFER, $sp, 8 + + addi.d $sp, $sp, -80 + + SDARG $r23, $sp, 0 + SDARG $r24, $sp, 8 + SDARG $r25, $sp, 16 + SDARG $r26, $sp, 32 + SDARG $r27, $sp, 40 + SDARG $r28, $sp, 48 + SDARG $r29, $sp, 56 + SDARG $r30, $sp, 64 + ST ALPHA, $sp, 72 + + vldrepl.d VALPHA, $sp, 72 + + slli.d LDA, LDA, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + slli.d INCY, INCY, BASE_SHIFT + + bge $r0, M, .L999 + bge $r0, N, .L999 + + move J, $r0 + move IY, $r0 + + move AO1, A //a_ptr1 + + srai.d T0, N, 2 //n/4 + beq J, T0, .L04 + +.L01: /* j + +typedef __vector unsigned char vec_t; +typedef FLOAT v4sf_t __attribute__ ((vector_size (16))); +typedef FLOAT v2sf_t __attribute__ ((vector_size (8))); + +#define SET_ACC_ZERO() \ + __builtin_mma_xxsetaccz (&acc0); \ + __builtin_mma_xxsetaccz (&acc1); \ + __builtin_mma_xxsetaccz (&acc2); \ + __builtin_mma_xxsetaccz (&acc3); \ + __builtin_mma_xxsetaccz (&acc4); \ + __builtin_mma_xxsetaccz (&acc5); \ + __builtin_mma_xxsetaccz (&acc6); \ + __builtin_mma_xxsetaccz (&acc7); + +#if (defined(NN) || defined(NT) || defined(TN) || defined(TT)) +#define COMP_MUL(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real = _arbr - _aibi; _imag = _arbi + _aibr; } +#define COMP_MAC(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real += _arbr - _aibi; _imag += _arbi + _aibr; } +#endif + +#if (defined(NR) || defined(NC) || defined(TR) || defined(TC)) +#define COMP_MUL(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real = _arbr + _aibi; _imag = -_arbi + _aibr; } +#define COMP_MAC(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real += _arbr + _aibi; _imag += -_arbi + _aibr; } +#endif + +#if (defined(RN) || defined(RT) || defined(CN) || defined(CT)) +#define COMP_MUL(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real = _arbr + _aibi; _imag = _arbi - _aibr; } +#define COMP_MAC(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real += _arbr + _aibi; _imag += _arbi - _aibr; } +#endif + +#if (defined(RR) || defined(RC) || defined(CR) || defined(CC)) +#define COMP_MUL(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real = _arbr - _aibi; _imag = -_arbi - _aibr; } +#define COMP_MAC(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real += _arbr - _aibi; _imag += -_arbi - _aibr; } +#endif + +#if defined (TRMMKERNEL) +#define A_OP = +#else +#define A_OP += +#endif + +#define BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + __builtin_mma_disassemble_acc ((void *)result, &acc0); \ + __builtin_mma_disassemble_acc ((void *)&result[ 4], &acc1); \ + __builtin_mma_disassemble_acc ((void *)&result[ 8], &acc2); \ + __builtin_mma_disassemble_acc ((void *)&result[12], &acc3); \ + __builtin_mma_disassemble_acc ((void *)&result[16], &acc4); \ + __builtin_mma_disassemble_acc ((void *)&result[20], &acc5); \ + __builtin_mma_disassemble_acc ((void *)&result[24], &acc6); \ + __builtin_mma_disassemble_acc ((void *)&result[28], &acc7); + +#define COMP_MUL_1 \ + COMP_MUL(tr[0], res[ 0], res[ 5], ti[0], res[ 1], res[ 4]) + +#define COMP_MAC_1(_offset) { \ + FLOAT *_ro = &res[_offset]; \ + COMP_MAC(tr[0], _ro[ 0], _ro[ 5], ti[0], _ro[ 1], _ro[ 4]) \ +} + +#define COMP_MUL_2A \ + COMP_MUL(tr[0], res[ 0], res[ 5], ti[0], res[ 1], res[ 4]) \ + COMP_MUL(tr[1], res[ 2], res[ 7], ti[1], res[ 3], res[ 6]) + +#define COMP_MAC_2A(_offset) { \ + FLOAT *_ro = &res[_offset]; \ + COMP_MAC(tr[0], _ro[ 0], _ro[ 5], ti[0], _ro[ 1], _ro[ 4]) \ + COMP_MAC(tr[1], _ro[ 2], _ro[ 7], ti[1], _ro[ 3], _ro[ 6]) \ +} + +#define COMP_MUL_2B \ + COMP_MUL(tr[0], res[ 0], res[ 5], ti[0], res[ 1], res[ 4]) \ + COMP_MUL(tr[1], res[ 8], res[13], ti[1], res[ 9], res[12]) + +#define COMP_MAC_2B(_offset) { \ + FLOAT *_ro = &res[_offset]; \ + COMP_MAC(tr[0], _ro[ 0], _ro[ 5], ti[0], _ro[ 1], _ro[ 4]) \ + COMP_MAC(tr[1], _ro[ 8], _ro[13], ti[1], _ro[ 9], _ro[12]) \ +} + +#define COMP_MUL_4A(_offset) { \ + FLOAT *_ro = &res[_offset]; \ + COMP_MUL(tr[0], _ro[ 0], _ro[ 5], ti[0], _ro[ 1], _ro[ 4]) \ + COMP_MUL(tr[1], _ro[ 8], _ro[13], ti[1], _ro[ 9], _ro[12]) \ + COMP_MUL(tr[2], _ro[16], _ro[21], ti[2], _ro[17], _ro[20]) \ + COMP_MUL(tr[3], _ro[24], _ro[29], ti[3], _ro[25], _ro[28]) \ +} + +#define COMP_MAC_4A(_offset) { \ + FLOAT *_ro = &res[_offset]; \ + COMP_MAC(tr[0], _ro[ 0], _ro[ 5], ti[0], _ro[ 1], _ro[ 4]) \ + COMP_MAC(tr[1], _ro[ 8], _ro[13], ti[1], _ro[ 9], _ro[12]) \ + COMP_MAC(tr[2], _ro[16], _ro[21], ti[2], _ro[17], _ro[20]) \ + COMP_MAC(tr[3], _ro[24], _ro[29], ti[3], _ro[25], _ro[28]) \ +} + +#define COMP_MUL_4B(_offset) { \ + FLOAT *_ro = &res[_offset]; \ + COMP_MUL(tr[0], _ro[ 0], _ro[ 5], ti[0], _ro[ 1], _ro[ 4]) \ + COMP_MUL(tr[1], _ro[ 8], _ro[13], ti[1], _ro[ 9], _ro[12]) \ + COMP_MUL(tr[2], _ro[ 2], _ro[ 7], ti[2], _ro[ 3], _ro[ 6]) \ + COMP_MUL(tr[3], _ro[10], _ro[15], ti[3], _ro[11], _ro[14]) \ +} + +#define COMP_MAC_4B(_offset) { \ + FLOAT *_ro = &res[_offset]; \ + COMP_MAC(tr[0], _ro[ 0], _ro[ 5], ti[0], _ro[ 1], _ro[ 4]) \ + COMP_MAC(tr[1], _ro[ 8], _ro[13], ti[1], _ro[ 9], _ro[12]) \ + COMP_MAC(tr[2], _ro[ 2], _ro[ 7], ti[2], _ro[ 3], _ro[ 6]) \ + COMP_MAC(tr[3], _ro[10], _ro[15], ti[3], _ro[11], _ro[14]) \ +} + + +#define SAVE_ACC_COMPLEX_11 \ + BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + COMP_MUL_1 \ + COMP_MAC_1(16) \ + COMP_MAC_1(32) \ + COMP_MAC_1(48) \ + COMP_MAC_1(64) \ + COMP_MAC_1(80) \ + COMP_MAC_1(96) \ + COMP_MAC_1(112) \ + CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; + +#define SAVE_ACC_COMPLEX_12 \ + BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + COMP_MUL_2A \ + COMP_MAC_2A(16) \ + COMP_MAC_2A(32) \ + COMP_MAC_2A(48) \ + COMP_MAC_2A(64) \ + COMP_MAC_2A(80) \ + COMP_MAC_2A(96) \ + COMP_MAC_2A(112) \ + CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[2*ldc+0] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[2*ldc+1] A_OP ti[1] * alpha_r + tr[1] * alpha_i; + +#define SAVE_ACC_COMPLEX_21_1 \ + BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + COMP_MUL_2B \ + COMP_MAC_2B(16) \ + COMP_MAC_2B(32) \ + COMP_MAC_2B(48) \ + COMP_MAC_2B(64) \ + COMP_MAC_2B(80) \ + COMP_MAC_2B(96) \ + COMP_MAC_2B(112) \ + CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; + +#define SAVE_ACC_COMPLEX_21_2 \ + BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + COMP_MUL_4A(0) \ + COMP_MAC_4A(32) \ + COMP_MAC_4A(64) \ + COMP_MAC_4A(96) \ + CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[4] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[5] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[6] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[7] A_OP ti[3] * alpha_r + tr[3] * alpha_i; + +#define SAVE_ACC_COMPLEX_21_4 \ + BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + COMP_MUL_4A(0) \ + COMP_MAC_4A(64) \ + CO[ 0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[ 1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[ 2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[ 3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[ 4] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[ 5] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[ 6] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[ 7] A_OP ti[3] * alpha_r + tr[3] * alpha_i; \ + COMP_MUL_4A(32) \ + COMP_MAC_4A(96) \ + CO[ 8] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[ 9] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[10] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[11] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[12] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[13] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[14] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[15] A_OP ti[3] * alpha_r + tr[3] * alpha_i; + +#define SAVE_ACC_COMPLEX_22_4 \ + BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + COMP_MUL_4B(0) \ + CO[ 0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[ 1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[ 2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[ 3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[2*ldc+ 0] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[2*ldc+ 1] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[2*ldc+ 2] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[2*ldc+ 3] A_OP ti[3] * alpha_r + tr[3] * alpha_i; \ + COMP_MUL_4B(16) \ + CO[ 4] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[ 5] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[ 6] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[ 7] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[2*ldc+ 4] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[2*ldc+ 5] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[2*ldc+ 6] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[2*ldc+ 7] A_OP ti[3] * alpha_r + tr[3] * alpha_i; \ + COMP_MUL_4B(32) \ + CO[ 8] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[ 9] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[10] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[11] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[2*ldc+ 8] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[2*ldc+ 9] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[2*ldc+10] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[2*ldc+11] A_OP ti[3] * alpha_r + tr[3] * alpha_i; \ + COMP_MUL_4B(48) \ + CO[12] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[13] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[14] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[15] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[2*ldc+12] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[2*ldc+13] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[2*ldc+14] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[2*ldc+15] A_OP ti[3] * alpha_r + tr[3] * alpha_i; + +#define SAVE_ACC_COMPLEX_22_2 \ + BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + COMP_MUL_4B(0) \ + CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[2*ldc+0] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[2*ldc+1] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[2*ldc+2] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[2*ldc+3] A_OP ti[3] * alpha_r + tr[3] * alpha_i; \ + COMP_MUL_4B(16) \ + CO[4] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[5] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[6] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[7] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[2*ldc+4] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[2*ldc+5] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[2*ldc+6] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[2*ldc+7] A_OP ti[3] * alpha_r + tr[3] * alpha_i; + +#define SAVE_ACC_COMPLEX_22_1 \ + BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + COMP_MUL_4B(0) \ + CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[2*ldc+0] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[2*ldc+1] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[2*ldc+2] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[2*ldc+3] A_OP ti[3] * alpha_r + tr[3] * alpha_i; + +#define SAVE_ACC_COMPLEX_24_ALL \ + __builtin_mma_disassemble_acc ((void *)result, &acc0); \ + __builtin_mma_disassemble_acc ((void *)(&result[4]), &acc4); \ + __builtin_mma_disassemble_acc ((void *)(&result[8]), &acc1); \ + __builtin_mma_disassemble_acc ((void *)(&result[12]), &acc5); \ + __builtin_mma_disassemble_acc ((void *)(&result[16]), &acc2); \ + __builtin_mma_disassemble_acc ((void *)(&result[20]), &acc6); \ + __builtin_mma_disassemble_acc ((void *)(&result[24]), &acc3); \ + __builtin_mma_disassemble_acc ((void *)(&result[28]), &acc7); \ + COMP_MUL(tr[ 0], res[ 0], res[ 5], ti[ 0], res[ 1], res[ 4]) \ + COMP_MUL(tr[ 1], res[ 8], res[ 13], ti[ 1], res[ 9], res[ 12]) \ + COMP_MUL(tr[ 2], res[ 2], res[ 7], ti[ 2], res[ 3], res[ 6]) \ + COMP_MUL(tr[ 3], res[ 10], res[ 15], ti[ 3], res[ 11], res[ 14]) \ + COMP_MUL(tr[ 4], res[ 16], res[ 21], ti[ 4], res[ 17], res[ 20]) \ + COMP_MUL(tr[ 5], res[ 24], res[ 29], ti[ 5], res[ 25], res[ 28]) \ + COMP_MUL(tr[ 6], res[ 18], res[ 23], ti[ 6], res[ 19], res[ 22]) \ + COMP_MUL(tr[ 7], res[ 26], res[ 31], ti[ 7], res[ 27], res[ 30]) \ + COMP_MUL(tr[ 8], res[ 32], res[ 37], ti[ 8], res[ 33], res[ 36]) \ + COMP_MUL(tr[ 9], res[ 40], res[ 45], ti[ 9], res[ 41], res[ 44]) \ + COMP_MUL(tr[10], res[ 34], res[ 39], ti[10], res[ 35], res[ 38]) \ + COMP_MUL(tr[11], res[ 42], res[ 47], ti[11], res[ 43], res[ 46]) \ + COMP_MUL(tr[12], res[ 48], res[ 53], ti[12], res[ 49], res[ 52]) \ + COMP_MUL(tr[13], res[ 56], res[ 61], ti[13], res[ 57], res[ 60]) \ + COMP_MUL(tr[14], res[ 50], res[ 55], ti[14], res[ 51], res[ 54]) \ + COMP_MUL(tr[15], res[ 58], res[ 63], ti[15], res[ 59], res[ 62]) \ + COMP_MUL(tr[16], res[ 64], res[ 69], ti[16], res[ 65], res[ 68]) \ + COMP_MUL(tr[17], res[ 72], res[ 77], ti[17], res[ 73], res[ 76]) \ + COMP_MUL(tr[18], res[ 66], res[ 71], ti[18], res[ 67], res[ 70]) \ + COMP_MUL(tr[19], res[ 74], res[ 79], ti[19], res[ 75], res[ 78]) \ + COMP_MUL(tr[20], res[ 80], res[ 85], ti[20], res[ 81], res[ 84]) \ + COMP_MUL(tr[21], res[ 88], res[ 93], ti[21], res[ 89], res[ 92]) \ + COMP_MUL(tr[22], res[ 82], res[ 87], ti[22], res[ 83], res[ 86]) \ + COMP_MUL(tr[23], res[ 90], res[ 95], ti[23], res[ 91], res[ 94]) \ + COMP_MUL(tr[24], res[ 96], res[101], ti[24], res[ 97], res[100]) \ + COMP_MUL(tr[25], res[104], res[109], ti[25], res[105], res[108]) \ + COMP_MUL(tr[26], res[ 98], res[103], ti[26], res[ 99], res[102]) \ + COMP_MUL(tr[27], res[106], res[111], ti[27], res[107], res[110]) \ + COMP_MUL(tr[28], res[112], res[117], ti[28], res[113], res[116]) \ + COMP_MUL(tr[29], res[120], res[125], ti[29], res[121], res[124]) \ + COMP_MUL(tr[30], res[114], res[119], ti[30], res[115], res[118]) \ + COMP_MUL(tr[31], res[122], res[127], ti[31], res[123], res[126]) \ + CO[ 0] A_OP tr[ 0] * alpha_r - ti[ 0] * alpha_i; \ + CO[ 1] A_OP ti[ 0] * alpha_r + tr[ 0] * alpha_i; \ + CO[ 2] A_OP tr[ 1] * alpha_r - ti[ 1] * alpha_i; \ + CO[ 3] A_OP ti[ 1] * alpha_r + tr[ 1] * alpha_i; \ + CO[2*ldc+ 0] A_OP tr[ 2] * alpha_r - ti[ 2] * alpha_i; \ + CO[2*ldc+ 1] A_OP ti[ 2] * alpha_r + tr[ 2] * alpha_i; \ + CO[2*ldc+ 2] A_OP tr[ 3] * alpha_r - ti[ 3] * alpha_i; \ + CO[2*ldc+ 3] A_OP ti[ 3] * alpha_r + tr[ 3] * alpha_i; \ + CO[4*ldc+ 0] A_OP tr[ 4] * alpha_r - ti[ 4] * alpha_i; \ + CO[4*ldc+ 1] A_OP ti[ 4] * alpha_r + tr[ 4] * alpha_i; \ + CO[4*ldc+ 2] A_OP tr[ 5] * alpha_r - ti[ 5] * alpha_i; \ + CO[4*ldc+ 3] A_OP ti[ 5] * alpha_r + tr[ 5] * alpha_i; \ + CO[6*ldc+ 0] A_OP tr[ 6] * alpha_r - ti[ 6] * alpha_i; \ + CO[6*ldc+ 1] A_OP ti[ 6] * alpha_r + tr[ 6] * alpha_i; \ + CO[6*ldc+ 2] A_OP tr[ 7] * alpha_r - ti[ 7] * alpha_i; \ + CO[6*ldc+ 3] A_OP ti[ 7] * alpha_r + tr[ 7] * alpha_i; \ + CO[ 4] A_OP tr[ 8] * alpha_r - ti[ 8] * alpha_i; \ + CO[ 5] A_OP ti[ 8] * alpha_r + tr[ 8] * alpha_i; \ + CO[ 6] A_OP tr[ 9] * alpha_r - ti[ 9] * alpha_i; \ + CO[ 7] A_OP ti[ 9] * alpha_r + tr[ 9] * alpha_i; \ + CO[2*ldc+ 4] A_OP tr[10] * alpha_r - ti[10] * alpha_i; \ + CO[2*ldc+ 5] A_OP ti[10] * alpha_r + tr[10] * alpha_i; \ + CO[2*ldc+ 6] A_OP tr[11] * alpha_r - ti[11] * alpha_i; \ + CO[2*ldc+ 7] A_OP ti[11] * alpha_r + tr[11] * alpha_i; \ + CO[4*ldc+ 4] A_OP tr[12] * alpha_r - ti[12] * alpha_i; \ + CO[4*ldc+ 5] A_OP ti[12] * alpha_r + tr[12] * alpha_i; \ + CO[4*ldc+ 6] A_OP tr[13] * alpha_r - ti[13] * alpha_i; \ + CO[4*ldc+ 7] A_OP ti[13] * alpha_r + tr[13] * alpha_i; \ + CO[6*ldc+ 4] A_OP tr[14] * alpha_r - ti[14] * alpha_i; \ + CO[6*ldc+ 5] A_OP ti[14] * alpha_r + tr[14] * alpha_i; \ + CO[6*ldc+ 6] A_OP tr[15] * alpha_r - ti[15] * alpha_i; \ + CO[6*ldc+ 7] A_OP ti[15] * alpha_r + tr[15] * alpha_i; \ + CO[ 8] A_OP tr[16] * alpha_r - ti[16] * alpha_i; \ + CO[ 9] A_OP ti[16] * alpha_r + tr[16] * alpha_i; \ + CO[ 10] A_OP tr[17] * alpha_r - ti[17] * alpha_i; \ + CO[ 11] A_OP ti[17] * alpha_r + tr[17] * alpha_i; \ + CO[2*ldc+ 8] A_OP tr[18] * alpha_r - ti[18] * alpha_i; \ + CO[2*ldc+ 9] A_OP ti[18] * alpha_r + tr[18] * alpha_i; \ + CO[2*ldc+10] A_OP tr[19] * alpha_r - ti[19] * alpha_i; \ + CO[2*ldc+11] A_OP ti[19] * alpha_r + tr[19] * alpha_i; \ + CO[4*ldc+ 8] A_OP tr[20] * alpha_r - ti[20] * alpha_i; \ + CO[4*ldc+ 9] A_OP ti[20] * alpha_r + tr[20] * alpha_i; \ + CO[4*ldc+10] A_OP tr[21] * alpha_r - ti[21] * alpha_i; \ + CO[4*ldc+11] A_OP ti[21] * alpha_r + tr[21] * alpha_i; \ + CO[6*ldc+ 8] A_OP tr[22] * alpha_r - ti[22] * alpha_i; \ + CO[6*ldc+ 9] A_OP ti[22] * alpha_r + tr[22] * alpha_i; \ + CO[6*ldc+10] A_OP tr[23] * alpha_r - ti[23] * alpha_i; \ + CO[6*ldc+11] A_OP ti[23] * alpha_r + tr[23] * alpha_i; \ + CO[ 12] A_OP tr[24] * alpha_r - ti[24] * alpha_i; \ + CO[ 13] A_OP ti[24] * alpha_r + tr[24] * alpha_i; \ + CO[ 14] A_OP tr[25] * alpha_r - ti[25] * alpha_i; \ + CO[ 15] A_OP ti[25] * alpha_r + tr[25] * alpha_i; \ + CO[2*ldc+12] A_OP tr[26] * alpha_r - ti[26] * alpha_i; \ + CO[2*ldc+13] A_OP ti[26] * alpha_r + tr[26] * alpha_i; \ + CO[2*ldc+14] A_OP tr[27] * alpha_r - ti[27] * alpha_i; \ + CO[2*ldc+15] A_OP ti[27] * alpha_r + tr[27] * alpha_i; \ + CO[4*ldc+12] A_OP tr[28] * alpha_r - ti[28] * alpha_i; \ + CO[4*ldc+13] A_OP ti[28] * alpha_r + tr[28] * alpha_i; \ + CO[4*ldc+14] A_OP tr[29] * alpha_r - ti[29] * alpha_i; \ + CO[4*ldc+15] A_OP ti[29] * alpha_r + tr[29] * alpha_i; \ + CO[6*ldc+12] A_OP tr[30] * alpha_r - ti[30] * alpha_i; \ + CO[6*ldc+13] A_OP ti[30] * alpha_r + tr[30] * alpha_i; \ + CO[6*ldc+14] A_OP tr[31] * alpha_r - ti[31] * alpha_i; \ + CO[6*ldc+15] A_OP ti[31] * alpha_r + tr[31] * alpha_i; + +#define SAVE_ACC_COMPLEX_24(ACC1, ACC2, CI) \ + __builtin_mma_disassemble_acc ((void *)result, ACC1); \ + __builtin_mma_disassemble_acc ((void *)(&result[4]), ACC2); \ + COMP_MUL(tr[0], res[0], res[5], ti[0], res[1], res[4]) \ + COMP_MUL(tr[1], res[8], res[13], ti[1], res[9], res[12]) \ + COMP_MUL(tr[2], res[2], res[7], ti[2], res[3], res[6]) \ + COMP_MUL(tr[3], res[10], res[15], ti[3], res[11], res[14]) \ + COMP_MUL(tr[4], res[16], res[21], ti[4], res[17], res[20]) \ + COMP_MUL(tr[5], res[24], res[29], ti[5], res[25], res[28]) \ + COMP_MUL(tr[6], res[18], res[23], ti[6], res[19], res[22]) \ + COMP_MUL(tr[7], res[26], res[31], ti[7], res[27], res[30]) \ + CO[CI+0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[CI+1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[CI+2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[CI+3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[CI+2*ldc+0] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[CI+2*ldc+1] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[CI+2*ldc+2] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[CI+2*ldc+3] A_OP ti[3] * alpha_r + tr[3] * alpha_i; \ + CO[CI+4*ldc+0] A_OP tr[4] * alpha_r - ti[4] * alpha_i; \ + CO[CI+4*ldc+1] A_OP ti[4] * alpha_r + tr[4] * alpha_i; \ + CO[CI+4*ldc+2] A_OP tr[5] * alpha_r - ti[5] * alpha_i; \ + CO[CI+4*ldc+3] A_OP ti[5] * alpha_r + tr[5] * alpha_i; \ + CO[CI+6*ldc+0] A_OP tr[6] * alpha_r - ti[6] * alpha_i; \ + CO[CI+6*ldc+1] A_OP ti[6] * alpha_r + tr[6] * alpha_i; \ + CO[CI+6*ldc+2] A_OP tr[7] * alpha_r - ti[7] * alpha_i; \ + CO[CI+6*ldc+3] A_OP ti[7] * alpha_r + tr[7] * alpha_i; + +#define SAVE_ACC_COMPLEX_14 \ + BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + COMP_MUL(tr[0], res[ 0], res[ 5], ti[0], res[ 1], res[ 4]) \ + COMP_MUL(tr[1], res[ 2], res[ 7], ti[1], res[ 3], res[ 6]) \ + COMP_MUL(tr[2], res[ 16], res[ 21], ti[2], res[ 17], res[ 20]) \ + COMP_MUL(tr[3], res[ 18], res[ 23], ti[3], res[ 19], res[ 22]) \ + COMP_MAC(tr[0], res[ 32], res[ 37], ti[0], res[ 33], res[ 36]) \ + COMP_MAC(tr[1], res[ 34], res[ 39], ti[1], res[ 35], res[ 38]) \ + COMP_MAC(tr[2], res[ 48], res[ 53], ti[2], res[ 49], res[ 52]) \ + COMP_MAC(tr[3], res[ 50], res[ 55], ti[3], res[ 51], res[ 54]) \ + COMP_MAC(tr[0], res[ 64], res[ 69], ti[0], res[ 65], res[ 68]) \ + COMP_MAC(tr[1], res[ 66], res[ 71], ti[1], res[ 67], res[ 70]) \ + COMP_MAC(tr[2], res[ 80], res[ 85], ti[2], res[ 81], res[ 84]) \ + COMP_MAC(tr[3], res[ 82], res[ 87], ti[3], res[ 83], res[ 86]) \ + COMP_MAC(tr[0], res[ 96], res[101], ti[0], res[ 97], res[100]) \ + COMP_MAC(tr[1], res[ 98], res[103], ti[1], res[ 99], res[102]) \ + COMP_MAC(tr[2], res[112], res[117], ti[2], res[113], res[116]) \ + COMP_MAC(tr[3], res[114], res[119], ti[3], res[115], res[118]) \ + CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[2*ldc+0] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[2*ldc+1] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[4*ldc+0] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[4*ldc+1] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[6*ldc+0] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[6*ldc+1] A_OP ti[3] * alpha_r + tr[3] * alpha_i; + +#define PREFETCH1(x, y) asm volatile ("dcbt %0, %1" : : "r" (x), "b" (y) : "memory"); + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) +#define REFRESH_TEMP_BK(x, y) \ + temp = k - off; +#elif defined(LEFT) +#define REFRESH_TEMP_BK(x, y) \ + temp = off + x; +#else +#define REFRESH_TEMP_BK(x, y) \ + temp = off + y; +#endif +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +#define REFRESH_POINTERS(x, y) \ + BO = B; \ + REFRESH_TEMP_BK(x, y) +#else +#define REFRESH_POINTERS(x, y) \ + AO += off * (2*x); \ + BO = B + off * (2*y); \ + REFRESH_TEMP_BK(x, y) +#endif + +#ifdef LEFT +#define REFRESH_OFF(x) \ + off += x; +#else +#define REFRESH_OFF(x) +#endif + +#ifdef LEFT +#define UPDATE_TEMP(x, y) \ + temp -= x; +#else +#define UPDATE_TEMP(x, y) \ + temp -= y; +#endif + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +#define REFRESH_TMP_AFTER_SAVE(x, y) \ + temp = k - off; \ + UPDATE_TEMP(x, y) \ + AO += temp * (2*x); \ + BO += temp * (2*y); +#else +#define REFRESH_TMP_AFTER_SAVE(x, y) +#endif + +#define REFRESH_AFTER_SAVE(x,y) \ + REFRESH_TMP_AFTER_SAVE(x, y) \ + REFRESH_OFF(x) +/************************************************************************************* +* GEMM Kernel +*************************************************************************************/ +int +#ifdef TRMMKERNEL +CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, + FLOAT * A, FLOAT * B, FLOAT * C, BLASLONG ldc, BLASLONG offset) +#else +CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, + FLOAT * A, FLOAT * B, FLOAT * C, BLASLONG ldc) +#endif +{ + BLASLONG i1, i, l, temp; + FLOAT *AO, *BO, *CO; +#if defined(TRMMKERNEL) + BLASLONG off; +#endif +#if defined(TRMMKERNEL) && !defined(LEFT) + off = -offset; +#endif + + __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; + + v4sf_t result[32]; + FLOAT *res, tr[64], ti[64]; + res = (FLOAT *) result; + + for (i1 = 0; i1 < (n >> 2); i1++) { +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + AO = A; + CO = C; + C += ldc << 3; + + for (i = 0; i < (m >> 3); i++) { +#if defined(TRMMKERNEL) + REFRESH_POINTERS (8, 4); +#else + BO = B; + temp = k; +#endif + SET_ACC_ZERO() + for (l = 0; l < temp; ++l) { + vec_t rowA1 = *(vec_t *) & AO[l<<4]; + vec_t rowB1 = *(vec_t *) & BO[l<<3]; + vec_t rowA2 = *(vec_t *) & AO[(l<<4)+4]; + vec_t rowB2 = *(vec_t *) & BO[(l<<3)+4]; + vec_t rowA3 = *(vec_t *) & AO[(l<<4)+8]; + vec_t rowA4 = *(vec_t *) & AO[(l<<4)+12]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1); + __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB1); + __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB1); + __builtin_mma_xvf32gerpp(&acc4, rowA1, rowB2); + __builtin_mma_xvf32gerpp(&acc5, rowA2, rowB2); + __builtin_mma_xvf32gerpp(&acc6, rowA3, rowB2); + __builtin_mma_xvf32gerpp(&acc7, rowA4, rowB2); + } + SAVE_ACC_COMPLEX_24_ALL + CO += 16; + AO += temp << 4; + BO += temp << 3; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (8, 4) +#endif + } + if (m & 4) { +#if defined(TRMMKERNEL) + REFRESH_POINTERS (4, 4); +#else + BO = B; + temp = k; +#endif + SET_ACC_ZERO() + for (l = 0; l < (temp & (~1)); l+=2) { + vec_t rowA1 = *(vec_t *) & AO[l<<3]; + vec_t rowA2 = *(vec_t *) & AO[(l<<3)+4]; + vec_t rowA3 = *(vec_t *) & AO[(l<<3)+8]; + vec_t rowA4 = *(vec_t *) & AO[(l<<3)+12]; + vec_t rowB1 = *(vec_t *) & BO[l<<3]; + vec_t rowB2 = *(vec_t *) & BO[(l<<3)+4]; + vec_t rowB3 = *(vec_t *) & BO[(l<<3)+8]; + vec_t rowB4 = *(vec_t *) & BO[(l<<3)+12]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1); + __builtin_mma_xvf32gerpp(&acc2, rowA1, rowB2); + __builtin_mma_xvf32gerpp(&acc3, rowA2, rowB2); + __builtin_mma_xvf32gerpp(&acc0, rowA3, rowB3); + __builtin_mma_xvf32gerpp(&acc1, rowA4, rowB3); + __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB4); + __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB4); + } + for (l = (temp & (~1)); l < temp; ++l) { + vec_t rowA1 = *(vec_t *) & AO[l<<3]; + vec_t rowA2 = *(vec_t *) & AO[(l<<3)+4]; + vec_t rowB1 = *(vec_t *) & BO[l<<3]; + vec_t rowB2 = *(vec_t *) & BO[(l<<3)+4]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1); + __builtin_mma_xvf32gerpp(&acc2, rowA1, rowB2); + __builtin_mma_xvf32gerpp(&acc3, rowA2, rowB2); + } + SAVE_ACC_COMPLEX_24(&acc0, &acc2, 0) + SAVE_ACC_COMPLEX_24(&acc1, &acc3, 4) + CO += 8; + AO += temp << 3; + BO += temp << 3; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (4, 4) +#endif + } + if (m & 2) { +#if defined(TRMMKERNEL) + REFRESH_POINTERS (2, 4); +#else + BO = B; + temp = k; +#endif + SET_ACC_ZERO() + for (l = 0; l < (temp & (~3)); l+=4) { + vec_t rowA1 = *(vec_t *) & AO[l<<2]; + vec_t rowA2 = *(vec_t *) & AO[(l<<2)+4]; + vec_t rowA3 = *(vec_t *) & AO[(l<<2)+8]; + vec_t rowA4 = *(vec_t *) & AO[(l<<2)+12]; + vec_t rowB1 = *(vec_t *) & BO[l<<3]; + vec_t rowB2 = *(vec_t *) & BO[(l<<3)+4]; + vec_t rowB3 = *(vec_t *) & BO[(l<<3)+8]; + vec_t rowB4 = *(vec_t *) & BO[(l<<3)+12]; + vec_t rowB5 = *(vec_t *) & BO[(l<<3)+16]; + vec_t rowB6 = *(vec_t *) & BO[(l<<3)+20]; + vec_t rowB7 = *(vec_t *) & BO[(l<<3)+24]; + vec_t rowB8 = *(vec_t *) & BO[(l<<3)+28]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA1, rowB2); + __builtin_mma_xvf32gerpp(&acc0, rowA2, rowB3); + __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB4); + __builtin_mma_xvf32gerpp(&acc0, rowA3, rowB5); + __builtin_mma_xvf32gerpp(&acc1, rowA3, rowB6); + __builtin_mma_xvf32gerpp(&acc0, rowA4, rowB7); + __builtin_mma_xvf32gerpp(&acc1, rowA4, rowB8); + } + for (l = (temp & (~3)); l < temp; ++l) { + vec_t rowA1 = *(vec_t *) & AO[l<<2]; + vec_t rowB1 = *(vec_t *) & BO[l<<3]; + vec_t rowB2 = *(vec_t *) & BO[(l<<3)+4]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA1, rowB2); + } + SAVE_ACC_COMPLEX_24(&acc0, &acc1, 0) + CO += 4; + AO += temp << 2; + BO += temp << 3; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (2, 4) +#endif + } + if (m & 1) { +#if defined(TRMMKERNEL) + REFRESH_POINTERS (1, 4) +#else + BO = B; + temp = k; +#endif + SET_ACC_ZERO() + for (l = 0; l < (temp & (~3)); l+=4) { + vec_t rowA1 = *(vec_t *) & AO[l<<1]; + vec_t rowA2 = *(vec_t *) & AO[(l<<1)+2]; + vec_t rowA3 = *(vec_t *) & AO[(l<<1)+4]; + vec_t rowA4 = *(vec_t *) & AO[(l<<1)+6]; + vec_t rowB1 = *(vec_t *) & BO[l<<3]; + vec_t rowB2 = *(vec_t *) & BO[(l<<3)+4]; + vec_t rowB3 = *(vec_t *) & BO[(l<<3)+8]; + vec_t rowB4 = *(vec_t *) & BO[(l<<3)+12]; + vec_t rowB5 = *(vec_t *) & BO[(l<<3)+16]; + vec_t rowB6 = *(vec_t *) & BO[(l<<3)+20]; + vec_t rowB7 = *(vec_t *) & BO[(l<<3)+24]; + vec_t rowB8 = *(vec_t *) & BO[(l<<3)+28]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA1, rowB2); + __builtin_mma_xvf32gerpp(&acc2, rowA2, rowB3); + __builtin_mma_xvf32gerpp(&acc3, rowA2, rowB4); + __builtin_mma_xvf32gerpp(&acc4, rowA3, rowB5); + __builtin_mma_xvf32gerpp(&acc5, rowA3, rowB6); + __builtin_mma_xvf32gerpp(&acc6, rowA4, rowB7); + __builtin_mma_xvf32gerpp(&acc7, rowA4, rowB8); + } + for (l = (temp & (~3)); l < temp; ++l) { + vec_t rowA1 = *(vec_t *) & AO[l<<1]; + vec_t rowB1 = *(vec_t *) & BO[l<<3]; + vec_t rowB2 = *(vec_t *) & BO[(l<<3)+4]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA1, rowB2); + } + SAVE_ACC_COMPLEX_14 + CO += 2; + AO += temp << 1; + BO += temp << 3; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (1, 4) +#endif + } +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 4; // number of values in A +#endif + + B += k << 3; + } + + if (n & 2) { +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + AO = A; + CO = C; + C += ldc << 2; + + for (i = 0; i < (m >> 3); i++) { +#if defined(TRMMKERNEL) + REFRESH_POINTERS (8, 2) +#else + BO = B; + temp = k; +#endif + SET_ACC_ZERO() + for (l = 0; l < (temp & (~1)); l+=2) { + vec_t rowA1 = *(vec_t *) & AO[l<<4]; + vec_t rowA2 = *(vec_t *) & AO[(l<<4)+4]; + vec_t rowA3 = *(vec_t *) & AO[(l<<4)+8]; + vec_t rowA4 = *(vec_t *) & AO[(l<<4)+12]; + vec_t rowA5 = *(vec_t *) & AO[(l<<4)+16]; + vec_t rowA6 = *(vec_t *) & AO[(l<<4)+20]; + vec_t rowA7 = *(vec_t *) & AO[(l<<4)+24]; + vec_t rowA8 = *(vec_t *) & AO[(l<<4)+28]; + vec_t rowB1 = *(vec_t *) & BO[l<<2]; + vec_t rowB2 = *(vec_t *) & BO[(l<<2)+4]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1); + __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB1); + __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB1); + __builtin_mma_xvf32gerpp(&acc0, rowA5, rowB2); + __builtin_mma_xvf32gerpp(&acc1, rowA6, rowB2); + __builtin_mma_xvf32gerpp(&acc2, rowA7, rowB2); + __builtin_mma_xvf32gerpp(&acc3, rowA8, rowB2); + } + for (l = (temp & (~1)); l < temp; ++l) { + vec_t rowA1 = *(vec_t *) & AO[l<<4]; + vec_t rowA2 = *(vec_t *) & AO[(l<<4)+4]; + vec_t rowA3 = *(vec_t *) & AO[(l<<4)+8]; + vec_t rowA4 = *(vec_t *) & AO[(l<<4)+12]; + vec_t rowB1 = *(vec_t *) & BO[l<<2]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1); + __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB1); + __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB1); + } + SAVE_ACC_COMPLEX_22_4 + AO += temp << 4; + BO += temp << 2; + CO += 16; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (8, 2) +#endif + } + if (m & 4) { +#if defined(TRMMKERNEL) + REFRESH_POINTERS (4, 2) +#else + BO = B; + temp = k; +#endif + SET_ACC_ZERO() + for (l = 0; l < (temp & (~3)); l+=4) { + vec_t rowA1 = *(vec_t *) & AO[l<<3]; + vec_t rowA2 = *(vec_t *) & AO[(l<<3)+4]; + vec_t rowA3 = *(vec_t *) & AO[(l<<3)+8]; + vec_t rowA4 = *(vec_t *) & AO[(l<<3)+12]; + vec_t rowA5 = *(vec_t *) & AO[(l<<3)+16]; + vec_t rowA6 = *(vec_t *) & AO[(l<<3)+20]; + vec_t rowA7 = *(vec_t *) & AO[(l<<3)+24]; + vec_t rowA8 = *(vec_t *) & AO[(l<<3)+28]; + vec_t rowB1 = *(vec_t *) & BO[l<<2]; + vec_t rowB2 = *(vec_t *) & BO[(l<<2)+4]; + vec_t rowB3 = *(vec_t *) & BO[(l<<2)+8]; + vec_t rowB4 = *(vec_t *) & BO[(l<<2)+12]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1); + __builtin_mma_xvf32gerpp(&acc0, rowA3, rowB2); + __builtin_mma_xvf32gerpp(&acc1, rowA4, rowB2); + __builtin_mma_xvf32gerpp(&acc0, rowA5, rowB3); + __builtin_mma_xvf32gerpp(&acc1, rowA6, rowB3); + __builtin_mma_xvf32gerpp(&acc0, rowA7, rowB4); + __builtin_mma_xvf32gerpp(&acc1, rowA8, rowB4); + } + for (l = (temp & (~3)); l < temp; ++l) { + vec_t rowA1 = *(vec_t *) & AO[l<<3]; + vec_t rowA2 = *(vec_t *) & AO[(l<<3)+4]; + vec_t rowB1 = *(vec_t *) & BO[l<<2]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1); + } + SAVE_ACC_COMPLEX_22_2 + AO += temp << 3; + BO += temp << 2; + CO += 8; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (4, 2) +#endif + } + if (m & 2) { +#if defined(TRMMKERNEL) + REFRESH_POINTERS (2, 2) +#else + BO = B; + temp = k; +#endif + SET_ACC_ZERO() + for (l = 0; l < (temp & (~7)); l+=8) { + vec_t rowA1 = *(vec_t *) & AO[l<<2]; + vec_t rowA2 = *(vec_t *) & AO[(l<<2)+4]; + vec_t rowA3 = *(vec_t *) & AO[(l<<2)+8]; + vec_t rowA4 = *(vec_t *) & AO[(l<<2)+12]; + vec_t rowA5 = *(vec_t *) & AO[(l<<2)+16]; + vec_t rowA6 = *(vec_t *) & AO[(l<<2)+20]; + vec_t rowA7 = *(vec_t *) & AO[(l<<2)+24]; + vec_t rowA8 = *(vec_t *) & AO[(l<<2)+28]; + vec_t rowB1 = *(vec_t *) & BO[l<<2]; + vec_t rowB2 = *(vec_t *) & BO[(l<<2)+4]; + vec_t rowB3 = *(vec_t *) & BO[(l<<2)+8]; + vec_t rowB4 = *(vec_t *) & BO[(l<<2)+12]; + vec_t rowB5 = *(vec_t *) & BO[(l<<2)+16]; + vec_t rowB6 = *(vec_t *) & BO[(l<<2)+20]; + vec_t rowB7 = *(vec_t *) & BO[(l<<2)+24]; + vec_t rowB8 = *(vec_t *) & BO[(l<<2)+28]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc0, rowA2, rowB2); + __builtin_mma_xvf32gerpp(&acc0, rowA3, rowB3); + __builtin_mma_xvf32gerpp(&acc0, rowA4, rowB4); + __builtin_mma_xvf32gerpp(&acc0, rowA5, rowB5); + __builtin_mma_xvf32gerpp(&acc0, rowA6, rowB6); + __builtin_mma_xvf32gerpp(&acc0, rowA7, rowB7); + __builtin_mma_xvf32gerpp(&acc0, rowA8, rowB8); + } + for (l = (temp & (~7)); l < temp; ++l) { + vec_t rowA1 = *(vec_t *) & AO[l<<2]; + vec_t rowB1 = *(vec_t *) & BO[l<<2]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + } + SAVE_ACC_COMPLEX_22_1 + AO += temp << 2; + BO += temp << 2; + CO += 4; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (2, 2) +#endif + } + if (m & 1) { +#if defined(TRMMKERNEL) + REFRESH_POINTERS (1, 2) +#else + BO = B; + temp = k; +#endif + SET_ACC_ZERO() + for (l = 0; l < (temp & (~7)); l+=8) { + vec_t rowA1 = *(vec_t *) & AO[l<<1]; + vec_t rowA2 = *(vec_t *) & AO[(l<<1)+2]; + vec_t rowA3 = *(vec_t *) & AO[(l<<1)+4]; + vec_t rowA4 = *(vec_t *) & AO[(l<<1)+6]; + vec_t rowA5 = *(vec_t *) & AO[(l<<1)+8]; + vec_t rowA6 = *(vec_t *) & AO[(l<<1)+10]; + vec_t rowA7 = *(vec_t *) & AO[(l<<1)+12]; + vec_t rowA8 = *(vec_t *) & AO[(l<<1)+14]; + vec_t rowB1 = *(vec_t *) & BO[l<<2]; + vec_t rowB2 = *(vec_t *) & BO[(l<<2)+4]; + vec_t rowB3 = *(vec_t *) & BO[(l<<2)+8]; + vec_t rowB4 = *(vec_t *) & BO[(l<<2)+12]; + vec_t rowB5 = *(vec_t *) & BO[(l<<2)+16]; + vec_t rowB6 = *(vec_t *) & BO[(l<<2)+20]; + vec_t rowB7 = *(vec_t *) & BO[(l<<2)+24]; + vec_t rowB8 = *(vec_t *) & BO[(l<<2)+28]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB2); + __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB3); + __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB4); + __builtin_mma_xvf32gerpp(&acc4, rowA5, rowB5); + __builtin_mma_xvf32gerpp(&acc5, rowA6, rowB6); + __builtin_mma_xvf32gerpp(&acc6, rowA7, rowB7); + __builtin_mma_xvf32gerpp(&acc7, rowA8, rowB8); + } + for (l = (temp & (~7)); l < temp; ++l) { + vec_t rowA1 = *(vec_t *) & AO[l<<1]; + vec_t rowB1 = *(vec_t *) & BO[l<<2]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + } + SAVE_ACC_COMPLEX_12 + AO += temp<<1; + BO += temp<<2; + CO += 2; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (1, 2) +#endif + } +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 2; // number of values in A +#endif + B += k << 2; + } + + if (n & 1) { +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + AO = A; + CO = C; + C += ldc << 1; + + for (i = 0; i < (m >> 3); i++) { +#if defined(TRMMKERNEL) + REFRESH_POINTERS (8, 1) +#else + BO = B; + temp = k; +#endif + SET_ACC_ZERO() + for (l = 0; l < (temp & (~1)); l+=2) { + vec_t rowA1 = *(vec_t *) & AO[l<<4]; + vec_t rowA2 = *(vec_t *) & AO[(l<<4)+4]; + vec_t rowA3 = *(vec_t *) & AO[(l<<4)+8]; + vec_t rowA4 = *(vec_t *) & AO[(l<<4)+12]; + vec_t rowA5 = *(vec_t *) & AO[(l<<4)+16]; + vec_t rowA6 = *(vec_t *) & AO[(l<<4)+20]; + vec_t rowA7 = *(vec_t *) & AO[(l<<4)+24]; + vec_t rowA8 = *(vec_t *) & AO[(l<<4)+28]; + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1); + __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB1); + __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB1); + __builtin_mma_xvf32gerpp(&acc4, rowA5, rowB2); + __builtin_mma_xvf32gerpp(&acc5, rowA6, rowB2); + __builtin_mma_xvf32gerpp(&acc6, rowA7, rowB2); + __builtin_mma_xvf32gerpp(&acc7, rowA8, rowB2); + } + for (l = (temp & (~1)); l < temp; ++l) { + vec_t rowA1 = *(vec_t *) & AO[l<<4]; + vec_t rowA2 = *(vec_t *) & AO[(l<<4)+4]; + vec_t rowA3 = *(vec_t *) & AO[(l<<4)+8]; + vec_t rowA4 = *(vec_t *) & AO[(l<<4)+12]; + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1); + __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB1); + __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB1); + } + SAVE_ACC_COMPLEX_21_4 + AO += temp << 4; + BO += temp << 1; + CO += 16; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (8, 1) +#endif + } + if (m & 4) { +#if defined(TRMMKERNEL) + REFRESH_POINTERS (4, 1) +#else + BO = B; + temp = k; +#endif + SET_ACC_ZERO() + for (l = 0; l < (temp & (~3)); l+=4) { + vec_t rowA1 = *(vec_t *) & AO[l<<3]; + vec_t rowA2 = *(vec_t *) & AO[(l<<3)+4]; + vec_t rowA3 = *(vec_t *) & AO[(l<<3)+8]; + vec_t rowA4 = *(vec_t *) & AO[(l<<3)+12]; + vec_t rowA5 = *(vec_t *) & AO[(l<<3)+16]; + vec_t rowA6 = *(vec_t *) & AO[(l<<3)+20]; + vec_t rowA7 = *(vec_t *) & AO[(l<<3)+24]; + vec_t rowA8 = *(vec_t *) & AO[(l<<3)+28]; + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2]; + vec_t rowB3 = *(vec_t *) & BO[(l<<1)+4]; + vec_t rowB4 = *(vec_t *) & BO[(l<<1)+6]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1); + __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB2); + __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB2); + __builtin_mma_xvf32gerpp(&acc4, rowA5, rowB3); + __builtin_mma_xvf32gerpp(&acc5, rowA6, rowB3); + __builtin_mma_xvf32gerpp(&acc6, rowA7, rowB4); + __builtin_mma_xvf32gerpp(&acc7, rowA8, rowB4); + } + for (l = (temp & (~3)); l < temp; ++l) { + vec_t rowA1 = *(vec_t *) & AO[l<<3]; + vec_t rowA2 = *(vec_t *) & AO[(l<<3)+4]; + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1); + } + SAVE_ACC_COMPLEX_21_2 + AO += temp << 3; + BO += temp << 1; + CO += 8; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (4, 1) +#endif + } + if (m & 2) { +#if defined(TRMMKERNEL) + REFRESH_POINTERS (2, 1) +#else + BO = B; + temp = k; +#endif + SET_ACC_ZERO() + for (l = 0; l < (temp & (~7)); l+=8) { + vec_t rowA1 = *(vec_t *) & AO[l<<2]; + vec_t rowA2 = *(vec_t *) & AO[(l<<2)+4]; + vec_t rowA3 = *(vec_t *) & AO[(l<<2)+8]; + vec_t rowA4 = *(vec_t *) & AO[(l<<2)+12]; + vec_t rowA5 = *(vec_t *) & AO[(l<<2)+16]; + vec_t rowA6 = *(vec_t *) & AO[(l<<2)+20]; + vec_t rowA7 = *(vec_t *) & AO[(l<<2)+24]; + vec_t rowA8 = *(vec_t *) & AO[(l<<2)+28]; + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2]; + vec_t rowB3 = *(vec_t *) & BO[(l<<1)+4]; + vec_t rowB4 = *(vec_t *) & BO[(l<<1)+6]; + vec_t rowB5 = *(vec_t *) & BO[(l<<1)+8]; + vec_t rowB6 = *(vec_t *) & BO[(l<<1)+10]; + vec_t rowB7 = *(vec_t *) & BO[(l<<1)+12]; + vec_t rowB8 = *(vec_t *) & BO[(l<<1)+14]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB2); + __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB3); + __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB4); + __builtin_mma_xvf32gerpp(&acc4, rowA5, rowB5); + __builtin_mma_xvf32gerpp(&acc5, rowA6, rowB6); + __builtin_mma_xvf32gerpp(&acc6, rowA7, rowB7); + __builtin_mma_xvf32gerpp(&acc7, rowA8, rowB8); + } + for (l = (temp & (~7)); l < temp; ++l) { + vec_t rowA1 = *(vec_t *) & AO[l<<2]; + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + } + SAVE_ACC_COMPLEX_21_1 + AO += temp << 2; + BO += temp << 1; + CO += 4; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (2, 1) +#endif + } + if (m & 1) { +#if defined(TRMMKERNEL) + REFRESH_POINTERS (1, 1) +#else + BO = B; + temp = k; +#endif + SET_ACC_ZERO() + for (l = 0; l < (temp & (~7)); l+=8) { + vec_t rowA1 = *(vec_t *) & AO[l<<1]; + vec_t rowA2 = *(vec_t *) & AO[(l<<1)+2]; + vec_t rowA3 = *(vec_t *) & AO[(l<<1)+4]; + vec_t rowA4 = *(vec_t *) & AO[(l<<1)+6]; + vec_t rowA5 = *(vec_t *) & AO[(l<<1)+8]; + vec_t rowA6 = *(vec_t *) & AO[(l<<1)+10]; + vec_t rowA7 = *(vec_t *) & AO[(l<<1)+12]; + vec_t rowA8 = *(vec_t *) & AO[(l<<1)+14]; + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2]; + vec_t rowB3 = *(vec_t *) & BO[(l<<1)+4]; + vec_t rowB4 = *(vec_t *) & BO[(l<<1)+6]; + vec_t rowB5 = *(vec_t *) & BO[(l<<1)+8]; + vec_t rowB6 = *(vec_t *) & BO[(l<<1)+10]; + vec_t rowB7 = *(vec_t *) & BO[(l<<1)+12]; + vec_t rowB8 = *(vec_t *) & BO[(l<<1)+14]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB2); + __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB3); + __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB4); + __builtin_mma_xvf32gerpp(&acc4, rowA5, rowB5); + __builtin_mma_xvf32gerpp(&acc5, rowA6, rowB6); + __builtin_mma_xvf32gerpp(&acc6, rowA7, rowB7); + __builtin_mma_xvf32gerpp(&acc7, rowA8, rowB8); + } + for (l = (temp & (~7)); l < temp; ++l) { + vec_t rowA1 = *(vec_t *) & AO[l<<1]; + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + } + SAVE_ACC_COMPLEX_11 + AO += temp<<1; + BO += temp<<1; + CO += 2; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (1, 1) +#endif + } +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 1; // number of values in A +#endif + B += k << 1; + } + return 0; +} diff --git a/kernel/power/zgemm_kernel_power10.c b/kernel/power/zgemm_kernel_power10.c new file mode 100644 index 000000000..370d12af3 --- /dev/null +++ b/kernel/power/zgemm_kernel_power10.c @@ -0,0 +1,736 @@ +/********************************************************************************* +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ +#include "common.h" +#include + +typedef __vector unsigned char vec_t; +typedef FLOAT v4sf_t __attribute__ ((vector_size (16))); + +#define SET_ACC_ZERO() \ + __builtin_mma_xxsetaccz (&acc0); \ + __builtin_mma_xxsetaccz (&acc1); \ + __builtin_mma_xxsetaccz (&acc2); \ + __builtin_mma_xxsetaccz (&acc3); \ + __builtin_mma_xxsetaccz (&acc4); \ + __builtin_mma_xxsetaccz (&acc5); \ + __builtin_mma_xxsetaccz (&acc6); \ + __builtin_mma_xxsetaccz (&acc7); + +#if (defined(NN) || defined(NT) || defined(TN) || defined(TT)) +#define COMP_MUL(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real = _arbr - _aibi; _imag = _arbi + _aibr; } +#define COMP_MAC(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real += _arbr - _aibi; _imag += _arbi + _aibr; } +#endif + +#if (defined(NR) || defined(NC) || defined(TR) || defined(TC)) +#define COMP_MUL(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real = _arbr + _aibi; _imag = -_arbi + _aibr; } +#define COMP_MAC(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real += _arbr + _aibi; _imag += -_arbi + _aibr; } +#endif + +#if (defined(RN) || defined(RT) || defined(CN) || defined(CT)) +#define COMP_MUL(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real = _arbr + _aibi; _imag = _arbi - _aibr; } +#define COMP_MAC(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real += _arbr + _aibi; _imag += _arbi - _aibr; } +#endif + +#if (defined(RR) || defined(RC) || defined(CR) || defined(CC)) +#define COMP_MUL(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real = _arbr - _aibi; _imag = -_arbi - _aibr; } +#define COMP_MAC(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real += _arbr - _aibi; _imag += -_arbi - _aibr; } +#endif + +#if defined(TRMMKERNEL) +#define A_OP = +#else +#define A_OP += +#endif + +#define BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + __builtin_mma_disassemble_acc ((void *)result, &acc0); \ + __builtin_mma_disassemble_acc ((void *)&result[4], &acc1); \ + __builtin_mma_disassemble_acc ((void *)&result[8], &acc2); \ + __builtin_mma_disassemble_acc ((void *)&result[12], &acc3); \ + __builtin_mma_disassemble_acc ((void *)&result[16], &acc4); \ + __builtin_mma_disassemble_acc ((void *)&result[20], &acc5); \ + __builtin_mma_disassemble_acc ((void *)&result[24], &acc6); \ + __builtin_mma_disassemble_acc ((void *)&result[28], &acc7); + +#define SAVE_ACC_COMPLEX_11 \ + BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + COMP_MUL(tr[0], res[ 0], res[ 3], ti[0], res[ 1], res[ 2]) \ + COMP_MAC(tr[0], res[ 8], res[11], ti[0], res[ 9], res[10]) \ + COMP_MAC(tr[0], res[16], res[19], ti[0], res[17], res[18]) \ + COMP_MAC(tr[0], res[24], res[27], ti[0], res[25], res[26]) \ + COMP_MAC(tr[0], res[32], res[35], ti[0], res[33], res[34]) \ + COMP_MAC(tr[0], res[40], res[43], ti[0], res[41], res[42]) \ + COMP_MAC(tr[0], res[48], res[51], ti[0], res[49], res[50]) \ + COMP_MAC(tr[0], res[56], res[59], ti[0], res[57], res[58]) \ + CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; + +#define SAVE_ACC_COMPLEX_12 \ + BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + COMP_MUL(tr[0], res[ 0], res[ 3], ti[0], res[ 1], res[ 2]) \ + COMP_MUL(tr[1], res[ 8], res[11], ti[1], res[ 9], res[10]) \ + COMP_MAC(tr[0], res[16], res[19], ti[0], res[17], res[18]) \ + COMP_MAC(tr[1], res[24], res[27], ti[1], res[25], res[26]) \ + COMP_MAC(tr[0], res[32], res[35], ti[0], res[33], res[34]) \ + COMP_MAC(tr[1], res[40], res[43], ti[1], res[41], res[42]) \ + COMP_MAC(tr[0], res[48], res[51], ti[0], res[49], res[50]) \ + COMP_MAC(tr[1], res[56], res[59], ti[1], res[57], res[58]) \ + CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[2*ldc+0] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[2*ldc+1] A_OP ti[1] * alpha_r + tr[1] * alpha_i; + +#define SAVE_ACC_COMPLEX_21_1 \ + BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + COMP_MUL(tr[0], res[ 0], res[ 3], ti[0], res[ 1], res[ 2]) \ + COMP_MUL(tr[1], res[ 4], res[ 7], ti[1], res[ 5], res[ 6]) \ + COMP_MAC(tr[0], res[ 8], res[11], ti[0], res[ 9], res[10]) \ + COMP_MAC(tr[1], res[12], res[15], ti[1], res[13], res[14]) \ + COMP_MAC(tr[0], res[16], res[19], ti[0], res[17], res[18]) \ + COMP_MAC(tr[1], res[20], res[23], ti[1], res[21], res[22]) \ + COMP_MAC(tr[0], res[24], res[27], ti[0], res[25], res[26]) \ + COMP_MAC(tr[1], res[28], res[31], ti[1], res[29], res[30]) \ + COMP_MAC(tr[0], res[32], res[35], ti[0], res[33], res[34]) \ + COMP_MAC(tr[1], res[36], res[39], ti[1], res[37], res[38]) \ + COMP_MAC(tr[0], res[40], res[43], ti[0], res[41], res[42]) \ + COMP_MAC(tr[1], res[44], res[47], ti[1], res[45], res[46]) \ + COMP_MAC(tr[0], res[48], res[51], ti[0], res[49], res[50]) \ + COMP_MAC(tr[1], res[52], res[55], ti[1], res[53], res[54]) \ + COMP_MAC(tr[0], res[56], res[59], ti[0], res[57], res[58]) \ + COMP_MAC(tr[1], res[60], res[63], ti[1], res[61], res[62]) \ + CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; + +#define SAVE_ACC_COMPLEX_21_2 \ + BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + COMP_MUL(tr[0], res[ 0], res[ 3], ti[0], res[ 1], res[ 2]) \ + COMP_MUL(tr[1], res[ 4], res[ 7], ti[1], res[ 5], res[ 6]) \ + COMP_MUL(tr[2], res[ 8], res[11], ti[2], res[ 9], res[10]) \ + COMP_MUL(tr[3], res[12], res[15], ti[3], res[13], res[14]) \ + COMP_MAC(tr[0], res[16], res[19], ti[0], res[17], res[18]) \ + COMP_MAC(tr[1], res[20], res[23], ti[1], res[21], res[22]) \ + COMP_MAC(tr[2], res[24], res[27], ti[2], res[25], res[26]) \ + COMP_MAC(tr[3], res[28], res[31], ti[3], res[29], res[30]) \ + COMP_MAC(tr[0], res[32], res[35], ti[0], res[33], res[34]) \ + COMP_MAC(tr[1], res[36], res[39], ti[1], res[37], res[38]) \ + COMP_MAC(tr[2], res[40], res[43], ti[2], res[41], res[42]) \ + COMP_MAC(tr[3], res[44], res[47], ti[3], res[45], res[46]) \ + COMP_MAC(tr[0], res[48], res[51], ti[0], res[49], res[50]) \ + COMP_MAC(tr[1], res[52], res[55], ti[1], res[53], res[54]) \ + COMP_MAC(tr[2], res[56], res[59], ti[2], res[57], res[58]) \ + COMP_MAC(tr[3], res[60], res[63], ti[3], res[61], res[62]) \ + CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[4] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[5] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[6] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[7] A_OP ti[3] * alpha_r + tr[3] * alpha_i; + +#define SAVE_ACC_COMPLEX_21_4 \ + BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + COMP_MUL(tr[0], res[ 0], res[ 3], ti[0], res[ 1], res[ 2]) \ + COMP_MUL(tr[1], res[ 4], res[ 7], ti[1], res[ 5], res[ 6]) \ + COMP_MUL(tr[2], res[ 8], res[11], ti[2], res[ 9], res[10]) \ + COMP_MUL(tr[3], res[12], res[15], ti[3], res[13], res[14]) \ + COMP_MUL(tr[4], res[16], res[19], ti[4], res[17], res[18]) \ + COMP_MUL(tr[5], res[20], res[23], ti[5], res[21], res[22]) \ + COMP_MUL(tr[6], res[24], res[27], ti[6], res[25], res[26]) \ + COMP_MUL(tr[7], res[28], res[31], ti[7], res[29], res[30]) \ + COMP_MAC(tr[0], res[32], res[35], ti[0], res[33], res[34]) \ + COMP_MAC(tr[1], res[36], res[39], ti[1], res[37], res[38]) \ + COMP_MAC(tr[2], res[40], res[43], ti[2], res[41], res[42]) \ + COMP_MAC(tr[3], res[44], res[47], ti[3], res[45], res[46]) \ + COMP_MAC(tr[4], res[48], res[51], ti[4], res[49], res[50]) \ + COMP_MAC(tr[5], res[52], res[55], ti[5], res[53], res[54]) \ + COMP_MAC(tr[6], res[56], res[59], ti[6], res[57], res[58]) \ + COMP_MAC(tr[7], res[60], res[63], ti[7], res[61], res[62]) \ + CO[ 0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[ 1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[ 2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[ 3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[ 4] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[ 5] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[ 6] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[ 7] A_OP ti[3] * alpha_r + tr[3] * alpha_i; \ + CO[ 8] A_OP tr[4] * alpha_r - ti[4] * alpha_i; \ + CO[ 9] A_OP ti[4] * alpha_r + tr[4] * alpha_i; \ + CO[10] A_OP tr[5] * alpha_r - ti[5] * alpha_i; \ + CO[11] A_OP ti[5] * alpha_r + tr[5] * alpha_i; \ + CO[12] A_OP tr[6] * alpha_r - ti[6] * alpha_i; \ + CO[13] A_OP ti[6] * alpha_r + tr[6] * alpha_i; \ + CO[14] A_OP tr[7] * alpha_r - ti[7] * alpha_i; \ + CO[15] A_OP ti[7] * alpha_r + tr[7] * alpha_i; + +#define SAVE_ACC_COMPLEX_22_1 \ + __builtin_mma_disassemble_acc ((void *)result, &acc0); \ + __builtin_mma_disassemble_acc ((void *)(&result[4]), &acc1); \ + COMP_MUL(tr[0], res[0], res[3], ti[0], res[1], res[2]) \ + COMP_MUL(tr[1], res[4], res[7], ti[1], res[5], res[6]) \ + COMP_MUL(tr[2], res[8], res[11], ti[2], res[9], res[10]) \ + COMP_MUL(tr[3], res[12], res[15], ti[3], res[13], res[14] ) \ + CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[2*ldc+0] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[2*ldc+1] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[2*ldc+2] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[2*ldc+3] A_OP ti[3] * alpha_r + tr[3] * alpha_i; + +#define SAVE_ACC_COMPLEX_22_2(ACC1, ACC2, CI) \ + __builtin_mma_disassemble_acc ((void *)result, ACC1); \ + __builtin_mma_disassemble_acc ((void *)(&result[4]), ACC2); \ + COMP_MUL(tr[0], res[0], res[3], ti[0], res[1], res[2]) \ + COMP_MUL(tr[1], res[4], res[7], ti[1], res[5], res[6]) \ + COMP_MUL(tr[2], res[8], res[11], ti[2], res[9], res[10]) \ + COMP_MUL(tr[3], res[12], res[15], ti[3], res[13], res[14]) \ + CO[CI+0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[CI+1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[CI+2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[CI+3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[2*ldc+CI+0] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[2*ldc+CI+1] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[2*ldc+CI+2] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[2*ldc+CI+3] A_OP ti[3] * alpha_r + tr[3] * alpha_i; + +#define PREFETCH1(x, y) asm volatile ("dcbt %0, %1" : : "r" (x), "b" (y) : "memory"); + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) +#define REFRESH_TEMP_BK(x, y) \ + temp = k - off; +#elif defined(LEFT) +#define REFRESH_TEMP_BK(x, y) \ + temp = off + x; +#else +#define REFRESH_TEMP_BK(x, y) \ + temp = off + y; +#endif +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +#define REFRESH_POINTERS(x, y) \ + BO = B; \ + REFRESH_TEMP_BK(x, y) +#else +#define REFRESH_POINTERS(x, y) \ + AO += off * (2*x); \ + BO = B + off * (2*y); \ + REFRESH_TEMP_BK(x, y) +#endif + +#ifdef LEFT +#define REFRESH_OFF(x) \ + off += x; +#else +#define REFRESH_OFF(x) +#endif + +#ifdef LEFT +#define UPDATE_TEMP(x, y) \ + temp -= x; +#else +#define UPDATE_TEMP(x, y) \ + temp -= y; +#endif + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +#define REFRESH_TMP_AFTER_SAVE(x, y) \ + temp = k - off; \ + UPDATE_TEMP(x, y) \ + AO += temp * (2*x); \ + BO += temp * (2*y); +#else +#define REFRESH_TMP_AFTER_SAVE(x, y) +#endif + +#define REFRESH_AFTER_SAVE(x,y) \ + REFRESH_TMP_AFTER_SAVE(x, y) \ + REFRESH_OFF(x) +/************************************************************************************* +* GEMM Kernel +*************************************************************************************/ +int +#ifdef TRMMKERNEL +CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, + FLOAT * A, FLOAT * B, FLOAT * C, BLASLONG ldc, BLASLONG offset) +#else +CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, + FLOAT * A, FLOAT * B, FLOAT * C, BLASLONG ldc) +#endif +{ + BLASLONG i1, i, l, temp; + FLOAT *AO, *BO, *CO; +#if defined(TRMMKERNEL) + BLASLONG off; +#endif +#if defined(TRMMKERNEL) && !defined(LEFT) + off = -offset; +#endif + __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; + + v4sf_t result[32]; + FLOAT *res, tr[16], ti[16]; + res = (FLOAT *) result; + + for (i1 = 0; i1 < (n >> 1); i1++) { +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + AO = A; + CO = C; + C += ldc<<2; + for (i = 0; i < (m >> 3); i++) { +#if defined(TRMMKERNEL) + REFRESH_POINTERS (8, 2) +#else + BO = B; + temp = k; +#endif + SET_ACC_ZERO() + for (l = 0; l < temp; ++l) { + __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<4])); + __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<4)+4])); + __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<4)+8])); + __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<4)+12])); + vec_t rowB1 = *(vec_t *) & BO[l<<2]; + vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2]; + __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1); + __builtin_mma_xvf64gerpp(&acc2, rowA3, rowB1); + __builtin_mma_xvf64gerpp(&acc3, rowA4, rowB1); + __builtin_mma_xvf64gerpp(&acc4, rowA1, rowB2); + __builtin_mma_xvf64gerpp(&acc5, rowA2, rowB2); + __builtin_mma_xvf64gerpp(&acc6, rowA3, rowB2); + __builtin_mma_xvf64gerpp(&acc7, rowA4, rowB2); + } + __builtin_mma_disassemble_acc ((void *)result, &acc0); + __builtin_mma_disassemble_acc ((void *)(&result[ 4]), &acc1); + __builtin_mma_disassemble_acc ((void *)(&result[ 8]), &acc2); + __builtin_mma_disassemble_acc ((void *)(&result[12]), &acc3); + __builtin_mma_disassemble_acc ((void *)(&result[16]), &acc4); + __builtin_mma_disassemble_acc ((void *)(&result[20]), &acc5); + __builtin_mma_disassemble_acc ((void *)(&result[24]), &acc6); + __builtin_mma_disassemble_acc ((void *)(&result[28]), &acc7); + COMP_MUL(tr[ 0], res[ 0], res[ 3], ti[ 0], res[ 1], res[ 2]) + COMP_MUL(tr[ 1], res[ 4], res[ 7], ti[ 1], res[ 5], res[ 6]) + COMP_MUL(tr[ 2], res[ 8], res[11], ti[ 2], res[ 9], res[10]) + COMP_MUL(tr[ 3], res[12], res[15], ti[ 3], res[13], res[14]) + COMP_MUL(tr[ 4], res[16], res[19], ti[ 4], res[17], res[18]) + COMP_MUL(tr[ 5], res[20], res[23], ti[ 5], res[21], res[22]) + COMP_MUL(tr[ 6], res[24], res[27], ti[ 6], res[25], res[26]) + COMP_MUL(tr[ 7], res[28], res[31], ti[ 7], res[29], res[30]) + COMP_MUL(tr[ 8], res[32], res[35], ti[ 8], res[33], res[34]) + COMP_MUL(tr[ 9], res[36], res[39], ti[ 9], res[37], res[38]) + COMP_MUL(tr[10], res[40], res[43], ti[10], res[41], res[42]) + COMP_MUL(tr[11], res[44], res[47], ti[11], res[45], res[46]) + COMP_MUL(tr[12], res[48], res[51], ti[12], res[49], res[50]) + COMP_MUL(tr[13], res[52], res[55], ti[13], res[53], res[54]) + COMP_MUL(tr[14], res[56], res[59], ti[14], res[57], res[58]) + COMP_MUL(tr[15], res[60], res[63], ti[15], res[61], res[62]) + CO[ 0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; + CO[ 1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; + CO[ 2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; + CO[ 3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; + CO[ 4] A_OP tr[2] * alpha_r - ti[2] * alpha_i; + CO[ 5] A_OP ti[2] * alpha_r + tr[2] * alpha_i; + CO[ 6] A_OP tr[3] * alpha_r - ti[3] * alpha_i; + CO[ 7] A_OP ti[3] * alpha_r + tr[3] * alpha_i; + CO[ 8] A_OP tr[4] * alpha_r - ti[4] * alpha_i; + CO[ 9] A_OP ti[4] * alpha_r + tr[4] * alpha_i; + CO[10] A_OP tr[5] * alpha_r - ti[5] * alpha_i; + CO[11] A_OP ti[5] * alpha_r + tr[5] * alpha_i; + CO[12] A_OP tr[6] * alpha_r - ti[6] * alpha_i; + CO[13] A_OP ti[6] * alpha_r + tr[6] * alpha_i; + CO[14] A_OP tr[7] * alpha_r - ti[7] * alpha_i; + CO[15] A_OP ti[7] * alpha_r + tr[7] * alpha_i; + CO[2*ldc+ 0] A_OP tr[ 8] * alpha_r - ti[ 8] * alpha_i; + CO[2*ldc+ 1] A_OP ti[ 8] * alpha_r + tr[ 8] * alpha_i; + CO[2*ldc+ 2] A_OP tr[ 9] * alpha_r - ti[ 9] * alpha_i; + CO[2*ldc+ 3] A_OP ti[ 9] * alpha_r + tr[ 9] * alpha_i; + CO[2*ldc+ 4] A_OP tr[10] * alpha_r - ti[10] * alpha_i; + CO[2*ldc+ 5] A_OP ti[10] * alpha_r + tr[10] * alpha_i; + CO[2*ldc+ 6] A_OP tr[11] * alpha_r - ti[11] * alpha_i; + CO[2*ldc+ 7] A_OP ti[11] * alpha_r + tr[11] * alpha_i; + CO[2*ldc+ 8] A_OP tr[12] * alpha_r - ti[12] * alpha_i; + CO[2*ldc+ 9] A_OP ti[12] * alpha_r + tr[12] * alpha_i; + CO[2*ldc+10] A_OP tr[13] * alpha_r - ti[13] * alpha_i; + CO[2*ldc+11] A_OP ti[13] * alpha_r + tr[13] * alpha_i; + CO[2*ldc+12] A_OP tr[14] * alpha_r - ti[14] * alpha_i; + CO[2*ldc+13] A_OP ti[14] * alpha_r + tr[14] * alpha_i; + CO[2*ldc+14] A_OP tr[15] * alpha_r - ti[15] * alpha_i; + CO[2*ldc+15] A_OP ti[15] * alpha_r + tr[15] * alpha_i; + + AO += temp << 4; + BO += temp << 2; + CO += 16; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (8, 2) +#endif + } + if (m & 4) { +#if defined(TRMMKERNEL) + REFRESH_POINTERS (4, 2) +#else + BO = B; + temp = k; +#endif + SET_ACC_ZERO() + for (l = 0; l < (temp & (~1)); l+=2) { + __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<3])); + __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<3)+4])); + __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<3)+8])); + __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<3)+12])); + vec_t rowB1 = *(vec_t *) & BO[l<<2]; + vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2]; + vec_t rowB3 = *(vec_t *) & BO[(l<<2)+4]; + vec_t rowB4 = *(vec_t *) & BO[(l<<2)+6]; + __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1); + __builtin_mma_xvf64gerpp(&acc2, rowA1, rowB2); + __builtin_mma_xvf64gerpp(&acc3, rowA2, rowB2); + __builtin_mma_xvf64gerpp(&acc0, rowA3, rowB3); + __builtin_mma_xvf64gerpp(&acc1, rowA4, rowB3); + __builtin_mma_xvf64gerpp(&acc2, rowA3, rowB4); + __builtin_mma_xvf64gerpp(&acc3, rowA4, rowB4); + } + for (l = (temp & (~1)); l < temp; ++l) { + __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<3])); + __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<3)+4])); + vec_t rowB1 = *(vec_t *) & BO[l<<2]; + vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2]; + __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1); + __builtin_mma_xvf64gerpp(&acc2, rowA1, rowB2); + __builtin_mma_xvf64gerpp(&acc3, rowA2, rowB2); + } + SAVE_ACC_COMPLEX_22_2(&acc0, &acc2, 0) + SAVE_ACC_COMPLEX_22_2(&acc1, &acc3, 4) + AO += temp << 3; + BO += temp << 2; + CO += 8; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (4, 2) +#endif + } + if (m & 2) { +#if defined(TRMMKERNEL) + REFRESH_POINTERS (2, 2) +#else + BO = B; + temp = k; +#endif + SET_ACC_ZERO() + for (l = 0; l < (temp & (~3)); l+=4) { + __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<2])); + __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<2)+4])); + __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<2)+8])); + __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<2)+12])); + vec_t rowB1 = *(vec_t *) & BO[l<<2]; + vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2]; + vec_t rowB3 = *(vec_t *) & BO[(l<<2)+4]; + vec_t rowB4 = *(vec_t *) & BO[(l<<2)+6]; + vec_t rowB5 = *(vec_t *) & BO[(l<<2)+8]; + vec_t rowB6 = *(vec_t *) & BO[(l<<2)+10]; + vec_t rowB7 = *(vec_t *) & BO[(l<<2)+12]; + vec_t rowB8 = *(vec_t *) & BO[(l<<2)+14]; + __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf64gerpp(&acc1, rowA1, rowB2); + __builtin_mma_xvf64gerpp(&acc0, rowA2, rowB3); + __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB4); + __builtin_mma_xvf64gerpp(&acc0, rowA3, rowB5); + __builtin_mma_xvf64gerpp(&acc1, rowA3, rowB6); + __builtin_mma_xvf64gerpp(&acc0, rowA4, rowB7); + __builtin_mma_xvf64gerpp(&acc1, rowA4, rowB8); + } + for (l = (temp & (~3)); l < temp; ++l) { + __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<2])); + vec_t rowB1 = *(vec_t *) & BO[l<<2]; + vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2]; + __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf64gerpp(&acc1, rowA1, rowB2); + } + SAVE_ACC_COMPLEX_22_1 + AO += temp << 2; + BO += temp << 2; + CO += 4; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (2, 2) +#endif + } + if (m & 1) { +#if defined(TRMMKERNEL) + REFRESH_POINTERS (1, 2) +#else + BO = B; + temp = k; +#endif + SET_ACC_ZERO() + for (l = 0; l < (temp & (~3)); l+=4) { + __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<1])); + __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<1)+2])); + __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<1)+4])); + __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<1)+6])); + vec_t rowB1 = *(vec_t *) & BO[l<<2]; + vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2]; + vec_t rowB3 = *(vec_t *) & BO[(l<<2)+4]; + vec_t rowB4 = *(vec_t *) & BO[(l<<2)+6]; + vec_t rowB5 = *(vec_t *) & BO[(l<<2)+8]; + vec_t rowB6 = *(vec_t *) & BO[(l<<2)+10]; + vec_t rowB7 = *(vec_t *) & BO[(l<<2)+12]; + vec_t rowB8 = *(vec_t *) & BO[(l<<2)+14]; + __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf64gerpp(&acc1, rowA1, rowB2); + __builtin_mma_xvf64gerpp(&acc0, rowA2, rowB3); + __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB4); + __builtin_mma_xvf64gerpp(&acc0, rowA3, rowB5); + __builtin_mma_xvf64gerpp(&acc1, rowA3, rowB6); + __builtin_mma_xvf64gerpp(&acc0, rowA4, rowB7); + __builtin_mma_xvf64gerpp(&acc1, rowA4, rowB8); + } + for (l = (temp & (~3)); l < temp; ++l) { + __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<1])); + vec_t rowB1 = *(vec_t *) & BO[l<<2]; + vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2]; + __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf64gerpp(&acc1, rowA1, rowB2); + } + SAVE_ACC_COMPLEX_12 + AO += temp << 1; + BO += temp << 2; + CO += 2; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (1, 2) +#endif + } +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 2; // number of values in A +#endif + B += k << 2; + } + if (n & 1) { +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + AO = A; + CO = C; + C += ldc<<1; + for (i = 0; i < (m >> 3); i++) { +#if defined(TRMMKERNEL) + REFRESH_POINTERS (8, 1) +#else + BO = B; + temp = k; +#endif + SET_ACC_ZERO() + for (l = 0; l < (temp & (~1)); l+=2) { + __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<4])); + __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<4)+4])); + __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<4)+8])); + __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<4)+12])); + __vector_pair rowA5 = *((__vector_pair *)((void *)&AO[(l<<4)+16])); + __vector_pair rowA6 = *((__vector_pair *)((void *)&AO[(l<<4)+20])); + __vector_pair rowA7 = *((__vector_pair *)((void *)&AO[(l<<4)+24])); + __vector_pair rowA8 = *((__vector_pair *)((void *)&AO[(l<<4)+28])); + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2]; + __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1); + __builtin_mma_xvf64gerpp(&acc2, rowA3, rowB1); + __builtin_mma_xvf64gerpp(&acc3, rowA4, rowB1); + __builtin_mma_xvf64gerpp(&acc0, rowA5, rowB2); + __builtin_mma_xvf64gerpp(&acc1, rowA6, rowB2); + __builtin_mma_xvf64gerpp(&acc2, rowA7, rowB2); + __builtin_mma_xvf64gerpp(&acc3, rowA8, rowB2); + } + for (l = (temp & (~1)); l < temp; ++l) { + __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<4])); + __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<4)+4])); + __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<4)+8])); + __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<4)+12])); + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1); + __builtin_mma_xvf64gerpp(&acc2, rowA3, rowB1); + __builtin_mma_xvf64gerpp(&acc3, rowA4, rowB1); + } + SAVE_ACC_COMPLEX_21_4 + + AO += temp << 4; + BO += temp << 1; + CO += 16; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (8, 1) +#endif + } + if (m & 4) { +#if defined(TRMMKERNEL) + REFRESH_POINTERS (4, 1) +#else + BO = B; + temp = k; +#endif + SET_ACC_ZERO() + for (l = 0; l < (temp & (~3)); l+=4) { + __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<3])); + __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<3)+4])); + __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<3)+8])); + __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<3)+12])); + __vector_pair rowA5 = *((__vector_pair *)((void *)&AO[(l<<3)+16])); + __vector_pair rowA6 = *((__vector_pair *)((void *)&AO[(l<<3)+20])); + __vector_pair rowA7 = *((__vector_pair *)((void *)&AO[(l<<3)+24])); + __vector_pair rowA8 = *((__vector_pair *)((void *)&AO[(l<<3)+28])); + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2]; + vec_t rowB3 = *(vec_t *) & BO[(l<<1)+4]; + vec_t rowB4 = *(vec_t *) & BO[(l<<1)+6]; + __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1); + __builtin_mma_xvf64gerpp(&acc2, rowA3, rowB2); + __builtin_mma_xvf64gerpp(&acc3, rowA4, rowB2); + __builtin_mma_xvf64gerpp(&acc4, rowA5, rowB3); + __builtin_mma_xvf64gerpp(&acc5, rowA6, rowB3); + __builtin_mma_xvf64gerpp(&acc6, rowA7, rowB4); + __builtin_mma_xvf64gerpp(&acc7, rowA8, rowB4); + } + for (l = (temp & (~3)); l < temp; ++l) { + __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<3])); + __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<3)+4])); + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1); + } + SAVE_ACC_COMPLEX_21_2 + AO += temp << 3; + BO += temp << 1; + CO += 8; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (4, 1) +#endif + } + if (m & 2) { +#if defined(TRMMKERNEL) + REFRESH_POINTERS (2, 1) +#else + BO = B; + temp = k; +#endif + SET_ACC_ZERO() + for (l = 0; l < (temp & (~7)); l+=8) { + __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<2])); + __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<2)+4])); + __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<2)+8])); + __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<2)+12])); + __vector_pair rowA5 = *((__vector_pair *)((void *)&AO[(l<<2)+16])); + __vector_pair rowA6 = *((__vector_pair *)((void *)&AO[(l<<2)+20])); + __vector_pair rowA7 = *((__vector_pair *)((void *)&AO[(l<<2)+24])); + __vector_pair rowA8 = *((__vector_pair *)((void *)&AO[(l<<2)+28])); + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2]; + vec_t rowB3 = *(vec_t *) & BO[(l<<1)+4]; + vec_t rowB4 = *(vec_t *) & BO[(l<<1)+6]; + vec_t rowB5 = *(vec_t *) & BO[(l<<1)+8]; + vec_t rowB6 = *(vec_t *) & BO[(l<<1)+10]; + vec_t rowB7 = *(vec_t *) & BO[(l<<1)+12]; + vec_t rowB8 = *(vec_t *) & BO[(l<<1)+14]; + __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB2); + __builtin_mma_xvf64gerpp(&acc2, rowA3, rowB3); + __builtin_mma_xvf64gerpp(&acc3, rowA4, rowB4); + __builtin_mma_xvf64gerpp(&acc4, rowA5, rowB5); + __builtin_mma_xvf64gerpp(&acc5, rowA6, rowB6); + __builtin_mma_xvf64gerpp(&acc6, rowA7, rowB7); + __builtin_mma_xvf64gerpp(&acc7, rowA8, rowB8); + } + for (l = (temp & (~7)); l < temp; ++l) { + __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<2])); + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); + } + SAVE_ACC_COMPLEX_21_1 + AO += temp << 2; + BO += temp << 1; + CO += 4; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (2, 1) +#endif + } + if (m & 1) { +#if defined(TRMMKERNEL) + REFRESH_POINTERS (1, 1) +#else + BO = B; + temp = k; +#endif + SET_ACC_ZERO() + for (l = 0; l < (temp & (~7)); l+=8) { + __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<1])); + __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<1)+2])); + __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<1)+4])); + __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<1)+6])); + __vector_pair rowA5 = *((__vector_pair *)((void *)&AO[(l<<1)+8])); + __vector_pair rowA6 = *((__vector_pair *)((void *)&AO[(l<<1)+10])); + __vector_pair rowA7 = *((__vector_pair *)((void *)&AO[(l<<1)+12])); + __vector_pair rowA8 = *((__vector_pair *)((void *)&AO[(l<<1)+14])); + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2]; + vec_t rowB3 = *(vec_t *) & BO[(l<<1)+4]; + vec_t rowB4 = *(vec_t *) & BO[(l<<1)+6]; + vec_t rowB5 = *(vec_t *) & BO[(l<<1)+8]; + vec_t rowB6 = *(vec_t *) & BO[(l<<1)+10]; + vec_t rowB7 = *(vec_t *) & BO[(l<<1)+12]; + vec_t rowB8 = *(vec_t *) & BO[(l<<1)+14]; + __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB2); + __builtin_mma_xvf64gerpp(&acc2, rowA3, rowB3); + __builtin_mma_xvf64gerpp(&acc3, rowA4, rowB4); + __builtin_mma_xvf64gerpp(&acc4, rowA5, rowB5); + __builtin_mma_xvf64gerpp(&acc5, rowA6, rowB6); + __builtin_mma_xvf64gerpp(&acc6, rowA7, rowB7); + __builtin_mma_xvf64gerpp(&acc7, rowA8, rowB8); + } + for (l = (temp & (~7)); l < temp; ++l) { + __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<1])); + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); + } + SAVE_ACC_COMPLEX_11 + AO += temp << 1; + BO += temp << 1; + CO += 2; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (1, 1) +#endif + } +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 1; // number of values in A +#endif + B += k << 1; + } + return 0; +} diff --git a/kernel/zarch/KERNEL.Z13 b/kernel/zarch/KERNEL.Z13 index 3bcc32197..fe82d81e6 100644 --- a/kernel/zarch/KERNEL.Z13 +++ b/kernel/zarch/KERNEL.Z13 @@ -35,10 +35,10 @@ DASUMKERNEL = dasum.c CASUMKERNEL = ../arm/zasum.c ZASUMKERNEL = zasum.c -SSUMKERNEL = ../arm/asum.c -DSUMKERNEL = dasum.c -CSUMKERNEL = ../arm/zasum.c -ZSUMKERNEL = zasum.c +SSUMKERNEL = ../arm/sum.c +DSUMKERNEL = dsum.c +CSUMKERNEL = ../arm/zsum.c +ZSUMKERNEL = zsum.c SAXPYKERNEL = ../arm/axpy.c DAXPYKERNEL = daxpy.c diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index ace20dffc..4ebd5348c 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -21,7 +21,16 @@ endif() if (BUILD_COMPLEX16) list (APPEND OpenBLAS_Tests zblat1 zblat2 zblat3) endif() -message (STATUS CCOMP ${CMAKE_C_COMPILER_ID} FCOMP ${CMAKE_Fortran_COMPILER_ID}) + +if (USE_GEMM3M) + if (BUILD_COMPLEX) + list (APPEND OpenBLAS_Tests cblat3_3m) + endif () + if (BUILD_COMPLEX16) + list (APPEND OpenBLAS_Tests zblat3_3m) + endif () +endif () + foreach(test_bin ${OpenBLAS_Tests}) add_executable(${test_bin} ${test_bin}.f) target_link_libraries(${test_bin} ${OpenBLAS_LIBNAME}) @@ -82,4 +91,10 @@ add_test(NAME "${float_type}blas2" COMMAND ${helper_prefix} $ "${PROJECT_SOURCE_DIR}/test/${float_type}blat2.dat" ${float_type_upper}BLAT2.SUMM) add_test(NAME "${float_type}blas3" COMMAND ${helper_prefix} $ "${PROJECT_SOURCE_DIR}/test/${float_type}blat3.dat" ${float_type_upper}BLAT3.SUMM) +if (USE_GEMM3M) +if ((${float_type} STREQUAL "c") OR (${float_type} STREQUAL "z")) +add_test(NAME "${float_type}blas3_3m" + COMMAND ${helper_prefix} $ "${PROJECT_SOURCE_DIR}/test/${float_type}blat3_3m.dat" ${float_type_upper}BLAT3_3M.SUMM) +endif() +endif() endforeach() diff --git a/test/Makefile b/test/Makefile index 5a4694ce6..6a50b6c98 100644 --- a/test/Makefile +++ b/test/Makefile @@ -4,6 +4,24 @@ ifeq ($(F_COMPILER),GFORTRAN) override FFLAGS += -fno-tree-vectorize endif +SUPPORT_GEMM3M = 0 + +ifeq ($(ARCH), x86) +SUPPORT_GEMM3M = 1 +endif + +ifeq ($(ARCH), x86_64) +SUPPORT_GEMM3M = 1 +endif + +ifeq ($(ARCH), ia64) +SUPPORT_GEMM3M = 1 +endif + +ifeq ($(ARCH), MIPS) +SUPPORT_GEMM3M = 1 +endif + ifeq ($(NOFORTRAN),1) all :: else @@ -153,11 +171,20 @@ ifeq ($(BUILD_DOUBLE),1) D3=dblat3 endif ifeq ($(BUILD_COMPLEX),1) +ifeq ($(SUPPORT_GEMM3M),1) +C3=cblat3 cblat3_3m +else C3=cblat3 endif +endif ifeq ($(BUILD_COMPLEX16),1) +ifeq ($(SUPPORT_GEMM3M),1) +Z3=zblat3 zblat3_3m +else Z3=zblat3 endif +endif + level3: $(B3) $(S3) $(D3) $(C3) $(Z3) diff --git a/utest/test_extensions/test_cgemv_t.c b/utest/test_extensions/test_cgemv_t.c index aa3281e66..cb4e5ad9e 100644 --- a/utest/test_extensions/test_cgemv_t.c +++ b/utest/test_extensions/test_cgemv_t.c @@ -126,7 +126,7 @@ static float check_cgemv(char api, char order, char trans, blasint m, blasint n, srand_generate(data_cgemv_t.y_test, m * inc_y * 2); // Copy vector y for reference funcs - for (int i = 0; i < m * inc_y * 2; i++) { + for (i = 0; i < m * inc_y * 2; i++) { data_cgemv_t.y_verify[i] = data_cgemv_t.y_test[i]; } @@ -1129,4 +1129,4 @@ CTEST(cgemv, c_api_xerbla_invalid_order_col_major) int passed = c_api_check_badargs(corder, ctrans, m, n, lda, inc_x, inc_y, expected_info); ASSERT_EQUAL(TRUE, passed); } -#endif \ No newline at end of file +#endif diff --git a/utest/test_extensions/test_csbmv.c b/utest/test_extensions/test_csbmv.c index 8e8ce4530..41c24a2b7 100644 --- a/utest/test_extensions/test_csbmv.c +++ b/utest/test_extensions/test_csbmv.c @@ -188,7 +188,7 @@ static float check_csbmv(char uplo, blasint n, blasint k, float *alpha, blasint char trans = 'N'; // Symmetric band packed matrix for sbmv - float a[lda * n * 2]; + float *a = (float*) malloc(lda * n * 2 * sizeof(float)); // Fill symmetric packed matrix sp_matrix, vector b_test, vector c_test srand_generate(data_csbmv.sp_matrix, n * (n + 1)); @@ -216,7 +216,7 @@ static float check_csbmv(char uplo, blasint n, blasint k, float *alpha, blasint // Find the differences between output vector caculated by csbmv and cgemv for (i = 0; i < n * inc_c * 2; i++) data_csbmv.c_test[i] -= data_csbmv.c_verify[i]; - + free(a); // Find the norm of differences return BLASFUNC(scnrm2)(&n, data_csbmv.c_test, &inc_c); } @@ -603,4 +603,4 @@ CTEST(csbmv, xerbla_lda_invalid) int passed = check_badargs(uplo, n, k, lda, inc_b, inc_c, expected_info); ASSERT_EQUAL(TRUE, passed); } -#endif \ No newline at end of file +#endif diff --git a/utest/test_extensions/test_idamin.c b/utest/test_extensions/test_idamin.c index 6a7ed9d1e..bebe76dba 100644 --- a/utest/test_extensions/test_idamin.c +++ b/utest/test_extensions/test_idamin.c @@ -402,13 +402,14 @@ CTEST(idamin, min_idx_in_vec_tail){ CTEST(idamin, min_idx_in_vec_tail_inc_1){ blasint i; blasint N = ELEMENTS, inc = 1; - double x[ELEMENTS * inc]; + double *x = (double*)malloc(ELEMENTS * inc * sizeof(double)); for (i = 0; i < N * inc; i ++) { x[i] = i + 1000; } x[(N - 1) * inc] = 0.0f; blasint index = BLASFUNC(idamin)(&N, x, &inc); + free(x); ASSERT_EQUAL(N, index); } @@ -775,13 +776,14 @@ CTEST(idamin, c_api_min_idx_in_vec_tail){ CTEST(idamin, c_api_min_idx_in_vec_tail_inc_1){ blasint i; blasint N = ELEMENTS, inc = 1; - double x[ELEMENTS * inc]; + double *x = (double*) malloc(ELEMENTS * inc * sizeof(double)); for (i = 0; i < N * inc; i ++) { x[i] = i + 1000; } x[(N - 1) * inc] = 0.0; blasint index = cblas_idamin(N, x, inc); + free(x); ASSERT_EQUAL(N - 1, index); } -#endif \ No newline at end of file +#endif diff --git a/utest/test_extensions/test_isamin.c b/utest/test_extensions/test_isamin.c index 4ff235b83..d93813e6f 100644 --- a/utest/test_extensions/test_isamin.c +++ b/utest/test_extensions/test_isamin.c @@ -402,13 +402,14 @@ CTEST(isamin, min_idx_in_vec_tail){ CTEST(isamin, min_idx_in_vec_tail_inc_1){ blasint i; blasint N = ELEMENTS, inc = 1; - float x[ELEMENTS * inc]; + float *x = (float*) malloc(ELEMENTS * inc * sizeof(float)); for (i = 0; i < N * inc; i ++) { x[i] = i + 1000; } x[(N - 1) * inc] = 0.0f; blasint index = BLASFUNC(isamin)(&N, x, &inc); + free(x); ASSERT_EQUAL(N, index); } @@ -775,13 +776,14 @@ CTEST(isamin, c_api_min_idx_in_vec_tail){ CTEST(isamin, c_api_min_idx_in_vec_tail_inc_1){ blasint i; blasint N = ELEMENTS, inc = 1; - float x[ELEMENTS * inc]; + float *x = (float*)malloc(ELEMENTS * inc * sizeof(float)); for (i = 0; i < N * inc; i ++) { x[i] = i + 1000; } x[(N - 1) * inc] = 0.0f; blasint index = cblas_isamin(N, x, inc); + free(x); ASSERT_EQUAL(N - 1, index); } -#endif \ No newline at end of file +#endif diff --git a/utest/test_extensions/test_zgemv_t.c b/utest/test_extensions/test_zgemv_t.c index 2e0ee65f0..b2d0b2713 100644 --- a/utest/test_extensions/test_zgemv_t.c +++ b/utest/test_extensions/test_zgemv_t.c @@ -126,7 +126,7 @@ static double check_zgemv(char api, char order, char trans, blasint m, blasint n drand_generate(data_zgemv_t.y_test, m * inc_y * 2); // Copy vector y for reference funcs - for (int i = 0; i < m * inc_y * 2; i++) + for (i = 0; i < m * inc_y * 2; i++) { data_zgemv_t.y_verify[i] = data_zgemv_t.y_test[i]; } @@ -1133,4 +1133,4 @@ CTEST(zgemv, c_api_xerbla_invalid_order_col_major) int passed = c_api_check_badargs(corder, ctrans, m, n, lda, inc_x, inc_y, expected_info); ASSERT_EQUAL(TRUE, passed); } -#endif \ No newline at end of file +#endif diff --git a/utest/test_extensions/test_zsbmv.c b/utest/test_extensions/test_zsbmv.c index afdb208c1..0e79dc0d8 100644 --- a/utest/test_extensions/test_zsbmv.c +++ b/utest/test_extensions/test_zsbmv.c @@ -188,7 +188,7 @@ static double check_zsbmv(char uplo, blasint n, blasint k, double *alpha, blasin char trans = 'N'; // Symmetric band packed matrix for sbmv - double a[lda * n * 2]; + double *a = (double*) malloc(lda * n * 2 * sizeof(double)); // Fill symmetric packed matrix sp_matrix, vector b_test, vector c_test drand_generate(data_zsbmv.sp_matrix, n * (n + 1)); @@ -213,6 +213,7 @@ static double check_zsbmv(char uplo, blasint n, blasint k, double *alpha, blasin BLASFUNC(zsbmv)(&uplo, &n, &k, alpha, a, &lda, data_zsbmv.b_test, &inc_b, beta, data_zsbmv.c_test, &inc_c); + free(a); // Find the differences between output vector caculated by zsbmv and zgemv for (i = 0; i < n * inc_c * 2; i++) data_zsbmv.c_test[i] -= data_zsbmv.c_verify[i]; @@ -603,4 +604,4 @@ CTEST(zsbmv, xerbla_lda_invalid) int passed = check_badargs(uplo, n, k, lda, inc_b, inc_c, expected_info); ASSERT_EQUAL(TRUE, passed); } -#endif \ No newline at end of file +#endif