From e7fd8d21a6f88b098a25ab76d3360efa1d38f830 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 26 Oct 2022 15:33:58 +0200 Subject: [PATCH] Add GEMMT based on looped GEMV --- interface/CMakeLists.txt | 2 +- interface/Makefile | 57 +++- interface/gemmt.c | 589 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 637 insertions(+), 11 deletions(-) create mode 100644 interface/gemmt.c diff --git a/interface/CMakeLists.txt b/interface/CMakeLists.txt index 0b2998237..654684b71 100644 --- a/interface/CMakeLists.txt +++ b/interface/CMakeLists.txt @@ -53,7 +53,7 @@ set(BLAS2_COMPLEX_ONLY_MANGLED_SOURCES # these do not have separate 'z' sources set(BLAS3_SOURCES gemm.c symm.c - trsm.c syrk.c syr2k.c + trsm.c syrk.c syr2k.c gemmt.c ) set(BLAS3_MANGLED_SOURCES diff --git a/interface/Makefile b/interface/Makefile index abdac96e1..a1f4f66da 100644 --- a/interface/Makefile +++ b/interface/Makefile @@ -44,12 +44,12 @@ SBLAS3OBJS = \ sgemm.$(SUFFIX) ssymm.$(SUFFIX) strmm.$(SUFFIX) \ strsm.$(SUFFIX) ssyrk.$(SUFFIX) ssyr2k.$(SUFFIX) \ somatcopy.$(SUFFIX) simatcopy.$(SUFFIX)\ - sgeadd.$(SUFFIX) + sgeadd.$(SUFFIX) sgemmt.$(SUFFIX) ifeq ($(BUILD_BFLOAT16),1) SBBLAS1OBJS = sbdot.$(SUFFIX) SBBLAS2OBJS = sbgemv.$(SUFFIX) -SBBLAS3OBJS = sbgemm.$(SUFFIX) +SBBLAS3OBJS = sbgemm.$(SUFFIX) sbgemmt.$(SUFFIX) SBEXTOBJS = sbstobf16.$(SUFFIX) sbdtobf16.$(SUFFIX) sbf16tos.$(SUFFIX) dbf16tod.$(SUFFIX) endif @@ -76,7 +76,7 @@ DBLAS3OBJS = \ dgemm.$(SUFFIX) dsymm.$(SUFFIX) dtrmm.$(SUFFIX) \ dtrsm.$(SUFFIX) dsyrk.$(SUFFIX) dsyr2k.$(SUFFIX) \ domatcopy.$(SUFFIX) dimatcopy.$(SUFFIX)\ - dgeadd.$(SUFFIX) + dgeadd.$(SUFFIX) dgemmt.$(SUFFIX) CBLAS1OBJS = \ caxpy.$(SUFFIX) caxpyc.$(SUFFIX) cswap.$(SUFFIX) \ @@ -105,7 +105,7 @@ CBLAS3OBJS = \ ctrsm.$(SUFFIX) csyrk.$(SUFFIX) csyr2k.$(SUFFIX) \ chemm.$(SUFFIX) cherk.$(SUFFIX) cher2k.$(SUFFIX) \ comatcopy.$(SUFFIX) cimatcopy.$(SUFFIX)\ - cgeadd.$(SUFFIX) + cgeadd.$(SUFFIX) cgemmt.$(SUFFIX) ZBLAS1OBJS = \ zaxpy.$(SUFFIX) zaxpyc.$(SUFFIX) zswap.$(SUFFIX) \ @@ -134,7 +134,7 @@ ZBLAS3OBJS = \ ztrsm.$(SUFFIX) zsyrk.$(SUFFIX) zsyr2k.$(SUFFIX) \ zhemm.$(SUFFIX) zherk.$(SUFFIX) zher2k.$(SUFFIX) \ zomatcopy.$(SUFFIX) zimatcopy.$(SUFFIX)\ - zgeadd.$(SUFFIX) + zgeadd.$(SUFFIX) zgemmt.$(SUFFIX) ifeq ($(SUPPORT_GEMM3M), 1) @@ -281,12 +281,12 @@ CSBLAS2OBJS = \ CSBLAS3OBJS = \ cblas_sgemm.$(SUFFIX) cblas_ssymm.$(SUFFIX) cblas_strmm.$(SUFFIX) cblas_strsm.$(SUFFIX) \ cblas_ssyrk.$(SUFFIX) cblas_ssyr2k.$(SUFFIX) cblas_somatcopy.$(SUFFIX) cblas_simatcopy.$(SUFFIX)\ - cblas_sgeadd.$(SUFFIX) + cblas_sgeadd.$(SUFFIX) cblas_sgemmt.$(SUFFIX) ifeq ($(BUILD_BFLOAT16),1) CSBBLAS1OBJS = cblas_sbdot.$(SUFFIX) CSBBLAS2OBJS = cblas_sbgemv.$(SUFFIX) -CSBBLAS3OBJS = cblas_sbgemm.$(SUFFIX) +CSBBLAS3OBJS = cblas_sbgemm.$(SUFFIX) cblas_sbgemmt.$(SUFFIX) CSBEXTOBJS = cblas_sbstobf16.$(SUFFIX) cblas_sbdtobf16.$(SUFFIX) cblas_sbf16tos.$(SUFFIX) cblas_dbf16tod.$(SUFFIX) endif @@ -306,7 +306,7 @@ CDBLAS2OBJS = \ CDBLAS3OBJS += \ cblas_dgemm.$(SUFFIX) cblas_dsymm.$(SUFFIX) cblas_dtrmm.$(SUFFIX) cblas_dtrsm.$(SUFFIX) \ cblas_dsyrk.$(SUFFIX) cblas_dsyr2k.$(SUFFIX) cblas_domatcopy.$(SUFFIX) cblas_dimatcopy.$(SUFFIX) \ - cblas_dgeadd.$(SUFFIX) + cblas_dgeadd.$(SUFFIX) cblas_dgemmt.$(SUFFIX) CCBLAS1OBJS = \ cblas_icamax.$(SUFFIX) cblas_icamin.$(SUFFIX) cblas_scasum.$(SUFFIX) cblas_caxpy.$(SUFFIX) \ @@ -331,7 +331,7 @@ CCBLAS3OBJS = \ cblas_csyrk.$(SUFFIX) cblas_csyr2k.$(SUFFIX) \ cblas_chemm.$(SUFFIX) cblas_cherk.$(SUFFIX) cblas_cher2k.$(SUFFIX) \ cblas_comatcopy.$(SUFFIX) cblas_cimatcopy.$(SUFFIX)\ - cblas_cgeadd.$(SUFFIX) + cblas_cgeadd.$(SUFFIX) cblas_cgemmt.$(SUFFIX) CXERBLAOBJ = \ cblas_xerbla.$(SUFFIX) @@ -362,7 +362,7 @@ CZBLAS3OBJS = \ cblas_zsyrk.$(SUFFIX) cblas_zsyr2k.$(SUFFIX) \ cblas_zhemm.$(SUFFIX) cblas_zherk.$(SUFFIX) cblas_zher2k.$(SUFFIX)\ cblas_zomatcopy.$(SUFFIX) cblas_zimatcopy.$(SUFFIX) \ - cblas_zgeadd.$(SUFFIX) + cblas_zgeadd.$(SUFFIX) cblas_zgemmt.$(SUFFIX) ifeq ($(SUPPORT_GEMM3M), 1) @@ -1300,6 +1300,8 @@ xhpr2.$(SUFFIX) xhpr2.$(PSUFFIX) : zhpr2.c ifeq ($(BUILD_BFLOAT16),1) sbgemm.$(SUFFIX) sbgemm.$(PSUFFIX) : gemm.c ../param.h $(CC) -c $(CFLAGS) $< -o $(@F) +sbgemmt.$(SUFFIX) sbgemm.$(PSUFFIX) : gemmt.c ../param.h + $(CC) -c $(CFLAGS) $< -o $(@F) endif sgemm.$(SUFFIX) sgemm.$(PSUFFIX) : gemm.c ../param.h @@ -1320,6 +1322,24 @@ zgemm.$(SUFFIX) zgemm.$(PSUFFIX) : gemm.c ../param.h xgemm.$(SUFFIX) xgemm.$(PSUFFIX) : gemm.c ../param.h $(CC) -c $(CFLAGS) $< -o $(@F) +sgemmt.$(SUFFIX) sgemmt.$(PSUFFIX) : gemmt.c ../param.h + $(CC) -c $(CFLAGS) $< -o $(@F) + +dgemmt.$(SUFFIX) dgemmt.$(PSUFFIX) : gemmt.c ../param.h + $(CC) -c $(CFLAGS) $< -o $(@F) + +qgemmt.$(SUFFIX) qgemmt.$(PSUFFIX) : gemmt.c ../param.h + $(CC) -c $(CFLAGS) $< -o $(@F) + +cgemmt.$(SUFFIX) cgemmt.$(PSUFFIX) : gemmt.c ../param.h + $(CC) -c $(CFLAGS) $< -o $(@F) + +zgemmt.$(SUFFIX) zgemmt.$(PSUFFIX) : gemmt.c ../param.h + $(CC) -c $(CFLAGS) $< -o $(@F) + +xgemmt.$(SUFFIX) xgemm.$(PSUFFIX) : gemmt.c ../param.h + $(CC) -c $(CFLAGS) $< -o $(@F) + ssymm.$(SUFFIX) ssymm.$(PSUFFIX) : symm.c $(CC) -c $(CFLAGS) $< -o $(@F) @@ -1907,6 +1927,23 @@ cblas_cgemm.$(SUFFIX) cblas_cgemm.$(PSUFFIX) : gemm.c ../param.h cblas_zgemm.$(SUFFIX) cblas_zgemm.$(PSUFFIX) : gemm.c ../param.h $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) +cblas_sgemmt.$(SUFFIX) cblas_sgemmt.$(PSUFFIX) : gemmt.c ../param.h + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +ifeq ($(BUILD_BFLOAT16),1) +cblas_sbgemmt.$(SUFFIX) cblas_sbgemmt.$(PSUFFIX) : gemmt.c ../param.h + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) +endif + +cblas_dgemmt.$(SUFFIX) cblas_dgemmt.$(PSUFFIX) : gemmt.c ../param.h + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_cgemmt.$(SUFFIX) cblas_cgemmt.$(PSUFFIX) : gemmt.c ../param.h + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_zgemmt.$(SUFFIX) cblas_zgemmt.$(PSUFFIX) : gemmt.c ../param.h + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + cblas_ssymm.$(SUFFIX) cblas_ssymm.$(PSUFFIX) : symm.c $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) diff --git a/interface/gemmt.c b/interface/gemmt.c new file mode 100644 index 000000000..3eed1dfe4 --- /dev/null +++ b/interface/gemmt.c @@ -0,0 +1,589 @@ +/*********************************************************************/ +/* Copyright 2022, The OpenBLAS Project. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/*********************************************************************/ + +#include +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifndef COMPLEX +#define SMP_THRESHOLD_MIN 65536.0 +#ifdef XDOUBLE +#define ERROR_NAME "QGEMT " +#elif defined(DOUBLE) +#define ERROR_NAME "DGEMT " +#elif defined(BFLOAT16) +#define ERROR_NAME "SBGEMT " +#else +#define ERROR_NAME "SGEMT " +#endif +#else +#define SMP_THRESHOLD_MIN 8192.0 +#ifdef XDOUBLE +#define ERROR_NAME "XGEMT " +#elif defined(DOUBLE) +#define ERROR_NAME "ZGEMT " +#else +#define ERROR_NAME "CGEMT " +#endif +#endif + +#ifndef GEMM_MULTITHREAD_THRESHOLD +#define GEMM_MULTITHREAD_THRESHOLD 4 +#endif + +#ifndef CBLAS + +void NAME(char *UPLO, char *TRANSA, char *TRANSB, + blasint * M, blasint * N, blasint * K, + FLOAT * Alpha, + IFLOAT * a, blasint * ldA, + IFLOAT * b, blasint * ldB, FLOAT * Beta, FLOAT * c, blasint * ldC) +{ + + blasint m, n, k; + blasint lda, ldb, ldc; + int transa, transb, uplo; + blasint info; + + char transA, transB, Uplo; + IFLOAT *buffer; + IFLOAT *aa, *bb; + FLOAT *cc; +#if defined(COMPLEX) + FLOAT alpha_r, alpha_i, beta_r, beta_i; +#else + FLOAT alpha, beta; +#endif + + PRINT_DEBUG_NAME; + + m = *M; + n = *N; + k = *K; + +#if defined(COMPLEX) + FLOAT *alpha = Alpha; + alpha_r = *(Alpha + 0); + alpha_i = *(Alpha + 1); + + beta_r = *(Beta + 0); + beta_i = *(Beta + 1); +#else + alpha = *Alpha; + beta = *Beta; +#endif + + lda = *ldA; + ldb = *ldB; + ldc = *ldC; + + transA = *TRANSA; + transB = *TRANSB; + Uplo = *UPLO; + TOUPPER(transA); + TOUPPER(transB); + TOUPPER(Uplo); + + transa = -1; + transb = -1; + uplo = -1; + + if (transA == 'N') + transa = 0; + if (transA == 'T') + transa = 1; +#ifndef COMPLEX + if (transA == 'R') + transa = 0; + if (transA == 'C') + transa = 1; +#else + if (transA == 'R') + transa = 2; + if (transA == 'C') + transa = 3; +#endif + + if (transB == 'N') + transb = 0; + if (transB == 'T') + transb = 1; +#ifndef COMPLEX + if (transB == 'R') + transb = 0; + if (transB == 'C') + transb = 1; +#else + if (transB == 'R') + transb = 2; + if (transB == 'C') + transb = 3; +#endif + + if (Uplo == 'U') + uplo = 0; + if (Uplo == 'L') + uplo = 1; + + info = 0; + + if (uplo < 0) + info = 14; + if (ldc < m) + info = 13; + if (k < 0) + info = 5; + if (n < 0) + info = 4; + if (m < 0) + info = 3; + if (transb < 0) + info = 2; + if (transa < 0) + info = 1; + + if (info) { + BLASFUNC(xerbla) (ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } +#else + +void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, + enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, + blasint N, blasint k, +#ifndef COMPLEX + FLOAT alpha, + IFLOAT * A, blasint LDA, + IFLOAT * B, blasint LDB, FLOAT beta, FLOAT * c, blasint ldc) +{ +#else + void *valpha, + void *va, blasint LDA, + void *vb, blasint LDB, void *vbeta, void *vc, blasint ldc) +{ + FLOAT *alpha = (FLOAT *) valpha; + FLOAT *beta = (FLOAT *) vbeta; + FLOAT *A = (FLOAT *) va; + FLOAT *B = (FLOAT *) vb; + FLOAT *c = (FLOAT *) vc; +#endif + FLOAT *aa, *bb, *cc; + + int transa, transb, uplo; + blasint info; + blasint m, n, lda, ldb; + FLOAT *a, *b; + XFLOAT *buffer; + + PRINT_DEBUG_CNAME; + + transa = -1; + transb = -1; + info = 0; + + if (order == CblasColMajor) { + + if (TransA == CblasNoTrans) + transa = 0; + if (TransA == CblasTrans) + transa = 1; +#ifndef COMPLEX + if (TransA == CblasConjNoTrans) + transa = 0; + if (TransA == CblasConjTrans) + transa = 1; +#else + if (TransA == CblasConjNoTrans) + transa = 2; + if (TransA == CblasConjTrans) + transa = 3; +#endif + if (TransB == CblasNoTrans) + transb = 0; + if (TransB == CblasTrans) + transb = 1; +#ifndef COMPLEX + if (TransB == CblasConjNoTrans) + transb = 0; + if (TransB == CblasConjTrans) + transb = 1; +#else + if (TransB == CblasConjNoTrans) + transb = 2; + if (TransB == CblasConjTrans) + transb = 3; +#endif + + m = M; + n = N; + + a = (void *)A; + b = (void *)B; + lda = LDA; + ldb = LDB; + + info = -1; + + if (ldc < m) + info = 13; + if (k < 0) + info = 5; + if (n < 0) + info = 4; + if (m < 0) + info = 3; + if (transb < 0) + info = 2; + if (transa < 0) + info = 1; + } + + if (order == CblasRowMajor) { + m = N; + n = M; + + a = (void *)B; + b = (void *)A; + + lda = LDB; + ldb = LDA; + + if (TransB == CblasNoTrans) + transa = 0; + if (TransB == CblasTrans) + transa = 1; +#ifndef COMPLEX + if (TransB == CblasConjNoTrans) + transa = 0; + if (TransB == CblasConjTrans) + transa = 1; +#else + if (TransB == CblasConjNoTrans) + transa = 2; + if (TransB == CblasConjTrans) + transa = 3; +#endif + if (TransA == CblasNoTrans) + transb = 0; + if (TransA == CblasTrans) + transb = 1; +#ifndef COMPLEX + if (TransA == CblasConjNoTrans) + transb = 0; + if (TransA == CblasConjTrans) + transb = 1; +#else + if (TransA == CblasConjNoTrans) + transb = 2; + if (TransA == CblasConjTrans) + transb = 3; +#endif + + info = -1; + + if (ldc < m) + info = 13; + if (k < 0) + info = 5; + if (n < 0) + info = 4; + if (m < 0) + info = 3; + if (transb < 0) + info = 2; + if (transa < 0) + info = 1; + + } + + uplo = -1; + if (Uplo == CblasUpper) + uplo = 0; + if (Uplo == CblasLower) + uplo = 1; + if (uplo < 0) + info = 14; + + if (info >= 0) { + BLASFUNC(xerbla) (ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } +#if defined(COMPLEX) + FLOAT alpha_r = *(alpha + 0); + FLOAT alpha_i = *(alpha + 1); + + FLOAT beta_r = *(beta + 0); + FLOAT beta_i = *(beta + 1); +#endif + +#endif + int buffer_size; + blasint l; + blasint i, j; + +#ifdef SMP + int nthreads; +#endif + +#if defined(COMPLEX) + +#ifdef SMP + static int (*gemv_thread[]) (BLASLONG, BLASLONG, FLOAT *, FLOAT *, + BLASLONG, FLOAT *, BLASLONG, FLOAT *, + BLASLONG, FLOAT *, int) = { +#ifdef XDOUBLE + xgemv_thread_n, xgemv_thread_t, xgemv_thread_r, xgemv_thread_c, + xgemv_thread_o, xgemv_thread_u, xgemv_thread_s, + xgemv_thread_d, +#elif defined DOUBLE + zgemv_thread_n, zgemv_thread_t, zgemv_thread_r, zgemv_thread_c, + zgemv_thread_o, zgemv_thread_u, zgemv_thread_s, + zgemv_thread_d, +#else + cgemv_thread_n, cgemv_thread_t, cgemv_thread_r, cgemv_thread_c, + cgemv_thread_o, cgemv_thread_u, cgemv_thread_s, + cgemv_thread_d, +#endif + }; +#endif + + int (*gemv[]) (BLASLONG, BLASLONG, BLASLONG, FLOAT, FLOAT, FLOAT *, + BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, + FLOAT *) = { + GEMV_N, GEMV_T, GEMV_R, GEMV_C, GEMV_O, GEMV_U, GEMV_S, GEMV_D,}; + +#else + +#ifdef SMP + static int (*gemv_thread[]) (BLASLONG, BLASLONG, FLOAT, FLOAT *, + BLASLONG, FLOAT *, BLASLONG, FLOAT *, + BLASLONG, FLOAT *, int) = { +#ifdef XDOUBLE + qgemv_thread_n, qgemv_thread_t, +#elif defined DOUBLE + dgemv_thread_n, dgemv_thread_t, +#else + sgemv_thread_n, sgemv_thread_t, +#endif + }; +#endif + int (*gemv[]) (BLASLONG, BLASLONG, BLASLONG, FLOAT, FLOAT *, BLASLONG, + FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *) = { + GEMV_N, GEMV_T,}; + +#endif + + if ((m == 0) || (n == 0)) + return; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + const blasint incb = (transb == 0) ? 1 : ldb; + + if (uplo == 1) { + for (i = 0; i < n; i++) { + j = n - i; + + l = j; +#if defined(COMPLEX) + aa = a + i * 2; + bb = b + i * ldb * 2; + if (transa) { + l = k; + aa = a + lda * i * 2; + bb = b + i * 2; + } + cc = c + i * 2 * ldc + i * 2; +#else + aa = a + i; + bb = b + i * ldb; + if (transa) { + l = k; + aa = a + lda * i; + bb = b + i; + } + cc = c + i * ldc + i; +#endif + +#if defined(COMPLEX) + if (beta_r != ONE || beta_i != ZERO) + SCAL_K(l, 0, 0, beta_r, beta_i, cc, 1, NULL, 0, + NULL, 0); + + if (alpha_r == ZERO && alpha_i == ZERO) + return; +#else + if (beta != ONE) + SCAL_K(l, 0, 0, beta, cc, 1, NULL, 0, NULL, 0); + + if (alpha == ZERO) + continue; +#endif + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + buffer_size = j + k + 128 / sizeof(FLOAT); +#ifdef WINDOWS_ABI + buffer_size += 160 / sizeof(FLOAT); +#endif + // for alignment + buffer_size = (buffer_size + 3) & ~3; + STACK_ALLOC(buffer_size, FLOAT, buffer); + +#ifdef SMP + + if (1L * j * k < 2304L * GEMM_MULTITHREAD_THRESHOLD) + nthreads = 1; + else + nthreads = num_cpu_avail(2); + + if (nthreads == 1) { +#endif + +#if defined(COMPLEX) + (gemv[(int)transa]) (j, k, 0, alpha_r, alpha_i, + aa, lda, bb, incb, cc, 1, + buffer); +#else + (gemv[(int)transa]) (j, k, 0, alpha, aa, lda, + bb, incb, cc, 1, buffer); +#endif +#ifdef SMP + } else { + + (gemv_thread[(int)transa]) (j, k, alpha, aa, + lda, bb, incb, cc, + 1, buffer, + nthreads); + + } +#endif + + STACK_FREE(buffer); + } + } else { + + for (i = 0; i < n; i++) { + j = i + 1; + + l = j; +#if defined COMPLEX + bb = b + i * ldb * 2; + if (transa) { + l = k; + bb = b + i * 2; + } + cc = c + i * 2 * ldc; +#else + bb = b + i * ldb; + if (transa) { + l = k; + bb = b + i; + } + cc = c + i * ldc; +#endif + +#if defined(COMPLEX) + if (beta_r != ONE || beta_i != ZERO) + SCAL_K(l, 0, 0, beta_r, beta_i, cc, 1, NULL, 0, + NULL, 0); + + if (alpha_r == ZERO && alpha_i == ZERO) + return; +#else + if (beta != ONE) + SCAL_K(l, 0, 0, beta, cc, 1, NULL, 0, NULL, 0); + + if (alpha == ZERO) + continue; +#endif + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + buffer_size = j + k + 128 / sizeof(FLOAT); +#ifdef WINDOWS_ABI + buffer_size += 160 / sizeof(FLOAT); +#endif + // for alignment + buffer_size = (buffer_size + 3) & ~3; + STACK_ALLOC(buffer_size, FLOAT, buffer); + +#ifdef SMP + + if (1L * j * k < 2304L * GEMM_MULTITHREAD_THRESHOLD) + nthreads = 1; + else + nthreads = num_cpu_avail(2); + + if (nthreads == 1) { +#endif + +#if defined(COMPLEX) + (gemv[(int)transa]) (j, k, 0, alpha_r, alpha_i, + a, lda, bb, incb, cc, 1, + buffer); +#else + (gemv[(int)transa]) (j, k, 0, alpha, a, lda, bb, + incb, cc, 1, buffer); +#endif + +#ifdef SMP + } else { + + (gemv_thread[(int)transa]) (j, k, alpha, a, lda, + bb, incb, cc, 1, + buffer, nthreads); + + } +#endif + + STACK_FREE(buffer); + } + } + FUNCTION_PROFILE_END(COMPSIZE * COMPSIZE, + args.m * args.k + args.k * args.n + + args.m * args.n, 2 * args.m * args.n * args.k); + + IDEBUG_END; + + return; +}