621 lines
		
	
	
		
			13 KiB
		
	
	
	
		
			C
		
	
	
	
			
		
		
	
	
			621 lines
		
	
	
		
			13 KiB
		
	
	
	
		
			C
		
	
	
	
| /*********************************************************************/
 | |
| /* Copyright 2022, The OpenBLAS Project.                             */
 | |
| /* All rights reserved.                                              */
 | |
| /*                                                                   */
 | |
| /* Redistribution and use in source and binary forms, with or        */
 | |
| /* without modification, are permitted provided that the following   */
 | |
| /* conditions are met:                                               */
 | |
| /*                                                                   */
 | |
| /*   1. Redistributions of source code must retain the above         */
 | |
| /*      copyright notice, this list of conditions and the following  */
 | |
| /*      disclaimer.                                                  */
 | |
| /*                                                                   */
 | |
| /*   2. Redistributions in binary form must reproduce the above      */
 | |
| /*      copyright notice, this list of conditions and the following  */
 | |
| /*      disclaimer in the documentation and/or other materials       */
 | |
| /*      provided with the distribution.                              */
 | |
| /*                                                                   */
 | |
| /*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
 | |
| /*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
 | |
| /*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
 | |
| /*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
 | |
| /*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
 | |
| /*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
 | |
| /*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
 | |
| /*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
 | |
| /*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
 | |
| /*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
 | |
| /*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
 | |
| /*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
 | |
| /*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
 | |
| /*    POSSIBILITY OF SUCH DAMAGE.                                    */
 | |
| /*                                                                   */
 | |
| /*********************************************************************/
 | |
| 
 | |
| #include <stdio.h>
 | |
| #include <stdlib.h>
 | |
| #include "common.h"
 | |
| 
 | |
| #ifndef COMPLEX
 | |
| #define SMP_THRESHOLD_MIN 65536.0
 | |
| #ifdef XDOUBLE
 | |
| #define ERROR_NAME "QGEMMT "
 | |
| #elif defined(DOUBLE)
 | |
| #define ERROR_NAME "DGEMMT "
 | |
| #elif defined(BFLOAT16)
 | |
| #define ERROR_NAME "SBGEMMT "
 | |
| #else
 | |
| #define ERROR_NAME "SGEMMT "
 | |
| #endif
 | |
| #else
 | |
| #define SMP_THRESHOLD_MIN 8192.0
 | |
| #ifdef XDOUBLE
 | |
| #define ERROR_NAME "XGEMMT "
 | |
| #elif defined(DOUBLE)
 | |
| #define ERROR_NAME "ZGEMMT "
 | |
| #else
 | |
| #define ERROR_NAME "CGEMMT "
 | |
| #endif
 | |
| #endif
 | |
| 
 | |
| #ifndef GEMM_MULTITHREAD_THRESHOLD
 | |
| #define GEMM_MULTITHREAD_THRESHOLD 4
 | |
| #endif
 | |
| 
 | |
| #ifndef CBLAS
 | |
| 
 | |
| void NAME(char *UPLO, char *TRANSA, char *TRANSB,
 | |
| 	  blasint * M, blasint * K,
 | |
| 	  FLOAT * Alpha,
 | |
| 	  IFLOAT * a, blasint * ldA,
 | |
| 	  IFLOAT * b, blasint * ldB, FLOAT * Beta, FLOAT * c, blasint * ldC)
 | |
| {
 | |
| 
 | |
| 	blasint m, k;
 | |
| 	blasint lda, ldb, ldc;
 | |
| 	int transa, transb, uplo;
 | |
| 	blasint info;
 | |
| 
 | |
| 	char transA, transB, Uplo;
 | |
| 	blasint nrowa, nrowb;
 | |
| 	IFLOAT *buffer;
 | |
| 	IFLOAT *aa, *bb;
 | |
| 	FLOAT *cc;
 | |
| #if defined(COMPLEX)
 | |
| 	FLOAT alpha_r, alpha_i, beta_r, beta_i;
 | |
| #else
 | |
| 	FLOAT alpha, beta;
 | |
| #endif
 | |
| 
 | |
| 	PRINT_DEBUG_NAME;
 | |
| 
 | |
| 	m = *M;
 | |
| 	k = *K;
 | |
| 
 | |
| #if defined(COMPLEX)
 | |
| 	FLOAT *alpha = Alpha;
 | |
| 	alpha_r = *(Alpha + 0);
 | |
| 	alpha_i = *(Alpha + 1);
 | |
| 
 | |
| 	beta_r = *(Beta + 0);
 | |
| 	beta_i = *(Beta + 1);
 | |
| #else
 | |
| 	alpha = *Alpha;
 | |
| 	beta = *Beta;
 | |
| #endif
 | |
| 
 | |
| 	lda = *ldA;
 | |
| 	ldb = *ldB;
 | |
| 	ldc = *ldC;
 | |
| 
 | |
| 	transA = *TRANSA;
 | |
| 	transB = *TRANSB;
 | |
| 	Uplo = *UPLO;
 | |
| 	TOUPPER(transA);
 | |
| 	TOUPPER(transB);
 | |
| 	TOUPPER(Uplo);
 | |
| 
 | |
| 	transa = -1;
 | |
| 	transb = -1;
 | |
| 	uplo = -1;
 | |
| 
 | |
| 	if (transA == 'N')
 | |
| 		transa = 0;
 | |
| 	if (transA == 'T')
 | |
| 		transa = 1;
 | |
| #ifndef COMPLEX
 | |
| 	if (transA == 'R')
 | |
| 		transa = 0;
 | |
| 	if (transA == 'C')
 | |
| 		transa = 1;
 | |
| #else
 | |
| 	if (transA == 'R')
 | |
| 		transa = 2;
 | |
| 	if (transA == 'C')
 | |
| 		transa = 3;
 | |
| #endif
 | |
| 
 | |
| 	if (transB == 'N')
 | |
| 		transb = 0;
 | |
| 	if (transB == 'T')
 | |
| 		transb = 1;
 | |
| #ifndef COMPLEX
 | |
| 	if (transB == 'R')
 | |
| 		transb = 0;
 | |
| 	if (transB == 'C')
 | |
| 		transb = 1;
 | |
| #else
 | |
| 	if (transB == 'R')
 | |
| 		transb = 2;
 | |
| 	if (transB == 'C')
 | |
| 		transb = 3;
 | |
| #endif
 | |
| 
 | |
| 	if (Uplo == 'U')
 | |
| 		uplo = 0;
 | |
| 	if (Uplo == 'L')
 | |
| 		uplo = 1;
 | |
| 
 | |
| 	nrowa = m;
 | |
| 	if (transa) nrowa = k;
 | |
| 	nrowb = k;
 | |
| 	if (transb) nrowb = m;
 | |
| 
 | |
| 	info = 0;
 | |
| 
 | |
| 	if (ldc < MAX(1, m))
 | |
| 		info = 13;
 | |
| 	if (ldb < MAX(1, nrowa))
 | |
| 		info = 10;
 | |
| 	if (lda < MAX(1, nrowb))
 | |
| 		info = 8;
 | |
| 	if (k < 0)
 | |
| 		info = 5;
 | |
| 	if (m < 0)
 | |
| 		info = 4;
 | |
| 	if (transb < 0)
 | |
| 		info = 3;
 | |
| 	if (transa < 0)
 | |
| 		info = 2;
 | |
| 	if (uplo < 0)
 | |
| 		info = 1;
 | |
| 
 | |
| 	if (info != 0) {
 | |
| 		BLASFUNC(xerbla) (ERROR_NAME, &info, sizeof(ERROR_NAME));
 | |
| 		return;
 | |
| 	}
 | |
| #else
 | |
| 
 | |
| void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
 | |
| 	   enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint m,
 | |
| 	   blasint k,
 | |
| #ifndef COMPLEX
 | |
| 	   FLOAT alpha,
 | |
| 	   IFLOAT * A, blasint LDA,
 | |
| 	   IFLOAT * B, blasint LDB, FLOAT beta, FLOAT * c, blasint ldc)
 | |
| {
 | |
| #else
 | |
| 	   void *valpha,
 | |
| 	   void *va, blasint LDA,
 | |
| 	   void *vb, blasint LDB, void *vbeta, void *vc, blasint ldc)
 | |
| {
 | |
| 	FLOAT *alpha = (FLOAT *) valpha;
 | |
| 	FLOAT *beta = (FLOAT *) vbeta;
 | |
| 	FLOAT *A = (FLOAT *) va;
 | |
| 	FLOAT *B = (FLOAT *) vb;
 | |
| 	FLOAT *c = (FLOAT *) vc;
 | |
| #endif
 | |
| 	FLOAT *aa, *bb, *cc;
 | |
| 
 | |
| 	int transa, transb, uplo;
 | |
| 	blasint info;
 | |
| 	blasint lda, ldb;
 | |
| 	FLOAT *a, *b;
 | |
| 	XFLOAT *buffer;
 | |
| 
 | |
| 	PRINT_DEBUG_CNAME;
 | |
| 
 | |
| 	uplo = -1;
 | |
| 	transa = -1;
 | |
| 	transb = -1;
 | |
| 	info = 0;
 | |
| 
 | |
| 	if (order == CblasColMajor) {
 | |
| 		if (Uplo == CblasUpper) uplo = 0;
 | |
| 		if (Uplo == CblasLower) uplo = 1;
 | |
| 
 | |
| 		if (TransA == CblasNoTrans)
 | |
| 			transa = 0;
 | |
| 		if (TransA == CblasTrans)
 | |
| 			transa = 1;
 | |
| #ifndef COMPLEX
 | |
| 		if (TransA == CblasConjNoTrans)
 | |
| 			transa = 0;
 | |
| 		if (TransA == CblasConjTrans)
 | |
| 			transa = 1;
 | |
| #else
 | |
| 		if (TransA == CblasConjNoTrans)
 | |
| 			transa = 2;
 | |
| 		if (TransA == CblasConjTrans)
 | |
| 			transa = 3;
 | |
| #endif
 | |
| 		if (TransB == CblasNoTrans)
 | |
| 			transb = 0;
 | |
| 		if (TransB == CblasTrans)
 | |
| 			transb = 1;
 | |
| #ifndef COMPLEX
 | |
| 		if (TransB == CblasConjNoTrans)
 | |
| 			transb = 0;
 | |
| 		if (TransB == CblasConjTrans)
 | |
| 			transb = 1;
 | |
| #else
 | |
| 		if (TransB == CblasConjNoTrans)
 | |
| 			transb = 2;
 | |
| 		if (TransB == CblasConjTrans)
 | |
| 			transb = 3;
 | |
| #endif
 | |
| 
 | |
| 		a = (void *)A;
 | |
| 		b = (void *)B;
 | |
| 		lda = LDA;
 | |
| 		ldb = LDB;
 | |
| 
 | |
| 		info = -1;
 | |
| 
 | |
| 		blasint nrowa, nrowb;
 | |
| 		nrowa = m;
 | |
| 		if (transa) nrowa = k;
 | |
| 		nrowb = k;
 | |
| 		if (transb) nrowb = m;
 | |
| 
 | |
| 		if (ldc < MAX(1, m))
 | |
| 			info = 13;
 | |
| 		if (ldb < MAX(1, nrowb))
 | |
| 			info = 10;
 | |
| 		if (lda < MAX(1, nrowa))
 | |
| 			info = 8;
 | |
| 		if (k < 0)
 | |
| 			info = 5;
 | |
| 		if (m < 0)
 | |
| 			info = 4;
 | |
| 		if (transb < 0)
 | |
| 			info = 3;
 | |
| 		if (transa < 0)
 | |
| 			info = 2;
 | |
| 		if (uplo < 0)
 | |
| 			info = 1;
 | |
| 	}
 | |
| 
 | |
| 	if (order == CblasRowMajor) {
 | |
| 
 | |
| 		a = (void *)B;
 | |
| 		b = (void *)A;
 | |
| 
 | |
| 		lda = LDB;
 | |
| 		ldb = LDA;
 | |
| 
 | |
| 		if (Uplo == CblasUpper) uplo = 0;
 | |
| 		if (Uplo == CblasLower) uplo = 1;
 | |
| 
 | |
| 		if (TransB == CblasNoTrans)
 | |
| 			transa = 0;
 | |
| 		if (TransB == CblasTrans)
 | |
| 			transa = 1;
 | |
| #ifndef COMPLEX
 | |
| 		if (TransB == CblasConjNoTrans)
 | |
| 			transa = 0;
 | |
| 		if (TransB == CblasConjTrans)
 | |
| 			transa = 1;
 | |
| #else
 | |
| 		if (TransB == CblasConjNoTrans)
 | |
| 			transa = 2;
 | |
| 		if (TransB == CblasConjTrans)
 | |
| 			transa = 3;
 | |
| #endif
 | |
| 		if (TransA == CblasNoTrans)
 | |
| 			transb = 0;
 | |
| 		if (TransA == CblasTrans)
 | |
| 			transb = 1;
 | |
| #ifndef COMPLEX
 | |
| 		if (TransA == CblasConjNoTrans)
 | |
| 			transb = 0;
 | |
| 		if (TransA == CblasConjTrans)
 | |
| 			transb = 1;
 | |
| #else
 | |
| 		if (TransA == CblasConjNoTrans)
 | |
| 			transb = 2;
 | |
| 		if (TransA == CblasConjTrans)
 | |
| 			transb = 3;
 | |
| #endif
 | |
| 
 | |
| 		info = -1;
 | |
| 
 | |
| 		blasint ncola, ncolb;
 | |
| 		ncola = k;
 | |
| 		if (transa) ncola = m;
 | |
| 		ncolb = m;
 | |
| 		if (transb) ncolb = k;
 | |
| 
 | |
| 		if (ldc < MAX(1,m))
 | |
| 			info = 13;
 | |
| 		if (ldb < MAX(1, ncolb))
 | |
| 			info = 10;
 | |
| 		if (lda < MAX(1, ncola))
 | |
| 			info = 8;
 | |
| 		if (k < 0)
 | |
| 			info = 5;
 | |
| 		if (m < 0)
 | |
| 			info = 4;
 | |
| 		if (transb < 0)
 | |
| 			info = 3;
 | |
| 		if (transa < 0)
 | |
| 			info = 2;
 | |
| 		if (uplo < 0)
 | |
| 			info = 1;
 | |
| 	}
 | |
| 
 | |
| 	if (info >= 0) {
 | |
| 		BLASFUNC(xerbla) (ERROR_NAME, &info, sizeof(ERROR_NAME));
 | |
| 		return;
 | |
| 	}
 | |
| #if defined(COMPLEX)
 | |
| 	FLOAT alpha_r = *(alpha + 0);
 | |
| 	FLOAT alpha_i = *(alpha + 1);
 | |
| 
 | |
| 	FLOAT beta_r = *(beta + 0);
 | |
| 	FLOAT beta_i = *(beta + 1);
 | |
| #endif
 | |
| 
 | |
| #endif
 | |
| 	int buffer_size;
 | |
| 	blasint l;
 | |
| 	blasint i, j;
 | |
| 
 | |
| #ifdef SMP
 | |
| 	int nthreads;
 | |
| #endif
 | |
| 
 | |
| #if defined(COMPLEX)
 | |
| 
 | |
| #ifdef SMP
 | |
| 	static int (*gemv_thread[]) (BLASLONG, BLASLONG, FLOAT *, FLOAT *,
 | |
| 				     BLASLONG, FLOAT *, BLASLONG, FLOAT *,
 | |
| 				     BLASLONG, FLOAT *, int) = {
 | |
| #ifdef XDOUBLE
 | |
| 		xgemv_thread_n, xgemv_thread_t, xgemv_thread_r, xgemv_thread_c,
 | |
| 		    xgemv_thread_o, xgemv_thread_u, xgemv_thread_s,
 | |
| 		    xgemv_thread_d,
 | |
| #elif defined DOUBLE
 | |
| 		zgemv_thread_n, zgemv_thread_t, zgemv_thread_r, zgemv_thread_c,
 | |
| 		    zgemv_thread_o, zgemv_thread_u, zgemv_thread_s,
 | |
| 		    zgemv_thread_d,
 | |
| #else
 | |
| 		cgemv_thread_n, cgemv_thread_t, cgemv_thread_r, cgemv_thread_c,
 | |
| 		    cgemv_thread_o, cgemv_thread_u, cgemv_thread_s,
 | |
| 		    cgemv_thread_d,
 | |
| #endif
 | |
| 	};
 | |
| #endif
 | |
| 
 | |
| 	int (*gemv[]) (BLASLONG, BLASLONG, BLASLONG, FLOAT, FLOAT, FLOAT *,
 | |
| 		       BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG,
 | |
| 		       FLOAT *) = {
 | |
| 	GEMV_N, GEMV_T, GEMV_R, GEMV_C, GEMV_O, GEMV_U, GEMV_S, GEMV_D,};
 | |
| 
 | |
| #else
 | |
| 
 | |
| #ifdef SMP
 | |
| 	static int (*gemv_thread[]) (BLASLONG, BLASLONG, FLOAT, FLOAT *,
 | |
| 				     BLASLONG, FLOAT *, BLASLONG, FLOAT *,
 | |
| 				     BLASLONG, FLOAT *, int) = {
 | |
| #ifdef XDOUBLE
 | |
| 		qgemv_thread_n, qgemv_thread_t,
 | |
| #elif defined DOUBLE
 | |
| 		dgemv_thread_n, dgemv_thread_t,
 | |
| #else
 | |
| 		sgemv_thread_n, sgemv_thread_t,
 | |
| #endif
 | |
| 	};
 | |
| #endif
 | |
| 	int (*gemv[]) (BLASLONG, BLASLONG, BLASLONG, FLOAT, FLOAT *, BLASLONG,
 | |
| 		       FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *) = {
 | |
| 	GEMV_N, GEMV_T,};
 | |
| 
 | |
| #endif
 | |
| 
 | |
| 	if (m == 0)
 | |
| 		return;
 | |
| 
 | |
| 	IDEBUG_START;
 | |
| 
 | |
| 	const blasint incb = (transb == 0) ? 1 : ldb;
 | |
| 
 | |
| 	if (uplo == 1) {
 | |
| 		for (i = 0; i < m; i++) {
 | |
| 			j = m - i;
 | |
| 
 | |
| 			l = j;
 | |
| #if defined(COMPLEX)
 | |
| 			aa = a + i * 2;
 | |
| 			bb = b + i * ldb * 2;
 | |
| 			if (transa) {
 | |
| 				aa = a + lda * i * 2;
 | |
| 			}
 | |
| 			if (transb)
 | |
| 				bb = b + i * 2;
 | |
| 			cc = c + i * 2 * ldc + i * 2;
 | |
| #else
 | |
| 			aa = a + i;
 | |
| 			bb = b + i * ldb;
 | |
| 			if (transa) {
 | |
| 				aa = a + lda * i;
 | |
| 			}
 | |
| 			if (transb)
 | |
| 				bb = b + i;
 | |
| 			cc = c + i * ldc + i;
 | |
| #endif
 | |
| 
 | |
| #if defined(COMPLEX)
 | |
| 			if (beta_r != ONE || beta_i != ZERO)
 | |
| 				SCAL_K(l, 0, 0, beta_r, beta_i, cc, 1, NULL, 0,
 | |
| 				       NULL, 0);
 | |
| 
 | |
| 			if (alpha_r == ZERO && alpha_i == ZERO)
 | |
| 				return;
 | |
| #else
 | |
| 			if (beta != ONE)
 | |
| 				SCAL_K(l, 0, 0, beta, cc, 1, NULL, 0, NULL, 0);
 | |
| 
 | |
| 			if (alpha == ZERO)
 | |
| 				continue;
 | |
| #endif
 | |
| 
 | |
| 			IDEBUG_START;
 | |
| 
 | |
| 			buffer_size = j + k + 128 / sizeof(FLOAT);
 | |
| #ifdef WINDOWS_ABI
 | |
| 			buffer_size += 160 / sizeof(FLOAT);
 | |
| #endif
 | |
| 			// for alignment
 | |
| 			buffer_size = (buffer_size + 3) & ~3;
 | |
| 			STACK_ALLOC(buffer_size, FLOAT, buffer);
 | |
| 
 | |
| #ifdef SMP
 | |
| 
 | |
| 			if (1L * j * k < 2304L * GEMM_MULTITHREAD_THRESHOLD)
 | |
| 				nthreads = 1;
 | |
| 			else
 | |
| 				nthreads = num_cpu_avail(2);
 | |
| 
 | |
| 			if (nthreads == 1) {
 | |
| #endif
 | |
| 
 | |
| #if defined(COMPLEX)
 | |
| 				if (!transa)
 | |
| 				(gemv[(int)transa]) (j, k, 0, alpha_r, alpha_i,
 | |
| 						     aa, lda, bb, incb, cc, 1,
 | |
| 						     buffer);
 | |
| 				else
 | |
| 				(gemv[(int)transa]) (k, j, 0, alpha_r, alpha_i,
 | |
| 						     aa, lda, bb, incb, cc, 1,
 | |
| 						     buffer);
 | |
| #else
 | |
| 				if (!transa)
 | |
| 				(gemv[(int)transa]) (j, k, 0, alpha, aa, lda,
 | |
| 						     bb, incb, cc, 1, buffer);
 | |
| 				else
 | |
| 				(gemv[(int)transa]) (k, j, 0, alpha, aa, lda,
 | |
| 						     bb, incb, cc, 1, buffer);
 | |
| #endif
 | |
| #ifdef SMP
 | |
| 			} else {
 | |
| 				if (!transa)
 | |
| 				(gemv_thread[(int)transa]) (j, k, alpha, aa,
 | |
| 							    lda, bb, incb, cc,
 | |
| 							    1, buffer,
 | |
| 							    nthreads);
 | |
| 				else
 | |
| 				(gemv_thread[(int)transa]) (k, j, alpha, aa,
 | |
| 							    lda, bb, incb, cc,
 | |
| 							    1, buffer,
 | |
| 							    nthreads);
 | |
| 
 | |
| 			}
 | |
| #endif
 | |
| 
 | |
| 			STACK_FREE(buffer);
 | |
| 		}
 | |
| 	} else {
 | |
| 
 | |
| 		for (i = 0; i < m; i++) {
 | |
| 			j = i + 1;
 | |
| 
 | |
| 			l = j;
 | |
| #if defined COMPLEX
 | |
| 			bb = b + i * ldb * 2;
 | |
| 			if (transb) {
 | |
| 				bb = b + i * 2;
 | |
| 			}
 | |
| 			cc = c + i * 2 * ldc;
 | |
| #else
 | |
| 			bb = b + i * ldb;
 | |
| 			if (transb) {
 | |
| 				bb = b + i;
 | |
| 			}
 | |
| 			cc = c + i * ldc;
 | |
| #endif
 | |
| 
 | |
| #if defined(COMPLEX)
 | |
| 			if (beta_r != ONE || beta_i != ZERO)
 | |
| 				SCAL_K(l, 0, 0, beta_r, beta_i, cc, 1, NULL, 0,
 | |
| 				       NULL, 0);
 | |
| 
 | |
| 			if (alpha_r == ZERO && alpha_i == ZERO)
 | |
| 				return;
 | |
| #else
 | |
| 			if (beta != ONE)
 | |
| 				SCAL_K(l, 0, 0, beta, cc, 1, NULL, 0, NULL, 0);
 | |
| 
 | |
| 			if (alpha == ZERO)
 | |
| 				continue;
 | |
| #endif
 | |
| 			IDEBUG_START;
 | |
| 
 | |
| 			buffer_size = j + k + 128 / sizeof(FLOAT);
 | |
| #ifdef WINDOWS_ABI
 | |
| 			buffer_size += 160 / sizeof(FLOAT);
 | |
| #endif
 | |
| 			// for alignment
 | |
| 			buffer_size = (buffer_size + 3) & ~3;
 | |
| 			STACK_ALLOC(buffer_size, FLOAT, buffer);
 | |
| 
 | |
| #ifdef SMP
 | |
| 
 | |
| 			if (1L * j * k < 2304L * GEMM_MULTITHREAD_THRESHOLD)
 | |
| 				nthreads = 1;
 | |
| 			else
 | |
| 				nthreads = num_cpu_avail(2);
 | |
| 
 | |
| 			if (nthreads == 1) {
 | |
| #endif
 | |
| 
 | |
| #if defined(COMPLEX)
 | |
| 				if (!transa)
 | |
| 				(gemv[(int)transa]) (j, k, 0, alpha_r, alpha_i,
 | |
| 						     a, lda, bb, incb, cc, 1,
 | |
| 						     buffer);
 | |
| 				else
 | |
| 				(gemv[(int)transa]) (k, j, 0, alpha_r, alpha_i,
 | |
| 						     a, lda, bb, incb, cc, 1,
 | |
| 						     buffer);
 | |
| #else
 | |
| 				if (!transa)
 | |
| 				(gemv[(int)transa]) (j, k, 0, alpha, a, lda, bb,
 | |
| 						     incb, cc, 1, buffer);
 | |
| 				else
 | |
| 				(gemv[(int)transa]) (k, j, 0, alpha, a, lda, bb,
 | |
| 						     incb, cc, 1, buffer);
 | |
| #endif
 | |
| 
 | |
| #ifdef SMP
 | |
| 			} else {
 | |
| 				if (!transa)
 | |
| 				(gemv_thread[(int)transa]) (j, k, alpha, a, lda,
 | |
| 							    bb, incb, cc, 1,
 | |
| 							    buffer, nthreads);
 | |
| 				else
 | |
| 				(gemv_thread[(int)transa]) (k, j, alpha, a, lda,
 | |
| 							    bb, incb, cc, 1,
 | |
| 							    buffer, nthreads);
 | |
| 			}
 | |
| #endif
 | |
| 
 | |
| 			STACK_FREE(buffer);
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	IDEBUG_END;
 | |
| 
 | |
| 	return;
 | |
| }
 |