Merge pull request #4006 from martin-frbg/issue4005

Fix ?GEMMT implementation
This commit is contained in:
Martin Kroeker 2023-04-16 13:30:17 +02:00 committed by GitHub
commit 73e6fcb925
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 50 additions and 50 deletions

View File

@ -35,29 +35,26 @@
#include <stdio.h>
#include <stdlib.h>
#include "common.h"
#ifdef FUNCTION_PROFILE
#include "functable.h"
#endif
#ifndef COMPLEX
#define SMP_THRESHOLD_MIN 65536.0
#ifdef XDOUBLE
#define ERROR_NAME "QGEMT "
#define ERROR_NAME "QGEMMT "
#elif defined(DOUBLE)
#define ERROR_NAME "DGEMT "
#define ERROR_NAME "DGEMMT "
#elif defined(BFLOAT16)
#define ERROR_NAME "SBGEMT "
#define ERROR_NAME "SBGEMMT "
#else
#define ERROR_NAME "SGEMT "
#define ERROR_NAME "SGEMMT "
#endif
#else
#define SMP_THRESHOLD_MIN 8192.0
#ifdef XDOUBLE
#define ERROR_NAME "XGEMT "
#define ERROR_NAME "XGEMMT "
#elif defined(DOUBLE)
#define ERROR_NAME "ZGEMT "
#define ERROR_NAME "ZGEMMT "
#else
#define ERROR_NAME "CGEMT "
#define ERROR_NAME "CGEMMT "
#endif
#endif
@ -68,13 +65,13 @@
#ifndef CBLAS
void NAME(char *UPLO, char *TRANSA, char *TRANSB,
blasint * M, blasint * N, blasint * K,
blasint * M, blasint * K,
FLOAT * Alpha,
IFLOAT * a, blasint * ldA,
IFLOAT * b, blasint * ldB, FLOAT * Beta, FLOAT * c, blasint * ldC)
{
blasint m, n, k;
blasint m, k;
blasint lda, ldb, ldc;
int transa, transb, uplo;
blasint info;
@ -92,7 +89,6 @@ void NAME(char *UPLO, char *TRANSA, char *TRANSB,
PRINT_DEBUG_NAME;
m = *M;
n = *N;
k = *K;
#if defined(COMPLEX)
@ -167,8 +163,6 @@ void NAME(char *UPLO, char *TRANSA, char *TRANSB,
info = 13;
if (k < 0)
info = 5;
if (n < 0)
info = 4;
if (m < 0)
info = 3;
if (transb < 0)
@ -184,7 +178,7 @@ void NAME(char *UPLO, char *TRANSA, char *TRANSB,
void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M,
blasint N, blasint k,
blasint k,
#ifndef COMPLEX
FLOAT alpha,
IFLOAT * A, blasint LDA,
@ -205,7 +199,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
int transa, transb, uplo;
blasint info;
blasint m, n, lda, ldb;
blasint m, lda, ldb;
FLOAT *a, *b;
XFLOAT *buffer;
@ -248,9 +242,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
transb = 3;
#endif
m = M;
n = N;
a = (void *)A;
b = (void *)B;
lda = LDA;
@ -262,8 +253,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
info = 13;
if (k < 0)
info = 5;
if (n < 0)
info = 4;
if (m < 0)
info = 3;
if (transb < 0)
@ -273,8 +262,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
}
if (order == CblasRowMajor) {
m = N;
n = M;
a = (void *)B;
b = (void *)A;
@ -319,8 +306,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
info = 13;
if (k < 0)
info = 5;
if (n < 0)
info = 4;
if (m < 0)
info = 3;
if (transb < 0)
@ -407,37 +392,35 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
#endif
if ((m == 0) || (n == 0))
if ((m == 0) )
return;
IDEBUG_START;
FUNCTION_PROFILE_START();
const blasint incb = (transb == 0) ? 1 : ldb;
if (uplo == 1) {
for (i = 0; i < n; i++) {
j = n - i;
for (i = 0; i < m; i++) {
j = m - i;
l = j;
#if defined(COMPLEX)
aa = a + i * 2;
bb = b + i * ldb * 2;
if (transa) {
l = k;
aa = a + lda * i * 2;
bb = b + i * 2;
}
if (transb)
bb = b + i * 2;
cc = c + i * 2 * ldc + i * 2;
#else
aa = a + i;
bb = b + i * ldb;
if (transa) {
l = k;
aa = a + lda * i;
bb = b + i;
}
if (transb)
bb = b + i;
cc = c + i * ldc + i;
#endif
@ -458,8 +441,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
IDEBUG_START;
FUNCTION_PROFILE_START();
buffer_size = j + k + 128 / sizeof(FLOAT);
#ifdef WINDOWS_ABI
buffer_size += 160 / sizeof(FLOAT);
@ -479,20 +460,34 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
#endif
#if defined(COMPLEX)
if (!transa)
(gemv[(int)transa]) (j, k, 0, alpha_r, alpha_i,
aa, lda, bb, incb, cc, 1,
buffer);
else
(gemv[(int)transa]) (k, j, 0, alpha_r, alpha_i,
aa, lda, bb, incb, cc, 1,
buffer);
#else
if (!transa)
(gemv[(int)transa]) (j, k, 0, alpha, aa, lda,
bb, incb, cc, 1, buffer);
else
(gemv[(int)transa]) (k, j, 0, alpha, aa, lda,
bb, incb, cc, 1, buffer);
#endif
#ifdef SMP
} else {
if (!transa)
(gemv_thread[(int)transa]) (j, k, alpha, aa,
lda, bb, incb, cc,
1, buffer,
nthreads);
else
(gemv_thread[(int)transa]) (k, j, alpha, aa,
lda, bb, incb, cc,
1, buffer,
nthreads);
}
#endif
@ -501,21 +496,19 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
}
} else {
for (i = 0; i < n; i++) {
for (i = 0; i < m; i++) {
j = i + 1;
l = j;
#if defined COMPLEX
bb = b + i * ldb * 2;
if (transa) {
l = k;
if (transb) {
bb = b + i * 2;
}
cc = c + i * 2 * ldc;
#else
bb = b + i * ldb;
if (transa) {
l = k;
if (transb) {
bb = b + i;
}
cc = c + i * ldc;
@ -537,8 +530,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
#endif
IDEBUG_START;
FUNCTION_PROFILE_START();
buffer_size = j + k + 128 / sizeof(FLOAT);
#ifdef WINDOWS_ABI
buffer_size += 160 / sizeof(FLOAT);
@ -558,30 +549,39 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
#endif
#if defined(COMPLEX)
if (!transa)
(gemv[(int)transa]) (j, k, 0, alpha_r, alpha_i,
a, lda, bb, incb, cc, 1,
buffer);
else
(gemv[(int)transa]) (k, j, 0, alpha_r, alpha_i,
a, lda, bb, incb, cc, 1,
buffer);
#else
if (!transa)
(gemv[(int)transa]) (j, k, 0, alpha, a, lda, bb,
incb, cc, 1, buffer);
else
(gemv[(int)transa]) (k, j, 0, alpha, a, lda, bb,
incb, cc, 1, buffer);
#endif
#ifdef SMP
} else {
if (!transa)
(gemv_thread[(int)transa]) (j, k, alpha, a, lda,
bb, incb, cc, 1,
buffer, nthreads);
else
(gemv_thread[(int)transa]) (k, j, alpha, a, lda,
bb, incb, cc, 1,
buffer, nthreads);
}
#endif
STACK_FREE(buffer);
}
}
FUNCTION_PROFILE_END(COMPSIZE * COMPSIZE,
args.m * args.k + args.k * args.n +
args.m * args.n, 2 * args.m * args.n * args.k);
IDEBUG_END;