Merge branch 'develop' into betterPowerGEMVTail
This commit is contained in:
commit
083faf7556
|
@ -263,7 +263,7 @@ if (DEFINED TARGET)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (${TARGET} STREQUAL POWER10)
|
if (${TARGET} STREQUAL POWER10)
|
||||||
if (CMAKE_C_COMPILER VERSION VERSION_GREATER 10.2 OR CMAKE_C_COMPILER_VERSION VERSION_EQUAL 10.2)
|
if (CMAKE_C_COMPILER_VERSION VERSION_GREATER 10.2 OR CMAKE_C_COMPILER_VERSION VERSION_EQUAL 10.2)
|
||||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math")
|
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math")
|
||||||
else ()
|
else ()
|
||||||
message(FATAL_ERROR "Compiler GCC ${CMAKE_C_COMPILER_VERSION} does not support Power10.")
|
message(FATAL_ERROR "Compiler GCC ${CMAKE_C_COMPILER_VERSION} does not support Power10.")
|
||||||
|
|
|
@ -26,7 +26,7 @@ endif
|
||||||
override CFLAGS += -DADD$(BU) -DCBLAS
|
override CFLAGS += -DADD$(BU) -DCBLAS
|
||||||
ifeq ($(F_COMPILER),GFORTRAN)
|
ifeq ($(F_COMPILER),GFORTRAN)
|
||||||
ifneq (, $(filter $(CORE),LOONGSON3R3 LOONGSON3R4))
|
ifneq (, $(filter $(CORE),LOONGSON3R3 LOONGSON3R4))
|
||||||
override FFLAGS = $(filter_out(-O2 -O3,$(FFLAGS))) -O0
|
override FFLAGS := $(filter_out(-O2 -O3,$(FFLAGS))) -O0
|
||||||
endif
|
endif
|
||||||
override FFLAGS += -fno-tree-vectorize
|
override FFLAGS += -fno-tree-vectorize
|
||||||
endif
|
endif
|
||||||
|
|
|
@ -2680,7 +2680,7 @@ static int sbgemv_kernel_1x128_lda_direct(BLASLONG m, BLASLONG n, float alpha, b
|
||||||
BLASLONG tag_n_32x = n & (~31);
|
BLASLONG tag_n_32x = n & (~31);
|
||||||
BLASLONG tag_n_128x = n & (~127);
|
BLASLONG tag_n_128x = n & (~127);
|
||||||
|
|
||||||
__m512 accum512_bridge[8];
|
__m512 accum512_bridge[16];
|
||||||
__m512 accum512_t_0, accum512_t_1, accum512_t_2, accum512_t_3;
|
__m512 accum512_t_0, accum512_t_1, accum512_t_2, accum512_t_3;
|
||||||
__m256 accum256_0;
|
__m256 accum256_0;
|
||||||
__m128 accum128;
|
__m128 accum128;
|
||||||
|
|
|
@ -5,5 +5,5 @@ Data file for testing REAL LAPACK linear equation routines RFP format
|
||||||
1 2 15 Values of NRHS (number of right hand sides)
|
1 2 15 Values of NRHS (number of right hand sides)
|
||||||
9 Number of matrix types (list types on next line if 0 < NTYPES < 9)
|
9 Number of matrix types (list types on next line if 0 < NTYPES < 9)
|
||||||
1 2 3 4 5 6 7 8 9 Matrix Types
|
1 2 3 4 5 6 7 8 9 Matrix Types
|
||||||
42.0 Threshold value of test ratio
|
45.0 Threshold value of test ratio
|
||||||
T Put T to test the error exits
|
T Put T to test the error exits
|
||||||
|
|
|
@ -121,7 +121,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
|
||||||
HERK_THREAD_LN(&newarg, NULL, NULL, sa, sb, 0);
|
HERK_THREAD_LN(&newarg, NULL, NULL, sa, sb, 0);
|
||||||
#else
|
#else
|
||||||
syrk_thread(mode | BLAS_TRANSA_N | BLAS_TRANSB_T | BLAS_UPLO,
|
syrk_thread(mode | BLAS_TRANSA_N | BLAS_TRANSB_T | BLAS_UPLO,
|
||||||
&newarg, NULL, NULL, (int (*)(void))HERK_LN, sa, sb, args -> nthreads);
|
&newarg, NULL, NULL, (int (*)(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG))HERK_LN, sa, sb, args -> nthreads);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -121,7 +121,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
|
||||||
HERK_THREAD_UC(&newarg, NULL, NULL, sa, sb, 0);
|
HERK_THREAD_UC(&newarg, NULL, NULL, sa, sb, 0);
|
||||||
#else
|
#else
|
||||||
syrk_thread(mode | BLAS_TRANSA_N | BLAS_TRANSB_T,
|
syrk_thread(mode | BLAS_TRANSA_N | BLAS_TRANSB_T,
|
||||||
&newarg, NULL, NULL, (int (*)(void))HERK_UC, sa, sb, args -> nthreads);
|
&newarg, NULL, NULL, (int (*)(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG))HERK_UC, sa, sb, args -> nthreads);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
4
param.h
4
param.h
|
@ -2637,8 +2637,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#undef SBGEMM_DEFAULT_Q
|
#undef SBGEMM_DEFAULT_Q
|
||||||
#define SBGEMM_DEFAULT_UNROLL_M 16
|
#define SBGEMM_DEFAULT_UNROLL_M 16
|
||||||
#define SBGEMM_DEFAULT_UNROLL_N 8
|
#define SBGEMM_DEFAULT_UNROLL_N 8
|
||||||
#define SBGEMM_DEFAULT_P 832
|
#define SBGEMM_DEFAULT_P 512
|
||||||
#define SBGEMM_DEFAULT_Q 1026
|
#define SBGEMM_DEFAULT_Q 1024
|
||||||
#define SBGEMM_DEFAULT_R 4096
|
#define SBGEMM_DEFAULT_R 4096
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
|
@ -2,7 +2,7 @@ TOPDIR = ..
|
||||||
include ../Makefile.system
|
include ../Makefile.system
|
||||||
ifeq ($(F_COMPILER),GFORTRAN)
|
ifeq ($(F_COMPILER),GFORTRAN)
|
||||||
ifneq (, $(filter $(CORE),LOONGSON3R3 LOONGSON3R4))
|
ifneq (, $(filter $(CORE),LOONGSON3R3 LOONGSON3R4))
|
||||||
override FFLAGS = $(filter_out(-O2 -O3,$(FFLAGS))) -O0
|
override FFLAGS := $(filter_out(-O2 -O3,$(FFLAGS))) -O0
|
||||||
endif
|
endif
|
||||||
override FFLAGS += -fno-tree-vectorize
|
override FFLAGS += -fno-tree-vectorize
|
||||||
endif
|
endif
|
||||||
|
|
|
@ -81,6 +81,8 @@ float16to32 (bfloat16_bits f16)
|
||||||
return f32.v;
|
return f32.v;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#define SBGEMM_LARGEST 256
|
||||||
|
|
||||||
int
|
int
|
||||||
main (int argc, char *argv[])
|
main (int argc, char *argv[])
|
||||||
{
|
{
|
||||||
|
@ -88,12 +90,45 @@ main (int argc, char *argv[])
|
||||||
int i, j, l;
|
int i, j, l;
|
||||||
blasint x, y;
|
blasint x, y;
|
||||||
int ret = 0;
|
int ret = 0;
|
||||||
int loop = 100;
|
int loop = SBGEMM_LARGEST;
|
||||||
char transA = 'N', transB = 'N';
|
char transA = 'N', transB = 'N';
|
||||||
float alpha = 1.0, beta = 0.0;
|
float alpha = 1.0, beta = 0.0;
|
||||||
|
|
||||||
for (x = 0; x <= loop; x++)
|
for (x = 0; x <= loop; x++)
|
||||||
{
|
{
|
||||||
|
if ((x > 100) && (x != SBGEMM_LARGEST)) continue;
|
||||||
|
m = k = n = x;
|
||||||
|
float *A = (float *)malloc(m * k * sizeof(FLOAT));
|
||||||
|
float *B = (float *)malloc(k * n * sizeof(FLOAT));
|
||||||
|
float *C = (float *)malloc(m * n * sizeof(FLOAT));
|
||||||
|
bfloat16_bits *AA = (bfloat16_bits *)malloc(m * k * sizeof(bfloat16_bits));
|
||||||
|
bfloat16_bits *BB = (bfloat16_bits *)malloc(k * n * sizeof(bfloat16_bits));
|
||||||
|
float *DD = (float *)malloc(m * n * sizeof(FLOAT));
|
||||||
|
float *CC = (float *)malloc(m * n * sizeof(FLOAT));
|
||||||
|
if ((A == NULL) || (B == NULL) || (C == NULL) || (AA == NULL) || (BB == NULL) ||
|
||||||
|
(DD == NULL) || (CC == NULL))
|
||||||
|
return 1;
|
||||||
|
bfloat16 atmp,btmp;
|
||||||
|
blasint one=1;
|
||||||
|
|
||||||
|
for (j = 0; j < m; j++)
|
||||||
|
{
|
||||||
|
for (i = 0; i < k; i++)
|
||||||
|
{
|
||||||
|
A[j * k + i] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5;
|
||||||
|
sbstobf16_(&one, &A[j*k+i], &one, &atmp, &one);
|
||||||
|
AA[j * k + i].v = atmp;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (j = 0; j < n; j++)
|
||||||
|
{
|
||||||
|
for (i = 0; i < k; i++)
|
||||||
|
{
|
||||||
|
B[j * k + i] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5;
|
||||||
|
sbstobf16_(&one, &B[j*k+i], &one, &btmp, &one);
|
||||||
|
BB[j * k + i].v = btmp;
|
||||||
|
}
|
||||||
|
}
|
||||||
for (y = 0; y < 4; y++)
|
for (y = 0; y < 4; y++)
|
||||||
{
|
{
|
||||||
if ((y == 0) || (y == 2)) {
|
if ((y == 0) || (y == 2)) {
|
||||||
|
@ -106,40 +141,19 @@ main (int argc, char *argv[])
|
||||||
} else {
|
} else {
|
||||||
transB = 'T';
|
transB = 'T';
|
||||||
}
|
}
|
||||||
m = k = n = x;
|
|
||||||
float A[m * k];
|
|
||||||
float B[k * n];
|
|
||||||
float C[m * n];
|
|
||||||
bfloat16_bits AA[m * k], BB[k * n];
|
|
||||||
float DD[m * n], CC[m * n];
|
|
||||||
bfloat16 atmp,btmp;
|
|
||||||
blasint one=1;
|
|
||||||
|
|
||||||
for (j = 0; j < m; j++)
|
memset(CC, 0, m * n * sizeof(FLOAT));
|
||||||
{
|
memset(DD, 0, m * n * sizeof(FLOAT));
|
||||||
for (i = 0; i < m; i++)
|
memset(C, 0, m * n * sizeof(FLOAT));
|
||||||
{
|
|
||||||
A[j * k + i] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5;
|
|
||||||
B[j * k + i] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5;
|
|
||||||
C[j * k + i] = 0;
|
|
||||||
sbstobf16_(&one, &A[j*k+i], &one, &atmp, &one);
|
|
||||||
sbstobf16_(&one, &B[j*k+i], &one, &btmp, &one);
|
|
||||||
AA[j * k + i].v = atmp;
|
|
||||||
BB[j * k + i].v = btmp;
|
|
||||||
CC[j * k + i] = 0;
|
|
||||||
DD[j * k + i] = 0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
SGEMM (&transA, &transB, &m, &n, &k, &alpha, A,
|
SGEMM (&transA, &transB, &m, &n, &k, &alpha, A,
|
||||||
&m, B, &k, &beta, C, &m);
|
&m, B, &k, &beta, C, &m);
|
||||||
SBGEMM (&transA, &transB, &m, &n, &k, &alpha, (bfloat16*) AA,
|
SBGEMM (&transA, &transB, &m, &n, &k, &alpha, (bfloat16*) AA,
|
||||||
&m, (bfloat16*)BB, &k, &beta, CC, &m);
|
&m, (bfloat16*)BB, &k, &beta, CC, &m);
|
||||||
|
|
||||||
for (i = 0; i < n; i++)
|
for (i = 0; i < n; i++)
|
||||||
for (j = 0; j < m; j++)
|
for (j = 0; j < m; j++)
|
||||||
if (fabs (CC[i * m + j] - C[i * m + j]) > 1.0)
|
{
|
||||||
ret++;
|
|
||||||
for (i = 0; i < n; i++)
|
|
||||||
for (j = 0; j < m; j++)
|
|
||||||
for (l = 0; l < k; l++)
|
for (l = 0; l < k; l++)
|
||||||
if (transA == 'N' && transB == 'N')
|
if (transA == 'N' && transB == 'N')
|
||||||
{
|
{
|
||||||
|
@ -158,12 +172,20 @@ main (int argc, char *argv[])
|
||||||
DD[i * m + j] +=
|
DD[i * m + j] +=
|
||||||
float16to32 (AA[k * j + l]) * float16to32 (BB[i + l * n]);
|
float16to32 (AA[k * j + l]) * float16to32 (BB[i + l * n]);
|
||||||
}
|
}
|
||||||
for (i = 0; i < n; i++)
|
if (fabs (CC[i * m + j] - C[i * m + j]) > 1.0)
|
||||||
for (j = 0; j < m; j++)
|
ret++;
|
||||||
if (CC[i * m + j] != DD[i * m + j])
|
if (fabs (CC[i * m + j] - DD[i * m + j]) > 1.0)
|
||||||
ret++;
|
ret++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
free(A);
|
||||||
|
free(B);
|
||||||
|
free(C);
|
||||||
|
free(AA);
|
||||||
|
free(BB);
|
||||||
|
free(DD);
|
||||||
|
free(CC);
|
||||||
|
}
|
||||||
|
|
||||||
if (ret != 0)
|
if (ret != 0)
|
||||||
fprintf (stderr, "FATAL ERROR SBGEMM - Return code: %d\n", ret);
|
fprintf (stderr, "FATAL ERROR SBGEMM - Return code: %d\n", ret);
|
||||||
|
|
Loading…
Reference in New Issue