commit
9f0ef9cdfc
17
.travis.yml
17
.travis.yml
|
@ -75,6 +75,23 @@ matrix:
|
||||||
- TARGET_BOX=LINUX32
|
- TARGET_BOX=LINUX32
|
||||||
- BTYPE="BINARY=32"
|
- BTYPE="BINARY=32"
|
||||||
|
|
||||||
|
- os: linux
|
||||||
|
arch: ppc64le
|
||||||
|
dist: bionic
|
||||||
|
compiler: gcc
|
||||||
|
before_script:
|
||||||
|
- sudo add-apt-repository 'ppa:ubuntu-toolchain-r/test' -y
|
||||||
|
- sudo apt-get update
|
||||||
|
- sudo apt-get install gcc-9 gfortran-9 -y
|
||||||
|
script:
|
||||||
|
- make QUIET_MAKE=1 BINARY=64 USE_OPENMP=1 CC=gcc-9 FC=gfortran-9
|
||||||
|
- make -C test $COMMON_FLAGS $BTYPE
|
||||||
|
- make -C ctest $COMMON_FLAGS $BTYPE
|
||||||
|
- make -C utest $COMMON_FLAGS $BTYPE
|
||||||
|
env:
|
||||||
|
# for matrix annotation only
|
||||||
|
- TARGET_BOX=PPC64LE_LINUX_P9
|
||||||
|
|
||||||
- os: linux
|
- os: linux
|
||||||
compiler: gcc
|
compiler: gcc
|
||||||
addons:
|
addons:
|
||||||
|
|
4
Makefile
4
Makefile
|
@ -141,7 +141,7 @@ ifndef NO_FBLAS
|
||||||
$(MAKE) -C test all
|
$(MAKE) -C test all
|
||||||
endif
|
endif
|
||||||
$(MAKE) -C utest all
|
$(MAKE) -C utest all
|
||||||
ifndef NO_CBLAS
|
ifneq ($(NO_CBLAS), 1)
|
||||||
$(MAKE) -C ctest all
|
$(MAKE) -C ctest all
|
||||||
ifeq ($(CPP_THREAD_SAFETY_TEST), 1)
|
ifeq ($(CPP_THREAD_SAFETY_TEST), 1)
|
||||||
$(MAKE) -C cpp_thread_test all
|
$(MAKE) -C cpp_thread_test all
|
||||||
|
@ -244,7 +244,7 @@ ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
|
||||||
@$(MAKE) -C $(NETLIB_LAPACK_DIR) lapacklib
|
@$(MAKE) -C $(NETLIB_LAPACK_DIR) lapacklib
|
||||||
@$(MAKE) -C $(NETLIB_LAPACK_DIR) tmglib
|
@$(MAKE) -C $(NETLIB_LAPACK_DIR) tmglib
|
||||||
endif
|
endif
|
||||||
ifndef NO_LAPACKE
|
ifneq ($(NO_LAPACKE), 1)
|
||||||
@$(MAKE) -C $(NETLIB_LAPACK_DIR) lapackelib
|
@$(MAKE) -C $(NETLIB_LAPACK_DIR) lapackelib
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
|
|
|
@ -105,6 +105,7 @@ static void INLINE blas_lock(volatile unsigned long *address){
|
||||||
" bne- 1f\n"
|
" bne- 1f\n"
|
||||||
" stwcx. %2,0, %1\n"
|
" stwcx. %2,0, %1\n"
|
||||||
" bne- 0b\n"
|
" bne- 0b\n"
|
||||||
|
" isync\n"
|
||||||
"1: "
|
"1: "
|
||||||
: "=&r"(ret)
|
: "=&r"(ret)
|
||||||
: "r"(address), "r" (val)
|
: "r"(address), "r" (val)
|
||||||
|
|
|
@ -367,7 +367,7 @@ CZBLAS3OBJS += cblas_zgemm3m.$(SUFFIX)
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
|
||||||
ifndef NO_CBLAS
|
ifneq ($(NO_CBLAS), 1)
|
||||||
|
|
||||||
override CFLAGS += -I.
|
override CFLAGS += -I.
|
||||||
|
|
||||||
|
|
|
@ -249,7 +249,6 @@ static inline vector_float vec_load_hinted(FLOAT const *restrict a) {
|
||||||
|
|
||||||
|
|
||||||
#if UNROLL_M == 16
|
#if UNROLL_M == 16
|
||||||
VECTOR_BLOCK(16, 4)
|
|
||||||
VECTOR_BLOCK(16, 2)
|
VECTOR_BLOCK(16, 2)
|
||||||
VECTOR_BLOCK(16, 1)
|
VECTOR_BLOCK(16, 1)
|
||||||
#endif
|
#endif
|
||||||
|
@ -257,18 +256,276 @@ VECTOR_BLOCK(16, 1)
|
||||||
VECTOR_BLOCK(8, 8)
|
VECTOR_BLOCK(8, 8)
|
||||||
VECTOR_BLOCK(4, 8)
|
VECTOR_BLOCK(4, 8)
|
||||||
#endif
|
#endif
|
||||||
|
#ifndef DOUBLE
|
||||||
VECTOR_BLOCK(8, 4)
|
VECTOR_BLOCK(8, 4)
|
||||||
|
#endif
|
||||||
VECTOR_BLOCK(8, 2)
|
VECTOR_BLOCK(8, 2)
|
||||||
VECTOR_BLOCK(8, 1)
|
VECTOR_BLOCK(8, 1)
|
||||||
VECTOR_BLOCK(4, 4)
|
VECTOR_BLOCK(4, 4)
|
||||||
VECTOR_BLOCK(4, 2)
|
VECTOR_BLOCK(4, 2)
|
||||||
VECTOR_BLOCK(4, 1)
|
VECTOR_BLOCK(4, 1)
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Calculate for a row-block in C_i of size ROWSxCOLS using scalar operations.
|
||||||
|
* Simple implementation for smaller block sizes
|
||||||
|
*
|
||||||
|
* @param[in] A Pointer current block of input matrix A.
|
||||||
|
* @param[in] k Number of columns in A.
|
||||||
|
* @param[in] B Pointer current block of input matrix B.
|
||||||
|
* @param[inout] C Pointer current block of output matrix C.
|
||||||
|
* @param[in] ldc Offset between elements in adjacent columns in C.
|
||||||
|
* @param[in] alpha Scalar factor.
|
||||||
|
*/
|
||||||
|
#define SCALAR_BLOCK(ROWS, COLS) \
|
||||||
|
static inline void GEBP_block_##ROWS##_##COLS( \
|
||||||
|
FLOAT const *restrict A, BLASLONG k, FLOAT const *restrict B, \
|
||||||
|
FLOAT *restrict C, BLASLONG ldc, FLOAT alpha) { \
|
||||||
|
FLOAT Caux[ROWS][COLS] __attribute__((aligned(16))); \
|
||||||
|
\
|
||||||
|
/* \
|
||||||
|
* Peel off first iteration (i.e., column of A) for \
|
||||||
|
* initializing Caux \
|
||||||
|
*/ \
|
||||||
|
for (BLASLONG i = 0; i < ROWS; i++) \
|
||||||
|
for (BLASLONG j = 0; j < COLS; j++) Caux[i][j] = A[i] * B[j]; \
|
||||||
|
\
|
||||||
|
for (BLASLONG kk = 1; kk < k; kk++) \
|
||||||
|
for (BLASLONG i = 0; i < ROWS; i++) \
|
||||||
|
for (BLASLONG j = 0; j < COLS; j++) \
|
||||||
|
Caux[i][j] += A[i + kk * ROWS] * B[j + kk * COLS]; \
|
||||||
|
\
|
||||||
|
for (BLASLONG i = 0; i < ROWS; i++) \
|
||||||
|
for (BLASLONG j = 0; j < COLS; j++) \
|
||||||
|
if (trmm) { \
|
||||||
|
C[i + j * ldc] = alpha * Caux[i][j]; \
|
||||||
|
} else { \
|
||||||
|
C[i + j * ldc] += alpha * Caux[i][j]; \
|
||||||
|
} \
|
||||||
|
}
|
||||||
|
|
||||||
#ifdef DOUBLE
|
#ifdef DOUBLE
|
||||||
VECTOR_BLOCK(2, 4)
|
VECTOR_BLOCK(2, 4)
|
||||||
VECTOR_BLOCK(2, 2)
|
VECTOR_BLOCK(2, 2)
|
||||||
|
VECTOR_BLOCK(2, 1)
|
||||||
|
#else
|
||||||
|
SCALAR_BLOCK(2, 4)
|
||||||
|
SCALAR_BLOCK(2, 2)
|
||||||
|
SCALAR_BLOCK(2, 1)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
SCALAR_BLOCK(1, 4)
|
||||||
|
SCALAR_BLOCK(1, 2)
|
||||||
|
SCALAR_BLOCK(1, 1)
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Calculate a row-block that fits 4x4 vector registers using a loop
|
||||||
|
* unrolled-by-2 with explicit interleaving to better overlap loads and
|
||||||
|
* computation.
|
||||||
|
* This function fits 16x4 blocks for SGEMM and 8x4 blocks for DGEMM.
|
||||||
|
*/
|
||||||
|
#ifdef DOUBLE
|
||||||
|
static inline void GEBP_block_8_4(
|
||||||
|
#else // float
|
||||||
|
static inline void GEBP_block_16_4(
|
||||||
|
#endif
|
||||||
|
FLOAT const *restrict A, BLASLONG bk, FLOAT const *restrict B,
|
||||||
|
FLOAT *restrict C, BLASLONG ldc, FLOAT alpha) {
|
||||||
|
#define VEC_ROWS 4
|
||||||
|
#define VEC_COLS 4
|
||||||
|
#define ROWS VEC_ROWS * VLEN_FLOATS
|
||||||
|
#define COLS (VEC_COLS)
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Hold intermediate results in vector registers.
|
||||||
|
* Since we need to force the compiler's hand in places, we need to use
|
||||||
|
* individual variables in contrast to the generic implementation's
|
||||||
|
* arrays.
|
||||||
|
*/
|
||||||
|
#define INIT_ROW_OF_C(ROW) \
|
||||||
|
vector_float A##ROW = vec_load_hinted(A + ROW * VLEN_FLOATS); \
|
||||||
|
vector_float C_##ROW##_0 = A##ROW * B[0]; \
|
||||||
|
vector_float C_##ROW##_1 = A##ROW * B[1]; \
|
||||||
|
vector_float C_##ROW##_2 = A##ROW * B[2]; \
|
||||||
|
vector_float C_##ROW##_3 = A##ROW * B[3];
|
||||||
|
|
||||||
|
INIT_ROW_OF_C(0)
|
||||||
|
INIT_ROW_OF_C(1)
|
||||||
|
INIT_ROW_OF_C(2)
|
||||||
|
INIT_ROW_OF_C(3)
|
||||||
|
#undef INIT_ROW_OF_C
|
||||||
|
|
||||||
|
if (bk > 1) {
|
||||||
|
BLASLONG k = 1;
|
||||||
|
vector_float Ak[VEC_ROWS], Aknext[VEC_ROWS];
|
||||||
|
vector_float Bk[VEC_COLS], Bknext[VEC_COLS];
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Note that in several places, we enforce an instruction
|
||||||
|
* sequence that we identified empirically by utilizing dummy
|
||||||
|
* asm statements.
|
||||||
|
*/
|
||||||
|
|
||||||
|
for (BLASLONG j = 0; j < VEC_COLS; j++)
|
||||||
|
Bk[j] = vec_splats(B[j + k * COLS]);
|
||||||
|
asm("");
|
||||||
|
|
||||||
|
for (BLASLONG i = 0; i < VEC_ROWS; i++)
|
||||||
|
Ak[i] = vec_load_hinted(A + i * VLEN_FLOATS + k * ROWS);
|
||||||
|
|
||||||
|
for (; k < (bk - 2); k += 2) {
|
||||||
|
/*
|
||||||
|
* Load inputs for (k+1) into registers.
|
||||||
|
* Loading from B first is advantageous.
|
||||||
|
*/
|
||||||
|
for (BLASLONG j = 0; j < VEC_COLS; j++)
|
||||||
|
Bknext[j] = vec_splats(B[j + (k + 1) * COLS]);
|
||||||
|
asm("");
|
||||||
|
for (BLASLONG i = 0; i < VEC_ROWS; i++)
|
||||||
|
Aknext[i] = vec_load_hinted(A + i * VLEN_FLOATS +
|
||||||
|
(k + 1) * ROWS);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* To achieve better instruction-level parallelism,
|
||||||
|
* make sure to first load input data for (k+1) before
|
||||||
|
* initiating compute for k. We enforce that ordering
|
||||||
|
* with a pseudo asm statement.
|
||||||
|
* Note that we need to massage this particular "barrier"
|
||||||
|
* depending on the gcc version.
|
||||||
|
*/
|
||||||
|
#if __GNUC__ > 7
|
||||||
|
#define BARRIER_READ_BEFORE_COMPUTE(SUFFIX) \
|
||||||
|
do { \
|
||||||
|
asm("" \
|
||||||
|
: "+v"(C_0_0), "+v"(C_0_1), "+v"(C_0_2), "+v"(C_0_3), "+v"(C_1_0), \
|
||||||
|
"+v"(C_1_1), "+v"(C_1_2), "+v"(C_1_3) \
|
||||||
|
: "v"(B##SUFFIX[0]), "v"(B##SUFFIX[1]), "v"(B##SUFFIX[2]), \
|
||||||
|
"v"(B##SUFFIX[3]), "v"(A##SUFFIX[0]), "v"(A##SUFFIX[1]), \
|
||||||
|
"v"(A##SUFFIX[2]), "v"(A##SUFFIX[3])); \
|
||||||
|
asm("" \
|
||||||
|
: "+v"(C_2_0), "+v"(C_2_1), "+v"(C_2_2), "+v"(C_2_3), "+v"(C_3_0), \
|
||||||
|
"+v"(C_3_1), "+v"(C_3_2), "+v"(C_3_3) \
|
||||||
|
: "v"(B##SUFFIX[0]), "v"(B##SUFFIX[1]), "v"(B##SUFFIX[2]), \
|
||||||
|
"v"(B##SUFFIX[3]), "v"(A##SUFFIX[0]), "v"(A##SUFFIX[1]), \
|
||||||
|
"v"(A##SUFFIX[2]), "v"(A##SUFFIX[3])); \
|
||||||
|
} while (0)
|
||||||
|
#else // __GNUC__ <= 7
|
||||||
|
#define BARRIER_READ_BEFORE_COMPUTE(SUFFIX) \
|
||||||
|
do { \
|
||||||
|
asm(""); \
|
||||||
|
} while (0)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
BARRIER_READ_BEFORE_COMPUTE(knext);
|
||||||
|
|
||||||
|
/* Compute for (k) */
|
||||||
|
C_0_0 += Ak[0] * Bk[0];
|
||||||
|
C_1_0 += Ak[1] * Bk[0];
|
||||||
|
C_2_0 += Ak[2] * Bk[0];
|
||||||
|
C_3_0 += Ak[3] * Bk[0];
|
||||||
|
|
||||||
|
C_0_1 += Ak[0] * Bk[1];
|
||||||
|
C_1_1 += Ak[1] * Bk[1];
|
||||||
|
C_2_1 += Ak[2] * Bk[1];
|
||||||
|
C_3_1 += Ak[3] * Bk[1];
|
||||||
|
|
||||||
|
C_0_2 += Ak[0] * Bk[2];
|
||||||
|
C_1_2 += Ak[1] * Bk[2];
|
||||||
|
C_2_2 += Ak[2] * Bk[2];
|
||||||
|
C_3_2 += Ak[3] * Bk[2];
|
||||||
|
|
||||||
|
C_0_3 += Ak[0] * Bk[3];
|
||||||
|
C_1_3 += Ak[1] * Bk[3];
|
||||||
|
C_2_3 += Ak[2] * Bk[3];
|
||||||
|
C_3_3 += Ak[3] * Bk[3];
|
||||||
|
|
||||||
|
asm("");
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Load inputs for (k+2) into registers.
|
||||||
|
* First load from B.
|
||||||
|
*/
|
||||||
|
for (BLASLONG j = 0; j < VEC_COLS; j++)
|
||||||
|
Bk[j] = vec_splats(B[j + (k + 2) * COLS]);
|
||||||
|
asm("");
|
||||||
|
for (BLASLONG i = 0; i < VEC_ROWS; i++)
|
||||||
|
Ak[i] = vec_load_hinted(A + i * VLEN_FLOATS + (k + 2) * ROWS);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* As above, make sure to first schedule the loads for (k+2)
|
||||||
|
* before compute for (k+1).
|
||||||
|
*/
|
||||||
|
BARRIER_READ_BEFORE_COMPUTE(k);
|
||||||
|
|
||||||
|
/* Compute on (k+1) */
|
||||||
|
C_0_0 += Aknext[0] * Bknext[0];
|
||||||
|
C_1_0 += Aknext[1] * Bknext[0];
|
||||||
|
C_2_0 += Aknext[2] * Bknext[0];
|
||||||
|
C_3_0 += Aknext[3] * Bknext[0];
|
||||||
|
|
||||||
|
C_0_1 += Aknext[0] * Bknext[1];
|
||||||
|
C_1_1 += Aknext[1] * Bknext[1];
|
||||||
|
C_2_1 += Aknext[2] * Bknext[1];
|
||||||
|
C_3_1 += Aknext[3] * Bknext[1];
|
||||||
|
|
||||||
|
C_0_2 += Aknext[0] * Bknext[2];
|
||||||
|
C_1_2 += Aknext[1] * Bknext[2];
|
||||||
|
C_2_2 += Aknext[2] * Bknext[2];
|
||||||
|
C_3_2 += Aknext[3] * Bknext[2];
|
||||||
|
|
||||||
|
C_0_3 += Aknext[0] * Bknext[3];
|
||||||
|
C_1_3 += Aknext[1] * Bknext[3];
|
||||||
|
C_2_3 += Aknext[2] * Bknext[3];
|
||||||
|
C_3_3 += Aknext[3] * Bknext[3];
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Wrapup remaining k's */
|
||||||
|
for (; k < bk; k++) {
|
||||||
|
vector_float Ak;
|
||||||
|
|
||||||
|
#define COMPUTE_WRAPUP_ROW(ROW) \
|
||||||
|
Ak = vec_load_hinted(A + ROW * VLEN_FLOATS + k * ROWS); \
|
||||||
|
C_##ROW##_0 += Ak * B[0 + k * COLS]; \
|
||||||
|
C_##ROW##_1 += Ak * B[1 + k * COLS]; \
|
||||||
|
C_##ROW##_2 += Ak * B[2 + k * COLS]; \
|
||||||
|
C_##ROW##_3 += Ak * B[3 + k * COLS];
|
||||||
|
|
||||||
|
COMPUTE_WRAPUP_ROW(0)
|
||||||
|
COMPUTE_WRAPUP_ROW(1)
|
||||||
|
COMPUTE_WRAPUP_ROW(2)
|
||||||
|
COMPUTE_WRAPUP_ROW(3)
|
||||||
|
#undef COMPUTE_WRAPUP_ROW
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Unpack row-block of C_aux into outer C_i, multiply by
|
||||||
|
* alpha and add up (or assign for TRMM).
|
||||||
|
*/
|
||||||
|
#define WRITE_BACK_C(ROW, COL) \
|
||||||
|
do { \
|
||||||
|
vector_float *Cij = \
|
||||||
|
(vector_float *)(C + ROW * VLEN_FLOATS + COL * ldc); \
|
||||||
|
if (trmm) { \
|
||||||
|
*Cij = alpha * C_##ROW##_##COL; \
|
||||||
|
} else { \
|
||||||
|
*Cij += alpha * C_##ROW##_##COL; \
|
||||||
|
} \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
|
WRITE_BACK_C(0, 0); WRITE_BACK_C(0, 1); WRITE_BACK_C(0, 2); WRITE_BACK_C(0, 3);
|
||||||
|
WRITE_BACK_C(1, 0); WRITE_BACK_C(1, 1); WRITE_BACK_C(1, 2); WRITE_BACK_C(1, 3);
|
||||||
|
WRITE_BACK_C(2, 0); WRITE_BACK_C(2, 1); WRITE_BACK_C(2, 2); WRITE_BACK_C(2, 3);
|
||||||
|
WRITE_BACK_C(3, 0); WRITE_BACK_C(3, 1); WRITE_BACK_C(3, 2); WRITE_BACK_C(3, 3);
|
||||||
|
#undef WRITE_BACK_C
|
||||||
|
|
||||||
|
#undef ROWS
|
||||||
|
#undef VEC_ROWS
|
||||||
|
#undef COLS
|
||||||
|
#undef VEC_COLS
|
||||||
|
#undef BARRIER_READ_BEFORE_COMPUTE
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Handle calculation for row blocks in C_i of any size by dispatching into
|
* Handle calculation for row blocks in C_i of any size by dispatching into
|
||||||
* macro-defined (inline) functions or by deferring to a simple generic
|
* macro-defined (inline) functions or by deferring to a simple generic
|
||||||
|
@ -315,6 +572,8 @@ static inline void GEBP_block(BLASLONG m, BLASLONG n,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Dispatch into the implementation for each block size: */
|
||||||
|
|
||||||
#define BLOCK(bm, bn) \
|
#define BLOCK(bm, bn) \
|
||||||
if (m == bm && n == bn) { \
|
if (m == bm && n == bn) { \
|
||||||
GEBP_block_##bm##_##bn(A, k, B, C, ldc, alpha); \
|
GEBP_block_##bm##_##bn(A, k, B, C, ldc, alpha); \
|
||||||
|
@ -330,35 +589,11 @@ static inline void GEBP_block(BLASLONG m, BLASLONG n,
|
||||||
BLOCK(8, 4); BLOCK(8, 2); BLOCK(8, 1);
|
BLOCK(8, 4); BLOCK(8, 2); BLOCK(8, 1);
|
||||||
BLOCK(4, 4); BLOCK(4, 2); BLOCK(4, 1);
|
BLOCK(4, 4); BLOCK(4, 2); BLOCK(4, 1);
|
||||||
|
|
||||||
#ifdef DOUBLE
|
BLOCK(2, 4); BLOCK(2, 2); BLOCK(2, 1);
|
||||||
BLOCK(2, 4);
|
|
||||||
BLOCK(2, 2);
|
BLOCK(1, 4); BLOCK(1, 2); BLOCK(1, 1);
|
||||||
#endif
|
|
||||||
|
|
||||||
#undef BLOCK
|
#undef BLOCK
|
||||||
|
|
||||||
/* simple implementation for smaller block sizes: */
|
|
||||||
FLOAT Caux[m][n] __attribute__ ((aligned (16)));
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Peel off first iteration (i.e., column of A) for initializing Caux
|
|
||||||
*/
|
|
||||||
for (BLASLONG i = 0; i < m; i++)
|
|
||||||
for (BLASLONG j = 0; j < n; j++)
|
|
||||||
Caux[i][j] = A[i] * B[j];
|
|
||||||
|
|
||||||
for (BLASLONG kk = 1; kk < k; kk++)
|
|
||||||
for (BLASLONG i = 0; i < m; i++)
|
|
||||||
for (BLASLONG j = 0; j < n; j++)
|
|
||||||
Caux[i][j] += A[i + kk * m] * B[j + kk * n];
|
|
||||||
|
|
||||||
for (BLASLONG i = 0; i < m; i++)
|
|
||||||
for (BLASLONG j = 0; j < n; j++)
|
|
||||||
if (trmm) {
|
|
||||||
C[i + j * ldc] = alpha * Caux[i][j];
|
|
||||||
} else {
|
|
||||||
C[i + j * ldc] += alpha * Caux[i][j];
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
4
param.h
4
param.h
|
@ -3092,12 +3092,12 @@ is a big desktop or server with abundant cache rather than a phone or embedded d
|
||||||
#define ZGEMM_DEFAULT_UNROLL_M 4
|
#define ZGEMM_DEFAULT_UNROLL_M 4
|
||||||
#define ZGEMM_DEFAULT_UNROLL_N 4
|
#define ZGEMM_DEFAULT_UNROLL_N 4
|
||||||
|
|
||||||
#define SGEMM_DEFAULT_P 456
|
#define SGEMM_DEFAULT_P 480
|
||||||
#define DGEMM_DEFAULT_P 320
|
#define DGEMM_DEFAULT_P 320
|
||||||
#define CGEMM_DEFAULT_P 480
|
#define CGEMM_DEFAULT_P 480
|
||||||
#define ZGEMM_DEFAULT_P 224
|
#define ZGEMM_DEFAULT_P 224
|
||||||
|
|
||||||
#define SGEMM_DEFAULT_Q 488
|
#define SGEMM_DEFAULT_Q 512
|
||||||
#define DGEMM_DEFAULT_Q 384
|
#define DGEMM_DEFAULT_Q 384
|
||||||
#define CGEMM_DEFAULT_Q 128
|
#define CGEMM_DEFAULT_Q 128
|
||||||
#define ZGEMM_DEFAULT_Q 352
|
#define ZGEMM_DEFAULT_Q 352
|
||||||
|
|
Loading…
Reference in New Issue