The code for SGEMM 16x4 and DGEMM 8x4 blocks on z14 and z15 uses explicit unrolling and interleaving to improve performance. The code employs an empty inline asm statement with operands that constrain the compiler's instruction scheduling and thereby enforce proper overlapping of load and compute phases. Fix an ifdef to apply that for clang builds, as well. Signed-off-by: Marius Hillenbrand <mhillen@linux.ibm.com>
711 lines
26 KiB
C
711 lines
26 KiB
C
/*
|
|
* Copyright (c) IBM Corporation 2020.
|
|
* All rights reserved.
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions are
|
|
* met:
|
|
*
|
|
* 1. Redistributions of source code must retain the above copyright
|
|
* notice, this list of conditions and the following disclaimer.
|
|
*
|
|
* 2. Redistributions in binary form must reproduce the above copyright
|
|
* notice, this list of conditions and the following disclaimer in
|
|
* the documentation and/or other materials provided with the
|
|
* distribution.
|
|
* 3. Neither the name of the OpenBLAS project nor the names of
|
|
* its contributors may be used to endorse or promote products
|
|
* derived from this software without specific prior written
|
|
* permission.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
|
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
|
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
|
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
|
* USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
*/
|
|
#include "common.h"
|
|
#include <vecintrin.h>
|
|
|
|
#include <stdbool.h>
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
|
|
#ifdef COMPLEX
|
|
#error "Handling for complex numbers is not supported in this kernel"
|
|
#endif
|
|
|
|
#ifdef DOUBLE
|
|
#define UNROLL_M DGEMM_DEFAULT_UNROLL_M
|
|
#define UNROLL_N DGEMM_DEFAULT_UNROLL_N
|
|
#else
|
|
#define UNROLL_M SGEMM_DEFAULT_UNROLL_M
|
|
#define UNROLL_N SGEMM_DEFAULT_UNROLL_N
|
|
#endif
|
|
|
|
static const size_t unroll_m = UNROLL_M;
|
|
static const size_t unroll_n = UNROLL_N;
|
|
|
|
/* Handling of triangular matrices */
|
|
#ifdef TRMMKERNEL
|
|
static const bool trmm = true;
|
|
static const bool left =
|
|
#ifdef LEFT
|
|
true;
|
|
#else
|
|
false;
|
|
#endif
|
|
|
|
static const bool backwards =
|
|
#if defined(LEFT) != defined(TRANSA)
|
|
true;
|
|
#else
|
|
false;
|
|
#endif
|
|
|
|
#else
|
|
static const bool trmm = false;
|
|
static const bool left = false;
|
|
static const bool backwards = false;
|
|
#endif /* TRMMKERNEL */
|
|
|
|
/*
|
|
* Background:
|
|
*
|
|
* The algorithm of GotoBLAS / OpenBLAS breaks down the matrix multiplication
|
|
* problem by splitting all matrices into partitions multiple times, so that the
|
|
* submatrices fit into the L1 or L2 caches. As a result, each multiplication of
|
|
* submatrices can stream data fast from L1 and L2 caches. Inbetween, it copies
|
|
* and rearranges the submatrices to enable contiguous memory accesses to
|
|
* improve locality in both caches and TLBs.
|
|
*
|
|
* At the heart of the algorithm is this kernel, which multiplies, a "Block
|
|
* matrix" A (small dimensions) with a "Panel matrix" B (number of rows is
|
|
* small) and adds the result into a "Panel matrix" C; GotoBLAS calls this
|
|
* operation GEBP. This kernel further partitions GEBP twice, such that (1)
|
|
* submatrices of C and B fit into the L1 caches (GEBP_column_block) and (2) a
|
|
* block of C fits into the registers, while multiplying panels from A and B
|
|
* streamed from the L2 and L1 cache, respectively (GEBP_block).
|
|
*
|
|
*
|
|
* Algorithm GEBP(A, B, C, m, n, k, alpha):
|
|
*
|
|
* The problem is calculating C += alpha * (A * B)
|
|
* C is an m x n matrix, A is an m x k matrix, B is an k x n matrix.
|
|
*
|
|
* - C is in column-major-order, with an offset of ldc to the element in the
|
|
* next column (same row).
|
|
* - A is in row-major-order yet stores SGEMM_UNROLL_M elements of each column
|
|
* contiguously while walking along rows.
|
|
* - B is in column-major-order but packs SGEMM_UNROLL_N elements of a row
|
|
* contiguously.
|
|
* If the numbers of rows and columns are not multiples of SGEMM_UNROLL_M or
|
|
* SGEMM_UNROLL_N, the remaining elements are arranged in blocks with power-of-2
|
|
* dimensions (e.g., 5 remaining columns would be in a block-of-4 and a
|
|
* block-of-1).
|
|
*
|
|
* Note that packing A and B into that form is taken care of by the caller in
|
|
* driver/level3/level3.c (actually done by "copy kernels").
|
|
*
|
|
* Steps:
|
|
* - Partition C and B into blocks of n_r (SGEMM_UNROLL_N) columns, C_j and B_j.
|
|
* Now, B_j should fit into the L1 cache.
|
|
* - For each partition, calculate C_j += alpha * (A * B_j) by
|
|
* (1) Calculate C_aux := A * B_j (see below)
|
|
* (2) unpack C_j = C_j + alpha * C_aux
|
|
*
|
|
*
|
|
* Algorithm for Calculating C_aux:
|
|
*
|
|
* - Further partition C_aux and A into groups of m_r (SGEMM_UNROLL_M) rows,
|
|
* such that the m_r x n_r-submatrix of C_aux can be held in registers. Each
|
|
* submatrix of C_aux can be calculated independently, and the registers are
|
|
* added back into C_j.
|
|
*
|
|
* - For each row-block of C_aux:
|
|
* (uses a row block of A and full B_j)
|
|
* - stream over all columns of A, multiply with elements from B and
|
|
* accumulate in registers. (use different inner-kernels to exploit
|
|
* vectorization for varying block sizes)
|
|
* - add alpha * row block of C_aux back into C_j.
|
|
*
|
|
* Note that there are additional mechanics for handling triangular matrices,
|
|
* calculating B := alpha (A * B) where either of the matrices A or B can be
|
|
* triangular. In case of A, the macro "LEFT" is defined. In addition, A can
|
|
* optionally be transposed.
|
|
* The code effectively skips an "offset" number of columns in A and rows of B
|
|
* in each block, to save unnecessary work by exploiting the triangular nature.
|
|
* To handle all cases, the code discerns (1) a "left" mode when A is triangular
|
|
* and (2) "forward" / "backwards" modes where only the first "offset"
|
|
* columns/rows of A/B are used or where the first "offset" columns/rows are
|
|
* skipped, respectively.
|
|
*
|
|
* Reference:
|
|
*
|
|
* The summary above is based on staring at various kernel implementations and:
|
|
* K. Goto and R. A. Van de Geijn, Anatomy of High-Performance Matrix
|
|
* Multiplication, in ACM Transactions of Mathematical Software, Vol. 34, No.
|
|
* 3, May 2008.
|
|
*/
|
|
|
|
#define VLEN_BYTES 16
|
|
#define VLEN_FLOATS (VLEN_BYTES / sizeof(FLOAT))
|
|
|
|
typedef FLOAT vector_float __attribute__ ((vector_size (16)));
|
|
|
|
/**
|
|
* Load a vector into register, and hint on 8-byte alignment to improve
|
|
* performance. gcc-9 and newer will create these hints by itself. For older
|
|
* compiler versions, use inline assembly to explicitly express the hint.
|
|
* Provide explicit hex encoding to cater for binutils versions that do not know
|
|
* about vector-load with alignment hints yet.
|
|
*
|
|
* Note that, for block sizes where we apply vectorization, vectors in A will
|
|
* always be 8-byte aligned.
|
|
*/
|
|
static inline vector_float vec_load_hinted(FLOAT const *restrict a) {
|
|
vector_float const *restrict addr = (vector_float const *restrict)a;
|
|
vector_float y;
|
|
|
|
#if __GNUC__ < 9 && !defined(__clang__)
|
|
// hex-encode vl %[out],%[addr],3
|
|
asm(".insn vrx,0xe70000003006,%[out],%[addr],3"
|
|
: [ out ] "=v"(y)
|
|
: [ addr ] "R"(*addr));
|
|
#else
|
|
y = *addr;
|
|
#endif
|
|
|
|
return y;
|
|
}
|
|
|
|
/**
|
|
* Calculate for a row-block in C_i of size ROWSxCOLS using vector intrinsics.
|
|
*
|
|
* @param[in] A Pointer current block of input matrix A.
|
|
* @param[in] k Number of columns in A.
|
|
* @param[in] B Pointer current block of input matrix B.
|
|
* @param[inout] C Pointer current block of output matrix C.
|
|
* @param[in] ldc Offset between elements in adjacent columns in C.
|
|
* @param[in] alpha Scalar factor.
|
|
*/
|
|
#define VECTOR_BLOCK(ROWS, COLS) \
|
|
static inline void GEBP_block_##ROWS##_##COLS( \
|
|
FLOAT const *restrict A, BLASLONG bk, FLOAT const *restrict B, \
|
|
FLOAT *restrict C, BLASLONG ldc, FLOAT alpha) { \
|
|
_Static_assert( \
|
|
ROWS % VLEN_FLOATS == 0, \
|
|
"rows in block must be multiples of vector length"); \
|
|
vector_float Caux[ROWS / VLEN_FLOATS][COLS]; \
|
|
\
|
|
for (BLASLONG i = 0; i < ROWS / VLEN_FLOATS; i++) { \
|
|
vector_float A0 = \
|
|
vec_load_hinted(A + i * VLEN_FLOATS); \
|
|
for (BLASLONG j = 0; j < COLS; j++) \
|
|
Caux[i][j] = A0 * B[j]; \
|
|
} \
|
|
\
|
|
/* \
|
|
* Stream over the row-block of A, which is packed \
|
|
* column-by-column, multiply by coefficients in B and add up \
|
|
* into temporaries Caux (which the compiler will hold in \
|
|
* registers). Vectorization: Multiply column vectors from A \
|
|
* with scalars from B and add up in column vectors of Caux. \
|
|
* That equates to unrolling the loop over rows (in i) and \
|
|
* executing each unrolled iteration as a vector element. \
|
|
*/ \
|
|
for (BLASLONG k = 1; k < bk; k++) { \
|
|
for (BLASLONG i = 0; i < ROWS / VLEN_FLOATS; i++) { \
|
|
vector_float Ak = vec_load_hinted( \
|
|
A + i * VLEN_FLOATS + k * ROWS); \
|
|
\
|
|
for (BLASLONG j = 0; j < COLS; j++) \
|
|
Caux[i][j] += Ak * B[j + k * COLS]; \
|
|
} \
|
|
} \
|
|
\
|
|
/* \
|
|
* Unpack row-block of C_aux into outer C_i, multiply by \
|
|
* alpha and add up. \
|
|
*/ \
|
|
for (BLASLONG j = 0; j < COLS; j++) { \
|
|
for (BLASLONG i = 0; i < ROWS / VLEN_FLOATS; i++) { \
|
|
vector_float *C_ij = \
|
|
(vector_float *)(C + i * VLEN_FLOATS + \
|
|
j * ldc); \
|
|
if (trmm) { \
|
|
*C_ij = alpha * Caux[i][j]; \
|
|
} else { \
|
|
*C_ij += alpha * Caux[i][j]; \
|
|
} \
|
|
} \
|
|
} \
|
|
}
|
|
|
|
|
|
#if UNROLL_M == 16
|
|
VECTOR_BLOCK(16, 2)
|
|
VECTOR_BLOCK(16, 1)
|
|
#endif
|
|
#if UNROLL_N == 8
|
|
VECTOR_BLOCK(8, 8)
|
|
VECTOR_BLOCK(4, 8)
|
|
#endif
|
|
#ifndef DOUBLE
|
|
VECTOR_BLOCK(8, 4)
|
|
#endif
|
|
VECTOR_BLOCK(8, 2)
|
|
VECTOR_BLOCK(8, 1)
|
|
VECTOR_BLOCK(4, 4)
|
|
VECTOR_BLOCK(4, 2)
|
|
VECTOR_BLOCK(4, 1)
|
|
|
|
/**
|
|
* Calculate for a row-block in C_i of size ROWSxCOLS using scalar operations.
|
|
* Simple implementation for smaller block sizes
|
|
*
|
|
* @param[in] A Pointer current block of input matrix A.
|
|
* @param[in] k Number of columns in A.
|
|
* @param[in] B Pointer current block of input matrix B.
|
|
* @param[inout] C Pointer current block of output matrix C.
|
|
* @param[in] ldc Offset between elements in adjacent columns in C.
|
|
* @param[in] alpha Scalar factor.
|
|
*/
|
|
#define SCALAR_BLOCK(ROWS, COLS) \
|
|
static inline void GEBP_block_##ROWS##_##COLS( \
|
|
FLOAT const *restrict A, BLASLONG k, FLOAT const *restrict B, \
|
|
FLOAT *restrict C, BLASLONG ldc, FLOAT alpha) { \
|
|
FLOAT Caux[ROWS][COLS] __attribute__((aligned(16))); \
|
|
\
|
|
/* \
|
|
* Peel off first iteration (i.e., column of A) for \
|
|
* initializing Caux \
|
|
*/ \
|
|
for (BLASLONG i = 0; i < ROWS; i++) \
|
|
for (BLASLONG j = 0; j < COLS; j++) Caux[i][j] = A[i] * B[j]; \
|
|
\
|
|
for (BLASLONG kk = 1; kk < k; kk++) \
|
|
for (BLASLONG i = 0; i < ROWS; i++) \
|
|
for (BLASLONG j = 0; j < COLS; j++) \
|
|
Caux[i][j] += A[i + kk * ROWS] * B[j + kk * COLS]; \
|
|
\
|
|
for (BLASLONG i = 0; i < ROWS; i++) \
|
|
for (BLASLONG j = 0; j < COLS; j++) \
|
|
if (trmm) { \
|
|
C[i + j * ldc] = alpha * Caux[i][j]; \
|
|
} else { \
|
|
C[i + j * ldc] += alpha * Caux[i][j]; \
|
|
} \
|
|
}
|
|
|
|
#ifdef DOUBLE
|
|
VECTOR_BLOCK(2, 4)
|
|
VECTOR_BLOCK(2, 2)
|
|
VECTOR_BLOCK(2, 1)
|
|
#else
|
|
SCALAR_BLOCK(2, 4)
|
|
SCALAR_BLOCK(2, 2)
|
|
SCALAR_BLOCK(2, 1)
|
|
#endif
|
|
|
|
SCALAR_BLOCK(1, 4)
|
|
SCALAR_BLOCK(1, 2)
|
|
SCALAR_BLOCK(1, 1)
|
|
|
|
|
|
/**
|
|
* Calculate a row-block that fits 4x4 vector registers using a loop
|
|
* unrolled-by-2 with explicit interleaving to better overlap loads and
|
|
* computation.
|
|
* This function fits 16x4 blocks for SGEMM and 8x4 blocks for DGEMM.
|
|
*/
|
|
#ifdef DOUBLE
|
|
static inline void GEBP_block_8_4(
|
|
#else // float
|
|
static inline void GEBP_block_16_4(
|
|
#endif
|
|
FLOAT const *restrict A, BLASLONG bk, FLOAT const *restrict B,
|
|
FLOAT *restrict C, BLASLONG ldc, FLOAT alpha) {
|
|
#define VEC_ROWS 4
|
|
#define VEC_COLS 4
|
|
#define ROWS VEC_ROWS * VLEN_FLOATS
|
|
#define COLS (VEC_COLS)
|
|
|
|
/*
|
|
* Hold intermediate results in vector registers.
|
|
* Since we need to force the compiler's hand in places, we need to use
|
|
* individual variables in contrast to the generic implementation's
|
|
* arrays.
|
|
*/
|
|
#define INIT_ROW_OF_C(ROW) \
|
|
vector_float A##ROW = vec_load_hinted(A + ROW * VLEN_FLOATS); \
|
|
vector_float C_##ROW##_0 = A##ROW * B[0]; \
|
|
vector_float C_##ROW##_1 = A##ROW * B[1]; \
|
|
vector_float C_##ROW##_2 = A##ROW * B[2]; \
|
|
vector_float C_##ROW##_3 = A##ROW * B[3];
|
|
|
|
INIT_ROW_OF_C(0)
|
|
INIT_ROW_OF_C(1)
|
|
INIT_ROW_OF_C(2)
|
|
INIT_ROW_OF_C(3)
|
|
#undef INIT_ROW_OF_C
|
|
|
|
if (bk > 1) {
|
|
BLASLONG k = 1;
|
|
vector_float Ak[VEC_ROWS], Aknext[VEC_ROWS];
|
|
vector_float Bk[VEC_COLS], Bknext[VEC_COLS];
|
|
|
|
/*
|
|
* Note that in several places, we enforce an instruction
|
|
* sequence that we identified empirically by utilizing dummy
|
|
* asm statements.
|
|
*/
|
|
|
|
for (BLASLONG j = 0; j < VEC_COLS; j++)
|
|
Bk[j] = vec_splats(B[j + k * COLS]);
|
|
asm("");
|
|
|
|
for (BLASLONG i = 0; i < VEC_ROWS; i++)
|
|
Ak[i] = vec_load_hinted(A + i * VLEN_FLOATS + k * ROWS);
|
|
|
|
for (; k < (bk - 2); k += 2) {
|
|
/*
|
|
* Load inputs for (k+1) into registers.
|
|
* Loading from B first is advantageous.
|
|
*/
|
|
for (BLASLONG j = 0; j < VEC_COLS; j++)
|
|
Bknext[j] = vec_splats(B[j + (k + 1) * COLS]);
|
|
asm("");
|
|
for (BLASLONG i = 0; i < VEC_ROWS; i++)
|
|
Aknext[i] = vec_load_hinted(A + i * VLEN_FLOATS +
|
|
(k + 1) * ROWS);
|
|
|
|
/*
|
|
* To achieve better instruction-level parallelism,
|
|
* make sure to first load input data for (k+1) before
|
|
* initiating compute for k. We enforce that ordering
|
|
* with a pseudo asm statement.
|
|
* Note that we need to massage this particular "barrier"
|
|
* depending on the gcc version.
|
|
*/
|
|
#if __GNUC__ > 7 || defined(__clang__)
|
|
#define BARRIER_READ_BEFORE_COMPUTE(SUFFIX) \
|
|
do { \
|
|
asm("" \
|
|
: "+v"(C_0_0), "+v"(C_0_1), "+v"(C_0_2), "+v"(C_0_3), "+v"(C_1_0), \
|
|
"+v"(C_1_1), "+v"(C_1_2), "+v"(C_1_3) \
|
|
: "v"(B##SUFFIX[0]), "v"(B##SUFFIX[1]), "v"(B##SUFFIX[2]), \
|
|
"v"(B##SUFFIX[3]), "v"(A##SUFFIX[0]), "v"(A##SUFFIX[1]), \
|
|
"v"(A##SUFFIX[2]), "v"(A##SUFFIX[3])); \
|
|
asm("" \
|
|
: "+v"(C_2_0), "+v"(C_2_1), "+v"(C_2_2), "+v"(C_2_3), "+v"(C_3_0), \
|
|
"+v"(C_3_1), "+v"(C_3_2), "+v"(C_3_3) \
|
|
: "v"(B##SUFFIX[0]), "v"(B##SUFFIX[1]), "v"(B##SUFFIX[2]), \
|
|
"v"(B##SUFFIX[3]), "v"(A##SUFFIX[0]), "v"(A##SUFFIX[1]), \
|
|
"v"(A##SUFFIX[2]), "v"(A##SUFFIX[3])); \
|
|
} while (0)
|
|
#else // __GNUC__ <= 7
|
|
#define BARRIER_READ_BEFORE_COMPUTE(SUFFIX) \
|
|
do { \
|
|
asm(""); \
|
|
} while (0)
|
|
#endif
|
|
|
|
BARRIER_READ_BEFORE_COMPUTE(knext);
|
|
|
|
/* Compute for (k) */
|
|
C_0_0 += Ak[0] * Bk[0];
|
|
C_1_0 += Ak[1] * Bk[0];
|
|
C_2_0 += Ak[2] * Bk[0];
|
|
C_3_0 += Ak[3] * Bk[0];
|
|
|
|
C_0_1 += Ak[0] * Bk[1];
|
|
C_1_1 += Ak[1] * Bk[1];
|
|
C_2_1 += Ak[2] * Bk[1];
|
|
C_3_1 += Ak[3] * Bk[1];
|
|
|
|
C_0_2 += Ak[0] * Bk[2];
|
|
C_1_2 += Ak[1] * Bk[2];
|
|
C_2_2 += Ak[2] * Bk[2];
|
|
C_3_2 += Ak[3] * Bk[2];
|
|
|
|
C_0_3 += Ak[0] * Bk[3];
|
|
C_1_3 += Ak[1] * Bk[3];
|
|
C_2_3 += Ak[2] * Bk[3];
|
|
C_3_3 += Ak[3] * Bk[3];
|
|
|
|
asm("");
|
|
|
|
/*
|
|
* Load inputs for (k+2) into registers.
|
|
* First load from B.
|
|
*/
|
|
for (BLASLONG j = 0; j < VEC_COLS; j++)
|
|
Bk[j] = vec_splats(B[j + (k + 2) * COLS]);
|
|
asm("");
|
|
for (BLASLONG i = 0; i < VEC_ROWS; i++)
|
|
Ak[i] = vec_load_hinted(A + i * VLEN_FLOATS + (k + 2) * ROWS);
|
|
|
|
/*
|
|
* As above, make sure to first schedule the loads for (k+2)
|
|
* before compute for (k+1).
|
|
*/
|
|
BARRIER_READ_BEFORE_COMPUTE(k);
|
|
|
|
/* Compute on (k+1) */
|
|
C_0_0 += Aknext[0] * Bknext[0];
|
|
C_1_0 += Aknext[1] * Bknext[0];
|
|
C_2_0 += Aknext[2] * Bknext[0];
|
|
C_3_0 += Aknext[3] * Bknext[0];
|
|
|
|
C_0_1 += Aknext[0] * Bknext[1];
|
|
C_1_1 += Aknext[1] * Bknext[1];
|
|
C_2_1 += Aknext[2] * Bknext[1];
|
|
C_3_1 += Aknext[3] * Bknext[1];
|
|
|
|
C_0_2 += Aknext[0] * Bknext[2];
|
|
C_1_2 += Aknext[1] * Bknext[2];
|
|
C_2_2 += Aknext[2] * Bknext[2];
|
|
C_3_2 += Aknext[3] * Bknext[2];
|
|
|
|
C_0_3 += Aknext[0] * Bknext[3];
|
|
C_1_3 += Aknext[1] * Bknext[3];
|
|
C_2_3 += Aknext[2] * Bknext[3];
|
|
C_3_3 += Aknext[3] * Bknext[3];
|
|
}
|
|
|
|
/* Wrapup remaining k's */
|
|
for (; k < bk; k++) {
|
|
vector_float Ak;
|
|
|
|
#define COMPUTE_WRAPUP_ROW(ROW) \
|
|
Ak = vec_load_hinted(A + ROW * VLEN_FLOATS + k * ROWS); \
|
|
C_##ROW##_0 += Ak * B[0 + k * COLS]; \
|
|
C_##ROW##_1 += Ak * B[1 + k * COLS]; \
|
|
C_##ROW##_2 += Ak * B[2 + k * COLS]; \
|
|
C_##ROW##_3 += Ak * B[3 + k * COLS];
|
|
|
|
COMPUTE_WRAPUP_ROW(0)
|
|
COMPUTE_WRAPUP_ROW(1)
|
|
COMPUTE_WRAPUP_ROW(2)
|
|
COMPUTE_WRAPUP_ROW(3)
|
|
#undef COMPUTE_WRAPUP_ROW
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Unpack row-block of C_aux into outer C_i, multiply by
|
|
* alpha and add up (or assign for TRMM).
|
|
*/
|
|
#define WRITE_BACK_C(ROW, COL) \
|
|
do { \
|
|
vector_float *Cij = \
|
|
(vector_float *)(C + ROW * VLEN_FLOATS + COL * ldc); \
|
|
if (trmm) { \
|
|
*Cij = alpha * C_##ROW##_##COL; \
|
|
} else { \
|
|
*Cij += alpha * C_##ROW##_##COL; \
|
|
} \
|
|
} while (0)
|
|
|
|
WRITE_BACK_C(0, 0); WRITE_BACK_C(0, 1); WRITE_BACK_C(0, 2); WRITE_BACK_C(0, 3);
|
|
WRITE_BACK_C(1, 0); WRITE_BACK_C(1, 1); WRITE_BACK_C(1, 2); WRITE_BACK_C(1, 3);
|
|
WRITE_BACK_C(2, 0); WRITE_BACK_C(2, 1); WRITE_BACK_C(2, 2); WRITE_BACK_C(2, 3);
|
|
WRITE_BACK_C(3, 0); WRITE_BACK_C(3, 1); WRITE_BACK_C(3, 2); WRITE_BACK_C(3, 3);
|
|
#undef WRITE_BACK_C
|
|
|
|
#undef ROWS
|
|
#undef VEC_ROWS
|
|
#undef COLS
|
|
#undef VEC_COLS
|
|
#undef BARRIER_READ_BEFORE_COMPUTE
|
|
}
|
|
|
|
/**
|
|
* Handle calculation for row blocks in C_i of any size by dispatching into
|
|
* macro-defined (inline) functions or by deferring to a simple generic
|
|
* implementation. Note that the compiler can remove this awkward-looking
|
|
* dispatching code while inlineing.
|
|
*
|
|
* @param[in] m Number of rows in block C_i.
|
|
* @param[in] n Number of columns in block C_i.
|
|
* @param[in] first_row Index of first row of the block C_i (relative to C).
|
|
* @param[in] A Pointer to input matrix A (note: all of it).
|
|
* @param[in] k Number of columns in A and rows in B.
|
|
* @param[in] B Pointer to current column block (panel) of input matrix B.
|
|
* @param[inout] C Pointer to current column block (panel) of output matrix C.
|
|
* @param[in] ldc Offset between elements in adjacent columns in C.
|
|
* @param[in] alpha Scalar factor.
|
|
* @param[in] offset Number of columns of A and rows of B to skip (for triangular matrices).
|
|
* @param[in] off Running offset for handling triangular matrices.
|
|
*/
|
|
static inline void GEBP_block(BLASLONG m, BLASLONG n,
|
|
BLASLONG first_row,
|
|
const FLOAT * restrict A, BLASLONG k,
|
|
const FLOAT * restrict B,
|
|
FLOAT *restrict C, BLASLONG ldc,
|
|
FLOAT alpha,
|
|
BLASLONG offset, BLASLONG off)
|
|
{
|
|
if (trmm && left)
|
|
off = offset + first_row;
|
|
|
|
A += first_row * k;
|
|
C += first_row;
|
|
|
|
if (trmm) {
|
|
if (backwards) {
|
|
A += off * m;
|
|
B += off * n;
|
|
k -= off;
|
|
} else {
|
|
if (left) {
|
|
k = off + m;
|
|
} else {
|
|
k = off + n;
|
|
}
|
|
}
|
|
}
|
|
|
|
/* Dispatch into the implementation for each block size: */
|
|
|
|
#define BLOCK(bm, bn) \
|
|
if (m == bm && n == bn) { \
|
|
GEBP_block_##bm##_##bn(A, k, B, C, ldc, alpha); \
|
|
return; \
|
|
}
|
|
|
|
#if UNROLL_M == 16
|
|
BLOCK(16, 4); BLOCK(16, 2); BLOCK(16, 1);
|
|
#endif
|
|
#if UNROLL_N == 8
|
|
BLOCK(8, 8); BLOCK(4, 8);
|
|
#endif
|
|
BLOCK(8, 4); BLOCK(8, 2); BLOCK(8, 1);
|
|
BLOCK(4, 4); BLOCK(4, 2); BLOCK(4, 1);
|
|
|
|
BLOCK(2, 4); BLOCK(2, 2); BLOCK(2, 1);
|
|
|
|
BLOCK(1, 4); BLOCK(1, 2); BLOCK(1, 1);
|
|
|
|
#undef BLOCK
|
|
}
|
|
|
|
/**
|
|
* Handle a column block (panel) of C and B while calculating C += alpha(A * B).
|
|
*
|
|
* @param[in] num_cols Number of columns in the block (in C and B).
|
|
* @param[in] first_col First column of the current block (in C and B).
|
|
* @param[in] A Pointer to input matrix A.
|
|
* @param[in] bk Number of columns in A and rows in B.
|
|
* @param[in] B Pointer to input matrix B (note: all of it).
|
|
* @param[in] bm Number of rows in C and A.
|
|
* @param[inout] C Pointer to output matrix C (note: all of it).
|
|
* @param[in] ldc Offset between elements in adjacent columns in C.
|
|
* @param[in] alpha Scalar factor.
|
|
* @param[in] offset Number of columns of A and rows of B to skip (for triangular matrices).
|
|
*/
|
|
static inline void GEBP_column_block(BLASLONG num_cols, BLASLONG first_col,
|
|
const FLOAT *restrict A, BLASLONG bk,
|
|
const FLOAT *restrict B, BLASLONG bm,
|
|
FLOAT *restrict C, BLASLONG ldc,
|
|
FLOAT alpha,
|
|
BLASLONG const offset) {
|
|
|
|
FLOAT *restrict C_i = C + first_col * ldc;
|
|
/*
|
|
* B is in column-order with n_r packed row elements, which does
|
|
* not matter -- we always move in full such blocks of
|
|
* column*pack
|
|
*/
|
|
const FLOAT *restrict B_i = B + first_col * bk;
|
|
|
|
BLASLONG off = 0;
|
|
if (trmm) {
|
|
if (left) {
|
|
off = offset;
|
|
} else {
|
|
off = -offset + first_col;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Calculate C_aux := A * B_j
|
|
* then unpack C_i += alpha * C_aux.
|
|
*
|
|
* For that purpose, further partition C_aux and A into blocks
|
|
* of m_r (unroll_m) rows, or powers-of-2 if smaller.
|
|
*/
|
|
BLASLONG row = 0;
|
|
for (BLASLONG block_size = unroll_m; block_size > 0; block_size /= 2)
|
|
for (; bm - row >= block_size; row += block_size)
|
|
GEBP_block(block_size, num_cols, row, A, bk, B_i, C_i,
|
|
ldc, alpha, offset, off);
|
|
}
|
|
|
|
/**
|
|
* Inner kernel for matrix-matrix multiplication. C += alpha (A * B)
|
|
* where C is an m-by-n matrix, A is m-by-k and B is k-by-n. Note that A, B, and
|
|
* C are pointers to submatrices of the actual matrices.
|
|
*
|
|
* For triangular matrix multiplication, calculate B := alpha (A * B) where A
|
|
* or B can be triangular (in case of A, the macro LEFT will be defined).
|
|
*
|
|
* @param[in] bm Number of rows in C and A.
|
|
* @param[in] bn Number of columns in C and B.
|
|
* @param[in] bk Number of columns in A and rows in B.
|
|
* @param[in] alpha Scalar factor.
|
|
* @param[in] ba Pointer to input matrix A.
|
|
* @param[in] bb Pointer to input matrix B.
|
|
* @param[inout] C Pointer to output matrix C.
|
|
* @param[in] ldc Offset between elements in adjacent columns in C.
|
|
* @param[in] offset Number of columns of A and rows of B to skip (for triangular matrices).
|
|
* @returns 0 on success.
|
|
*/
|
|
int CNAME(BLASLONG bm, BLASLONG bn, BLASLONG bk, FLOAT alpha,
|
|
FLOAT *restrict ba, FLOAT *restrict bb,
|
|
FLOAT *restrict C, BLASLONG ldc
|
|
#ifdef TRMMKERNEL
|
|
, BLASLONG offset
|
|
#endif
|
|
)
|
|
{
|
|
if ( (bm == 0) || (bn == 0) || (bk == 0) || (alpha == ZERO))
|
|
return 0;
|
|
|
|
/*
|
|
* interface code allocates buffers for ba and bb at page
|
|
* granularity (i.e., using mmap(MAP_ANONYMOUS), so enable the compiler
|
|
* to make use of the fact in vector load operations.
|
|
*/
|
|
ba = __builtin_assume_aligned(ba, 16);
|
|
bb = __builtin_assume_aligned(bb, 16);
|
|
|
|
/*
|
|
* Use offset and off even when compiled as SGEMMKERNEL to simplify
|
|
* function signatures and function calls.
|
|
*/
|
|
#ifndef TRMMKERNEL
|
|
BLASLONG const offset = 0;
|
|
#endif
|
|
|
|
/*
|
|
* Partition B and C into blocks of n_r (unroll_n) columns, called B_i
|
|
* and C_i. For each partition, calculate C_i += alpha * (A * B_j).
|
|
*
|
|
* For remaining columns that do not fill up a block of n_r, iteratively
|
|
* use smaller block sizes of powers of 2.
|
|
*/
|
|
BLASLONG col = 0;
|
|
for (BLASLONG block_size = unroll_n; block_size > 0; block_size /= 2)
|
|
for (; bn - col >= block_size; col += block_size)
|
|
GEBP_column_block(block_size, col, ba, bk, bb, bm, C, ldc, alpha, offset);
|
|
|
|
return 0;
|
|
}
|