OpenBLAS/kernel/x86_64/sbgemm_oncopy_16_spr.c

129 lines
3.9 KiB
C

/***************************************************************************
* Copyright (c) 2021, The OpenBLAS Project
* All rights reserved.
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
* 3. Neither the name of the OpenBLAS project nor the names of
* its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
* USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
* *****************************************************************************/
#include <immintrin.h>
#include "common.h"
typedef struct {
char palette_id;
char start_row;
char dummy0[14]; // bytes 2-15 reserved, must be zero
short tile_colsb[8];
char dummy1[16]; // bytes 32-47 reserved, must be zero
char tile_rows[8];
char dummy2[16]; // bytes 56-63 reserved, must be zero
} tilecfg;
#define T_16x32 0
#define T_16xm 1
#define T_nx32 2
#define T_nxm 3
#define TCONF(cfg, m, n) \
memset(&cfg, 0, sizeof(tilecfg)); \
cfg.palette_id = 1; \
cfg.tile_rows[T_16x32] = 16; \
cfg.tile_colsb[T_16x32] = 64; \
if (m) { \
cfg.tile_rows[T_16xm] = 16; \
cfg.tile_colsb[T_16xm] = m * 2; \
} \
if (n) { \
cfg.tile_rows[T_nx32] = n; \
cfg.tile_colsb[T_nx32] = 64; \
} \
if (m && n) { \
cfg.tile_rows[T_nxm] = n; \
cfg.tile_colsb[T_nxm] = m * 2; \
} \
_tile_loadconfig(&cfg);
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) {
BLASLONG i, j;
IFLOAT *aoffset, *boffset;
IFLOAT *aoffset0;
aoffset = a;
boffset = b;
BLASLONG n16 = n & ~15;
BLASLONG m32 = m & ~31;
BLASLONG m2 = m & ~1;
BLASLONG tail_m = m2 - m32;
BLASLONG tail_n = n - n16;
tilecfg cfg;
TCONF(cfg, tail_m, tail_n);
for (j = 0; j < n16; j += 16) {
aoffset0 = aoffset;
for (i = 0; i < m32; i += 32) {
_tile_loadd(T_16x32, aoffset0, lda * 2);
_tile_stored(T_16x32, boffset, 32 * 2);
aoffset0 += 32;
boffset += 32 * 16;
}
if (i < m2) {
_tile_loadd(T_16xm, aoffset0, lda * 2);
_tile_stored(T_16xm, boffset, tail_m * 2);
aoffset0 += tail_m;
boffset += tail_m * 16;
i = m2;
}
if (i < m) {
/* the tail odd k should put alone */
for (int ii = 0; ii < 16; ii++) {
*(boffset + ii) = *(aoffset0 + lda * ii);
}
boffset += 16;
}
aoffset += 16 * lda;
}
if (j < n) {
aoffset0 = aoffset;
for (i = 0; i < m32; i += 32) {
_tile_loadd(T_nx32, aoffset0, lda * 2);
_tile_stored(T_nx32, boffset, 32 * 2);
aoffset0 += 32;
boffset += 32 * tail_n;
}
if (i < m2) {
_tile_loadd(T_nxm, aoffset0, lda * 2);
_tile_stored(T_nxm, boffset, tail_m * 2);
aoffset0 += tail_m;
boffset += tail_m * tail_n;
}
if (i < m) {
for (int ii = 0; ii < tail_n; ii++) {
*(boffset + ii) = *(aoffset0 + lda * ii);
}
}
}
return 0;
}