diff --git a/kernel/arm64/KERNEL.NEOVERSEN2 b/kernel/arm64/KERNEL.NEOVERSEN2 index ae386d6e1..467d68f4e 100644 --- a/kernel/arm64/KERNEL.NEOVERSEN2 +++ b/kernel/arm64/KERNEL.NEOVERSEN2 @@ -190,10 +190,10 @@ ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) SBGEMM_BETA = sbgemm_beta_neoversen2.c SBGEMMKERNEL = sbgemm_kernel_$(SBGEMM_UNROLL_M)x$(SBGEMM_UNROLL_N)_neoversen2.c -SBGEMMINCOPY = sbgemm_ncopy_$(SBGEMM_UNROLL_N)_neoversen2.c +SBGEMMINCOPY = sbgemm_ncopy_$(SBGEMM_UNROLL_M)_neoversen2.c SBGEMMITCOPY = sbgemm_tcopy_$(SBGEMM_UNROLL_M)_neoversen2.c SBGEMMONCOPY = sbgemm_ncopy_$(SBGEMM_UNROLL_N)_neoversen2.c -SBGEMMOTCOPY = sbgemm_tcopy_$(SBGEMM_UNROLL_M)_neoversen2.c +SBGEMMOTCOPY = sbgemm_tcopy_$(SBGEMM_UNROLL_N)_neoversen2.c SBGEMMINCOPYOBJ = sbgemm_incopy$(TSUFFIX).$(SUFFIX) SBGEMMITCOPYOBJ = sbgemm_itcopy$(TSUFFIX).$(SUFFIX) SBGEMMONCOPYOBJ = sbgemm_oncopy$(TSUFFIX).$(SUFFIX) diff --git a/kernel/arm64/sbgemm_ncopy_8_neoversen2.c b/kernel/arm64/sbgemm_ncopy_8_neoversen2.c new file mode 100644 index 000000000..ff41e487c --- /dev/null +++ b/kernel/arm64/sbgemm_ncopy_8_neoversen2.c @@ -0,0 +1,179 @@ +/*************************************************************************** + * Copyright (c) 2022, The OpenBLAS Project + * All rights reserved. + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * 3. Neither the name of the OpenBLAS project nor the names of + * its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * *****************************************************************************/ + +#include + +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) { + IFLOAT *a_offset; + IFLOAT *a_offsetx[8]; + IFLOAT *b_offset; + a_offset = a; + b_offset = b; + + svbool_t pg16 = svdupq_b16(1, 1, 1, 1, 0, 0, 0, 0); + svbfloat16_t v0, v1, v2, v3, v4, v5, v6, v7; + + for (BLASLONG j = 0; j < n / 8; j++) { + a_offsetx[0] = a_offset; + a_offsetx[1] = a_offsetx[0] + lda; + a_offsetx[2] = a_offsetx[1] + lda; + a_offsetx[3] = a_offsetx[2] + lda; + a_offsetx[4] = a_offsetx[3] + lda; + a_offsetx[5] = a_offsetx[4] + lda; + a_offsetx[6] = a_offsetx[5] + lda; + a_offsetx[7] = a_offsetx[6] + lda; + a_offset += 8 * lda; + + for (BLASLONG i = 0; i < m / 4; i++) { + v0 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[0]); + v1 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[1]); + v2 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[2]); + v3 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[3]); + v4 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[4]); + v5 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[5]); + v6 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[6]); + v7 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[7]); + + svst1_bf16(pg16, (bfloat16_t *)b_offset, v0); + svst1_bf16(pg16, (bfloat16_t *)b_offset + 4, v1); + svst1_bf16(pg16, (bfloat16_t *)b_offset + 8, v2); + svst1_bf16(pg16, (bfloat16_t *)b_offset + 12, v3); + svst1_bf16(pg16, (bfloat16_t *)b_offset + 16, v4); + svst1_bf16(pg16, (bfloat16_t *)b_offset + 20, v5); + svst1_bf16(pg16, (bfloat16_t *)b_offset + 24, v6); + svst1_bf16(pg16, (bfloat16_t *)b_offset + 28, v7); + + b_offset += 32; + a_offsetx[0] += 4; + a_offsetx[1] += 4; + a_offsetx[2] += 4; + a_offsetx[3] += 4; + a_offsetx[4] += 4; + a_offsetx[5] += 4; + a_offsetx[6] += 4; + a_offsetx[7] += 4; + } + + if (m & 3) { + BLASLONG rest = m & 3; + for (BLASLONG col = 0; col < 4; col++) { + b_offset[4 * col] = a_offsetx[col][0]; + b_offset[4 * col + 1] = rest == 1 ? 0 : a_offsetx[col][1]; + b_offset[4 * col + 2] = rest <= 2 ? 0 : a_offsetx[col][2]; + b_offset[4 * col + 3] = rest <= 3 ? 0 : a_offsetx[col][3]; + } + b_offset += 16; + } + } + + if (n & 4) { + a_offsetx[0] = a_offset; + a_offsetx[1] = a_offsetx[0] + lda; + a_offsetx[2] = a_offsetx[1] + lda; + a_offsetx[3] = a_offsetx[2] + lda; + a_offset += 4 * lda; + + for (BLASLONG i = 0; i < m / 4; i++) { + v0 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[0]); + v1 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[1]); + v2 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[2]); + v3 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[3]); + + svst1_bf16(pg16, (bfloat16_t *)b_offset, v0); + svst1_bf16(pg16, (bfloat16_t *)b_offset + 4, v1); + svst1_bf16(pg16, (bfloat16_t *)b_offset + 8, v2); + svst1_bf16(pg16, (bfloat16_t *)b_offset + 12, v3); + + b_offset += 16; + a_offsetx[0] += 4; + a_offsetx[1] += 4; + a_offsetx[2] += 4; + a_offsetx[3] += 4; + } + + if (m & 3) { + BLASLONG rest = m & 3; + for (BLASLONG col = 0; col < 4; col++) { + b_offset[4 * col] = a_offsetx[col][0]; + b_offset[4 * col + 1] = rest == 1 ? 0 : a_offsetx[col][1]; + b_offset[4 * col + 2] = rest <= 2 ? 0 : a_offsetx[col][2]; + b_offset[4 * col + 3] = rest <= 3 ? 0 : a_offsetx[col][3]; + } + b_offset += 16; + } + } + + if (n & 2) { + a_offsetx[0] = a_offset; + a_offsetx[1] = a_offsetx[0] + lda; + a_offset += 2 * lda; + + for (BLASLONG i = 0; i < m / 4; i++) { + v0 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[0]); + v1 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[1]); + svst1_bf16(pg16, (bfloat16_t *)b_offset, v0); + svst1_bf16(pg16, (bfloat16_t *)b_offset + 4, v1); + + b_offset += 8; + a_offsetx[0] += 4; + a_offsetx[1] += 4; + } + + if (m & 3) { + BLASLONG rest = m & 3; + for (BLASLONG col = 0; col < 2; col++) { + b_offset[4 * col] = a_offsetx[col][0]; + b_offset[4 * col + 1] = rest == 1 ? 0 : a_offsetx[col][1]; + b_offset[4 * col + 2] = rest <= 2 ? 0 : a_offsetx[col][2]; + b_offset[4 * col + 3] = rest <= 3 ? 0 : a_offsetx[col][3]; + } + b_offset += 8; + } + } + + if (n & 1) { + a_offsetx[0] = a_offset; + for (BLASLONG i = 0; i < m / 4; i++) { + v0 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[0]); + svst1_bf16(pg16, (bfloat16_t *)b_offset, v0); + b_offset += 4; + a_offsetx[0] += 4; + } + if (m & 3) { + BLASLONG rest = m & 3; + b_offset[0] = a_offsetx[0][0]; + b_offset[1] = rest == 1 ? 0 : a_offsetx[0][1]; + b_offset[2] = rest <= 2 ? 0 : a_offsetx[0][2]; + b_offset[3] = rest <= 3 ? 0 : a_offsetx[0][3]; + } + } + + return 0; +} diff --git a/kernel/arm64/sbgemm_tcopy_4_neoversen2.c b/kernel/arm64/sbgemm_tcopy_4_neoversen2.c new file mode 100644 index 000000000..a652b0b2a --- /dev/null +++ b/kernel/arm64/sbgemm_tcopy_4_neoversen2.c @@ -0,0 +1,147 @@ +/*************************************************************************** + * Copyright (c) 2022, The OpenBLAS Project + * All rights reserved. + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * 3. Neither the name of the OpenBLAS project nor the names of + * its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * *****************************************************************************/ +#include + +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) { + IFLOAT *a_offset, *a_offset0, *a_offset1, *a_offset2, *a_offset3; + IFLOAT *b_offset; + a_offset = a; + b_offset = b; + + uint16x4_t v0_h, v1_h, v2_h, v3_h, v4_h, v5_h, v6_h, v7_h; + + for (BLASLONG j = 0; j < n / 4; j++) { + a_offset0 = a_offset; + a_offset1 = a_offset0 + lda; + a_offset2 = a_offset1 + lda; + a_offset3 = a_offset2 + lda; + a_offset += 4; + + for (BLASLONG i = 0; i < m / 4; i++) { + v0_h = vld1_u16(a_offset0); + v1_h = vld1_u16(a_offset1); + v2_h = vld1_u16(a_offset2); + v3_h = vld1_u16(a_offset3); + + v4_h = vtrn1_u16(v0_h, v1_h); + v5_h = vtrn2_u16(v0_h, v1_h); + v6_h = vtrn1_u16(v2_h, v3_h); + v7_h = vtrn2_u16(v2_h, v3_h); + + v0_h = (uint16x4_t)vtrn1_u32((uint32x2_t)v4_h, (uint32x2_t)v6_h); + v1_h = (uint16x4_t)vtrn1_u32((uint32x2_t)v5_h, (uint32x2_t)v7_h); + v2_h = (uint16x4_t)vtrn2_u32((uint32x2_t)v4_h, (uint32x2_t)v6_h); + v3_h = (uint16x4_t)vtrn2_u32((uint32x2_t)v5_h, (uint32x2_t)v7_h); + + vst1_u16(b_offset, v0_h); + vst1_u16(b_offset + 4, v1_h); + vst1_u16(b_offset + 8, v2_h); + vst1_u16(b_offset + 12, v3_h); + + b_offset += 16; + a_offset0 += 4 * lda; + a_offset1 += 4 * lda; + a_offset2 += 4 * lda; + a_offset3 += 4 * lda; + } + + if (m & 3) { + BLASLONG rest = m & 3; + for (BLASLONG line = 0; line < 4; line++) { + b_offset[line * 4] = a_offset0[line]; + b_offset[line * 4 + 1] = rest == 1 ? 0 : a_offset1[line]; + b_offset[line * 4 + 2] = rest <= 2 ? 0 : a_offset2[line]; + b_offset[line * 4 + 3] = rest <= 3 ? 0 : a_offset3[line]; + } + b_offset += 16; + } + } + + if (n & 2) { + a_offset0 = a_offset; + a_offset1 = a_offset0 + lda; + a_offset2 = a_offset1 + lda; + a_offset3 = a_offset2 + lda; + a_offset += 2; + + for (BLASLONG i = 0; i < m / 4; i++) { + for (BLASLONG line = 0; line < 2; line++) { + b_offset[line * 4] = a_offset0[line]; + b_offset[line * 4 + 1] = a_offset1[line]; + b_offset[line * 4 + 2] = a_offset2[line]; + b_offset[line * 4 + 3] = a_offset3[line]; + } + b_offset += 8; + a_offset0 += 4 * lda; + a_offset1 += 4 * lda; + a_offset2 += 4 * lda; + a_offset3 += 4 * lda; + } + + if (m & 3) { + BLASLONG rest = m & 3; + for (BLASLONG line = 0; line < 2; line++) { + b_offset[line * 4] = a_offset0[line]; + b_offset[line * 4 + 1] = rest == 1 ? 0 : a_offset1[line]; + b_offset[line * 4 + 2] = rest <= 2 ? 0 : a_offset2[line]; + b_offset[line * 4 + 3] = rest <= 3 ? 0 : a_offset3[line]; + } + b_offset += 8; + } + } + + if (n & 1) { + a_offset0 = a_offset; + a_offset1 = a_offset0 + lda; + a_offset2 = a_offset1 + lda; + a_offset3 = a_offset2 + lda; + + for (BLASLONG i = 0; i < m / 4; i++) { + b_offset[0] = *a_offset0; + b_offset[1] = *a_offset1; + b_offset[2] = *a_offset2; + b_offset[3] = *a_offset3; + b_offset += 4; + a_offset0 += 4 * lda; + a_offset1 += 4 * lda; + a_offset2 += 4 * lda; + a_offset3 += 4 * lda; + } + + if (m & 3) { + BLASLONG rest = m & 3; + b_offset[0] = *a_offset0; + b_offset[1] = rest == 1 ? 0 : *a_offset1; + b_offset[2] = rest <= 2 ? 0 : *a_offset2; + b_offset[3] = rest <= 3 ? 0 : *a_offset3; + } + } + return 0; +}