From 70411af888f220c205d457ab7e653a10a0e18108 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sat, 28 Sep 2013 19:02:25 +0200 Subject: [PATCH 01/81] initial checkin of kernel/arm --- kernel/arm/KERNEL | 46 + kernel/arm/KERNEL.ARMV7 | 143 +++ kernel/arm/amax.c | 73 ++ kernel/arm/amin.c | 73 ++ kernel/arm/asum.c | 67 ++ kernel/arm/axpy.c | 64 ++ kernel/arm/copy.c | 59 ++ kernel/arm/dgemm_kernel_8x2_vfpv3.S | 1223 +++++++++++++++++++++ kernel/arm/dot.c | 64 ++ kernel/arm/dtrmm_kernel_8x2_vfpv3.S | 1521 +++++++++++++++++++++++++++ kernel/arm/gemv_n.c | 67 ++ kernel/arm/gemv_t.c | 67 ++ kernel/arm/iamax.c | 75 ++ kernel/arm/iamin.c | 75 ++ kernel/arm/imax.c | 67 ++ kernel/arm/imin.c | 65 ++ kernel/arm/izamax.c | 81 ++ kernel/arm/izamin.c | 81 ++ kernel/arm/max.c | 63 ++ kernel/arm/min.c | 63 ++ kernel/arm/nrm2.c | 88 ++ kernel/arm/rot.c | 62 ++ kernel/arm/scal.c | 58 + kernel/arm/swap.c | 62 ++ kernel/arm/zamax.c | 81 ++ kernel/arm/zamin.c | 81 ++ kernel/arm/zasum.c | 71 ++ kernel/arm/zaxpy.c | 72 ++ kernel/arm/zcopy.c | 63 ++ kernel/arm/zdot.c | 78 ++ kernel/arm/zgemv_n.c | 125 +++ kernel/arm/zgemv_t.c | 131 +++ kernel/arm/znrm2.c | 106 ++ kernel/arm/zrot.c | 68 ++ kernel/arm/zscal.c | 64 ++ kernel/arm/zswap.c | 70 ++ 36 files changed, 5317 insertions(+) create mode 100644 kernel/arm/KERNEL create mode 100644 kernel/arm/KERNEL.ARMV7 create mode 100644 kernel/arm/amax.c create mode 100644 kernel/arm/amin.c create mode 100644 kernel/arm/asum.c create mode 100644 kernel/arm/axpy.c create mode 100644 kernel/arm/copy.c create mode 100644 kernel/arm/dgemm_kernel_8x2_vfpv3.S create mode 100644 kernel/arm/dot.c create mode 100644 kernel/arm/dtrmm_kernel_8x2_vfpv3.S create mode 100644 kernel/arm/gemv_n.c create mode 100644 kernel/arm/gemv_t.c create mode 100644 kernel/arm/iamax.c create mode 100644 kernel/arm/iamin.c create mode 100644 kernel/arm/imax.c create mode 100644 kernel/arm/imin.c create mode 100644 kernel/arm/izamax.c create mode 100644 kernel/arm/izamin.c create mode 100644 kernel/arm/max.c create mode 100644 kernel/arm/min.c create mode 100644 kernel/arm/nrm2.c create mode 100644 kernel/arm/rot.c create mode 100644 kernel/arm/scal.c create mode 100644 kernel/arm/swap.c create mode 100644 kernel/arm/zamax.c create mode 100644 kernel/arm/zamin.c create mode 100644 kernel/arm/zasum.c create mode 100644 kernel/arm/zaxpy.c create mode 100644 kernel/arm/zcopy.c create mode 100644 kernel/arm/zdot.c create mode 100644 kernel/arm/zgemv_n.c create mode 100644 kernel/arm/zgemv_t.c create mode 100644 kernel/arm/znrm2.c create mode 100644 kernel/arm/zrot.c create mode 100644 kernel/arm/zscal.c create mode 100644 kernel/arm/zswap.c diff --git a/kernel/arm/KERNEL b/kernel/arm/KERNEL new file mode 100644 index 000000000..aeccfbf4c --- /dev/null +++ b/kernel/arm/KERNEL @@ -0,0 +1,46 @@ +ifndef SNRM2KERNEL +SNRM2KERNEL = nrm2.c +endif + +ifndef DNRM2KERNEL +DNRM2KERNEL = nrm2.c +endif + +ifndef CNRM2KERNEL +CNRM2KERNEL = znrm2.c +endif + +ifndef ZNRM2KERNEL +ZNRM2KERNEL = znrm2.c +endif + +ifndef SCABS_KERNEL +SCABS_KERNEL = ../generic/cabs.c +endif + +ifndef DCABS_KERNEL +DCABS_KERNEL = ../generic/cabs.c +endif + +ifndef QCABS_KERNEL +QCABS_KERNEL = ../generic/cabs.c +endif + +ifndef LSAME_KERNEL +LSAME_KERNEL = ../generic/lsame.c +endif + +ifndef SGEMM_BETA +SGEMM_BETA = ../generic/gemm_beta.c +endif +ifndef DGEMM_BETA +DGEMM_BETA = ../generic/gemm_beta.c +endif +ifndef CGEMM_BETA +CGEMM_BETA = ../generic/zgemm_beta.c +endif +ifndef ZGEMM_BETA +ZGEMM_BETA = ../generic/zgemm_beta.c +endif + + diff --git a/kernel/arm/KERNEL.ARMV7 b/kernel/arm/KERNEL.ARMV7 new file mode 100644 index 000000000..a60701002 --- /dev/null +++ b/kernel/arm/KERNEL.ARMV7 @@ -0,0 +1,143 @@ +SAMAXKERNEL = ../arm/amax.c +DAMAXKERNEL = ../arm/amax.c +CAMAXKERNEL = ../arm/zamax.c +ZAMAXKERNEL = ../arm/zamax.c + +SAMINKERNEL = ../arm/amin.c +DAMINKERNEL = ../arm/amin.c +CAMINKERNEL = ../arm/zamin.c +ZAMINKERNEL = ../arm/zamin.c + +SMAXKERNEL = ../arm/max.c +DMAXKERNEL = ../arm/max.c + +SMINKERNEL = ../arm/min.c +DMINKERNEL = ../arm/min.c + +ISAMAXKERNEL = ../arm/iamax.c +IDAMAXKERNEL = ../arm/iamax.c +ICAMAXKERNEL = ../arm/izamax.c +IZAMAXKERNEL = ../arm/izamax.c + +ISAMINKERNEL = ../arm/iamin.c +IDAMINKERNEL = ../arm/iamin.c +ICAMINKERNEL = ../arm/izamin.c +IZAMINKERNEL = ../arm/izamin.c + +ISMAXKERNEL = ../arm/imax.c +IDMAXKERNEL = ../arm/imax.c + +ISMINKERNEL = ../arm/imin.c +IDMINKERNEL = ../arm/imin.c + +SSWAPKERNEL = ../arm/swap.c +DSWAPKERNEL = ../arm/swap.c +CSWAPKERNEL = ../arm/zswap.c +ZSWAPKERNEL = ../arm/zswap.c + +SASUMKERNEL = ../arm/asum.c +DASUMKERNEL = ../arm/asum.c +CASUMKERNEL = ../arm/zasum.c +ZASUMKERNEL = ../arm/zasum.c + +SAXPYKERNEL = ../arm/axpy.c +DAXPYKERNEL = ../arm/axpy.c +CAXPYKERNEL = ../arm/zaxpy.c +ZAXPYKERNEL = ../arm/zaxpy.c + +SCOPYKERNEL = ../arm/copy.c +DCOPYKERNEL = ../arm/copy.c +CCOPYKERNEL = ../arm/zcopy.c +ZCOPYKERNEL = ../arm/zcopy.c + +SDOTKERNEL = ../arm/dot.c +DDOTKERNEL = ../arm/dot.c +CDOTKERNEL = ../arm/zdot.c +ZDOTKERNEL = ../arm/zdot.c + +SNRM2KERNEL = ../arm/nrm2.c +DNRM2KERNEL = ../arm/nrm2.c +CNRM2KERNEL = ../arm/znrm2.c +ZNRM2KERNEL = ../arm/znrm2.c + +SROTKERNEL = ../arm/rot.c +DROTKERNEL = ../arm/rot.c +CROTKERNEL = ../arm/zrot.c +ZROTKERNEL = ../arm/zrot.c + +SSCALKERNEL = ../arm/scal.c +DSCALKERNEL = ../arm/scal.c +CSCALKERNEL = ../arm/zscal.c +ZSCALKERNEL = ../arm/zscal.c + +SGEMVNKERNEL = gemv_n.c +DGEMVNKERNEL = gemv_n.c +CGEMVNKERNEL = zgemv_n.c +ZGEMVNKERNEL = zgemv_n.c + +SGEMVTKERNEL = gemv_t.c +DGEMVTKERNEL = gemv_t.c +CGEMVTKERNEL = zgemv_t.c +ZGEMVTKERNEL = zgemv_t.c + +STRMMKERNEL = ../generic/trmmkernel_2x2.c +DTRMMKERNEL = dtrmm_kernel_8x2_vfpv3.S +CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c +ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c + +SGEMMKERNEL = ../generic/gemmkernel_2x2.c +SGEMMONCOPY = ../generic/gemm_ncopy_2.c +SGEMMOTCOPY = ../generic/gemm_tcopy_2.c +SGEMMONCOPYOBJ = sgemm_oncopy.o +SGEMMOTCOPYOBJ = sgemm_otcopy.o + +#DGEMMKERNEL = ../generic/gemmkernel_2x2.c +#DGEMMKERNEL = dgemm_kernel_4x2_vfpv2.S +DGEMMKERNEL = dgemm_kernel_8x2_vfpv3.S +DGEMMINCOPY = ../generic/gemm_ncopy_8.c +DGEMMITCOPY = ../generic/gemm_tcopy_8.c +DGEMMONCOPY = ../generic/gemm_ncopy_2.c +DGEMMOTCOPY = ../generic/gemm_tcopy_2.c +DGEMMINCOPYOBJ = dgemm_incopy.o +DGEMMITCOPYOBJ = dgemm_itcopy.o +DGEMMONCOPYOBJ = dgemm_oncopy.o +DGEMMOTCOPYOBJ = dgemm_otcopy.o + +CGEMMKERNEL = ../generic/zgemmkernel_2x2.c +CGEMMONCOPY = ../generic/zgemm_ncopy_2.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +CGEMMONCOPYOBJ = cgemm_oncopy.o +CGEMMOTCOPYOBJ = cgemm_otcopy.o + +ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c +ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMONCOPYOBJ = zgemm_oncopy.o +ZGEMMOTCOPYOBJ = zgemm_otcopy.o + +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CGEMM3MKERNEL = zgemm3m_kernel_8x4_core2.S +ZGEMM3MKERNEL = zgemm3m_kernel_4x4_core2.S + + + + diff --git a/kernel/arm/amax.c b/kernel/arm/amax.c new file mode 100644 index 000000000..55107ca4f --- /dev/null +++ b/kernel/arm/amax.c @@ -0,0 +1,73 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : OK +* BLASTEST double : OK +* CTEST : NoTest +* TEST : NoTest +* +**************************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT maxf=0.0; + + if (n < 0 || inc_x < 1 ) return(maxf); + + maxf=ABS(x[0]); + + while(i < n) + { + if( ABS(x[ix]) > ABS(maxf) ) + { + maxf = ABS(x[ix]); + } + ix += inc_x; + i++; + } + return(maxf); +} + + diff --git a/kernel/arm/amin.c b/kernel/arm/amin.c new file mode 100644 index 000000000..3f7e97be6 --- /dev/null +++ b/kernel/arm/amin.c @@ -0,0 +1,73 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : OK +* BLASTEST double : OK +* CTEST : NoTest +* TEST : NoTest +* +**************************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT minf=0.0; + + if (n < 0 || inc_x < 1 ) return(minf); + + minf=ABS(x[0]); + + while(i < n) + { + if( ABS(x[ix]) < ABS(minf) ) + { + minf = ABS(x[ix]); + } + ix += inc_x; + i++; + } + return(minf); +} + + diff --git a/kernel/arm/asum.c b/kernel/arm/asum.c new file mode 100644 index 000000000..5ac6936a0 --- /dev/null +++ b/kernel/arm/asum.c @@ -0,0 +1,67 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : OK +* BLASTEST double : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + + +#include "common.h" +#include + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + FLOAT sumf = 0.0; + if (n < 0 || inc_x < 1 ) return(sumf); + + n *= inc_x; + while(i < n) + { + sumf += ABS(x[i]); + i += inc_x; + } + return(sumf); +} + + diff --git a/kernel/arm/axpy.c b/kernel/arm/axpy.c new file mode 100644 index 000000000..dceddf78a --- /dev/null +++ b/kernel/arm/axpy.c @@ -0,0 +1,64 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : OK +* BLASTEST double : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + + +#include "common.h" + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i=0; + BLASLONG ix,iy; + + if ( n < 0 ) return(0); + if ( da == 0.0 ) return(0); + + ix = 0; + iy = 0; + + while(i < n) + { + + y[iy] += da * x[ix] ; + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + return(0); + +} + + diff --git a/kernel/arm/copy.c b/kernel/arm/copy.c new file mode 100644 index 000000000..f742a4a33 --- /dev/null +++ b/kernel/arm/copy.c @@ -0,0 +1,59 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : OK +* BLASTEST double : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#include "common.h" + +int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + + if ( n < 0 ) return(0); + + while(i < n) + { + + y[iy] = x[ix] ; + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + return(0); + +} + + diff --git a/kernel/arm/dgemm_kernel_8x2_vfpv3.S b/kernel/arm/dgemm_kernel_8x2_vfpv3.S new file mode 100644 index 000000000..3c474a172 --- /dev/null +++ b/kernel/arm/dgemm_kernel_8x2_vfpv3.S @@ -0,0 +1,1223 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/28 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +* 2013/09/22 Saar +* UNROLL_N 2 +* UNROLL_M 8 +* DGEMM_P 64 +* DGEMM_Q 64 +* DGEMM_R 512 +* A_PRE 128 +* B_PRE 128 +* +* Performance on Odroid U2: +* +* 1 Core: 0.92 GFLOPS ATLAS: 0.81 GFLOPS +* 2 Cores: 1.83 GFLOPS ATLAS: 1.51 GFLOPS +* 3 Cores: 2.67 GFLOPS ATLAS: 1.51 GFLOPS +* 4 Cores: 3.52 GFLOPS ATLAS: 1.51 GFLOPS +* +* 2013/09/28 Saar +* UNROLL_N 2 +* UNROLL_M 8 +* DGEMM_P 128 +* DGEMM_Q 128 +* DGEMM_R 2048 +* A_PRE 128 +* B_PRE 128 +* C_PRE 32 +* +* Performance on Odroid U2: +* +* 1 Core: 0.99 GFLOPS ATLAS: 0.82 GFLOPS +* 2 Cores: 1.97 GFLOPS ATLAS: 1.59 GFLOPS +* 3 Cores: 2.86 GFLOPS ATLAS: 1.59 GFLOPS +* 4 Cores: 3.79 GFLOPS ATLAS: 1.59 GFLOPS +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 252 + +#define OLD_M r0 +#define OLD_N r1 +#define OLD_K r2 +#define OLD_A r3 +#define OLD_ALPHA d0 + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define C [fp, #-248 ] +#define LDC [fp, #-252 ] +#define M [fp, #-256 ] +#define N [fp, #-260 ] +#define K [fp, #-264 ] +#define A [fp, #-268 ] + +#define ALPHA [fp, #-276 ] + +#define B [fp, #4 ] +#define OLD_C [fp, #8 ] +#define OLD_LDC [fp, #12 ] + +#define I r0 +#define J r1 +#define L r2 + +#define AO r5 +#define BO r6 + +#define CO1 r8 +#define CO2 r9 + +#define K1 r7 +#define BC r12 + +#define A_PRE 128 +#define A_PRE1 160 +#define B_PRE 128 +#define C_PRE 32 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro INIT8x2 + + vsub.f64 d8 , d8 , d8 + vmov.f64 d9 , d8 + vmov.f64 d10, d8 + vmov.f64 d11, d8 + vmov.f64 d12, d8 + vmov.f64 d13, d8 + vmov.f64 d14, d8 + vmov.f64 d15, d8 + vmov.f64 d16, d8 + vmov.f64 d17, d8 + vmov.f64 d18, d8 + vmov.f64 d19, d8 + vmov.f64 d20, d8 + vmov.f64 d21, d8 + vmov.f64 d22, d8 + vmov.f64 d23, d8 + +.endm + +.macro KERNEL8x2_START + + vldm BO!, { d24 , d25} + vldm AO!, { d0, d1 , d2, d3, d4 , d5 , d6 , d7 } + + vmul.f64 d26 , d0 , d24 + vmul.f64 d29 , d0 , d25 + vmul.f64 d27 , d1 , d24 + vmul.f64 d30 , d1 , d25 + vmul.f64 d28 , d2 , d24 + vmul.f64 d31 , d2 , d25 + + pld [AO , #A_PRE] + + vadd.f64 d8 , d8 , d26 + vadd.f64 d16 , d16, d29 + vadd.f64 d9 , d9 , d27 + vadd.f64 d17 , d17, d30 + vadd.f64 d10 , d10, d28 + vadd.f64 d18 , d18, d31 + + vmul.f64 d26 , d3 , d24 + vmul.f64 d27 , d4 , d24 + vmul.f64 d28 , d5 , d24 + vmul.f64 d29 , d3 , d25 + vmul.f64 d30 , d4 , d25 + vmul.f64 d31 , d5 , d25 + + pld [AO , #A_PRE1] + + vadd.f64 d11 , d11, d26 + vadd.f64 d12 , d12, d27 + vadd.f64 d13 , d13, d28 + vadd.f64 d19 , d19, d29 + vadd.f64 d20 , d20, d30 + vadd.f64 d21 , d21, d31 + + vmul.f64 d26 , d6 , d24 + vmul.f64 d27 , d7 , d24 + vmul.f64 d29 , d6 , d25 + vmul.f64 d30 , d7 , d25 + + vldm BO!, { d24 , d25} + vldm AO!, { d0, d1 , d2, d3, d4 , d5 , d6 , d7 } + + vadd.f64 d14 , d14, d26 + vadd.f64 d15 , d15, d27 + vadd.f64 d22 , d22, d29 + vadd.f64 d23 , d23, d30 + +.endm + + +.macro KERNEL8x2_M + + vmul.f64 d26 , d0 , d24 + vmul.f64 d29 , d0 , d25 + vmul.f64 d27 , d1 , d24 + vmul.f64 d30 , d1 , d25 + vmul.f64 d28 , d2 , d24 + vmul.f64 d31 , d2 , d25 + + pld [AO , #A_PRE] + + vadd.f64 d8 , d8 , d26 + vadd.f64 d16 , d16, d29 + vadd.f64 d9 , d9 , d27 + vadd.f64 d17 , d17, d30 + vadd.f64 d10 , d10, d28 + vadd.f64 d18 , d18, d31 + + vmul.f64 d26 , d3 , d24 + vmul.f64 d27 , d4 , d24 + vmul.f64 d28 , d5 , d24 + vmul.f64 d29 , d3 , d25 + vmul.f64 d30 , d4 , d25 + vmul.f64 d31 , d5 , d25 + + pld [AO , #A_PRE1] + + vadd.f64 d11 , d11, d26 + vadd.f64 d12 , d12, d27 + vadd.f64 d13 , d13, d28 + vadd.f64 d19 , d19, d29 + vadd.f64 d20 , d20, d30 + vadd.f64 d21 , d21, d31 + + vmul.f64 d26 , d6 , d24 + vmul.f64 d27 , d7 , d24 + vmul.f64 d29 , d6 , d25 + vmul.f64 d30 , d7 , d25 + + vldm BO!, { d24 , d25} + vldm AO!, { d0, d1 , d2, d3, d4 , d5 , d6 , d7 } + + vadd.f64 d14 , d14, d26 + vadd.f64 d15 , d15, d27 + vadd.f64 d22 , d22, d29 + vadd.f64 d23 , d23, d30 + +.endm + + +.macro KERNEL8x2_END + + vmul.f64 d26 , d0 , d24 + vmul.f64 d29 , d0 , d25 + vmul.f64 d27 , d1 , d24 + vmul.f64 d30 , d1 , d25 + vmul.f64 d28 , d2 , d24 + vmul.f64 d31 , d2 , d25 + + pld [AO , #A_PRE] + + vadd.f64 d8 , d8 , d26 + vadd.f64 d16 , d16, d29 + vadd.f64 d9 , d9 , d27 + vadd.f64 d17 , d17, d30 + vadd.f64 d10 , d10, d28 + vadd.f64 d18 , d18, d31 + + vmul.f64 d26 , d3 , d24 + vmul.f64 d27 , d4 , d24 + vmul.f64 d28 , d5 , d24 + vmul.f64 d29 , d3 , d25 + vmul.f64 d30 , d4 , d25 + vmul.f64 d31 , d5 , d25 + + pld [AO , #A_PRE1] + + vadd.f64 d11 , d11, d26 + vadd.f64 d12 , d12, d27 + vadd.f64 d13 , d13, d28 + vadd.f64 d19 , d19, d29 + vadd.f64 d20 , d20, d30 + vadd.f64 d21 , d21, d31 + + vmul.f64 d26 , d6 , d24 + vmul.f64 d27 , d7 , d24 + vmul.f64 d29 , d6 , d25 + vmul.f64 d30 , d7 , d25 + + vadd.f64 d14 , d14, d26 + vadd.f64 d15 , d15, d27 + vadd.f64 d22 , d22, d29 + vadd.f64 d23 , d23, d30 + +.endm + + + + +.macro KERNEL8x2 + + vldm BO!, { d24 , d25} + vldm AO!, { d0, d1 , d2, d3, d4 , d5 , d6 , d7 } + + vmul.f64 d26 , d0 , d24 + vmul.f64 d29 , d0 , d25 + vmul.f64 d27 , d1 , d24 + vmul.f64 d30 , d1 , d25 + vmul.f64 d28 , d2 , d24 + vmul.f64 d31 , d2 , d25 + + pld [AO , #A_PRE] + + vadd.f64 d8 , d8 , d26 + vadd.f64 d16 , d16, d29 + vadd.f64 d9 , d9 , d27 + vadd.f64 d17 , d17, d30 + vadd.f64 d10 , d10, d28 + vadd.f64 d18 , d18, d31 + + vmul.f64 d26 , d3 , d24 + vmul.f64 d27 , d4 , d24 + vmul.f64 d28 , d5 , d24 + vmul.f64 d29 , d3 , d25 + vmul.f64 d30 , d4 , d25 + vmul.f64 d31 , d5 , d25 + + pld [AO , #A_PRE1] + + vadd.f64 d11 , d11, d26 + vadd.f64 d12 , d12, d27 + vadd.f64 d13 , d13, d28 + vadd.f64 d19 , d19, d29 + vadd.f64 d20 , d20, d30 + vadd.f64 d21 , d21, d31 + + vmul.f64 d26 , d6 , d24 + vmul.f64 d27 , d7 , d24 + vmul.f64 d29 , d6 , d25 + vmul.f64 d30 , d7 , d25 + + vadd.f64 d14 , d14, d26 + vadd.f64 d15 , d15, d27 + vadd.f64 d22 , d22, d29 + vadd.f64 d23 , d23, d30 + +.endm + +.macro SAVE8x2 + + vldr d0, ALPHA + vldm CO2, { d24, d25, d26 , d27 , d28 , d29 , d30 , d31 } + + vmul.f64 d8 , d0 , d8 + vmul.f64 d9 , d0 , d9 + vmul.f64 d10, d0 , d10 + vmul.f64 d11, d0 , d11 + vmul.f64 d12, d0 , d12 + vmul.f64 d13, d0 , d13 + vmul.f64 d14, d0 , d14 + vmul.f64 d15, d0 , d15 + + vmul.f64 d16, d0 , d16 + vmul.f64 d17, d0 , d17 + vmul.f64 d18, d0 , d18 + vmul.f64 d19, d0 , d19 + vmul.f64 d20, d0 , d20 + vmul.f64 d21, d0 , d21 + vmul.f64 d22, d0 , d22 + vmul.f64 d23, d0 , d23 + + vldm CO1, { d0 , d1 , d2 , d3 , d4 , d5 , d6 , d7 } + + vadd.f64 d16, d16, d24 + vadd.f64 d17, d17, d25 + vadd.f64 d18, d18, d26 + vadd.f64 d19, d19, d27 + + vadd.f64 d20, d20, d28 + vadd.f64 d21, d21, d29 + vadd.f64 d22, d22, d30 + vadd.f64 d23, d23, d31 + + vadd.f64 d8 , d8 , d0 + vadd.f64 d9 , d9 , d1 + vadd.f64 d10, d10, d2 + vadd.f64 d11, d11, d3 + + vadd.f64 d12, d12, d4 + vadd.f64 d13, d13, d5 + vadd.f64 d14, d14, d6 + vadd.f64 d15, d15, d7 + + vstm CO2!, { d16, d17, d18 , d19 , d20 , d21 , d22 , d23 } + vstm CO1!, { d8 , d9 , d10 , d11 , d12, d13 , d14 , d15 } + +.endm + + +/*************************************************************************************/ + + +.macro INIT4x2 + + vsub.f64 d8 , d8 , d8 + vsub.f64 d9 , d9 , d9 + vsub.f64 d10, d10, d10 + vsub.f64 d11, d11, d11 + vsub.f64 d12, d12, d12 + vsub.f64 d13, d13, d13 + vsub.f64 d14, d14, d14 + vsub.f64 d15, d15, d15 + +.endm + +.macro KERNEL4x2 + + vldm AO!, { d0, d1 , d2, d3 } + vldm BO!, { d4, d5 } + + vmul.f64 d6 , d0 , d4 + vmul.f64 d7 , d1 , d4 + vadd.f64 d8 , d8 , d6 + vadd.f64 d9 , d9 , d7 + + vmul.f64 d6 , d2 , d4 + vmul.f64 d7 , d3 , d4 + vadd.f64 d10, d10, d6 + vadd.f64 d11, d11, d7 + + vmul.f64 d6 , d0 , d5 + vmul.f64 d7 , d1 , d5 + vadd.f64 d12, d12, d6 + vadd.f64 d13, d13, d7 + + vmul.f64 d6 , d2 , d5 + vmul.f64 d7 , d3 , d5 + vadd.f64 d14, d14, d6 + vadd.f64 d15, d15, d7 + +.endm + +.macro SAVE4x2 + + vldr d0, ALPHA + + vmul.f64 d8 , d0 , d8 + vmul.f64 d9 , d0 , d9 + vmul.f64 d10, d0 , d10 + vmul.f64 d11, d0 , d11 + vmul.f64 d12, d0 , d12 + vmul.f64 d13, d0 , d13 + vmul.f64 d14, d0 , d14 + vmul.f64 d15, d0 , d15 + + vldm CO1, { d0, d1 , d2 , d3 } + vldm CO2, { d4, d5 , d6 , d7 } + + vadd.f64 d8 , d8 , d0 + vadd.f64 d9 , d9 , d1 + vadd.f64 d10, d10, d2 + vadd.f64 d11, d11, d3 + + vadd.f64 d12, d12, d4 + vadd.f64 d13, d13, d5 + vadd.f64 d14, d14, d6 + vadd.f64 d15, d15, d7 + + vstm CO1!, { d8 , d9 , d10 , d11 } + vstm CO2!, { d12, d13 ,d14 , d15 } + +.endm + + + +/*************************************************************************************/ + +.macro INIT2x2 + + vsub.f64 d8 , d8 , d8 + vsub.f64 d9 , d9 , d9 + vsub.f64 d12, d12, d12 + vsub.f64 d13, d13, d13 + +.endm + +.macro KERNEL2x2 + + vldm AO!, { d0, d1 } + vldm BO!, { d4, d5 } + + vmul.f64 d6 , d0 , d4 + vmul.f64 d7 , d1 , d4 + vadd.f64 d8 , d8 , d6 + vadd.f64 d9 , d9 , d7 + + vmul.f64 d6 , d0 , d5 + vmul.f64 d7 , d1 , d5 + vadd.f64 d12, d12, d6 + vadd.f64 d13, d13, d7 + +.endm + +.macro SAVE2x2 + + vldr d0, ALPHA + + vmul.f64 d8 , d0 , d8 + vmul.f64 d9 , d0 , d9 + vmul.f64 d12, d0 , d12 + vmul.f64 d13, d0 , d13 + + vldm CO1, { d0, d1 } + vldm CO2, { d4, d5 } + + vadd.f64 d8 , d8 , d0 + vadd.f64 d9 , d9 , d1 + + vadd.f64 d12, d12, d4 + vadd.f64 d13, d13, d5 + + vstm CO1!, { d8 , d9 } + vstm CO2!, { d12, d13 } + +.endm + +/*************************************************************************************/ + +.macro INIT1x2 + + vsub.f64 d8 , d8 , d8 + vsub.f64 d12, d12, d12 + +.endm + +.macro KERNEL1x2 + + vldm AO!, { d0 } + vldm BO!, { d4, d5 } + + vmul.f64 d6 , d0 , d4 + vadd.f64 d8 , d8 , d6 + + vmul.f64 d6 , d0 , d5 + vadd.f64 d12, d12, d6 + +.endm + +.macro SAVE1x2 + + vldr d0, ALPHA + + vmul.f64 d8 , d0 , d8 + vmul.f64 d12, d0 , d12 + + vldm CO1, { d0 } + vldm CO2, { d4 } + + vadd.f64 d8 , d8 , d0 + vadd.f64 d12, d12, d4 + + vstm CO1!, { d8 } + vstm CO2!, { d12} + +.endm + +/*************************************************************************************/ + +.macro INIT8x1 + + vsub.f64 d8 , d8 , d8 + vsub.f64 d9 , d9 , d9 + vsub.f64 d10, d10, d10 + vsub.f64 d11, d11, d11 + vsub.f64 d12, d12, d12 + vsub.f64 d13, d13, d13 + vsub.f64 d14, d14, d14 + vsub.f64 d15, d15, d15 + +.endm + +.macro KERNEL8x1 + + vldm AO!, { d0, d1 , d2, d3, d4 , d5 , d6 , d7 } + vldm BO!, { d24 } + + vmul.f64 d26 , d0 , d24 + vmul.f64 d27 , d1 , d24 + vadd.f64 d8 , d8 , d26 + vadd.f64 d9 , d9 , d27 + + vmul.f64 d28 , d2 , d24 + vmul.f64 d29 , d3 , d24 + vadd.f64 d10 , d10, d28 + vadd.f64 d11 , d11, d29 + + vmul.f64 d26 , d4 , d24 + vmul.f64 d27 , d5 , d24 + vadd.f64 d12 , d12, d26 + vadd.f64 d13 , d13, d27 + + vmul.f64 d28 , d6 , d24 + vmul.f64 d29 , d7 , d24 + vadd.f64 d14 , d14, d28 + vadd.f64 d15 , d15, d29 + + +.endm + +.macro SAVE8x1 + + vldr d0, ALPHA + + vmul.f64 d8 , d0 , d8 + vmul.f64 d9 , d0 , d9 + vmul.f64 d10, d0 , d10 + vmul.f64 d11, d0 , d11 + vmul.f64 d12, d0 , d12 + vmul.f64 d13, d0 , d13 + vmul.f64 d14, d0 , d14 + vmul.f64 d15, d0 , d15 + + vldm CO1, { d0, d1 , d2 , d3 , d4 , d5 , d6 , d7 } + + vadd.f64 d8 , d8 , d0 + vadd.f64 d9 , d9 , d1 + vadd.f64 d10, d10, d2 + vadd.f64 d11, d11, d3 + + vadd.f64 d12, d12, d4 + vadd.f64 d13, d13, d5 + vadd.f64 d14, d14, d6 + vadd.f64 d15, d15, d7 + + vstm CO1!, { d8 , d9 , d10 , d11 , d12, d13 ,d14 , d15 } + +.endm + + +/*************************************************************************************/ + +.macro INIT4x1 + + vsub.f64 d8 , d8 , d8 + vsub.f64 d9 , d9 , d9 + vsub.f64 d10, d10, d10 + vsub.f64 d11, d11, d11 + +.endm + +.macro KERNEL4x1 + + vldm AO!, { d0, d1 , d2, d3 } + vldm BO!, { d4 } + + vmul.f64 d6 , d0 , d4 + vmul.f64 d7 , d1 , d4 + vadd.f64 d8 , d8 , d6 + vadd.f64 d9 , d9 , d7 + + vmul.f64 d6 , d2 , d4 + vmul.f64 d7 , d3 , d4 + vadd.f64 d10, d10, d6 + vadd.f64 d11, d11, d7 + +.endm + +.macro SAVE4x1 + + vldr d0, ALPHA + + vmul.f64 d8 , d0 , d8 + vmul.f64 d9 , d0 , d9 + vmul.f64 d10, d0 , d10 + vmul.f64 d11, d0 , d11 + + vldm CO1, { d0, d1 , d2 , d3 } + + vadd.f64 d8 , d8 , d0 + vadd.f64 d9 , d9 , d1 + vadd.f64 d10, d10, d2 + vadd.f64 d11, d11, d3 + + vstm CO1!, { d8 , d9 , d10 , d11 } + +.endm + +/*************************************************************************************/ + +.macro INIT2x1 + + vsub.f64 d8 , d8 , d8 + vsub.f64 d9 , d9 , d9 + +.endm + +.macro KERNEL2x1 + + vldm AO!, { d0, d1 } + vldm BO!, { d4 } + + vmul.f64 d6 , d0 , d4 + vmul.f64 d7 , d1 , d4 + vadd.f64 d8 , d8 , d6 + vadd.f64 d9 , d9 , d7 + +.endm + +.macro SAVE2x1 + + vldr d0, ALPHA + + vmul.f64 d8 , d0 , d8 + vmul.f64 d9 , d0 , d9 + + vldm CO1, { d0, d1 } + + vadd.f64 d8 , d8 , d0 + vadd.f64 d9 , d9 , d1 + + vstm CO1!, { d8 , d9 } + +.endm + +/*************************************************************************************/ + +.macro INIT1x1 + + vsub.f64 d8 , d8 , d8 + +.endm + +.macro KERNEL1x1 + + vldm AO!, { d0 } + vldm BO!, { d4 } + + vmul.f64 d6 , d0 , d4 + vadd.f64 d8 , d8 , d6 + +.endm + +.macro SAVE1x1 + + vldr d0, ALPHA + + vmul.f64 d8 , d0 , d8 + + vldm CO1, { d0 } + + vadd.f64 d8 , d8 , d0 + + vstm CO1!, { d8 } + +.endm + + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + str OLD_M, M + str OLD_N, N + str OLD_K, K + str OLD_A, A + vstr OLD_ALPHA, ALPHA + + sub r3, fp, #128 + vstm r3, { d8 - d15} // store floating point registers + + ldr r3, OLD_LDC + lsl r3, r3, #3 // ldc = ldc * 8 + str r3, LDC + + ldr r3, OLD_C + str r3, C + + ldr K1, K + ldr BC, B + + ldr J, N + asrs J, J, #1 // J = J / 2 + ble _L1_BEGIN + +_L2_BEGIN: + + ldr CO1, C // CO1 = C + ldr r4 , LDC + add CO2, CO1, r4 // CO2 = C + LDC + add r3 , CO2, r4 // C = CO2 + LDC + str r3 , C // store C + + ldr AO, A // AO = A + +_L2_M8_BEGIN: + + ldr I, M + asrs I, I, #3 // I = I / 8 + ble _L2_M4_BEGIN + +_L2_M8_20: + + pld [CO1, #C_PRE] + pld [CO2, #C_PRE] + INIT8x2 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble _L2_M8_40 + .align 5 + +_L2_M8_22: + + pld [BO , #B_PRE] + KERNEL8x2_START + KERNEL8x2_M + pld [BO , #B_PRE] + KERNEL8x2_M + KERNEL8x2_M + + pld [BO , #B_PRE] + KERNEL8x2_M + KERNEL8x2_M + pld [BO , #B_PRE] + KERNEL8x2_M + KERNEL8x2_END + + subs L, L, #1 + bgt _L2_M8_22 + + +_L2_M8_40: + + ands L , K1, #7 // L = L % 8 + ble _L2_M8_100 + +_L2_M8_42: + + KERNEL8x2 + + subs L, L, #1 + bgt _L2_M8_42 + +_L2_M8_100: + + SAVE8x2 + +_L2_M8_END: + + subs I, I, #1 + bgt _L2_M8_20 + + +_L2_M4_BEGIN: + + ldr I, M + tst I , #7 + ble _L2_END + + tst I , #4 + ble _L2_M2_BEGIN + +_L2_M4_20: + + INIT4x2 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble _L2_M4_40 + .align 5 + +_L2_M4_22: + + KERNEL4x2 + KERNEL4x2 + KERNEL4x2 + KERNEL4x2 + + KERNEL4x2 + KERNEL4x2 + KERNEL4x2 + KERNEL4x2 + + subs L, L, #1 + bgt _L2_M4_22 + + +_L2_M4_40: + + ands L , K1, #7 // L = L % 8 + ble _L2_M4_100 + +_L2_M4_42: + + KERNEL4x2 + + subs L, L, #1 + bgt _L2_M4_42 + +_L2_M4_100: + + SAVE4x2 + +_L2_M4_END: + + + +_L2_M2_BEGIN: + + tst I, #2 // I = I / 2 + ble _L2_M1_BEGIN + +_L2_M2_20: + + INIT2x2 + + mov BO, BC + asrs L , K1, #2 // L = L / 4 + ble _L2_M2_40 + +_L2_M2_22: + + KERNEL2x2 + KERNEL2x2 + KERNEL2x2 + KERNEL2x2 + + subs L, L, #1 + bgt _L2_M2_22 + + +_L2_M2_40: + + ands L , K1, #3 // L = L % 4 + ble _L2_M2_100 + +_L2_M2_42: + + KERNEL2x2 + + subs L, L, #1 + bgt _L2_M2_42 + +_L2_M2_100: + + SAVE2x2 + +_L2_M2_END: + + +_L2_M1_BEGIN: + + tst I, #1 // I = I % 2 + ble _L2_END + +_L2_M1_20: + + INIT1x2 + + mov BO, BC + asrs L , K1, #2 // L = L / 4 + ble _L2_M1_40 + +_L2_M1_22: + + KERNEL1x2 + KERNEL1x2 + KERNEL1x2 + KERNEL1x2 + + subs L, L, #1 + bgt _L2_M1_22 + + +_L2_M1_40: + + ands L , K1, #3 // L = L % 4 + ble _L2_M1_100 + +_L2_M1_42: + + KERNEL1x2 + + subs L, L, #1 + bgt _L2_M1_42 + +_L2_M1_100: + + SAVE1x2 + + +_L2_END: + + mov r3, BC + mov r4, K1 + lsl r4, r4, #4 // k * 2 * 8 + add r3, r3, r4 // B = B + K * 2 * 8 + mov BC, r3 + + subs J , #1 // j-- + bgt _L2_BEGIN + + +_L1_BEGIN: + + ldr J, N + tst J , #1 // J = J % 2 + ble _L999 + + ldr CO1, C // CO1 = C + ldr r4 , LDC + add r3 , CO1, r4 // C = CO1 + LDC + str r3 , C // store C + + ldr AO, A // AO = A + + + +_L1_M8_BEGIN: + + ldr I, M + asrs I, I, #3 // I = I / 8 + ble _L1_M4_BEGIN + +_L1_M8_20: + + INIT8x1 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble _L1_M8_40 + +_L1_M8_22: + + KERNEL8x1 + KERNEL8x1 + KERNEL8x1 + KERNEL8x1 + KERNEL8x1 + KERNEL8x1 + KERNEL8x1 + KERNEL8x1 + + subs L, L, #1 + bgt _L1_M8_22 + + +_L1_M8_40: + + ands L , K1, #7 // L = L % 8 + ble _L1_M8_100 + +_L1_M8_42: + + KERNEL8x1 + + subs L, L, #1 + bgt _L1_M8_42 + +_L1_M8_100: + + SAVE8x1 + +_L1_M8_END: + + subs I, I, #1 + bgt _L1_M8_20 + + + + +_L1_M4_BEGIN: + + ldr I, M + tst I, #7 // I = I % 8 + ble _L1_END + + tst I, #4 // I = I % 8 + ble _L1_M2_BEGIN + +_L1_M4_20: + + INIT4x1 + + mov BO, BC + asrs L , K1, #2 // L = L / 4 + ble _L1_M4_40 + +_L1_M4_22: + + KERNEL4x1 + KERNEL4x1 + KERNEL4x1 + KERNEL4x1 + + subs L, L, #1 + bgt _L1_M4_22 + + +_L1_M4_40: + + ands L , K1, #3 // L = L % 4 + ble _L1_M4_100 + +_L1_M4_42: + + KERNEL4x1 + + subs L, L, #1 + bgt _L1_M4_42 + +_L1_M4_100: + + SAVE4x1 + +_L1_M4_END: + + + + +_L1_M2_BEGIN: + + tst I, #2 // I = I % 4 + ble _L1_M1_BEGIN + +_L1_M2_20: + + INIT2x1 + + mov BO, BC + asrs L , K1, #2 // L = L / 4 + ble _L1_M2_40 + +_L1_M2_22: + + KERNEL2x1 + KERNEL2x1 + KERNEL2x1 + KERNEL2x1 + + subs L, L, #1 + bgt _L1_M2_22 + + +_L1_M2_40: + + ands L , K1, #3 // L = L % 4 + ble _L1_M2_100 + +_L1_M2_42: + + KERNEL2x1 + + subs L, L, #1 + bgt _L1_M2_42 + +_L1_M2_100: + + SAVE2x1 + +_L1_M2_END: + + + +_L1_M1_BEGIN: + + tst I, #1 // I = I % 4 + ble _L1_END + +_L1_M1_20: + + INIT1x1 + + mov BO, BC + asrs L , K1, #2 // L = L / 4 + ble _L1_M1_40 + +_L1_M1_22: + + KERNEL1x1 + KERNEL1x1 + KERNEL1x1 + KERNEL1x1 + + subs L, L, #1 + bgt _L1_M1_22 + + +_L1_M1_40: + + ands L , K1, #3 // L = L % 4 + ble _L1_M1_100 + +_L1_M1_42: + + KERNEL1x1 + + subs L, L, #1 + bgt _L1_M1_42 + +_L1_M1_100: + + SAVE1x1 + + +_L1_END: + + + +_L999: + + sub r3, fp, #128 + vldm r3, { d8 - d15} // restore floating point registers + + movs r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/dot.c b/kernel/arm/dot.c new file mode 100644 index 000000000..30490e291 --- /dev/null +++ b/kernel/arm/dot.c @@ -0,0 +1,64 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : OK +* BLASTEST double : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#include "common.h" + +#if defined(DSDOT) +double CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +#else +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +#endif +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + double dot = 0.0 ; + + if ( n < 0 ) return(dot); + + while(i < n) + { + + dot += y[iy] * x[ix] ; + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + return(dot); + +} + + diff --git a/kernel/arm/dtrmm_kernel_8x2_vfpv3.S b/kernel/arm/dtrmm_kernel_8x2_vfpv3.S new file mode 100644 index 000000000..930616635 --- /dev/null +++ b/kernel/arm/dtrmm_kernel_8x2_vfpv3.S @@ -0,0 +1,1521 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/28 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 252 + +#define OLD_M r0 +#define OLD_N r1 +#define OLD_K r2 +#define OLD_A r3 +#define OLD_ALPHA d0 + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + + +#define KK [fp, #-240 ] +#define KKK [fp, #-244] +#define C [fp, #-248 ] +#define LDC [fp, #-252 ] +#define M [fp, #-256 ] +#define N [fp, #-260 ] +#define K [fp, #-264 ] +#define A [fp, #-268 ] + +#define ALPHA [fp, #-276 ] + +#define B [fp, #4 ] +#define OLD_C [fp, #8 ] +#define OLD_LDC [fp, #12 ] +#define OFFSET [fp, #16 ] + +#define I r0 +#define J r1 +#define L r2 + +#define AO r5 +#define BO r6 + +#define CO1 r8 +#define CO2 r9 + +#define K1 r7 +#define BC r12 + +#define A_PRE 128 +#define A_PRE1 160 +#define B_PRE 128 +#define C_PRE 32 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro INIT8x2 + + vsub.f64 d8 , d8 , d8 + vmov.f64 d9 , d8 + vmov.f64 d10, d8 + vmov.f64 d11, d8 + vmov.f64 d12, d8 + vmov.f64 d13, d8 + vmov.f64 d14, d8 + vmov.f64 d15, d8 + vmov.f64 d16, d8 + vmov.f64 d17, d8 + vmov.f64 d18, d8 + vmov.f64 d19, d8 + vmov.f64 d20, d8 + vmov.f64 d21, d8 + vmov.f64 d22, d8 + vmov.f64 d23, d8 + +.endm + +.macro KERNEL8x2_START + + vldm BO!, { d24 , d25} + vldm AO!, { d0, d1 , d2, d3, d4 , d5 , d6 , d7 } + + vmul.f64 d26 , d0 , d24 + vmul.f64 d29 , d0 , d25 + vmul.f64 d27 , d1 , d24 + vmul.f64 d30 , d1 , d25 + vmul.f64 d28 , d2 , d24 + vmul.f64 d31 , d2 , d25 + + pld [AO , #A_PRE] + + vadd.f64 d8 , d8 , d26 + vadd.f64 d16 , d16, d29 + vadd.f64 d9 , d9 , d27 + vadd.f64 d17 , d17, d30 + vadd.f64 d10 , d10, d28 + vadd.f64 d18 , d18, d31 + + vmul.f64 d26 , d3 , d24 + vmul.f64 d27 , d4 , d24 + vmul.f64 d28 , d5 , d24 + vmul.f64 d29 , d3 , d25 + vmul.f64 d30 , d4 , d25 + vmul.f64 d31 , d5 , d25 + + pld [AO , #A_PRE1] + + vadd.f64 d11 , d11, d26 + vadd.f64 d12 , d12, d27 + vadd.f64 d13 , d13, d28 + vadd.f64 d19 , d19, d29 + vadd.f64 d20 , d20, d30 + vadd.f64 d21 , d21, d31 + + vmul.f64 d26 , d6 , d24 + vmul.f64 d27 , d7 , d24 + vmul.f64 d29 , d6 , d25 + vmul.f64 d30 , d7 , d25 + + vldm BO!, { d24 , d25} + vldm AO!, { d0, d1 , d2, d3, d4 , d5 , d6 , d7 } + + vadd.f64 d14 , d14, d26 + vadd.f64 d15 , d15, d27 + vadd.f64 d22 , d22, d29 + vadd.f64 d23 , d23, d30 + +.endm + + +.macro KERNEL8x2_M + + vmul.f64 d26 , d0 , d24 + vmul.f64 d29 , d0 , d25 + vmul.f64 d27 , d1 , d24 + vmul.f64 d30 , d1 , d25 + vmul.f64 d28 , d2 , d24 + vmul.f64 d31 , d2 , d25 + + pld [AO , #A_PRE] + + vadd.f64 d8 , d8 , d26 + vadd.f64 d16 , d16, d29 + vadd.f64 d9 , d9 , d27 + vadd.f64 d17 , d17, d30 + vadd.f64 d10 , d10, d28 + vadd.f64 d18 , d18, d31 + + vmul.f64 d26 , d3 , d24 + vmul.f64 d27 , d4 , d24 + vmul.f64 d28 , d5 , d24 + vmul.f64 d29 , d3 , d25 + vmul.f64 d30 , d4 , d25 + vmul.f64 d31 , d5 , d25 + + pld [AO , #A_PRE1] + + vadd.f64 d11 , d11, d26 + vadd.f64 d12 , d12, d27 + vadd.f64 d13 , d13, d28 + vadd.f64 d19 , d19, d29 + vadd.f64 d20 , d20, d30 + vadd.f64 d21 , d21, d31 + + vmul.f64 d26 , d6 , d24 + vmul.f64 d27 , d7 , d24 + vmul.f64 d29 , d6 , d25 + vmul.f64 d30 , d7 , d25 + + vldm BO!, { d24 , d25} + vldm AO!, { d0, d1 , d2, d3, d4 , d5 , d6 , d7 } + + vadd.f64 d14 , d14, d26 + vadd.f64 d15 , d15, d27 + vadd.f64 d22 , d22, d29 + vadd.f64 d23 , d23, d30 + +.endm + + +.macro KERNEL8x2_END + + vmul.f64 d26 , d0 , d24 + vmul.f64 d29 , d0 , d25 + vmul.f64 d27 , d1 , d24 + vmul.f64 d30 , d1 , d25 + vmul.f64 d28 , d2 , d24 + vmul.f64 d31 , d2 , d25 + + pld [AO , #A_PRE] + + vadd.f64 d8 , d8 , d26 + vadd.f64 d16 , d16, d29 + vadd.f64 d9 , d9 , d27 + vadd.f64 d17 , d17, d30 + vadd.f64 d10 , d10, d28 + vadd.f64 d18 , d18, d31 + + vmul.f64 d26 , d3 , d24 + vmul.f64 d27 , d4 , d24 + vmul.f64 d28 , d5 , d24 + vmul.f64 d29 , d3 , d25 + vmul.f64 d30 , d4 , d25 + vmul.f64 d31 , d5 , d25 + + pld [AO , #A_PRE1] + + vadd.f64 d11 , d11, d26 + vadd.f64 d12 , d12, d27 + vadd.f64 d13 , d13, d28 + vadd.f64 d19 , d19, d29 + vadd.f64 d20 , d20, d30 + vadd.f64 d21 , d21, d31 + + vmul.f64 d26 , d6 , d24 + vmul.f64 d27 , d7 , d24 + vmul.f64 d29 , d6 , d25 + vmul.f64 d30 , d7 , d25 + + vadd.f64 d14 , d14, d26 + vadd.f64 d15 , d15, d27 + vadd.f64 d22 , d22, d29 + vadd.f64 d23 , d23, d30 + +.endm + + + + +.macro KERNEL8x2 + + vldm BO!, { d24 , d25} + vldm AO!, { d0, d1 , d2, d3, d4 , d5 , d6 , d7 } + + vmul.f64 d26 , d0 , d24 + vmul.f64 d29 , d0 , d25 + vmul.f64 d27 , d1 , d24 + vmul.f64 d30 , d1 , d25 + vmul.f64 d28 , d2 , d24 + vmul.f64 d31 , d2 , d25 + + pld [AO , #A_PRE] + + vadd.f64 d8 , d8 , d26 + vadd.f64 d16 , d16, d29 + vadd.f64 d9 , d9 , d27 + vadd.f64 d17 , d17, d30 + vadd.f64 d10 , d10, d28 + vadd.f64 d18 , d18, d31 + + vmul.f64 d26 , d3 , d24 + vmul.f64 d27 , d4 , d24 + vmul.f64 d28 , d5 , d24 + vmul.f64 d29 , d3 , d25 + vmul.f64 d30 , d4 , d25 + vmul.f64 d31 , d5 , d25 + + pld [AO , #A_PRE1] + + vadd.f64 d11 , d11, d26 + vadd.f64 d12 , d12, d27 + vadd.f64 d13 , d13, d28 + vadd.f64 d19 , d19, d29 + vadd.f64 d20 , d20, d30 + vadd.f64 d21 , d21, d31 + + vmul.f64 d26 , d6 , d24 + vmul.f64 d27 , d7 , d24 + vmul.f64 d29 , d6 , d25 + vmul.f64 d30 , d7 , d25 + + vadd.f64 d14 , d14, d26 + vadd.f64 d15 , d15, d27 + vadd.f64 d22 , d22, d29 + vadd.f64 d23 , d23, d30 + +.endm + +.macro SAVE8x2 + + vldr d0, ALPHA + + vmul.f64 d8 , d0 , d8 + vmul.f64 d9 , d0 , d9 + vmul.f64 d10, d0 , d10 + vmul.f64 d11, d0 , d11 + vmul.f64 d12, d0 , d12 + vmul.f64 d13, d0 , d13 + vmul.f64 d14, d0 , d14 + vmul.f64 d15, d0 , d15 + + vmul.f64 d16, d0 , d16 + vmul.f64 d17, d0 , d17 + vmul.f64 d18, d0 , d18 + vmul.f64 d19, d0 , d19 + vmul.f64 d20, d0 , d20 + vmul.f64 d21, d0 , d21 + vmul.f64 d22, d0 , d22 + vmul.f64 d23, d0 , d23 + + vstm CO2!, { d16, d17, d18 , d19 , d20 , d21 , d22 , d23 } + vstm CO1!, { d8 , d9 , d10 , d11 , d12, d13 , d14 , d15 } + +.endm + + +/*************************************************************************************/ + + +.macro INIT4x2 + + vsub.f64 d8 , d8 , d8 + vsub.f64 d9 , d9 , d9 + vsub.f64 d10, d10, d10 + vsub.f64 d11, d11, d11 + vsub.f64 d12, d12, d12 + vsub.f64 d13, d13, d13 + vsub.f64 d14, d14, d14 + vsub.f64 d15, d15, d15 + +.endm + +.macro KERNEL4x2 + + vldm AO!, { d0, d1 , d2, d3 } + vldm BO!, { d4, d5 } + + vmul.f64 d6 , d0 , d4 + vmul.f64 d7 , d1 , d4 + vadd.f64 d8 , d8 , d6 + vadd.f64 d9 , d9 , d7 + + vmul.f64 d6 , d2 , d4 + vmul.f64 d7 , d3 , d4 + vadd.f64 d10, d10, d6 + vadd.f64 d11, d11, d7 + + vmul.f64 d6 , d0 , d5 + vmul.f64 d7 , d1 , d5 + vadd.f64 d12, d12, d6 + vadd.f64 d13, d13, d7 + + vmul.f64 d6 , d2 , d5 + vmul.f64 d7 , d3 , d5 + vadd.f64 d14, d14, d6 + vadd.f64 d15, d15, d7 + +.endm + +.macro SAVE4x2 + + vldr d0, ALPHA + + vmul.f64 d8 , d0 , d8 + vmul.f64 d9 , d0 , d9 + vmul.f64 d10, d0 , d10 + vmul.f64 d11, d0 , d11 + vmul.f64 d12, d0 , d12 + vmul.f64 d13, d0 , d13 + vmul.f64 d14, d0 , d14 + vmul.f64 d15, d0 , d15 + + vstm CO1!, { d8 , d9 , d10 , d11 } + vstm CO2!, { d12, d13 ,d14 , d15 } + +.endm + + + +/*************************************************************************************/ + +.macro INIT2x2 + + vsub.f64 d8 , d8 , d8 + vsub.f64 d9 , d9 , d9 + vsub.f64 d12, d12, d12 + vsub.f64 d13, d13, d13 + +.endm + +.macro KERNEL2x2 + + vldm AO!, { d0, d1 } + vldm BO!, { d4, d5 } + + vmul.f64 d6 , d0 , d4 + vmul.f64 d7 , d1 , d4 + vadd.f64 d8 , d8 , d6 + vadd.f64 d9 , d9 , d7 + + vmul.f64 d6 , d0 , d5 + vmul.f64 d7 , d1 , d5 + vadd.f64 d12, d12, d6 + vadd.f64 d13, d13, d7 + +.endm + +.macro SAVE2x2 + + vldr d0, ALPHA + + vmul.f64 d8 , d0 , d8 + vmul.f64 d9 , d0 , d9 + vmul.f64 d12, d0 , d12 + vmul.f64 d13, d0 , d13 + + vstm CO1!, { d8 , d9 } + vstm CO2!, { d12, d13 } + +.endm + +/*************************************************************************************/ + +.macro INIT1x2 + + vsub.f64 d8 , d8 , d8 + vsub.f64 d12, d12, d12 + +.endm + +.macro KERNEL1x2 + + vldm AO!, { d0 } + vldm BO!, { d4, d5 } + + vmul.f64 d6 , d0 , d4 + vadd.f64 d8 , d8 , d6 + + vmul.f64 d6 , d0 , d5 + vadd.f64 d12, d12, d6 + +.endm + +.macro SAVE1x2 + + vldr d0, ALPHA + + vmul.f64 d8 , d0 , d8 + vmul.f64 d12, d0 , d12 + + vstm CO1!, { d8 } + vstm CO2!, { d12} + +.endm + +/*************************************************************************************/ + +.macro INIT8x1 + + vsub.f64 d8 , d8 , d8 + vsub.f64 d9 , d9 , d9 + vsub.f64 d10, d10, d10 + vsub.f64 d11, d11, d11 + vsub.f64 d12, d12, d12 + vsub.f64 d13, d13, d13 + vsub.f64 d14, d14, d14 + vsub.f64 d15, d15, d15 + +.endm + +.macro KERNEL8x1 + + vldm AO!, { d0, d1 , d2, d3, d4 , d5 , d6 , d7 } + vldm BO!, { d24 } + + vmul.f64 d26 , d0 , d24 + vmul.f64 d27 , d1 , d24 + vadd.f64 d8 , d8 , d26 + vadd.f64 d9 , d9 , d27 + + vmul.f64 d28 , d2 , d24 + vmul.f64 d29 , d3 , d24 + vadd.f64 d10 , d10, d28 + vadd.f64 d11 , d11, d29 + + vmul.f64 d26 , d4 , d24 + vmul.f64 d27 , d5 , d24 + vadd.f64 d12 , d12, d26 + vadd.f64 d13 , d13, d27 + + vmul.f64 d28 , d6 , d24 + vmul.f64 d29 , d7 , d24 + vadd.f64 d14 , d14, d28 + vadd.f64 d15 , d15, d29 + + +.endm + +.macro SAVE8x1 + + vldr d0, ALPHA + + vmul.f64 d8 , d0 , d8 + vmul.f64 d9 , d0 , d9 + vmul.f64 d10, d0 , d10 + vmul.f64 d11, d0 , d11 + vmul.f64 d12, d0 , d12 + vmul.f64 d13, d0 , d13 + vmul.f64 d14, d0 , d14 + vmul.f64 d15, d0 , d15 + + vstm CO1!, { d8 , d9 , d10 , d11 , d12, d13 ,d14 , d15 } + +.endm + + +/*************************************************************************************/ + +.macro INIT4x1 + + vsub.f64 d8 , d8 , d8 + vsub.f64 d9 , d9 , d9 + vsub.f64 d10, d10, d10 + vsub.f64 d11, d11, d11 + +.endm + +.macro KERNEL4x1 + + vldm AO!, { d0, d1 , d2, d3 } + vldm BO!, { d4 } + + vmul.f64 d6 , d0 , d4 + vmul.f64 d7 , d1 , d4 + vadd.f64 d8 , d8 , d6 + vadd.f64 d9 , d9 , d7 + + vmul.f64 d6 , d2 , d4 + vmul.f64 d7 , d3 , d4 + vadd.f64 d10, d10, d6 + vadd.f64 d11, d11, d7 + +.endm + +.macro SAVE4x1 + + vldr d0, ALPHA + + vmul.f64 d8 , d0 , d8 + vmul.f64 d9 , d0 , d9 + vmul.f64 d10, d0 , d10 + vmul.f64 d11, d0 , d11 + + vstm CO1!, { d8 , d9 , d10 , d11 } + +.endm + +/*************************************************************************************/ + +.macro INIT2x1 + + vsub.f64 d8 , d8 , d8 + vsub.f64 d9 , d9 , d9 + +.endm + +.macro KERNEL2x1 + + vldm AO!, { d0, d1 } + vldm BO!, { d4 } + + vmul.f64 d6 , d0 , d4 + vmul.f64 d7 , d1 , d4 + vadd.f64 d8 , d8 , d6 + vadd.f64 d9 , d9 , d7 + +.endm + +.macro SAVE2x1 + + vldr d0, ALPHA + + vmul.f64 d8 , d0 , d8 + vmul.f64 d9 , d0 , d9 + + vstm CO1!, { d8 , d9 } + +.endm + +/*************************************************************************************/ + +.macro INIT1x1 + + vsub.f64 d8 , d8 , d8 + +.endm + +.macro KERNEL1x1 + + vldm AO!, { d0 } + vldm BO!, { d4 } + + vmul.f64 d6 , d0 , d4 + vadd.f64 d8 , d8 , d6 + +.endm + +.macro SAVE1x1 + + vldr d0, ALPHA + + vmul.f64 d8 , d0 , d8 + + vstm CO1!, { d8 } + +.endm + + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + str OLD_M, M + str OLD_N, N + str OLD_K, K + str OLD_A, A + vstr OLD_ALPHA, ALPHA + + sub r3, fp, #128 + vstm r3, { d8 - d15} // store floating point registers + + ldr r3, OLD_LDC + lsl r3, r3, #3 // ldc = ldc * 8 + str r3, LDC + + ldr r3, OLD_C + str r3, C + + ldr BC, B + + ldr r3, OFFSET +#ifndef LEFT + neg r3 , r3 +#endif + str r3 , KK + + ldr J, N + asrs J, J, #1 // J = J / 2 + ble _L1_BEGIN + +_L2_BEGIN: + + ldr CO1, C // CO1 = C + ldr r4 , LDC + add CO2, CO1, r4 // CO2 = C + LDC + add r3 , CO2, r4 // C = CO2 + LDC + str r3 , C // store C + +#if defined(LEFT) + ldr r3 , OFFSET + str r3 , KK +#endif + + ldr AO, A // AO = A + +_L2_M8_BEGIN: + + ldr I, M + asrs I, I, #3 // I = I / 8 + ble _L2_M4_BEGIN + +_L2_M8_20: + + pld [CO1, #C_PRE] + pld [CO2, #C_PRE] + + INIT8x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #4 // 2 double values + add BO , BO , r4 + lsls r4 , r3 , #6 // 8 double values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr L , K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr L , K + ldr r3, KK + sub L , L, r3 + str L , KKK +#else + ldr L , KK +#ifdef LEFT + add L , L , #8 // number of values in AO +#else + add L , L , #2 // number of values in BO +#endif + str L , KKK +#endif + + asrs K1 , L, #3 // L = L / 8 + ble _L2_M8_40 + .align 5 + +_L2_M8_22: + + pld [BO , #B_PRE] + KERNEL8x2_START + KERNEL8x2_M + pld [BO , #B_PRE] + KERNEL8x2_M + KERNEL8x2_M + + pld [BO , #B_PRE] + KERNEL8x2_M + KERNEL8x2_M + pld [BO , #B_PRE] + KERNEL8x2_M + KERNEL8x2_END + + subs K1, K1, #1 + bgt _L2_M8_22 + + +_L2_M8_40: + + ands K1 , L, #7 // L = L % 8 + ble _L2_M8_100 + +_L2_M8_42: + + KERNEL8x2 + + subs K1, K1, #1 + bgt _L2_M8_42 + +_L2_M8_100: + + SAVE8x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #4 // 2 double values + add BO , BO , r4 + lsls r4 , r3 , #6 // 8 double values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #8 // number of values in AO + str r3 , KK +#endif + + +_L2_M8_END: + + subs I, I, #1 + bgt _L2_M8_20 + + +_L2_M4_BEGIN: + + ldr I, M + tst I , #7 + ble _L2_END + + tst I , #4 + ble _L2_M2_BEGIN + +_L2_M4_20: + + INIT4x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #4 // 2 double values + add BO , BO , r4 + lsls r4 , r3 , #5 // 4 double values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr L , K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr L , K + ldr r3, KK + sub L , L, r3 + str L , KKK +#else + ldr L , KK +#ifdef LEFT + add L , L , #4 // number of values in AO +#else + add L , L , #2 // number of values in BO +#endif + str L , KKK +#endif + + asrs K1, L, #3 // L = L / 8 + ble _L2_M4_40 + .align 5 + +_L2_M4_22: + + KERNEL4x2 + KERNEL4x2 + KERNEL4x2 + KERNEL4x2 + + KERNEL4x2 + KERNEL4x2 + KERNEL4x2 + KERNEL4x2 + + subs K1, K1, #1 + bgt _L2_M4_22 + + +_L2_M4_40: + + ands K1, L, #7 // L = L % 8 + ble _L2_M4_100 + +_L2_M4_42: + + KERNEL4x2 + + subs K1, K1, #1 + bgt _L2_M4_42 + +_L2_M4_100: + + SAVE4x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #4 // 2 double values + add BO , BO , r4 + lsls r4 , r3 , #5 // 4 double values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #4 // number of values in AO + str r3 , KK +#endif + +_L2_M4_END: + + + +_L2_M2_BEGIN: + + tst I, #2 // I = I / 2 + ble _L2_M1_BEGIN + +_L2_M2_20: + + INIT2x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #4 // 2 double values + add BO , BO , r4 + lsls r4 , r3 , #4 // 2 double values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr L , K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr L , K + ldr r3, KK + sub L , L, r3 + str L , KKK +#else + ldr L , KK +#ifdef LEFT + add L , L , #2 // number of values in AO +#else + add L , L , #2 // number of values in BO +#endif + str L , KKK +#endif + + asrs K1, L, #2 // L = L / 4 + ble _L2_M2_40 + +_L2_M2_22: + + KERNEL2x2 + KERNEL2x2 + KERNEL2x2 + KERNEL2x2 + + subs K1, K1, #1 + bgt _L2_M2_22 + + +_L2_M2_40: + + ands K1, L, #3 // L = L % 4 + ble _L2_M2_100 + +_L2_M2_42: + + KERNEL2x2 + + subs K1, K1, #1 + bgt _L2_M2_42 + +_L2_M2_100: + + SAVE2x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #4 // 2 double values + add BO , BO , r4 + lsls r4 , r3 , #4 // 2 double values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #2 // number of values in AO + str r3 , KK +#endif + + +_L2_M2_END: + + +_L2_M1_BEGIN: + + tst I, #1 // I = I % 2 + ble _L2_END + +_L2_M1_20: + + INIT1x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #4 // 2 double values + add BO , BO , r4 + lsls r4 , r3 , #3 // 1 double value + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr L , K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr L , K + ldr r3, KK + sub L , L, r3 + str L , KKK +#else + ldr L , KK +#ifdef LEFT + add L , L , #1 // number of values in AO +#else + add L , L , #2 // number of values in BO +#endif + str L , KKK +#endif + + asrs K1, L, #2 // L = L / 4 + ble _L2_M1_40 + +_L2_M1_22: + + KERNEL1x2 + KERNEL1x2 + KERNEL1x2 + KERNEL1x2 + + subs K1, K1, #1 + bgt _L2_M1_22 + + +_L2_M1_40: + + ands K1, L, #3 // L = L % 4 + ble _L2_M1_100 + +_L2_M1_42: + + KERNEL1x2 + + subs K1, K1, #1 + bgt _L2_M1_42 + +_L2_M1_100: + + SAVE1x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #4 // 2 double values + add BO , BO , r4 + lsls r4 , r3 , #3 // 1 double value + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #1 // number of values in AO + str r3 , KK +#endif + + + +_L2_END: + + mov r3, BC + ldr r4, K + lsl r4, r4, #4 // k * 2 * 8 + add r3, r3, r4 // B = B + K * 2 * 8 + mov BC, r3 + +#if !defined(LEFT) + ldr r3 , KK + add r3 , r3 , #2 // number of values in BO + str r3 , KK +#endif + + subs J , #1 // j-- + bgt _L2_BEGIN + + +_L1_BEGIN: + + ldr J, N + tst J , #1 // J = J % 2 + ble _L999 + + ldr CO1, C // CO1 = C + ldr r4 , LDC + add r3 , CO1, r4 // C = CO1 + LDC + str r3 , C // store C + +#if defined(LEFT) + ldr r3 , OFFSET + str r3 , KK +#endif + + ldr AO, A // AO = A + + + +_L1_M8_BEGIN: + + ldr I, M + asrs I, I, #3 // I = I / 8 + ble _L1_M4_BEGIN + +_L1_M8_20: + + INIT8x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #3 // 1 double value + add BO , BO , r4 + lsls r4 , r3 , #6 // 8 double values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr L , K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr L , K + ldr r3, KK + sub L , L, r3 + str L , KKK +#else + ldr L , KK +#ifdef LEFT + add L , L , #8 // number of values in AO +#else + add L , L , #1 // number of values in BO +#endif + str L , KKK +#endif + + asrs K1, L, #3 // L = L / 8 + ble _L1_M8_40 + +_L1_M8_22: + + KERNEL8x1 + KERNEL8x1 + KERNEL8x1 + KERNEL8x1 + KERNEL8x1 + KERNEL8x1 + KERNEL8x1 + KERNEL8x1 + + subs K1, K1, #1 + bgt _L1_M8_22 + + +_L1_M8_40: + + ands K1, L, #7 // L = L % 8 + ble _L1_M8_100 + +_L1_M8_42: + + KERNEL8x1 + + subs K1, K1, #1 + bgt _L1_M8_42 + +_L1_M8_100: + + SAVE8x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #3 // 1 double value + add BO , BO , r4 + lsls r4 , r3 , #6 // 8 double values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #8 // number of values in AO + str r3 , KK +#endif + + +_L1_M8_END: + + subs I, I, #1 + bgt _L1_M8_20 + + + + +_L1_M4_BEGIN: + + ldr I, M + tst I, #7 // I = I % 8 + ble _L1_END + + tst I, #4 // I = I % 8 + ble _L1_M2_BEGIN + +_L1_M4_20: + + INIT4x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #3 // 1 double value + add BO , BO , r4 + lsls r4 , r3 , #5 // 4 double values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr L , K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr L , K + ldr r3, KK + sub L , L, r3 + str L , KKK +#else + ldr L , KK +#ifdef LEFT + add L , L , #4 // number of values in AO +#else + add L , L , #1 // number of values in BO +#endif + str L , KKK +#endif + + asrs K1, L, #2 // L = L / 4 + ble _L1_M4_40 + +_L1_M4_22: + + KERNEL4x1 + KERNEL4x1 + KERNEL4x1 + KERNEL4x1 + + subs K1, K1, #1 + bgt _L1_M4_22 + + +_L1_M4_40: + + ands K1, L, #3 // L = L % 4 + ble _L1_M4_100 + +_L1_M4_42: + + KERNEL4x1 + + subs K1, K1, #1 + bgt _L1_M4_42 + +_L1_M4_100: + + SAVE4x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #3 // 1 double value + add BO , BO , r4 + lsls r4 , r3 , #5 // 4 double values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #4 // number of values in AO + str r3 , KK +#endif + + +_L1_M4_END: + + + + +_L1_M2_BEGIN: + + tst I, #2 // I = I % 4 + ble _L1_M1_BEGIN + +_L1_M2_20: + + INIT2x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #3 // 1 double value + add BO , BO , r4 + lsls r4 , r3 , #4 // 2 double values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr L , K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr L , K + ldr r3, KK + sub L , L, r3 + str L , KKK +#else + ldr L , KK +#ifdef LEFT + add L , L , #2 // number of values in AO +#else + add L , L , #1 // number of values in BO +#endif + str L , KKK +#endif + + asrs K1, L, #2 // L = L / 4 + ble _L1_M2_40 + +_L1_M2_22: + + KERNEL2x1 + KERNEL2x1 + KERNEL2x1 + KERNEL2x1 + + subs K1, K1, #1 + bgt _L1_M2_22 + + +_L1_M2_40: + + ands K1 , L, #3 // L = L % 4 + ble _L1_M2_100 + +_L1_M2_42: + + KERNEL2x1 + + subs K1, K1, #1 + bgt _L1_M2_42 + +_L1_M2_100: + + SAVE2x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #3 // 1 double value + add BO , BO , r4 + lsls r4 , r3 , #4 // 2 double values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #2 // number of values in AO + str r3 , KK +#endif + + +_L1_M2_END: + + + +_L1_M1_BEGIN: + + tst I, #1 // I = I % 4 + ble _L1_END + +_L1_M1_20: + + INIT1x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #3 // 1 double value + add BO , BO , r4 + lsls r4 , r3 , #3 // 1 double value + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr L , K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr L , K + ldr r3, KK + sub L , L, r3 + str L , KKK +#else + ldr L , KK +#ifdef LEFT + add L , L , #1 // number of values in AO +#else + add L , L , #1 // number of values in BO +#endif + str L , KKK +#endif + + asrs K1, L, #2 // L = L / 4 + ble _L1_M1_40 + +_L1_M1_22: + + KERNEL1x1 + KERNEL1x1 + KERNEL1x1 + KERNEL1x1 + + subs K1, K1, #1 + bgt _L1_M1_22 + + +_L1_M1_40: + + ands K1 , L, #3 // L = L % 4 + ble _L1_M1_100 + +_L1_M1_42: + + KERNEL1x1 + + subs K1, K1, #1 + bgt _L1_M1_42 + +_L1_M1_100: + + SAVE1x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #3 // 1 double value + add BO , BO , r4 + lsls r4 , r3 , #3 // 1 double value + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #1 // number of values in AO + str r3 , KK +#endif + + + +_L1_END: + + + +_L999: + + sub r3, fp, #128 + vldm r3, { d8 - d15} // restore floating point registers + + movs r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/gemv_n.c b/kernel/arm/gemv_n.c new file mode 100644 index 000000000..aedcca965 --- /dev/null +++ b/kernel/arm/gemv_n.c @@ -0,0 +1,67 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +/************************************************************************************** + * * 2013/09/14 Saar + * * BLASTEST float : OK + * * BLASTEST double : OK + * CTEST : OK + * TEST : OK + * * + * **************************************************************************************/ + + +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) +{ + BLASLONG i; + BLASLONG ix,iy; + BLASLONG j; + FLOAT *a_ptr; + FLOAT temp; + + ix = 0; + a_ptr = a; + + for (j=0; j + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT maxf=0.0; + BLASLONG max=0; + + if (n < 0 || inc_x < 1 ) return(max); + + maxf=ABS(x[0]); + + while(i < n) + { + if( ABS(x[ix]) > ABS(maxf) ) + { + max = i; + maxf = ABS(x[ix]); + } + ix += inc_x; + i++; + } + return(max+1); +} + + diff --git a/kernel/arm/iamin.c b/kernel/arm/iamin.c new file mode 100644 index 000000000..fdb5d7a10 --- /dev/null +++ b/kernel/arm/iamin.c @@ -0,0 +1,75 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : NoTest +* BLASTEST double : NoTest +* CTEST : NoTest +* TEST : NoTest +* +**************************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT minf=0.0; + BLASLONG min=0; + + if (n < 0 || inc_x < 1 ) return(min); + + minf=ABS(x[0]); + + while(i < n) + { + if( ABS(x[ix]) < ABS(minf) ) + { + min = i; + minf = ABS(x[ix]); + } + ix += inc_x; + i++; + } + return(min+1); +} + + diff --git a/kernel/arm/imax.c b/kernel/arm/imax.c new file mode 100644 index 000000000..e3e4b9a6c --- /dev/null +++ b/kernel/arm/imax.c @@ -0,0 +1,67 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : NoTest +* BLASTEST double : NoTest +* CTEST : NoTest +* TEST : NoTest +* +**************************************************************************************/ + +#include "common.h" +#include + + + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT maxf=0.0; + BLASLONG max=0; + + if (n < 0 || inc_x < 1 ) return(max); + + maxf=x[0]; + + while(i < n) + { + if( x[ix] > maxf ) + { + max = i; + maxf = x[ix]; + } + ix += inc_x; + i++; + } + return(max+1); +} + + diff --git a/kernel/arm/imin.c b/kernel/arm/imin.c new file mode 100644 index 000000000..fbcadc2fd --- /dev/null +++ b/kernel/arm/imin.c @@ -0,0 +1,65 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +/************************************************************************************** +* 2013/08/19 Saar +* BLASTEST float +* BLASTEST double +* +**************************************************************************************/ + +#include "common.h" +#include + + + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT minf=0.0; + BLASLONG min=0; + + if (n < 0 || inc_x < 1 ) return(min); + + minf=x[0]; + + while(i < n) + { + if( x[ix] > minf ) + { + min = i; + minf = x[ix]; + } + ix += inc_x; + i++; + } + return(min+1); +} + + diff --git a/kernel/arm/izamax.c b/kernel/arm/izamax.c new file mode 100644 index 000000000..a6ba86388 --- /dev/null +++ b/kernel/arm/izamax.c @@ -0,0 +1,81 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : NoTest +* BLASTEST double : NoTest +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + +#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT maxf[2]; + BLASLONG max=0; + BLASLONG inc_x2; + + if (n < 0 || inc_x < 1 ) return(max); + + inc_x2 = 2 * inc_x; + + maxf[0] = ABS(x[ix]); + maxf[1] = ABS(x[ix+1]); + + while(i < n) + { + if( CABS1(x,ix) > CABS1(maxf,0) ) + { + max = i; + maxf[0] = ABS(x[ix]); + maxf[1] = ABS(x[ix+1]); + } + ix += inc_x2; + i++; + } + return(max+1); +} + + diff --git a/kernel/arm/izamin.c b/kernel/arm/izamin.c new file mode 100644 index 000000000..45c2a7c9c --- /dev/null +++ b/kernel/arm/izamin.c @@ -0,0 +1,81 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : NoTest +* BLASTEST double : NoTest +* CTEST : NoTest +* TEST : NoTest +* +**************************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + +#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT minf[2]; + BLASLONG min=0; + BLASLONG inc_x2; + + if (n < 0 || inc_x < 1 ) return(min); + + inc_x2 = 2 * inc_x; + + minf[0] = ABS(x[ix]); + minf[1] = ABS(x[ix+1]); + + while(i < n) + { + if( CABS1(x,ix) < CABS1(minf,0) ) + { + min = i; + minf[0] = ABS(x[ix]); + minf[1] = ABS(x[ix+1]); + } + ix += inc_x2; + i++; + } + return(min+1); +} + + diff --git a/kernel/arm/max.c b/kernel/arm/max.c new file mode 100644 index 000000000..3239e3408 --- /dev/null +++ b/kernel/arm/max.c @@ -0,0 +1,63 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : NoTest +* BLASTEST double : NoTest +* CTEST : NoTest +* TEST : NoTest +* +**************************************************************************************/ + +#include "common.h" +#include + + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT maxf=0.0; + + if (n < 0 || inc_x < 1 ) return(maxf); + + maxf=x[0]; + + while(i < n) + { + if( x[ix] > maxf ) + { + maxf = x[ix]; + } + ix += inc_x; + i++; + } + return(maxf); +} + + diff --git a/kernel/arm/min.c b/kernel/arm/min.c new file mode 100644 index 000000000..de4c4719a --- /dev/null +++ b/kernel/arm/min.c @@ -0,0 +1,63 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : NoTest +* BLASTEST double : NoTest +* CTEST : NoTest +* TEST : NoTest +* +**************************************************************************************/ + +#include "common.h" +#include + + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT minf=0.0; + + if (n < 0 || inc_x < 1 ) return(minf); + + minf=x[0]; + + while(i < n) + { + if( x[ix] < minf ) + { + minf = x[ix]; + } + ix += inc_x; + i++; + } + return(minf); +} + + diff --git a/kernel/arm/nrm2.c b/kernel/arm/nrm2.c new file mode 100644 index 000000000..d65c5a410 --- /dev/null +++ b/kernel/arm/nrm2.c @@ -0,0 +1,88 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/13 Saar +* BLASTEST float : OK +* BLASTEST double : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + + + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + FLOAT scale = 0.0; + FLOAT ssq = 1.0; + FLOAT absxi = 0.0; + + + if (n < 0 || inc_x < 1 ) return(0.0); + if ( n == 1 ) return( ABS(x[0]) ); + + n *= inc_x; + while(i < n) + { + + if ( x[i] != 0.0 ) + { + absxi = ABS( x[i] ); + if ( scale < absxi ) + { + ssq = 1 + ssq * ( scale / absxi ) * ( scale / absxi ); + scale = absxi ; + } + else + { + ssq += ( absxi/scale ) * ( absxi/scale ); + } + + } + i += inc_x; + } + scale = scale * sqrt( ssq ); + return(scale); + +} + + diff --git a/kernel/arm/rot.c b/kernel/arm/rot.c new file mode 100644 index 000000000..aa60b4471 --- /dev/null +++ b/kernel/arm/rot.c @@ -0,0 +1,62 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : OK +* BLASTEST double : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#include "common.h" + +int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + FLOAT temp; + + if ( n <= 0 ) return(0); + + while(i < n) + { + temp = c*x[ix] + s*y[iy] ; + y[iy] = c*y[iy] - s*x[ix] ; + x[ix] = temp ; + + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + return(0); + +} + + diff --git a/kernel/arm/scal.c b/kernel/arm/scal.c new file mode 100644 index 000000000..d385c46bc --- /dev/null +++ b/kernel/arm/scal.c @@ -0,0 +1,58 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : OK +* BLASTEST double : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#include "common.h" + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i=0; + + if ( n < 0 || inc_x < 1 ) return(0); + if ( da == 1.0 ) return(0); + + n *= inc_x; + while(i < n) + { + + x[i] = da * x[i] ; + i += inc_x ; + + } + return(0); + +} + + diff --git a/kernel/arm/swap.c b/kernel/arm/swap.c new file mode 100644 index 000000000..1ca9e7607 --- /dev/null +++ b/kernel/arm/swap.c @@ -0,0 +1,62 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/08/20 Saar +* BLASTEST float OK +* BLASTEST double OK +* +**************************************************************************************/ + +#include "common.h" +#include + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + FLOAT temp; + + if ( n < 0 ) return(0); + + while(i < n) + { + + temp = x[ix] ; + x[ix] = y[iy] ; + y[iy] = temp ; + + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + return(0); + +} + + diff --git a/kernel/arm/zamax.c b/kernel/arm/zamax.c new file mode 100644 index 000000000..8c2a5c346 --- /dev/null +++ b/kernel/arm/zamax.c @@ -0,0 +1,81 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : OK +* BLASTEST double : OK +* CTEST : NoTest +* TEST : NoTest +* +**************************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + +#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT maxf[2]; + BLASLONG max=0; + BLASLONG inc_x2; + + if (n < 0 || inc_x < 1 ) return(0.0); + + inc_x2 = 2 * inc_x; + + maxf[0] = ABS(x[ix]); + maxf[1] = ABS(x[ix+1]); + + while(i < n) + { + if( CABS1(x,ix) > CABS1(maxf,0) ) + { + max = i; + maxf[0] = ABS(x[ix]); + maxf[1] = ABS(x[ix+1]); + } + ix += inc_x2; + i++; + } + return(CABS1(maxf,0)); +} + + diff --git a/kernel/arm/zamin.c b/kernel/arm/zamin.c new file mode 100644 index 000000000..6956ced0e --- /dev/null +++ b/kernel/arm/zamin.c @@ -0,0 +1,81 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : OK +* BLASTEST double : OK +* CTEST : NoTest +* TEST : NoTest +* +**************************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + +#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT minf[2]; + BLASLONG min=0; + BLASLONG inc_x2; + + if (n < 0 || inc_x < 1 ) return(0.0); + + inc_x2 = 2 * inc_x; + + minf[0] = ABS(x[ix]); + minf[1] = ABS(x[ix+1]); + + while(i < n) + { + if( CABS1(x,ix) < CABS1(minf,0) ) + { + min = i; + minf[0] = ABS(x[ix]); + minf[1] = ABS(x[ix+1]); + } + ix += inc_x2; + i++; + } + return(CABS1(minf,0)); +} + + diff --git a/kernel/arm/zasum.c b/kernel/arm/zasum.c new file mode 100644 index 000000000..13acfc0f0 --- /dev/null +++ b/kernel/arm/zasum.c @@ -0,0 +1,71 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : OK +* BLASTEST double : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + + +#include "common.h" +#include + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + +#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + FLOAT sumf = 0.0; + BLASLONG inc_x2; + if (n < 0 || inc_x < 1 ) return(sumf); + + inc_x2 = 2 * inc_x; + + n *= inc_x2; + while(i < n) + { + sumf += CABS1(x,i); + i += inc_x2; + } + return(sumf); +} + + diff --git a/kernel/arm/zaxpy.c b/kernel/arm/zaxpy.c new file mode 100644 index 000000000..28a4380fb --- /dev/null +++ b/kernel/arm/zaxpy.c @@ -0,0 +1,72 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/15 Saar +* BLASTEST float : OK +* BLASTEST double : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + + +#include "common.h" + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i=0; + BLASLONG ix,iy; + + if ( n < 0 ) return(0); + if ( da_r == 0.0 && da_i == 0.0 ) return(0); + + ix = 0; + iy = 0; + + BLASLONG inc_x2 = 2 * inc_x; + BLASLONG inc_y2 = 2 * inc_y; + + while(i < n) + { +#if !defined(CONJ) + y[iy] += ( da_r * x[ix] - da_i * x[ix+1] ) ; + y[iy+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ; +#else + y[iy] += ( da_r * x[ix] + da_i * x[ix+1] ) ; + y[iy+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ; +#endif + ix += inc_x2 ; + iy += inc_y2 ; + i++ ; + + } + return(0); + +} + + diff --git a/kernel/arm/zcopy.c b/kernel/arm/zcopy.c new file mode 100644 index 000000000..654711240 --- /dev/null +++ b/kernel/arm/zcopy.c @@ -0,0 +1,63 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : OK +* BLASTEST double : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#include "common.h" + +int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + + if ( n < 0 ) return(0); + + BLASLONG inc_x2 = 2 * inc_x; + BLASLONG inc_y2 = 2 * inc_y; + + while(i < n) + { + + y[iy] = x[ix] ; + y[iy+1] = x[ix+1] ; + ix += inc_x2; + iy += inc_y2; + i++ ; + + } + return(0); + +} + + diff --git a/kernel/arm/zdot.c b/kernel/arm/zdot.c new file mode 100644 index 000000000..096ced9db --- /dev/null +++ b/kernel/arm/zdot.c @@ -0,0 +1,78 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : FAIL +* BLASTEST double : FAIL +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#include "common.h" +#include + +FLOAT _Complex CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + FLOAT dot[2]; + FLOAT _Complex result; + + dot[0]=0.0; + dot[1]=0.0; + + __real__ result = 0.0 ; + __imag__ result = 0.0 ; + + if ( n < 1 ) return(result); + + BLASLONG inc_x2 = 2 * inc_x ; + BLASLONG inc_y2 = 2 * inc_y ; + + while(i < n) + { +#if !defined(CONJ) + dot[0] += ( x[ix] * y[iy] - x[ix+1] * y[iy+1] ) ; + dot[1] += ( x[ix+1] * y[iy] + x[ix] * y[iy+1] ) ; +#else + dot[0] += ( x[ix] * y[iy] + x[ix+1] * y[iy+1] ) ; + dot[1] -= ( x[ix+1] * y[iy] - x[ix] * y[iy+1] ) ; +#endif + ix += inc_x2 ; + iy += inc_y2 ; + i++ ; + + } + __real__ result = dot[0]; + __imag__ result = dot[1]; + return(result); + +} + + diff --git a/kernel/arm/zgemv_n.c b/kernel/arm/zgemv_n.c new file mode 100644 index 000000000..5f00c34f6 --- /dev/null +++ b/kernel/arm/zgemv_n.c @@ -0,0 +1,125 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** + * * 2013/09/15 Saar + * * BLASTEST float : OK + * * BLASTEST double : OK + * CTEST : OK + * TEST : OK + * * + * **************************************************************************************/ + + +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) +{ + BLASLONG i; + BLASLONG ix,iy; + BLASLONG j; + FLOAT *a_ptr; + FLOAT temp_r,temp_i; + BLASLONG inc_x2,inc_y2; + BLASLONG lda2; + BLASLONG i2; + + if( alpha_r == 0.0 && alpha_i == 0.0 ) return(0); + + lda2 = 2*lda; + + inc_x2 = 2 * inc_x; + inc_y2 = 2 * inc_y; + + ix = 0; + a_ptr = a; + +#if !defined(CONJ) + for (j=0; j + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + + + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + FLOAT scale = 0.0; + FLOAT ssq = 1.0; + BLASLONG inc_x2; + FLOAT temp; + + if (n < 0 || inc_x < 1 ) return(0.0); + + inc_x2 = 2 * inc_x; + + n *= inc_x2; + while(i < n) + { + + if ( x[i] != 0.0 ) + { + temp = ABS( x[i] ); + if ( scale < temp ) + { + ssq = 1 + ssq * ( scale / temp ) * ( scale / temp ); + scale = temp ; + } + else + { + ssq += ( temp / scale ) * ( temp / scale ); + } + + } + + if ( x[i+1] != 0.0 ) + { + temp = ABS( x[i+1] ); + if ( scale < temp ) + { + ssq = 1 + ssq * ( scale / temp ) * ( scale / temp ); + scale = temp ; + } + else + { + ssq += ( temp / scale ) * ( temp / scale ); + } + + } + + + i += inc_x2; + } + scale = scale * sqrt( ssq ); + return(scale); + +} + + diff --git a/kernel/arm/zrot.c b/kernel/arm/zrot.c new file mode 100644 index 000000000..4a2f37f64 --- /dev/null +++ b/kernel/arm/zrot.c @@ -0,0 +1,68 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : OK +* BLASTEST double : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#include "common.h" + +int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + FLOAT temp[2]; + + if ( n <= 0 ) return(0); + + BLASLONG inc_x2 = 2 * inc_x ; + BLASLONG inc_y2 = 2 * inc_y ; + + while(i < n) + { + temp[0] = c*x[ix] + s*y[iy] ; + temp[1] = c*x[ix+1] + s*y[iy+1] ; + y[iy] = c*y[iy] - s*x[ix] ; + y[iy+1] = c*y[iy+1] - s*x[ix+1] ; + x[ix] = temp[0] ; + x[ix+1] = temp[1] ; + + ix += inc_x2 ; + iy += inc_y2 ; + i++ ; + + } + return(0); + +} + + diff --git a/kernel/arm/zscal.c b/kernel/arm/zscal.c new file mode 100644 index 000000000..833dc8c03 --- /dev/null +++ b/kernel/arm/zscal.c @@ -0,0 +1,64 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : OK +* BLASTEST double : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#include "common.h" + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i=0; + BLASLONG inc_x2; + BLASLONG ip = 0; + FLOAT temp; + + if ( n < 0 || inc_x < 1 ) return(0); + + inc_x2 = 2 * inc_x; + for ( i=0; i + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + FLOAT temp[2]; + + if ( n < 0 ) return(0); + + BLASLONG inc_x2 = 2 * inc_x; + BLASLONG inc_y2 = 2 * inc_y; + + while(i < n) + { + + temp[0] = x[ix] ; + temp[1] = x[ix+1] ; + x[ix] = y[iy] ; + x[ix+1] = y[iy+1] ; + y[iy] = temp[0] ; + y[iy+1] = temp[1] ; + + ix += inc_x2 ; + iy += inc_y2 ; + i++ ; + + } + return(0); + +} + + From d13788d1b4027d9e545694b7cad71a877bcab3dd Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sat, 28 Sep 2013 19:10:32 +0200 Subject: [PATCH 02/81] common files modified for ARM --- Makefile.rule | 30 ++++----- Makefile.system | 37 +++-------- common.h | 9 ++- common_arm.h | 163 ++++++++++++++++++++++++++++++++++++++++++++++++ ctest.c | 6 ++ getarch.c | 15 +++++ param.h | 43 +++++++++++++ 7 files changed, 255 insertions(+), 48 deletions(-) create mode 100644 common_arm.h diff --git a/Makefile.rule b/Makefile.rule index e357d5ccc..a7aa0873d 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -12,7 +12,7 @@ VERSION = 0.2.8 # You can specify the target architecture, otherwise it's # automatically detected. -# TARGET = PENRYN +TARGET = ARMV7 # If you want to support multiple architecture in one binary # DYNAMIC_ARCH = 1 @@ -25,20 +25,20 @@ VERSION = 0.2.8 # FC = gfortran # Even you can specify cross compiler. Meanwhile, please set HOSTCC. -# CC = x86_64-w64-mingw32-gcc -# FC = x86_64-w64-mingw32-gfortran +CC = arm-linux-gnueabihf-gcc +FC = arm-linux-gnueabihf-gfortran # If you use the cross compiler, please set this host compiler. -# HOSTCC = gcc +HOSTCC = gcc # If you need 32bit binary, define BINARY=32, otherwise define BINARY=64 -# BINARY=64 +#BINARY=32 # About threaded BLAS. It will be automatically detected if you don't # specify it. # For force setting for single threaded, specify USE_THREAD = 0 # For force setting for multi threaded, specify USE_THREAD = 1 -# USE_THREAD = 0 +USE_THREAD = 0 # If you're going to use this library with OpenMP, please comment it in. # USE_OPENMP = 1 @@ -46,7 +46,7 @@ VERSION = 0.2.8 # You can define maximum number of threads. Basically it should be # less than actual number of cores. If you don't specify one, it's # automatically detected by the the script. -# NUM_THREADS = 24 +NUM_THREADS = 4 # if you don't need generate the shared library, please comment it in. # NO_SHARED = 1 @@ -54,16 +54,12 @@ VERSION = 0.2.8 # If you don't need CBLAS interface, please comment it in. # NO_CBLAS = 1 -# If you only want CBLAS interface without installing Fortran compiler, -# please comment it in. -# ONLY_CBLAS = 1 - # If you don't need LAPACK, please comment it in. # If you set NO_LAPACK=1, the library automatically sets NO_LAPACKE=1. -# NO_LAPACK = 1 +#NO_LAPACK = 1 # If you don't need LAPACKE (C Interface to LAPACK), please comment it in. -# NO_LAPACKE = 1 +#NO_LAPACKE = 1 # If you want to use legacy threaded Level 3 implementation. # USE_SIMPLE_THREADED_LEVEL3 = 1 @@ -76,10 +72,10 @@ VERSION = 0.2.8 # Unfortunately most of kernel won't give us high quality buffer. # BLAS tries to find the best region before entering main function, # but it will consume time. If you don't like it, you can disable one. -# NO_WARMUP = 1 +NO_WARMUP = 1 # If you want to disable CPU/Memory affinity on Linux. -# NO_AFFINITY = 1 +NO_AFFINITY = 1 # Don't use AVX kernel on Sandy Bridge. It is compatible with old compilers # and OS. However, the performance is low. @@ -127,13 +123,13 @@ VERSION = 0.2.8 # Common Optimization Flag; # The default -O2 is enough. -# COMMON_OPT = -O2 +COMMON_OPT = -O0 -marm -mfpu=vfpv3 -fno-omit-frame-pointer # Profiling flags COMMON_PROF = -pg # Build Debug version -# DEBUG = 1 +DEBUG = 1 # # End of user configuration diff --git a/Makefile.system b/Makefile.system index 858160fc4..e5358f65b 100644 --- a/Makefile.system +++ b/Makefile.system @@ -82,19 +82,12 @@ ifeq ($(HOSTCC), loongcc) GETARCH_FLAGS += -static endif -#if don't use Fortran, it will only compile CBLAS. -ifeq ($(ONLY_CBLAS), 1) -NO_LAPACK = 1 -else -ONLY_CBLAS = 0 -endif - # This operation is expensive, so execution should be once. ifndef GOTOBLAS_MAKEFILE export GOTOBLAS_MAKEFILE = 1 # Generating Makefile.conf and config.h -DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.prebuild CC="$(CC)" FC="$(FC)" HOSTCC="$(HOSTCC)" CFLAGS="$(GETARCH_FLAGS)" BINARY=$(BINARY) USE_OPENMP=$(USE_OPENMP) TARGET_CORE=$(TARGET_CORE) ONLY_CBLAS=$(ONLY_CBLAS) all) +DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.prebuild CC="$(CC)" FC="$(FC)" HOSTCC="$(HOSTCC)" CFLAGS="$(GETARCH_FLAGS)" BINARY=$(BINARY) USE_OPENMP=$(USE_OPENMP) TARGET_CORE=$(TARGET_CORE) all) ifndef TARGET_CORE include $(TOPDIR)/Makefile.conf @@ -331,14 +324,16 @@ ifeq ($(ARCH), x86) DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \ CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO ifneq ($(NO_AVX), 1) -DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER +DYNAMIC_CORE += SANDYBRIDGE +#BULLDOZER PILEDRIVER endif endif ifeq ($(ARCH), x86_64) DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO ifneq ($(NO_AVX), 1) -DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER +DYNAMIC_CORE += SANDYBRIDGE +#BULLDOZER PILEDRIVER endif endif @@ -368,6 +363,10 @@ NO_BINARY_MODE = 1 BINARY_DEFINED = 1 endif +ifeq ($(ARCH), arm) +NO_BINARY_MODE = 1 +BINARY_DEFINED = 1 +endif # # C Compiler dependent settings # @@ -892,23 +891,6 @@ LIBZIPNAME = $(LIBNAME:.$(LIBSUFFIX)=.zip) LIBS = $(TOPDIR)/$(LIBNAME) LIBS_P = $(TOPDIR)/$(LIBNAME_P) - -LIB_COMPONENTS = BLAS -ifneq ($(NO_CBLAS), 1) -LIB_COMPONENTS += CBLAS -endif - -ifneq ($(NO_LAPACK), 1) -LIB_COMPONENTS += LAPACK -ifneq ($(NO_LAPACKE), 1) -LIB_COMPONENTS += LAPACKE -endif -endif - -ifeq ($(ONLY_CBLAS), 1) -LIB_COMPONENTS = CBLAS -endif - export OSNAME export ARCH export CORE @@ -935,7 +917,6 @@ export USE_OPENMP export CROSS export CROSS_SUFFIX export NOFORTRAN -export NO_FBLAS export EXTRALIB export CEXTRALIB export FEXTRALIB diff --git a/common.h b/common.h index 309f246e2..418ed25f5 100644 --- a/common.h +++ b/common.h @@ -363,6 +363,10 @@ please https://github.com/xianyi/OpenBLAS/issues/246 #include "common_mips64.h" #endif +#ifdef ARCH_ARM +#include "common_arm.h" +#endif + #ifdef OS_LINUX #include "common_linux.h" #endif @@ -574,10 +578,9 @@ typedef struct { #include "common_level2.h" #include "common_level3.h" #include "common_lapack.h" - #ifdef CBLAS -# define OPENBLAS_CONST /* see comment in cblas.h */ -# include "cblas.h" +/* This header file is generated from "cblas.h" (see Makefile.prebuild). */ +#include "cblas_noconst.h" #endif #ifndef ASSEMBLER diff --git a/common_arm.h b/common_arm.h new file mode 100644 index 000000000..b61efd7c1 --- /dev/null +++ b/common_arm.h @@ -0,0 +1,163 @@ +/***************************************************************************** +Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the ISCAS nor the names of its contributors may + be used to endorse or promote products derived from this software + without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#ifndef COMMON_ARM +#define COMMON_ARM + +#define MB +#define WMB + +#define INLINE inline + +#define RETURN_BY_COMPLEX + +#ifndef ASSEMBLER + +static void INLINE blas_lock(volatile unsigned long *address){ + +// long int ret, val = 1; +/* + do { + while (*address) {YIELDING;}; + + __asm__ __volatile__( + "1: ll %0, %3\n" + " ori %2, %0, 1\n" + " sc %2, %1\n" + " beqz %2, 1b\n" + " andi %2, %0, 1\n" + " sync\n" + : "=&r" (val), "=m" (address), "=&r" (ret) + : "m" (address) + : "memory"); + + } while (ret); +*/ +} + +static inline unsigned int rpcc(void){ + unsigned long ret=0; + + return ret; +} + +static inline int blas_quickdivide(blasint x, blasint y){ + return x / y; +} + +#if defined(DOUBLE) +#define GET_IMAGE(res) __asm__ __volatile__("vstr.f64 d1, %0" : "=m"(res) : : "memory") +#else +#define GET_IMAGE(res) __asm__ __volatile__("vstr.f32 s1, %0" : "=m"(res) : : "memory") +#endif + +#define GET_IMAGE_CANCEL + +#endif + + +#ifndef F_INTERFACE +#define REALNAME ASMNAME +#else +#define REALNAME ASMFNAME +#endif + +#if defined(ASSEMBLER) && !defined(NEEDPARAM) + +#define PROLOGUE \ + .arm ;\ + .global REALNAME ;\ + .func REALNAME ;\ +REALNAME: + +#define EPILOGUE + +#define PROFCODE + +#endif + + +#define SEEK_ADDRESS + +#ifndef PAGESIZE +#define PAGESIZE ( 4 << 10) +#endif +#define HUGE_PAGESIZE ( 4 << 20) + +#define BUFFER_SIZE (16 << 20) + + +#define BASE_ADDRESS (START_ADDRESS - BUFFER_SIZE * MAX_CPU_NUMBER) + +#ifndef MAP_ANONYMOUS +#define MAP_ANONYMOUS MAP_ANON +#endif + +#endif diff --git a/ctest.c b/ctest.c index 413519274..184416339 100644 --- a/ctest.c +++ b/ctest.c @@ -124,3 +124,9 @@ ARCH_IA64 #if defined(__LP64) || defined(__LP64__) || defined(__ptr64) || defined(__x86_64__) || defined(__amd64__) || defined(__64BIT__) BINARY_64 #endif + +#if defined(__ARM_ARCH) || defined(__ARM_ARCH_7A__) +ARCH_ARM +#endif + + diff --git a/getarch.c b/getarch.c index 3ffda6244..3264a76f6 100644 --- a/getarch.c +++ b/getarch.c @@ -679,6 +679,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CORENAME "generic" #endif +#ifdef FORCE_ARMV7 +#define FORCE +#define ARCHITECTURE "ARM" +#define SUBARCHITECTURE "ARMV7" +#define SUBDIRNAME "arm" +#define ARCHCONFIG "-DARMV7 " \ + "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \ + "-DL2_SIZE=512488 -DL2_LINESIZE=32 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " +#define LIBNAME "armv7" +#define CORENAME "ARMV7" +#else +#endif + + #ifndef FORCE #if defined(__powerpc__) || defined(__powerpc) || defined(powerpc) || \ diff --git a/param.h b/param.h index 0c3df6951..79c18f7e2 100644 --- a/param.h +++ b/param.h @@ -1793,6 +1793,49 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SYMV_P 16 #endif + +#ifdef ARMV7 +#define SNUMOPT 2 +#define DNUMOPT 2 + +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 +#define GEMM_DEFAULT_ALIGN 0x03fffUL + +#define SGEMM_DEFAULT_UNROLL_M 2 +#define SGEMM_DEFAULT_UNROLL_N 2 + +#define DGEMM_DEFAULT_UNROLL_M 8 +#define DGEMM_DEFAULT_UNROLL_N 2 + +#define CGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_N 2 + +#define ZGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 + +#define SGEMM_DEFAULT_P 64 +#define DGEMM_DEFAULT_P 128 +#define CGEMM_DEFAULT_P 24 +#define ZGEMM_DEFAULT_P 20 + +#define SGEMM_DEFAULT_Q 192 +#define DGEMM_DEFAULT_Q 128 +#define CGEMM_DEFAULT_Q 128 +#define ZGEMM_DEFAULT_Q 64 + +#define SGEMM_DEFAULT_R 512 +#define DGEMM_DEFAULT_R 2048 +#define CGEMM_DEFAULT_R 512 +#define ZGEMM_DEFAULT_R 512 + + + +#define SYMV_P 16 +#endif + + + #ifdef GENERIC #define SNUMOPT 2 From 69ce737cc5942831ab877ca88c410039ab997e8e Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sat, 28 Sep 2013 19:13:47 +0200 Subject: [PATCH 03/81] modified Makefile.L3 for ARM --- kernel/Makefile.L3 | 198 +++------------------------------------------ 1 file changed, 12 insertions(+), 186 deletions(-) diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index f8152ac50..f543cd08d 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -14,6 +14,16 @@ ifeq ($(ARCH), MIPS) USE_GEMM3M = 1 endif +ifeq ($(ARCH), arm) +USE_TRMM = 1 +endif + +ifeq ($(TARGET), LOONGSON3B) +USE_TRMM = 1 +endif + + + SKERNELOBJS += \ sgemm_kernel$(TSUFFIX).$(SUFFIX) \ $(SGEMMINCOPYOBJ) $(SGEMMITCOPYOBJ) \ @@ -498,7 +508,8 @@ $(KDIR)xgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMD $(KDIR)xgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMDEPEND) $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $@ -ifeq ($(TARGET), LOONGSON3B) + +ifdef USE_TRMM $(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ @@ -582,24 +593,6 @@ $(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) $(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ - -else - -ifdef STRMMKERNEL - -$(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ - -$(KDIR)strmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ - -$(KDIR)strmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ - -$(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ - - else $(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ @@ -613,79 +606,17 @@ $(KDIR)strmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ -endif - -ifdef DTRMMKERNEL - -ifdef DTRMMKERNEL_LN -$(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL_LN) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ -else -$(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ -endif - -ifdef DTRMMKERNEL_LT -$(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL_LT) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ -else -$(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ -endif - -ifdef DTRMMKERNEL_RN -$(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL_RN) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ -else -$(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ -endif - -ifdef DTRMMKERNEL_RT -$(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL_RT) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ -else -$(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ -endif - -else - -ifdef DTRMMKERNEL_LN -$(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL_LN) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ -else $(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ -endif -ifdef DTRMMKERNEL_LT -$(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL_LT) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ -else $(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ -endif -ifdef DTRMMKERNEL_RN -$(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL_RN) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ -else $(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ -endif -ifdef DTRMMKERNEL_RT -$(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL_RT) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ -else $(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ -endif - -endif - -ifdef QTRMMKERNEL $(KDIR)qtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ @@ -699,50 +630,6 @@ $(KDIR)qtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) $(KDIR)qtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ -else - -$(KDIR)qtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ - -$(KDIR)qtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ - -$(KDIR)qtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ - -$(KDIR)qtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ - -endif - -ifdef CTRMMKERNEL - -$(KDIR)ctrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ - -$(KDIR)ctrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ - -$(KDIR)ctrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ - -$(KDIR)ctrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ - -$(KDIR)ctrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ - -$(KDIR)ctrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ - -$(KDIR)ctrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ - -$(KDIR)ctrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ - -else - $(KDIR)ctrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ @@ -767,37 +654,6 @@ $(KDIR)ctrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(KDIR)ctrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ -endif - -ifdef ZTRMMKERNEL - -$(KDIR)ztrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ - -$(KDIR)ztrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ - -$(KDIR)ztrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ - -$(KDIR)ztrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ - -$(KDIR)ztrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ - -$(KDIR)ztrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ - -$(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ - -$(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ - - -else - $(KDIR)ztrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ @@ -821,37 +677,10 @@ $(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ - -endif endif -ifdef XTRMMKERNEL -$(KDIR)xtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ -$(KDIR)xtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ - -$(KDIR)xtrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ - -$(KDIR)xtrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ - -$(KDIR)xtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ - -$(KDIR)xtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ - -$(KDIR)xtrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ - -$(KDIR)xtrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ - -else $(KDIR)xtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ @@ -877,9 +706,6 @@ $(KDIR)xtrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(KDIR)xtrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ -endif - - $(KDIR)cgemm3m_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM3MKERNEL) $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN $< -o $@ From 4a474ea7dc21f68267442b89e1fa88c04d9187a0 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sun, 29 Sep 2013 17:46:23 +0200 Subject: [PATCH 04/81] changed dgemm_kernel to use fused multiply add --- kernel/arm/dgemm_kernel_8x2_vfpv3.S | 380 +++++++++------------------- 1 file changed, 115 insertions(+), 265 deletions(-) diff --git a/kernel/arm/dgemm_kernel_8x2_vfpv3.S b/kernel/arm/dgemm_kernel_8x2_vfpv3.S index 3c474a172..e4b256832 100644 --- a/kernel/arm/dgemm_kernel_8x2_vfpv3.S +++ b/kernel/arm/dgemm_kernel_8x2_vfpv3.S @@ -26,43 +26,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** -* 2013/09/28 Saar +* 2013/09/29 Saar * BLASTEST : OK * CTEST : OK * TEST : OK * -* 2013/09/22 Saar -* UNROLL_N 2 -* UNROLL_M 8 -* DGEMM_P 64 -* DGEMM_Q 64 -* DGEMM_R 512 -* A_PRE 128 -* B_PRE 128 * -* Performance on Odroid U2: -* -* 1 Core: 0.92 GFLOPS ATLAS: 0.81 GFLOPS -* 2 Cores: 1.83 GFLOPS ATLAS: 1.51 GFLOPS -* 3 Cores: 2.67 GFLOPS ATLAS: 1.51 GFLOPS -* 4 Cores: 3.52 GFLOPS ATLAS: 1.51 GFLOPS -* -* 2013/09/28 Saar +* 2013/09/29 Saar * UNROLL_N 2 * UNROLL_M 8 * DGEMM_P 128 * DGEMM_Q 128 * DGEMM_R 2048 -* A_PRE 128 -* B_PRE 128 -* C_PRE 32 +* A_PRE 192 +* B_PRE 32 +* C_PRE 64 * * Performance on Odroid U2: * -* 1 Core: 0.99 GFLOPS ATLAS: 0.82 GFLOPS -* 2 Cores: 1.97 GFLOPS ATLAS: 1.59 GFLOPS -* 3 Cores: 2.86 GFLOPS ATLAS: 1.59 GFLOPS -* 4 Cores: 3.79 GFLOPS ATLAS: 1.59 GFLOPS +* 1 Core: 1.48 GFLOPS ATLAS: 1.52 GFLOPS +* 2 Cores: 2.92 GFLOPS ATLAS: - GFLOPS +* 3 Cores: 4.08 GFLOPS ATLAS: - GFLOPS +* 4 Cores: 4.80 GFLOPS ATLAS: 3.80 GFLOPS **************************************************************************************/ #define ASSEMBLER @@ -108,10 +93,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define K1 r7 #define BC r12 -#define A_PRE 128 -#define A_PRE1 160 -#define B_PRE 128 -#define C_PRE 32 +#define A_PRE 192 +#define A_PRE1 224 +#define B_PRE 32 +#define C_PRE 64 /************************************************************************************** * Macro definitions @@ -138,257 +123,122 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm -.macro KERNEL8x2_START - - vldm BO!, { d24 , d25} - vldm AO!, { d0, d1 , d2, d3, d4 , d5 , d6 , d7 } - - vmul.f64 d26 , d0 , d24 - vmul.f64 d29 , d0 , d25 - vmul.f64 d27 , d1 , d24 - vmul.f64 d30 , d1 , d25 - vmul.f64 d28 , d2 , d24 - vmul.f64 d31 , d2 , d25 - - pld [AO , #A_PRE] - - vadd.f64 d8 , d8 , d26 - vadd.f64 d16 , d16, d29 - vadd.f64 d9 , d9 , d27 - vadd.f64 d17 , d17, d30 - vadd.f64 d10 , d10, d28 - vadd.f64 d18 , d18, d31 - - vmul.f64 d26 , d3 , d24 - vmul.f64 d27 , d4 , d24 - vmul.f64 d28 , d5 , d24 - vmul.f64 d29 , d3 , d25 - vmul.f64 d30 , d4 , d25 - vmul.f64 d31 , d5 , d25 - - pld [AO , #A_PRE1] - - vadd.f64 d11 , d11, d26 - vadd.f64 d12 , d12, d27 - vadd.f64 d13 , d13, d28 - vadd.f64 d19 , d19, d29 - vadd.f64 d20 , d20, d30 - vadd.f64 d21 , d21, d31 - - vmul.f64 d26 , d6 , d24 - vmul.f64 d27 , d7 , d24 - vmul.f64 d29 , d6 , d25 - vmul.f64 d30 , d7 , d25 - - vldm BO!, { d24 , d25} - vldm AO!, { d0, d1 , d2, d3, d4 , d5 , d6 , d7 } - - vadd.f64 d14 , d14, d26 - vadd.f64 d15 , d15, d27 - vadd.f64 d22 , d22, d29 - vadd.f64 d23 , d23, d30 - -.endm - - -.macro KERNEL8x2_M - - vmul.f64 d26 , d0 , d24 - vmul.f64 d29 , d0 , d25 - vmul.f64 d27 , d1 , d24 - vmul.f64 d30 , d1 , d25 - vmul.f64 d28 , d2 , d24 - vmul.f64 d31 , d2 , d25 - - pld [AO , #A_PRE] - - vadd.f64 d8 , d8 , d26 - vadd.f64 d16 , d16, d29 - vadd.f64 d9 , d9 , d27 - vadd.f64 d17 , d17, d30 - vadd.f64 d10 , d10, d28 - vadd.f64 d18 , d18, d31 - - vmul.f64 d26 , d3 , d24 - vmul.f64 d27 , d4 , d24 - vmul.f64 d28 , d5 , d24 - vmul.f64 d29 , d3 , d25 - vmul.f64 d30 , d4 , d25 - vmul.f64 d31 , d5 , d25 - - pld [AO , #A_PRE1] - - vadd.f64 d11 , d11, d26 - vadd.f64 d12 , d12, d27 - vadd.f64 d13 , d13, d28 - vadd.f64 d19 , d19, d29 - vadd.f64 d20 , d20, d30 - vadd.f64 d21 , d21, d31 - - vmul.f64 d26 , d6 , d24 - vmul.f64 d27 , d7 , d24 - vmul.f64 d29 , d6 , d25 - vmul.f64 d30 , d7 , d25 - - vldm BO!, { d24 , d25} - vldm AO!, { d0, d1 , d2, d3, d4 , d5 , d6 , d7 } - - vadd.f64 d14 , d14, d26 - vadd.f64 d15 , d15, d27 - vadd.f64 d22 , d22, d29 - vadd.f64 d23 , d23, d30 - -.endm - - -.macro KERNEL8x2_END - - vmul.f64 d26 , d0 , d24 - vmul.f64 d29 , d0 , d25 - vmul.f64 d27 , d1 , d24 - vmul.f64 d30 , d1 , d25 - vmul.f64 d28 , d2 , d24 - vmul.f64 d31 , d2 , d25 - - pld [AO , #A_PRE] - - vadd.f64 d8 , d8 , d26 - vadd.f64 d16 , d16, d29 - vadd.f64 d9 , d9 , d27 - vadd.f64 d17 , d17, d30 - vadd.f64 d10 , d10, d28 - vadd.f64 d18 , d18, d31 - - vmul.f64 d26 , d3 , d24 - vmul.f64 d27 , d4 , d24 - vmul.f64 d28 , d5 , d24 - vmul.f64 d29 , d3 , d25 - vmul.f64 d30 , d4 , d25 - vmul.f64 d31 , d5 , d25 - - pld [AO , #A_PRE1] - - vadd.f64 d11 , d11, d26 - vadd.f64 d12 , d12, d27 - vadd.f64 d13 , d13, d28 - vadd.f64 d19 , d19, d29 - vadd.f64 d20 , d20, d30 - vadd.f64 d21 , d21, d31 - - vmul.f64 d26 , d6 , d24 - vmul.f64 d27 , d7 , d24 - vmul.f64 d29 , d6 , d25 - vmul.f64 d30 , d7 , d25 - - vadd.f64 d14 , d14, d26 - vadd.f64 d15 , d15, d27 - vadd.f64 d22 , d22, d29 - vadd.f64 d23 , d23, d30 - -.endm - - .macro KERNEL8x2 - vldm BO!, { d24 , d25} - vldm AO!, { d0, d1 , d2, d3, d4 , d5 , d6 , d7 } - - vmul.f64 d26 , d0 , d24 - vmul.f64 d29 , d0 , d25 - vmul.f64 d27 , d1 , d24 - vmul.f64 d30 , d1 , d25 - vmul.f64 d28 , d2 , d24 - vmul.f64 d31 , d2 , d25 - + fldmiad BO!, { d24 , d25} pld [AO , #A_PRE] + fldmiad AO!, { d0, d1 } - vadd.f64 d8 , d8 , d26 - vadd.f64 d16 , d16, d29 - vadd.f64 d9 , d9 , d27 - vadd.f64 d17 , d17, d30 - vadd.f64 d10 , d10, d28 - vadd.f64 d18 , d18, d31 + fmacd d8 , d0, d24 + fldmiad AO!, { d2, d3 } + fmacd d9 , d1, d24 + fldmiad AO!, { d4, d5 } + fmacd d16 , d0, d25 + fldmiad AO!, { d6, d7 } + fmacd d17 , d1, d25 - vmul.f64 d26 , d3 , d24 - vmul.f64 d27 , d4 , d24 - vmul.f64 d28 , d5 , d24 - vmul.f64 d29 , d3 , d25 - vmul.f64 d30 , d4 , d25 - vmul.f64 d31 , d5 , d25 + fmacd d10 , d2, d24 + fmacd d11 , d3, d24 + fmacd d18 , d2, d25 + fmacd d19 , d3, d25 - pld [AO , #A_PRE1] + pld [AO , #A_PRE-32] - vadd.f64 d11 , d11, d26 - vadd.f64 d12 , d12, d27 - vadd.f64 d13 , d13, d28 - vadd.f64 d19 , d19, d29 - vadd.f64 d20 , d20, d30 - vadd.f64 d21 , d21, d31 + fmacd d12 , d4, d24 + fmacd d13 , d5, d24 + fmacd d20 , d4, d25 + fmacd d21 , d5, d25 - vmul.f64 d26 , d6 , d24 - vmul.f64 d27 , d7 , d24 - vmul.f64 d29 , d6 , d25 - vmul.f64 d30 , d7 , d25 - - vadd.f64 d14 , d14, d26 - vadd.f64 d15 , d15, d27 - vadd.f64 d22 , d22, d29 - vadd.f64 d23 , d23, d30 + fmacd d14 , d6, d24 + fmacd d15 , d7, d24 + fmacd d22 , d6, d25 + fmacd d23 , d7, d25 .endm .macro SAVE8x2 vldr d0, ALPHA - vldm CO2, { d24, d25, d26 , d27 , d28 , d29 , d30 , d31 } + vldm CO1, { d24, d25, d26 , d27 , d28 , d29 , d30 , d31 } - vmul.f64 d8 , d0 , d8 - vmul.f64 d9 , d0 , d9 - vmul.f64 d10, d0 , d10 - vmul.f64 d11, d0 , d11 - vmul.f64 d12, d0 , d12 - vmul.f64 d13, d0 , d13 - vmul.f64 d14, d0 , d14 - vmul.f64 d15, d0 , d15 + vmla.f64 d24, d0 , d8 + vmla.f64 d25, d0 , d9 + vmla.f64 d26, d0 , d10 + vmla.f64 d27, d0 , d11 + vmla.f64 d28, d0 , d12 + vmla.f64 d29, d0 , d13 + vmla.f64 d30, d0 , d14 + vmla.f64 d31, d0 , d15 + + vstm CO1!, { d24, d25, d26 , d27 , d28 , d29 , d30 , d31 } + vldm CO2, { d8 , d9 , d10 , d11 , d12, d13 , d14 , d15 } - vmul.f64 d16, d0 , d16 - vmul.f64 d17, d0 , d17 - vmul.f64 d18, d0 , d18 - vmul.f64 d19, d0 , d19 - vmul.f64 d20, d0 , d20 - vmul.f64 d21, d0 , d21 - vmul.f64 d22, d0 , d22 - vmul.f64 d23, d0 , d23 + vmla.f64 d8 , d0 , d16 + vmla.f64 d9 , d0 , d17 + vmla.f64 d10, d0 , d18 + vmla.f64 d11, d0 , d19 + vmla.f64 d12, d0 , d20 + vmla.f64 d13, d0 , d21 + vmla.f64 d14, d0 , d22 + vmla.f64 d15, d0 , d23 - vldm CO1, { d0 , d1 , d2 , d3 , d4 , d5 , d6 , d7 } - - vadd.f64 d16, d16, d24 - vadd.f64 d17, d17, d25 - vadd.f64 d18, d18, d26 - vadd.f64 d19, d19, d27 - - vadd.f64 d20, d20, d28 - vadd.f64 d21, d21, d29 - vadd.f64 d22, d22, d30 - vadd.f64 d23, d23, d31 - - vadd.f64 d8 , d8 , d0 - vadd.f64 d9 , d9 , d1 - vadd.f64 d10, d10, d2 - vadd.f64 d11, d11, d3 - - vadd.f64 d12, d12, d4 - vadd.f64 d13, d13, d5 - vadd.f64 d14, d14, d6 - vadd.f64 d15, d15, d7 - - vstm CO2!, { d16, d17, d18 , d19 , d20 , d21 , d22 , d23 } - vstm CO1!, { d8 , d9 , d10 , d11 , d12, d13 , d14 , d15 } + vstm CO2!, { d8 , d9 , d10 , d11 , d12, d13 , d14 , d15 } .endm +.macro SAVE8x2_BAD + + vldr d0, ALPHA + vldm CO2, { d24, d25, d26 , d27 , d28 , d29 , d30 , d31 } + + vmul.f64 d8 , d0 , d8 + vmul.f64 d9 , d0 , d9 + vmul.f64 d10, d0 , d10 + vmul.f64 d11, d0 , d11 + vmul.f64 d12, d0 , d12 + vmul.f64 d13, d0 , d13 + vmul.f64 d14, d0 , d14 + vmul.f64 d15, d0 , d15 + + vmul.f64 d16, d0 , d16 + vmul.f64 d17, d0 , d17 + vmul.f64 d18, d0 , d18 + vmul.f64 d19, d0 , d19 + vmul.f64 d20, d0 , d20 + vmul.f64 d21, d0 , d21 + vmul.f64 d22, d0 , d22 + vmul.f64 d23, d0 , d23 + + vldm CO1, { d0 , d1 , d2 , d3 , d4 , d5 , d6 , d7 } + + vadd.f64 d16, d16, d24 + vadd.f64 d17, d17, d25 + vadd.f64 d18, d18, d26 + vadd.f64 d19, d19, d27 + + vadd.f64 d20, d20, d28 + vadd.f64 d21, d21, d29 + vadd.f64 d22, d22, d30 + vadd.f64 d23, d23, d31 + + vadd.f64 d8 , d8 , d0 + vadd.f64 d9 , d9 , d1 + vadd.f64 d10, d10, d2 + vadd.f64 d11, d11, d3 + + vadd.f64 d12, d12, d4 + vadd.f64 d13, d13, d5 + vadd.f64 d14, d14, d6 + vadd.f64 d15, d15, d7 + + vstm CO2!, { d16, d17, d18 , d19 , d20 , d21 , d22 , d23 } + vstm CO1!, { d8 , d9 , d10 , d11 , d12, d13 , d14 , d15 } + +.endm + + /*************************************************************************************/ @@ -814,18 +664,18 @@ _L2_M8_20: _L2_M8_22: pld [BO , #B_PRE] - KERNEL8x2_START - KERNEL8x2_M + KERNEL8x2 + KERNEL8x2 pld [BO , #B_PRE] - KERNEL8x2_M - KERNEL8x2_M + KERNEL8x2 + KERNEL8x2 pld [BO , #B_PRE] - KERNEL8x2_M - KERNEL8x2_M + KERNEL8x2 + KERNEL8x2 pld [BO , #B_PRE] - KERNEL8x2_M - KERNEL8x2_END + KERNEL8x2 + KERNEL8x2 subs L, L, #1 bgt _L2_M8_22 From 9965d4800585467b512e2a972f326a0ff133835e Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sun, 29 Sep 2013 18:55:21 +0200 Subject: [PATCH 05/81] added Makefile.arm --- Makefile.arm | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 Makefile.arm diff --git a/Makefile.arm b/Makefile.arm new file mode 100644 index 000000000..05ea9c679 --- /dev/null +++ b/Makefile.arm @@ -0,0 +1,3 @@ +ifdef BINARY64 +else +endif From 22a8fcc4b7d4c6feb433229f75316449b8e97fdb Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sun, 29 Sep 2013 19:42:33 +0200 Subject: [PATCH 06/81] add modified c_check perl program --- c_check | 2 ++ 1 file changed, 2 insertions(+) diff --git a/c_check b/c_check index d5fe59f75..c1cdd59c4 100644 --- a/c_check +++ b/c_check @@ -63,6 +63,7 @@ $architecture = mips64 if ($data =~ /ARCH_MIPS64/); $architecture = alpha if ($data =~ /ARCH_ALPHA/); $architecture = sparc if ($data =~ /ARCH_SPARC/); $architecture = ia64 if ($data =~ /ARCH_IA64/); +$architecture = arm if ($data =~ /ARCH_ARM/); $defined = 0; @@ -149,6 +150,7 @@ $architecture = mips64 if ($data =~ /ARCH_MIPS64/); $architecture = alpha if ($data =~ /ARCH_ALPHA/); $architecture = sparc if ($data =~ /ARCH_SPARC/); $architecture = ia64 if ($data =~ /ARCH_IA64/); +$architecture = arm if ($data =~ /ARCH_ARM/); $binformat = bin32; $binformat = bin64 if ($data =~ /BINARY_64/); From 1c63180bb6820fce44ab81b62cc056dd9d35ca9d Mon Sep 17 00:00:00 2001 From: wernsaar Date: Mon, 30 Sep 2013 17:31:23 +0200 Subject: [PATCH 07/81] updated dgemm_kernel_8x2_vfpv3.S --- kernel/arm/dgemm_kernel_8x2_vfpv3.S | 123 ++++++++++++++++++---------- 1 file changed, 81 insertions(+), 42 deletions(-) diff --git a/kernel/arm/dgemm_kernel_8x2_vfpv3.S b/kernel/arm/dgemm_kernel_8x2_vfpv3.S index e4b256832..6c1b0f5fd 100644 --- a/kernel/arm/dgemm_kernel_8x2_vfpv3.S +++ b/kernel/arm/dgemm_kernel_8x2_vfpv3.S @@ -26,28 +26,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** -* 2013/09/29 Saar +* 2013/09/30 Saar * BLASTEST : OK * CTEST : OK * TEST : OK * * -* 2013/09/29 Saar +* 2013/09/30 Saar * UNROLL_N 2 * UNROLL_M 8 -* DGEMM_P 128 -* DGEMM_Q 128 -* DGEMM_R 2048 +* DGEMM_P 64 +* DGEMM_Q 64 +* DGEMM_R 512 * A_PRE 192 * B_PRE 32 * C_PRE 64 * * Performance on Odroid U2: * -* 1 Core: 1.48 GFLOPS ATLAS: 1.52 GFLOPS -* 2 Cores: 2.92 GFLOPS ATLAS: - GFLOPS -* 3 Cores: 4.08 GFLOPS ATLAS: - GFLOPS -* 4 Cores: 4.80 GFLOPS ATLAS: 3.80 GFLOPS +* 1 Core: 1.42 GFLOPS ATLAS: 1.58 GFLOPS +* 2 Cores: 2.81 GFLOPS ATLAS: - GFLOPS +* 3 Cores: 4.05 GFLOPS ATLAS: - GFLOPS +* 4 Cores: 5.40 GFLOPS ATLAS: 3.88 GFLOPS **************************************************************************************/ #define ASSEMBLER @@ -128,32 +128,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL8x2 fldmiad BO!, { d24 , d25} - pld [AO , #A_PRE] - fldmiad AO!, { d0, d1 } - + fldd d0, [ AO ] fmacd d8 , d0, d24 - fldmiad AO!, { d2, d3 } - fmacd d9 , d1, d24 - fldmiad AO!, { d4, d5 } + fldd d1, [ AO , #8 ] fmacd d16 , d0, d25 - fldmiad AO!, { d6, d7 } + fldd d2, [ AO , #16 ] + fmacd d9 , d1, d24 fmacd d17 , d1, d25 - + fldd d3, [ AO , #24 ] fmacd d10 , d2, d24 - fmacd d11 , d3, d24 fmacd d18 , d2, d25 + fldd d4, [ AO , #32 ] + fmacd d11 , d3, d24 + pld [AO , #A_PRE] fmacd d19 , d3, d25 - - pld [AO , #A_PRE-32] + fldd d5, [ AO , #40 ] fmacd d12 , d4, d24 - fmacd d13 , d5, d24 fmacd d20 , d4, d25 + fldd d6, [ AO , #48 ] + fmacd d13 , d5, d24 fmacd d21 , d5, d25 + fldd d7, [ AO , #56 ] fmacd d14 , d6, d24 - fmacd d15 , d7, d24 fmacd d22 , d6, d25 + pld [AO , #A_PRE+32] + fmacd d15 , d7, d24 + add AO, AO, #64 fmacd d23 , d7, d25 .endm @@ -161,30 +163,62 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE8x2 vldr d0, ALPHA - vldm CO1, { d24, d25, d26 , d27 , d28 , d29 , d30 , d31 } - vmla.f64 d24, d0 , d8 - vmla.f64 d25, d0 , d9 - vmla.f64 d26, d0 , d10 - vmla.f64 d27, d0 , d11 - vmla.f64 d28, d0 , d12 - vmla.f64 d29, d0 , d13 - vmla.f64 d30, d0 , d14 - vmla.f64 d31, d0 , d15 + fldd d24, [CO1] + fldd d25, [CO1, #8 ] + + fmacd d24, d0 , d8 + fldd d8 , [CO2] + fldd d26, [CO1, #16] + fmacd d25, d0 , d9 + fldd d9 , [CO2, #8 ] + fldd d27, [CO1, #24] + fmacd d26, d0 , d10 + fldd d10 , [CO2, #16 ] + fldd d28, [CO1, #32] + fmacd d27, d0 , d11 + fldd d11 , [CO2, #24 ] + fldd d29, [CO1, #40] + fmacd d28, d0 , d12 + fldd d12 , [CO2, #32 ] + fldd d30, [CO1, #48] + fmacd d29, d0 , d13 + fldd d13 , [CO2, #40 ] + fldd d31, [CO1, #56] + fmacd d30, d0 , d14 + fldd d14 , [CO2, #48 ] + fmacd d31, d0 , d15 + fldd d15 , [CO2, #56 ] - vstm CO1!, { d24, d25, d26 , d27 , d28 , d29 , d30 , d31 } - vldm CO2, { d8 , d9 , d10 , d11 , d12, d13 , d14 , d15 } - vmla.f64 d8 , d0 , d16 - vmla.f64 d9 , d0 , d17 - vmla.f64 d10, d0 , d18 - vmla.f64 d11, d0 , d19 - vmla.f64 d12, d0 , d20 - vmla.f64 d13, d0 , d21 - vmla.f64 d14, d0 , d22 - vmla.f64 d15, d0 , d23 + fmacd d8 , d0 , d16 + fstd d24, [CO1] + fmacd d9 , d0 , d17 + fstd d25, [CO1, #8 ] + fstd d8 , [CO2] + fmacd d10, d0 , d18 + fstd d26, [CO1, #16 ] + fstd d9 , [CO2, #8 ] + fmacd d11, d0 , d19 + fstd d27, [CO1, #24 ] + fstd d10, [CO2, #16 ] + fmacd d12, d0 , d20 + fstd d28, [CO1, #32 ] + fstd d11, [CO2, #24 ] + fmacd d13, d0 , d21 + fstd d29, [CO1, #40 ] + fstd d12, [CO2, #32 ] + fmacd d14, d0 , d22 + fstd d30, [CO1, #48 ] + fstd d13, [CO2, #40 ] + fmacd d15, d0 , d23 + fstd d31, [CO1, #56 ] + fstd d14, [CO2, #48 ] + + add CO1, CO1, #64 + fstd d15, [CO2, #56 ] + add CO2, CO2, #64 - vstm CO2!, { d8 , d9 , d10 , d11 , d12, d13 , d14 , d15 } .endm @@ -643,6 +677,9 @@ _L2_BEGIN: str r3 , C // store C ldr AO, A // AO = A + pld [AO , #A_PRE-96] + pld [AO , #A_PRE-64] + pld [AO , #A_PRE-32] _L2_M8_BEGIN: @@ -653,7 +690,9 @@ _L2_M8_BEGIN: _L2_M8_20: pld [CO1, #C_PRE] + pld [CO1, #C_PRE+32] pld [CO2, #C_PRE] + pld [CO2, #C_PRE+32] INIT8x2 mov BO, BC From 93f1074dd4100835962d10621a7fcd2fe63e5b0e Mon Sep 17 00:00:00 2001 From: wernsaar Date: Mon, 30 Sep 2013 18:03:56 +0200 Subject: [PATCH 08/81] changed some values for arm --- param.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/param.h b/param.h index 79c18f7e2..a2ed09d6f 100644 --- a/param.h +++ b/param.h @@ -1815,17 +1815,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ZGEMM_DEFAULT_UNROLL_N 2 #define SGEMM_DEFAULT_P 64 -#define DGEMM_DEFAULT_P 128 +#define DGEMM_DEFAULT_P 64 #define CGEMM_DEFAULT_P 24 #define ZGEMM_DEFAULT_P 20 #define SGEMM_DEFAULT_Q 192 -#define DGEMM_DEFAULT_Q 128 +#define DGEMM_DEFAULT_Q 64 #define CGEMM_DEFAULT_Q 128 #define ZGEMM_DEFAULT_Q 64 #define SGEMM_DEFAULT_R 512 -#define DGEMM_DEFAULT_R 2048 +#define DGEMM_DEFAULT_R 512 #define CGEMM_DEFAULT_R 512 #define ZGEMM_DEFAULT_R 512 From e0b968c3a73ab797211789def5873d03b08ee9a4 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sat, 5 Oct 2013 12:59:44 +0200 Subject: [PATCH 09/81] Changed kernels for dgemm and dtrmm --- kernel/arm/KERNEL.ARMV7 | 16 +- kernel/arm/dgemm_kernel_4x4_vfpv3.S | 1589 ++++++++++++++++++++++ kernel/arm/dtrmm_kernel_4x4_vfpv3.S | 1953 +++++++++++++++++++++++++++ param.h | 8 +- 4 files changed, 3554 insertions(+), 12 deletions(-) create mode 100644 kernel/arm/dgemm_kernel_4x4_vfpv3.S create mode 100644 kernel/arm/dtrmm_kernel_4x4_vfpv3.S diff --git a/kernel/arm/KERNEL.ARMV7 b/kernel/arm/KERNEL.ARMV7 index a60701002..8c69ad5cf 100644 --- a/kernel/arm/KERNEL.ARMV7 +++ b/kernel/arm/KERNEL.ARMV7 @@ -81,7 +81,7 @@ CGEMVTKERNEL = zgemv_t.c ZGEMVTKERNEL = zgemv_t.c STRMMKERNEL = ../generic/trmmkernel_2x2.c -DTRMMKERNEL = dtrmm_kernel_8x2_vfpv3.S +DTRMMKERNEL = dtrmm_kernel_4x4_vfpv3.S CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c @@ -93,13 +93,13 @@ SGEMMOTCOPYOBJ = sgemm_otcopy.o #DGEMMKERNEL = ../generic/gemmkernel_2x2.c #DGEMMKERNEL = dgemm_kernel_4x2_vfpv2.S -DGEMMKERNEL = dgemm_kernel_8x2_vfpv3.S -DGEMMINCOPY = ../generic/gemm_ncopy_8.c -DGEMMITCOPY = ../generic/gemm_tcopy_8.c -DGEMMONCOPY = ../generic/gemm_ncopy_2.c -DGEMMOTCOPY = ../generic/gemm_tcopy_2.c -DGEMMINCOPYOBJ = dgemm_incopy.o -DGEMMITCOPYOBJ = dgemm_itcopy.o +DGEMMKERNEL = dgemm_kernel_4x4_vfpv3.S +DGEMMINCOPY = +DGEMMITCOPY = +DGEMMONCOPY = ../generic/gemm_ncopy_4.c +DGEMMOTCOPY = ../generic/gemm_tcopy_4.c +DGEMMINCOPYOBJ = +DGEMMITCOPYOBJ = DGEMMONCOPYOBJ = dgemm_oncopy.o DGEMMOTCOPYOBJ = dgemm_otcopy.o diff --git a/kernel/arm/dgemm_kernel_4x4_vfpv3.S b/kernel/arm/dgemm_kernel_4x4_vfpv3.S new file mode 100644 index 000000000..4c30e108c --- /dev/null +++ b/kernel/arm/dgemm_kernel_4x4_vfpv3.S @@ -0,0 +1,1589 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/10/05 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +* +* 2013/10/03 Saar +* UNROLL_N 4 +* UNROLL_M 4 +* DGEMM_P 128 +* DGEMM_Q 96 +* DGEMM_R 512 +* A_PRE 64 +* B_PRE 64 +* C_PRE 64 +* +* Performance on Odroid U2: +* +* 1 Core: 1.55 GFLOPS ATLAS: 1.59 GFLOPS +* 2 Cores: 3.10 GFLOPS ATLAS: - GFLOPS +* 3 Cores: 4.54 GFLOPS ATLAS: - GFLOPS +* 4 Cores: 5.67 GFLOPS ATLAS: 3.88 GFLOPS +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 252 + +#define OLD_M r0 +#define OLD_N r1 +#define OLD_K r2 +#define OLD_A r3 +#define OLD_ALPHA d0 + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define C [fp, #-248 ] +#define LDC [fp, #-252 ] +#define M [fp, #-256 ] +#define N [fp, #-260 ] +#define K [fp, #-264 ] +#define A [fp, #-268 ] + +#define ALPHA [fp, #-276 ] + +#define B [fp, #4 ] +#define OLD_C [fp, #8 ] +#define OLD_LDC [fp, #12 ] + +#define I r0 +#define J r1 +#define L r2 + +#define AO r5 +#define BO r6 + +#define CO1 r8 +#define CO2 r9 + +#define K1 r7 +#define BC r12 + +#define A_PRE 64 +#define B_PRE 64 +#define C_PRE 64 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro INIT4x4 + + vsub.f64 d16 , d16 , d16 + vmov.f64 d17, d16 + vmov.f64 d18, d16 + vmov.f64 d19, d16 + vmov.f64 d20, d16 + vmov.f64 d21, d16 + vmov.f64 d22, d16 + vmov.f64 d23, d16 + vmov.f64 d24, d16 + vmov.f64 d25, d16 + vmov.f64 d26, d16 + vmov.f64 d27, d16 + vmov.f64 d28, d16 + vmov.f64 d29, d16 + vmov.f64 d30, d16 + vmov.f64 d31, d16 + +.endm + +.macro KERNEL4x4_I + pld [ BO , #B_PRE ] + + fldd d8 , [ BO ] + + pld [ AO , #A_PRE ] + fldmiad AO!, { d0 - d1} + + fmuld d16 , d0, d8 + fldmiad AO!, { d2 - d3} + fmuld d17 , d1, d8 + fldd d9 , [ BO, #8 ] + fmuld d18 , d2, d8 + fldd d10, [ BO, #16 ] + fmuld d19 , d3, d8 + + fldd d11, [ BO, #24 ] + fmuld d20 , d0, d9 + fmuld d21 , d1, d9 + add BO , BO, #32 + fmuld d22 , d2, d9 + + fldd d12, [ BO ] + fmuld d23 , d3, d9 + + fmuld d24 , d0, d10 + fldmiad AO!, { d4 - d5 } + fmuld d25 , d1, d10 + fmuld d26 , d2, d10 + fldmiad AO!, { d6 - d7 } + fmuld d27 , d3, d10 + + fldd d13, [ BO, #8 ] + fmuld d28 , d0, d11 + fldd d14, [ BO, #16 ] + fmuld d29 , d1, d11 + fldd d15, [ BO, #24 ] + fmuld d30 , d2, d11 + fmuld d31 , d3, d11 + add BO , BO, #32 + +.endm + + + +.macro KERNEL4x4_S + pld [ BO , #B_PRE ] + + fldd d8 , [ BO ] + + pld [ AO , #A_PRE ] + fldmiad AO!, { d0 - d1} + + fmacd d16 , d0, d8 + fldmiad AO!, { d2 - d3} + fmacd d17 , d1, d8 + fldd d9 , [ BO, #8 ] + fmacd d18 , d2, d8 + fldd d10, [ BO, #16 ] + fmacd d19 , d3, d8 + + fldd d11, [ BO, #24 ] + fmacd d20 , d0, d9 + fmacd d21 , d1, d9 + add BO , BO, #32 + fmacd d22 , d2, d9 + + fldd d12, [ BO ] + fmacd d23 , d3, d9 + + fmacd d24 , d0, d10 + fldmiad AO!, { d4 - d5 } + fmacd d25 , d1, d10 + fmacd d26 , d2, d10 + fldmiad AO!, { d6 - d7 } + fmacd d27 , d3, d10 + + fldd d13, [ BO, #8 ] + fmacd d28 , d0, d11 + fldd d14, [ BO, #16 ] + fmacd d29 , d1, d11 + fldd d15, [ BO, #24 ] + fmacd d30 , d2, d11 + fmacd d31 , d3, d11 + add BO , BO, #32 + +.endm + + + +.macro KERNEL4x4_M1 + + fmacd d16 , d4, d12 + pld [ AO , #A_PRE ] + fmacd d17 , d5, d12 + fmacd d18 , d6, d12 + pld [ BO , #B_PRE ] + fmacd d19 , d7, d12 + + fmacd d20 , d4, d13 + fldd d8 , [ BO ] + fmacd d21 , d5, d13 + fmacd d22 , d6, d13 + fldmiad AO!, { d0 - d1 } + fmacd d23 , d7, d13 + + fmacd d24 , d4, d14 + fldmiad AO!, { d2 - d3 } + fmacd d25 , d5, d14 + fldd d9 , [ BO, #8 ] + fmacd d26 , d6, d14 + fldd d10, [ BO, #16 ] + fmacd d27 , d7, d14 + + fldd d11, [ BO, #24 ] + fmacd d28 , d4, d15 + fmacd d29 , d5, d15 + fmacd d30 , d6, d15 + add BO , BO, #32 + fmacd d31 , d7, d15 + +.endm + +.macro KERNEL4x4_M2 + + + fmacd d16 , d0, d8 + pld [ AO , #A_PRE ] + fmacd d17 , d1, d8 + pld [ BO , #B_PRE ] + fmacd d18 , d2, d8 + fldd d12, [ BO ] + fmacd d19 , d3, d8 + + fmacd d20 , d0, d9 + fldmiad AO!, { d4 - d5 } + fmacd d21 , d1, d9 + fmacd d22 , d2, d9 + fldmiad AO!, { d6 - d7 } + fmacd d23 , d3, d9 + + fmacd d24 , d0, d10 + fmacd d25 , d1, d10 + fmacd d26 , d2, d10 + fmacd d27 , d3, d10 + + fldd d13, [ BO, #8 ] + fmacd d28 , d0, d11 + fldd d14, [ BO, #16 ] + fmacd d29 , d1, d11 + fldd d15, [ BO, #24 ] + fmacd d30 , d2, d11 + fmacd d31 , d3, d11 + add BO , BO, #32 + +.endm + + +.macro KERNEL4x4_E + + fmacd d16 , d4, d12 + pld [ AO , #A_PRE ] + fmacd d17 , d5, d12 + fmacd d18 , d6, d12 + pld [ BO , #B_PRE ] + fmacd d19 , d7, d12 + + fmacd d20 , d4, d13 + fmacd d21 , d5, d13 + fmacd d22 , d6, d13 + fmacd d23 , d7, d13 + + fmacd d24 , d4, d14 + fmacd d25 , d5, d14 + fmacd d26 , d6, d14 + fmacd d27 , d7, d14 + + fmacd d28 , d4, d15 + fmacd d29 , d5, d15 + fmacd d30 , d6, d15 + fmacd d31 , d7, d15 + +.endm + + + + +.macro KERNEL4x4_SUB + pld [ BO , #B_PRE ] + pld [ AO , #A_PRE ] + + fldd d8 , [ BO ] + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + + fmacd d16 , d0, d8 + fldd d9 , [ BO, #8 ] + fmacd d17 , d1, d8 + fldd d10, [ BO, #16 ] + fmacd d18 , d2, d8 + fldd d11, [ BO, #24 ] + fmacd d19 , d3, d8 + + fmacd d20 , d0, d9 + fmacd d21 , d1, d9 + fmacd d22 , d2, d9 + fmacd d23 , d3, d9 + + fmacd d24 , d0, d10 + fmacd d25 , d1, d10 + fmacd d26 , d2, d10 + fmacd d27 , d3, d10 + + fmacd d28 , d0, d11 + fmacd d29 , d1, d11 + add AO , AO, #32 + fmacd d30 , d2, d11 + add BO , BO, #32 + fmacd d31 , d3, d11 + +.endm + +.macro SAVE4x4 + pld [ CO1 , #C_PRE ] + + ldr r3 , LDC + add CO2 , CO1, r3 + fldd d0, ALPHA + add r4 , CO2, r3 + pld [ CO2 , #C_PRE ] + + fldmiad CO1, { d8 - d11 } + pld [ r4 , #C_PRE ] + + fmacd d8 , d0 , d16 + fldd d12, [CO2] + fmacd d9 , d0 , d17 + fldd d13, [CO2, #8 ] + fmacd d10, d0 , d18 + fldd d14, [CO2, #16 ] + fmacd d11, d0 , d19 + fldd d15, [CO2, #24 ] + + fmacd d12, d0 , d20 + fstd d8 , [CO1] + fmacd d13, d0 , d21 + fstd d9 , [CO1, #8 ] + fmacd d14, d0 , d22 + fstd d10, [CO1, #16 ] + fmacd d15, d0 , d23 + fstd d11, [CO1, #24 ] + + fldmiad r4, { d8 - d11 } + + fmacd d8 , d0 , d24 + fstd d12, [CO2] + fmacd d9 , d0 , d25 + fstd d13, [CO2, #8 ] + fmacd d10, d0 , d26 + fstd d14, [CO2, #16 ] + fmacd d11, d0 , d27 + fstd d15, [CO2, #24 ] + + add CO2, r4 , r3 + + pld [ CO2 , #C_PRE ] + + fldmiad CO2, { d12 - d15 } + + fstd d8 , [r4 ] + fmacd d12, d0 , d28 + fstd d9 , [r4 , #8 ] + fmacd d13, d0 , d29 + fstd d10, [r4 , #16 ] + fmacd d14, d0 , d30 + fstd d11, [r4 , #24 ] + fmacd d15, d0 , d31 + + fstmiad CO2, { d12 - d15 } + + add CO1, CO1, #32 + +.endm + +/******************************************************************************/ + +.macro INIT2x4 + + vsub.f64 d16 , d16 , d16 + vmov.f64 d17, d16 + vmov.f64 d20, d16 + vmov.f64 d21, d16 + vmov.f64 d24, d16 + vmov.f64 d25, d16 + vmov.f64 d28, d16 + vmov.f64 d29, d16 + +.endm + + + +.macro KERNEL2x4_SUB + + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + fldd d10, [ BO, #16 ] + fldd d11, [ BO, #24 ] + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fmacd d16 , d0, d8 + fmacd d17 , d1, d8 + + fmacd d20 , d0, d9 + fmacd d21 , d1, d9 + + fmacd d24 , d0, d10 + fmacd d25 , d1, d10 + + fmacd d28 , d0, d11 + fmacd d29 , d1, d11 + add AO , AO, #16 + add BO , BO, #32 + +.endm + +.macro SAVE2x4 + + ldr r3 , LDC + add CO2 , CO1, r3 + add r4 , CO2, r3 + + fldd d0, ALPHA + + fldd d8 , [CO1] + fldd d9 , [CO1, #8 ] + + fmacd d8 , d0 , d16 + fmacd d9 , d0 , d17 + + fstd d8 , [CO1] + fstd d9 , [CO1, #8 ] + + fldd d12, [CO2] + fldd d13, [CO2, #8 ] + + fmacd d12, d0 , d20 + fmacd d13, d0 , d21 + + fstd d12, [CO2] + fstd d13, [CO2, #8 ] + + fldd d8 , [r4 ] + fldd d9 , [r4 , #8 ] + + fmacd d8 , d0 , d24 + fmacd d9 , d0 , d25 + + fstd d8 , [r4 ] + fstd d9 , [r4 , #8 ] + + add CO2, r4 , r3 + + fldd d12, [CO2] + fldd d13, [CO2, #8 ] + + fmacd d12, d0 , d28 + fmacd d13, d0 , d29 + + fstd d12, [CO2] + fstd d13, [CO2, #8 ] + + add CO1, CO1, #16 + +.endm + + +/******************************************************************************/ + +.macro INIT1x4 + + vsub.f64 d16 , d16 , d16 + vmov.f64 d20, d16 + vmov.f64 d24, d16 + vmov.f64 d28, d16 + +.endm + + + +.macro KERNEL1x4_SUB + + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + fldd d10, [ BO, #16 ] + fldd d11, [ BO, #24 ] + + fldd d0 , [ AO ] + + fmacd d16 , d0, d8 + fmacd d20 , d0, d9 + fmacd d24 , d0, d10 + fmacd d28 , d0, d11 + + add AO , AO, #8 + add BO , BO, #32 + +.endm + +.macro SAVE1x4 + + ldr r3 , LDC + add CO2 , CO1, r3 + add r4 , CO2, r3 + + fldd d0, ALPHA + + fldd d8 , [CO1] + fmacd d8 , d0 , d16 + fstd d8 , [CO1] + + fldd d12, [CO2] + fmacd d12, d0 , d20 + fstd d12, [CO2] + + fldd d8 , [r4 ] + fmacd d8 , d0 , d24 + fstd d8 , [r4 ] + + add CO2, r4 , r3 + + fldd d12, [CO2] + fmacd d12, d0 , d28 + fstd d12, [CO2] + + add CO1, CO1, #8 + +.endm + +/******************************************************************************/ +/******************************************************************************/ + +.macro INIT4x2 + + vsub.f64 d16 , d16 , d16 + vmov.f64 d17, d16 + vmov.f64 d18, d16 + vmov.f64 d19, d16 + vmov.f64 d20, d16 + vmov.f64 d21, d16 + vmov.f64 d22, d16 + vmov.f64 d23, d16 + +.endm + + + +.macro KERNEL4x2_SUB + + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + + fmacd d16 , d0, d8 + fmacd d17 , d1, d8 + fmacd d18 , d2, d8 + fmacd d19 , d3, d8 + + fmacd d20 , d0, d9 + fmacd d21 , d1, d9 + fmacd d22 , d2, d9 + fmacd d23 , d3, d9 + + add AO , AO, #32 + add BO , BO, #16 + +.endm + +.macro SAVE4x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + fldd d0, ALPHA + + fldd d8 , [CO1] + fldd d9 , [CO1, #8 ] + fldd d10, [CO1, #16 ] + fldd d11, [CO1, #24 ] + + fmacd d8 , d0 , d16 + fmacd d9 , d0 , d17 + fmacd d10, d0 , d18 + fmacd d11, d0 , d19 + + fstd d8 , [CO1] + fstd d9 , [CO1, #8 ] + fstd d10, [CO1, #16 ] + fstd d11, [CO1, #24 ] + + fldd d12, [CO2] + fldd d13, [CO2, #8 ] + fldd d14, [CO2, #16 ] + fldd d15, [CO2, #24 ] + + fmacd d12, d0 , d20 + fmacd d13, d0 , d21 + fmacd d14, d0 , d22 + fmacd d15, d0 , d23 + + fstd d12, [CO2] + fstd d13, [CO2, #8 ] + fstd d14, [CO2, #16 ] + fstd d15, [CO2, #24 ] + + add CO1, CO1, #32 + +.endm + + +/******************************************************************************/ + +.macro INIT2x2 + + vsub.f64 d16 , d16 , d16 + vmov.f64 d17, d16 + vmov.f64 d20, d16 + vmov.f64 d21, d16 + +.endm + + + +.macro KERNEL2x2_SUB + + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fmacd d16 , d0, d8 + fmacd d17 , d1, d8 + + fmacd d20 , d0, d9 + fmacd d21 , d1, d9 + + add AO , AO, #16 + add BO , BO, #16 + +.endm + +.macro SAVE2x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + fldd d0, ALPHA + + fldd d8 , [CO1] + fldd d9 , [CO1, #8 ] + + fmacd d8 , d0 , d16 + fmacd d9 , d0 , d17 + + fstd d8 , [CO1] + fstd d9 , [CO1, #8 ] + + fldd d12, [CO2] + fldd d13, [CO2, #8 ] + + fmacd d12, d0 , d20 + fmacd d13, d0 , d21 + + fstd d12, [CO2] + fstd d13, [CO2, #8 ] + + add CO1, CO1, #16 + +.endm + +/******************************************************************************/ + +.macro INIT1x2 + + vsub.f64 d16 , d16 , d16 + vmov.f64 d20, d16 + +.endm + + + +.macro KERNEL1x2_SUB + + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + + fldd d0 , [ AO ] + fmacd d16 , d0, d8 + fmacd d20 , d0, d9 + + add AO , AO, #8 + add BO , BO, #16 + +.endm + +.macro SAVE1x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + fldd d0, ALPHA + + fldd d8 , [CO1] + fmacd d8 , d0 , d16 + fstd d8 , [CO1] + + fldd d12, [CO2] + fmacd d12, d0 , d20 + fstd d12, [CO2] + + add CO1, CO1, #16 + +.endm + +/******************************************************************************/ +/******************************************************************************/ + +.macro INIT4x1 + + vsub.f64 d16 , d16 , d16 + vmov.f64 d17, d16 + vmov.f64 d18, d16 + vmov.f64 d19, d16 + +.endm + + + +.macro KERNEL4x1_SUB + + fldd d8 , [ BO ] + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + + fmacd d16 , d0, d8 + fmacd d17 , d1, d8 + fmacd d18 , d2, d8 + fmacd d19 , d3, d8 + + add AO , AO, #32 + add BO , BO, #8 + +.endm + +.macro SAVE4x1 + + + fldd d0, ALPHA + + fldd d8 , [CO1] + fldd d9 , [CO1, #8 ] + fldd d10, [CO1, #16 ] + fldd d11, [CO1, #24 ] + + fmacd d8 , d0 , d16 + fmacd d9 , d0 , d17 + fmacd d10, d0 , d18 + fmacd d11, d0 , d19 + + fstd d8 , [CO1] + fstd d9 , [CO1, #8 ] + fstd d10, [CO1, #16 ] + fstd d11, [CO1, #24 ] + + add CO1, CO1, #32 + +.endm + + + + +/******************************************************************************/ + +.macro INIT2x1 + + vsub.f64 d16 , d16 , d16 + vmov.f64 d17, d16 + +.endm + + + +.macro KERNEL2x1_SUB + + fldd d8 , [ BO ] + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fmacd d16 , d0, d8 + fmacd d17 , d1, d8 + + add AO , AO, #16 + add BO , BO, #8 + +.endm + +.macro SAVE2x1 + + + fldd d0, ALPHA + + fldd d8 , [CO1] + fldd d9 , [CO1, #8 ] + + fmacd d8 , d0 , d16 + fmacd d9 , d0 , d17 + + fstd d8 , [CO1] + fstd d9 , [CO1, #8 ] + + add CO1, CO1, #16 + +.endm + +/******************************************************************************/ + +.macro INIT1x1 + + vsub.f64 d16 , d16 , d16 + +.endm + + + +.macro KERNEL1x1_SUB + + fldd d8 , [ BO ] + + fldd d0 , [ AO ] + + fmacd d16 , d0, d8 + + add AO , AO, #8 + add BO , BO, #8 + +.endm + +.macro SAVE1x1 + + + fldd d0, ALPHA + + fldd d8 , [CO1] + fmacd d8 , d0 , d16 + fstd d8 , [CO1] + + add CO1, CO1, #8 + +.endm + + + + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + str OLD_M, M + str OLD_N, N + str OLD_K, K + str OLD_A, A + vstr OLD_ALPHA, ALPHA + + sub r3, fp, #128 + vstm r3, { d8 - d15} // store floating point registers + + ldr r3, OLD_LDC + lsl r3, r3, #3 // ldc = ldc * 8 + str r3, LDC + + ldr r3, OLD_C + str r3, C + + ldr K1, K + ldr BC, B + + ldr J, N + asrs J, J, #2 // J = J / 4 + ble _L2_BEGIN + +_L4_BEGIN: + + ldr CO1, C // CO1 = C + ldr r4 , LDC + lsl r4 , r4 , #2 // LDC * 4 + add r3 , r4, CO1 + str r3 , C // store C + + ldr AO, A // AO = A + pld [AO , #A_PRE-64] + pld [AO , #A_PRE-32] + + + +_L4_M4_BEGIN: + + ldr I, M + asrs I, I, #2 // I = I / 4 + ble _L4_M2_BEGIN + +_L4_M4_20: + + + mov BO, BC + asrs L , K1, #5 // L = L / 32 + ble _L4_M4_40 + .align 5 + + + + KERNEL4x4_I + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + + + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_E + + subs L, L, #1 + ble _L4_M4_41 + +_L4_M4_22: + + KERNEL4x4_S + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + + + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_E + + subs L, L, #1 + ble _L4_M4_41 + + b _L4_M4_22 + + +_L4_M4_40: + + INIT4x4 + +_L4_M4_41: + + tst K1, #31 + ble _L4_M4_100 + + tst K1, #16 + ble _L4_M4_44 + + KERNEL4x4_S + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_E + +_L4_M4_44: + + ands L , K1, #15 // L = L % 16 + ble _L4_M4_100 + +_L4_M4_46: + + KERNEL4x4_SUB + + subs L, L, #1 + bgt _L4_M4_46 + +_L4_M4_100: + + SAVE4x4 + +_L4_M4_END: + + subs I, I, #1 + bgt _L4_M4_20 + + +_L4_M2_BEGIN: + + ldr I, M + tst I , #3 + ble _L4_END + + tst I, #2 // I = I / 2 + ble _L4_M1_BEGIN + +_L4_M2_20: + + INIT2x4 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble _L4_M2_40 + +_L4_M2_22: + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + subs L, L, #1 + bgt _L4_M2_22 + + +_L4_M2_40: + + ands L , K1, #7 // L = L % 8 + ble _L4_M2_100 + +_L4_M2_42: + + KERNEL2x4_SUB + + subs L, L, #1 + bgt _L4_M2_42 + +_L4_M2_100: + + SAVE2x4 + +_L4_M2_END: + + +_L4_M1_BEGIN: + + tst I, #1 // I = I % 2 + ble _L4_END + +_L4_M1_20: + + INIT1x4 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble _L4_M1_40 + +_L4_M1_22: + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + subs L, L, #1 + bgt _L4_M1_22 + + +_L4_M1_40: + + ands L , K1, #7 // L = L % 8 + ble _L4_M1_100 + +_L4_M1_42: + + KERNEL1x4_SUB + + subs L, L, #1 + bgt _L4_M1_42 + +_L4_M1_100: + + SAVE1x4 + + +_L4_END: + + mov r3, BC + mov r4, K1 + lsl r4, r4, #5 // k * 4 * 8 + add r3, r3, r4 // B = B + K * 4 * 8 + mov BC, r3 + + subs J , #1 // j-- + bgt _L4_BEGIN + + + +/*********************************************************************************************/ + +_L2_BEGIN: + + ldr J , N + tst J , #3 + ble _L999 + + tst J , #2 + ble _L1_BEGIN + + ldr CO1, C // CO1 = C + ldr r4 , LDC + lsl r4 , r4 , #1 // LDC * 2 + add r3 , r4, CO1 + str r3 , C // store C + + ldr AO, A // AO = A + //pld [AO , #A_PRE-96] + //pld [AO , #A_PRE-64] + //pld [AO , #A_PRE-32] + + + +_L2_M4_BEGIN: + + ldr I, M + asrs I, I, #2 // I = I / 4 + ble _L2_M2_BEGIN + +_L2_M4_20: + + INIT4x2 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble _L2_M4_40 + .align 5 + +_L2_M4_22: + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + subs L, L, #1 + bgt _L2_M4_22 + + +_L2_M4_40: + + ands L , K1, #7 // L = L % 8 + ble _L2_M4_100 + +_L2_M4_42: + + KERNEL4x2_SUB + + subs L, L, #1 + bgt _L2_M4_42 + +_L2_M4_100: + + SAVE4x2 + +_L2_M4_END: + + subs I, I, #1 + bgt _L2_M4_20 + + +_L2_M2_BEGIN: + + ldr I, M + tst I , #3 + ble _L2_END + + tst I, #2 // I = I / 2 + ble _L2_M1_BEGIN + +_L2_M2_20: + + INIT2x2 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble _L2_M2_40 + +_L2_M2_22: + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + subs L, L, #1 + bgt _L2_M2_22 + + +_L2_M2_40: + + ands L , K1, #7 // L = L % 8 + ble _L2_M2_100 + +_L2_M2_42: + + KERNEL2x2_SUB + + subs L, L, #1 + bgt _L2_M2_42 + +_L2_M2_100: + + SAVE2x2 + +_L2_M2_END: + + +_L2_M1_BEGIN: + + tst I, #1 // I = I % 2 + ble _L2_END + +_L2_M1_20: + + INIT1x2 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble _L2_M1_40 + +_L2_M1_22: + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + subs L, L, #1 + bgt _L2_M1_22 + + +_L2_M1_40: + + ands L , K1, #7 // L = L % 8 + ble _L2_M1_100 + +_L2_M1_42: + + KERNEL1x2_SUB + + subs L, L, #1 + bgt _L2_M1_42 + +_L2_M1_100: + + SAVE1x2 + + +_L2_END: + + mov r3, BC + mov r4, K1 + lsl r4, r4, #4 // k * 2 * 8 + add r3, r3, r4 // B = B + K * 2 * 8 + mov BC, r3 + +/*********************************************************************************************/ + +_L1_BEGIN: + + ldr J , N + tst J , #1 + ble _L999 + + + ldr CO1, C // CO1 = C + ldr r4 , LDC + add r3 , r4, CO1 + str r3 , C // store C + + ldr AO, A // AO = A + //pld [AO , #A_PRE-96] + //pld [AO , #A_PRE-64] + //pld [AO , #A_PRE-32] + + + +_L1_M4_BEGIN: + + ldr I, M + asrs I, I, #2 // I = I / 4 + ble _L1_M2_BEGIN + +_L1_M4_20: + + INIT4x1 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble _L1_M4_40 + .align 5 + +_L1_M4_22: + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + subs L, L, #1 + bgt _L1_M4_22 + + +_L1_M4_40: + + ands L , K1, #7 // L = L % 8 + ble _L1_M4_100 + +_L1_M4_42: + + KERNEL4x1_SUB + + subs L, L, #1 + bgt _L1_M4_42 + +_L1_M4_100: + + SAVE4x1 + +_L1_M4_END: + + subs I, I, #1 + bgt _L1_M4_20 + + +_L1_M2_BEGIN: + + ldr I, M + tst I , #3 + ble _L1_END + + tst I, #2 // I = I / 2 + ble _L1_M1_BEGIN + +_L1_M2_20: + + INIT2x1 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble _L1_M2_40 + +_L1_M2_22: + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + subs L, L, #1 + bgt _L1_M2_22 + + +_L1_M2_40: + + ands L , K1, #7 // L = L % 8 + ble _L1_M2_100 + +_L1_M2_42: + + KERNEL2x1_SUB + + subs L, L, #1 + bgt _L1_M2_42 + +_L1_M2_100: + + SAVE2x1 + +_L1_M2_END: + + +_L1_M1_BEGIN: + + tst I, #1 // I = I % 2 + ble _L1_END + +_L1_M1_20: + + INIT1x1 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble _L1_M1_40 + +_L1_M1_22: + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + subs L, L, #1 + bgt _L1_M1_22 + + +_L1_M1_40: + + ands L , K1, #7 // L = L % 8 + ble _L1_M1_100 + +_L1_M1_42: + + KERNEL1x1_SUB + + subs L, L, #1 + bgt _L1_M1_42 + +_L1_M1_100: + + SAVE1x1 + + +_L1_END: + + +_L999: + + sub r3, fp, #128 + vldm r3, { d8 - d15} // restore floating point registers + + movs r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/dtrmm_kernel_4x4_vfpv3.S b/kernel/arm/dtrmm_kernel_4x4_vfpv3.S new file mode 100644 index 000000000..a80177f8b --- /dev/null +++ b/kernel/arm/dtrmm_kernel_4x4_vfpv3.S @@ -0,0 +1,1953 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/10/05 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 252 + +#define OLD_M r0 +#define OLD_N r1 +#define OLD_K r2 +#define OLD_A r3 +#define OLD_ALPHA d0 + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define KK [fp, #-240 ] +#define KKK [fp, #-244] +#define C [fp, #-248 ] +#define LDC [fp, #-252 ] +#define M [fp, #-256 ] +#define N [fp, #-260 ] +#define K [fp, #-264 ] +#define A [fp, #-268 ] + +#define ALPHA [fp, #-276 ] + +#define B [fp, #4 ] +#define OLD_C [fp, #8 ] +#define OLD_LDC [fp, #12 ] +#define OFFSET [fp, #16 ] + +#define I r0 +#define J r1 +#define L r2 + +#define AO r5 +#define BO r6 + +#define CO1 r8 +#define CO2 r9 + +#define K1 r7 +#define BC r12 + +#define A_PRE 64 +#define B_PRE 64 +#define C_PRE 64 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro INIT4x4 + + vsub.f64 d16 , d16 , d16 + vmov.f64 d17, d16 + vmov.f64 d18, d16 + vmov.f64 d19, d16 + vmov.f64 d20, d16 + vmov.f64 d21, d16 + vmov.f64 d22, d16 + vmov.f64 d23, d16 + vmov.f64 d24, d16 + vmov.f64 d25, d16 + vmov.f64 d26, d16 + vmov.f64 d27, d16 + vmov.f64 d28, d16 + vmov.f64 d29, d16 + vmov.f64 d30, d16 + vmov.f64 d31, d16 + +.endm + +.macro KERNEL4x4_I + pld [ BO , #B_PRE ] + + fldd d8 , [ BO ] + + pld [ AO , #A_PRE ] + fldmiad AO!, { d0 - d1} + + fmuld d16 , d0, d8 + fldmiad AO!, { d2 - d3} + fmuld d17 , d1, d8 + fldd d9 , [ BO, #8 ] + fmuld d18 , d2, d8 + fldd d10, [ BO, #16 ] + fmuld d19 , d3, d8 + + fldd d11, [ BO, #24 ] + fmuld d20 , d0, d9 + fmuld d21 , d1, d9 + add BO , BO, #32 + fmuld d22 , d2, d9 + + fldd d12, [ BO ] + fmuld d23 , d3, d9 + + fmuld d24 , d0, d10 + fldmiad AO!, { d4 - d5 } + fmuld d25 , d1, d10 + fmuld d26 , d2, d10 + fldmiad AO!, { d6 - d7 } + fmuld d27 , d3, d10 + + fldd d13, [ BO, #8 ] + fmuld d28 , d0, d11 + fldd d14, [ BO, #16 ] + fmuld d29 , d1, d11 + fldd d15, [ BO, #24 ] + fmuld d30 , d2, d11 + fmuld d31 , d3, d11 + add BO , BO, #32 + +.endm + + + +.macro KERNEL4x4_S + pld [ BO , #B_PRE ] + + fldd d8 , [ BO ] + + pld [ AO , #A_PRE ] + fldmiad AO!, { d0 - d1} + + fmacd d16 , d0, d8 + fldmiad AO!, { d2 - d3} + fmacd d17 , d1, d8 + fldd d9 , [ BO, #8 ] + fmacd d18 , d2, d8 + fldd d10, [ BO, #16 ] + fmacd d19 , d3, d8 + + fldd d11, [ BO, #24 ] + fmacd d20 , d0, d9 + fmacd d21 , d1, d9 + add BO , BO, #32 + fmacd d22 , d2, d9 + + fldd d12, [ BO ] + fmacd d23 , d3, d9 + + fmacd d24 , d0, d10 + fldmiad AO!, { d4 - d5 } + fmacd d25 , d1, d10 + fmacd d26 , d2, d10 + fldmiad AO!, { d6 - d7 } + fmacd d27 , d3, d10 + + fldd d13, [ BO, #8 ] + fmacd d28 , d0, d11 + fldd d14, [ BO, #16 ] + fmacd d29 , d1, d11 + fldd d15, [ BO, #24 ] + fmacd d30 , d2, d11 + fmacd d31 , d3, d11 + add BO , BO, #32 + +.endm + + + +.macro KERNEL4x4_M1 + + fmacd d16 , d4, d12 + pld [ AO , #A_PRE ] + fmacd d17 , d5, d12 + fmacd d18 , d6, d12 + pld [ BO , #B_PRE ] + fmacd d19 , d7, d12 + + fmacd d20 , d4, d13 + fldd d8 , [ BO ] + fmacd d21 , d5, d13 + fmacd d22 , d6, d13 + fldmiad AO!, { d0 - d1 } + fmacd d23 , d7, d13 + + fmacd d24 , d4, d14 + fldmiad AO!, { d2 - d3 } + fmacd d25 , d5, d14 + fldd d9 , [ BO, #8 ] + fmacd d26 , d6, d14 + fldd d10, [ BO, #16 ] + fmacd d27 , d7, d14 + + fldd d11, [ BO, #24 ] + fmacd d28 , d4, d15 + fmacd d29 , d5, d15 + fmacd d30 , d6, d15 + add BO , BO, #32 + fmacd d31 , d7, d15 + +.endm + +.macro KERNEL4x4_M2 + + + fmacd d16 , d0, d8 + pld [ AO , #A_PRE ] + fmacd d17 , d1, d8 + pld [ BO , #B_PRE ] + fmacd d18 , d2, d8 + fldd d12, [ BO ] + fmacd d19 , d3, d8 + + fmacd d20 , d0, d9 + fldmiad AO!, { d4 - d5 } + fmacd d21 , d1, d9 + fmacd d22 , d2, d9 + fldmiad AO!, { d6 - d7 } + fmacd d23 , d3, d9 + + fmacd d24 , d0, d10 + fmacd d25 , d1, d10 + fmacd d26 , d2, d10 + fmacd d27 , d3, d10 + + fldd d13, [ BO, #8 ] + fmacd d28 , d0, d11 + fldd d14, [ BO, #16 ] + fmacd d29 , d1, d11 + fldd d15, [ BO, #24 ] + fmacd d30 , d2, d11 + fmacd d31 , d3, d11 + add BO , BO, #32 + +.endm + + +.macro KERNEL4x4_E + + fmacd d16 , d4, d12 + pld [ AO , #A_PRE ] + fmacd d17 , d5, d12 + fmacd d18 , d6, d12 + pld [ BO , #B_PRE ] + fmacd d19 , d7, d12 + + fmacd d20 , d4, d13 + fmacd d21 , d5, d13 + fmacd d22 , d6, d13 + fmacd d23 , d7, d13 + + fmacd d24 , d4, d14 + fmacd d25 , d5, d14 + fmacd d26 , d6, d14 + fmacd d27 , d7, d14 + + fmacd d28 , d4, d15 + fmacd d29 , d5, d15 + fmacd d30 , d6, d15 + fmacd d31 , d7, d15 + +.endm + + + + +.macro KERNEL4x4_SUB + pld [ BO , #B_PRE ] + pld [ AO , #A_PRE ] + + fldd d8 , [ BO ] + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + + fmacd d16 , d0, d8 + fldd d9 , [ BO, #8 ] + fmacd d17 , d1, d8 + fldd d10, [ BO, #16 ] + fmacd d18 , d2, d8 + fldd d11, [ BO, #24 ] + fmacd d19 , d3, d8 + + fmacd d20 , d0, d9 + fmacd d21 , d1, d9 + fmacd d22 , d2, d9 + fmacd d23 , d3, d9 + + fmacd d24 , d0, d10 + fmacd d25 , d1, d10 + fmacd d26 , d2, d10 + fmacd d27 , d3, d10 + + fmacd d28 , d0, d11 + fmacd d29 , d1, d11 + add AO , AO, #32 + fmacd d30 , d2, d11 + add BO , BO, #32 + fmacd d31 , d3, d11 + +.endm + +.macro SAVE4x4 + + ldr r3 , LDC + add CO2 , CO1, r3 + fldd d0, ALPHA + add r4 , CO2, r3 + + + fmuld d8 , d0 , d16 + fmuld d9 , d0 , d17 + fmuld d10, d0 , d18 + fmuld d11, d0 , d19 + + fmuld d12, d0 , d20 + fstd d8 , [CO1] + fmuld d13, d0 , d21 + fstd d9 , [CO1, #8 ] + fmuld d14, d0 , d22 + fstd d10, [CO1, #16 ] + fmuld d15, d0 , d23 + fstd d11, [CO1, #24 ] + + + fmuld d8 , d0 , d24 + fstd d12, [CO2] + fmuld d9 , d0 , d25 + fstd d13, [CO2, #8 ] + fmuld d10, d0 , d26 + fstd d14, [CO2, #16 ] + fmuld d11, d0 , d27 + fstd d15, [CO2, #24 ] + + add CO2, r4 , r3 + + fstd d8 , [r4 ] + fmuld d12, d0 , d28 + fstd d9 , [r4 , #8 ] + fmuld d13, d0 , d29 + fstd d10, [r4 , #16 ] + fmuld d14, d0 , d30 + fstd d11, [r4 , #24 ] + fmuld d15, d0 , d31 + + fstmiad CO2, { d12 - d15 } + + add CO1, CO1, #32 + +.endm + +/******************************************************************************/ + +.macro INIT2x4 + + vsub.f64 d16 , d16 , d16 + vmov.f64 d17, d16 + vmov.f64 d20, d16 + vmov.f64 d21, d16 + vmov.f64 d24, d16 + vmov.f64 d25, d16 + vmov.f64 d28, d16 + vmov.f64 d29, d16 + +.endm + + + +.macro KERNEL2x4_SUB + + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + fldd d10, [ BO, #16 ] + fldd d11, [ BO, #24 ] + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fmacd d16 , d0, d8 + fmacd d17 , d1, d8 + + fmacd d20 , d0, d9 + fmacd d21 , d1, d9 + + fmacd d24 , d0, d10 + fmacd d25 , d1, d10 + + fmacd d28 , d0, d11 + fmacd d29 , d1, d11 + add AO , AO, #16 + add BO , BO, #32 + +.endm + +.macro SAVE2x4 + + ldr r3 , LDC + add CO2 , CO1, r3 + add r4 , CO2, r3 + + fldd d0, ALPHA + + fmuld d8 , d0 , d16 + fmuld d9 , d0 , d17 + + fstd d8 , [CO1] + fstd d9 , [CO1, #8 ] + + fmuld d12, d0 , d20 + fmuld d13, d0 , d21 + + fstd d12, [CO2] + fstd d13, [CO2, #8 ] + + fmuld d8 , d0 , d24 + fmuld d9 , d0 , d25 + + fstd d8 , [r4 ] + fstd d9 , [r4 , #8 ] + + add CO2, r4 , r3 + + fmuld d12, d0 , d28 + fmuld d13, d0 , d29 + + fstd d12, [CO2] + fstd d13, [CO2, #8 ] + + add CO1, CO1, #16 + +.endm + + +/******************************************************************************/ + +.macro INIT1x4 + + vsub.f64 d16 , d16 , d16 + vmov.f64 d20, d16 + vmov.f64 d24, d16 + vmov.f64 d28, d16 + +.endm + + + +.macro KERNEL1x4_SUB + + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + fldd d10, [ BO, #16 ] + fldd d11, [ BO, #24 ] + + fldd d0 , [ AO ] + + fmacd d16 , d0, d8 + fmacd d20 , d0, d9 + fmacd d24 , d0, d10 + fmacd d28 , d0, d11 + + add AO , AO, #8 + add BO , BO, #32 + +.endm + +.macro SAVE1x4 + + ldr r3 , LDC + add CO2 , CO1, r3 + add r4 , CO2, r3 + + fldd d0, ALPHA + + fmuld d8 , d0 , d16 + fstd d8 , [CO1] + + fmuld d12, d0 , d20 + fstd d12, [CO2] + + fmuld d8 , d0 , d24 + fstd d8 , [r4 ] + + add CO2, r4 , r3 + + fmuld d12, d0 , d28 + fstd d12, [CO2] + + add CO1, CO1, #8 + +.endm + +/******************************************************************************/ +/******************************************************************************/ + +.macro INIT4x2 + + vsub.f64 d16 , d16 , d16 + vmov.f64 d17, d16 + vmov.f64 d18, d16 + vmov.f64 d19, d16 + vmov.f64 d20, d16 + vmov.f64 d21, d16 + vmov.f64 d22, d16 + vmov.f64 d23, d16 + +.endm + + + +.macro KERNEL4x2_SUB + + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + + fmacd d16 , d0, d8 + fmacd d17 , d1, d8 + fmacd d18 , d2, d8 + fmacd d19 , d3, d8 + + fmacd d20 , d0, d9 + fmacd d21 , d1, d9 + fmacd d22 , d2, d9 + fmacd d23 , d3, d9 + + add AO , AO, #32 + add BO , BO, #16 + +.endm + +.macro SAVE4x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + fldd d0, ALPHA + + fmuld d8 , d0 , d16 + fmuld d9 , d0 , d17 + fmuld d10, d0 , d18 + fmuld d11, d0 , d19 + + fstd d8 , [CO1] + fstd d9 , [CO1, #8 ] + fstd d10, [CO1, #16 ] + fstd d11, [CO1, #24 ] + + fmuld d12, d0 , d20 + fmuld d13, d0 , d21 + fmuld d14, d0 , d22 + fmuld d15, d0 , d23 + + fstd d12, [CO2] + fstd d13, [CO2, #8 ] + fstd d14, [CO2, #16 ] + fstd d15, [CO2, #24 ] + + add CO1, CO1, #32 + +.endm + + +/******************************************************************************/ + +.macro INIT2x2 + + vsub.f64 d16 , d16 , d16 + vmov.f64 d17, d16 + vmov.f64 d20, d16 + vmov.f64 d21, d16 + +.endm + + + +.macro KERNEL2x2_SUB + + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fmacd d16 , d0, d8 + fmacd d17 , d1, d8 + + fmacd d20 , d0, d9 + fmacd d21 , d1, d9 + + add AO , AO, #16 + add BO , BO, #16 + +.endm + +.macro SAVE2x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + fldd d0, ALPHA + + fmuld d8 , d0 , d16 + fmuld d9 , d0 , d17 + + fstd d8 , [CO1] + fstd d9 , [CO1, #8 ] + + fmuld d12, d0 , d20 + fmuld d13, d0 , d21 + + fstd d12, [CO2] + fstd d13, [CO2, #8 ] + + add CO1, CO1, #16 + +.endm + +/******************************************************************************/ + +.macro INIT1x2 + + vsub.f64 d16 , d16 , d16 + vmov.f64 d20, d16 + +.endm + + + +.macro KERNEL1x2_SUB + + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + + fldd d0 , [ AO ] + fmacd d16 , d0, d8 + fmacd d20 , d0, d9 + + add AO , AO, #8 + add BO , BO, #16 + +.endm + +.macro SAVE1x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + fldd d0, ALPHA + + fmuld d8 , d0 , d16 + fstd d8 , [CO1] + + fmuld d12, d0 , d20 + fstd d12, [CO2] + + add CO1, CO1, #16 + +.endm + +/******************************************************************************/ +/******************************************************************************/ + +.macro INIT4x1 + + vsub.f64 d16 , d16 , d16 + vmov.f64 d17, d16 + vmov.f64 d18, d16 + vmov.f64 d19, d16 + +.endm + + + +.macro KERNEL4x1_SUB + + fldd d8 , [ BO ] + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + + fmacd d16 , d0, d8 + fmacd d17 , d1, d8 + fmacd d18 , d2, d8 + fmacd d19 , d3, d8 + + add AO , AO, #32 + add BO , BO, #8 + +.endm + +.macro SAVE4x1 + + + fldd d0, ALPHA + + fmuld d8 , d0 , d16 + fmuld d9 , d0 , d17 + fmuld d10, d0 , d18 + fmuld d11, d0 , d19 + + fstd d8 , [CO1] + fstd d9 , [CO1, #8 ] + fstd d10, [CO1, #16 ] + fstd d11, [CO1, #24 ] + + add CO1, CO1, #32 + +.endm + + + + +/******************************************************************************/ + +.macro INIT2x1 + + vsub.f64 d16 , d16 , d16 + vmov.f64 d17, d16 + +.endm + + + +.macro KERNEL2x1_SUB + + fldd d8 , [ BO ] + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fmacd d16 , d0, d8 + fmacd d17 , d1, d8 + + add AO , AO, #16 + add BO , BO, #8 + +.endm + +.macro SAVE2x1 + + + fldd d0, ALPHA + + fmuld d8 , d0 , d16 + fmuld d9 , d0 , d17 + + fstd d8 , [CO1] + fstd d9 , [CO1, #8 ] + + add CO1, CO1, #16 + +.endm + +/******************************************************************************/ + +.macro INIT1x1 + + vsub.f64 d16 , d16 , d16 + +.endm + + + +.macro KERNEL1x1_SUB + + fldd d8 , [ BO ] + + fldd d0 , [ AO ] + + fmacd d16 , d0, d8 + + add AO , AO, #8 + add BO , BO, #8 + +.endm + +.macro SAVE1x1 + + + fldd d0, ALPHA + + fmuld d8 , d0 , d16 + fstd d8 , [CO1] + + add CO1, CO1, #8 + +.endm + + + + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + str OLD_M, M + str OLD_N, N + str OLD_K, K + str OLD_A, A + vstr OLD_ALPHA, ALPHA + + sub r3, fp, #128 + vstm r3, { d8 - d15} // store floating point registers + + ldr r3, OLD_LDC + lsl r3, r3, #3 // ldc = ldc * 8 + str r3, LDC + + ldr r3, OLD_C + str r3, C + + ldr BC, B + + ldr r3, OFFSET +#ifndef LEFT + neg r3 , r3 +#endif + str r3 , KK + + ldr J, N + asrs J, J, #2 // J = J / 4 + ble _L2_BEGIN + +_L4_BEGIN: + + ldr CO1, C // CO1 = C + ldr r4 , LDC + lsl r4 , r4 , #2 // LDC * 4 + add r3 , r4, CO1 + str r3 , C // store C + +#if defined(LEFT) + ldr r3 , OFFSET + str r3 , KK +#endif + + + ldr AO, A // AO = A + pld [AO , #A_PRE-64] + pld [AO , #A_PRE-32] + + + +_L4_M4_BEGIN: + + ldr I, M + asrs I, I, #2 // I = I / 4 + ble _L4_M2_BEGIN + +_L4_M4_20: + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #5 // 4 double values + add BO , BO , r4 + lsls r4 , r3 , #5 // 4 double values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr L , K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr L , K + ldr r3, KK + sub L , L, r3 + str L , KKK +#else + ldr L , KK +#ifdef LEFT + add L , L , #4 // number of values in AO +#else + add L , L , #4 // number of values in BO +#endif + str L , KKK +#endif + + mov K1, L + asrs L , K1, #5 // L = L / 8 + ble _L4_M4_40 + .align 5 + + + + KERNEL4x4_I + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + + + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_E + + subs L, L, #1 + ble _L4_M4_41 + +_L4_M4_22: + + KERNEL4x4_S + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + + + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_E + + subs L, L, #1 + ble _L4_M4_41 + + b _L4_M4_22 + + +_L4_M4_40: + + INIT4x4 + +_L4_M4_41: + + ands L , K1, #31 // L = L % 8 + ble _L4_M4_100 + +_L4_M4_42: + + KERNEL4x4_SUB + + subs L, L, #1 + bgt _L4_M4_42 + +_L4_M4_100: + + SAVE4x4 + + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #5 // 4 double values + add BO , BO , r4 + lsls r4 , r3 , #5 // 4 double values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #4 // number of values in AO + str r3 , KK +#endif + + +_L4_M4_END: + + subs I, I, #1 + bgt _L4_M4_20 + + +_L4_M2_BEGIN: + + ldr I, M + tst I , #3 + ble _L4_END + + tst I, #2 // I = I / 2 + ble _L4_M1_BEGIN + +_L4_M2_20: + + INIT2x4 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #5 // 4 double values + add BO , BO , r4 + lsls r4 , r3 , #4 // 2 double values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr L , K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr L , K + ldr r3, KK + sub L , L, r3 + str L , KKK +#else + ldr L , KK +#ifdef LEFT + add L , L , #2 // number of values in AO +#else + add L , L , #4 // number of values in BO +#endif + str L , KKK +#endif + + mov K1, L + asrs L , K1, #3 // L = L / 8 + ble _L4_M2_40 + +_L4_M2_22: + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + subs L, L, #1 + bgt _L4_M2_22 + + +_L4_M2_40: + + ands L , K1, #7 // L = L % 8 + ble _L4_M2_100 + +_L4_M2_42: + + KERNEL2x4_SUB + + subs L, L, #1 + bgt _L4_M2_42 + +_L4_M2_100: + + SAVE2x4 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #5 // 4 double values + add BO , BO , r4 + lsls r4 , r3 , #4 // 2 double values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #2 // number of values in AO + str r3 , KK +#endif + + +_L4_M2_END: + + +_L4_M1_BEGIN: + + tst I, #1 // I = I % 2 + ble _L4_END + +_L4_M1_20: + + INIT1x4 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #5 // 4 double values + add BO , BO , r4 + lsls r4 , r3 , #3 // 1 double value + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr L , K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr L , K + ldr r3, KK + sub L , L, r3 + str L , KKK +#else + ldr L , KK +#ifdef LEFT + add L , L , #1 // number of values in AO +#else + add L , L , #4 // number of values in BO +#endif + str L , KKK +#endif + + mov K1, L + asrs L , K1, #3 // L = L / 8 + ble _L4_M1_40 + +_L4_M1_22: + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + subs L, L, #1 + bgt _L4_M1_22 + + +_L4_M1_40: + + ands L , K1, #7 // L = L % 8 + ble _L4_M1_100 + +_L4_M1_42: + + KERNEL1x4_SUB + + subs L, L, #1 + bgt _L4_M1_42 + +_L4_M1_100: + + SAVE1x4 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #5 // 4 double values + add BO , BO , r4 + lsls r4 , r3 , #3 // 1 double value + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #1 // number of values in AO + str r3 , KK +#endif + + + +_L4_END: + + mov r3, BC + ldr r4, K + lsl r4, r4, #5 // k * 4 * 8 + add r3, r3, r4 // B = B + K * 4 * 8 + mov BC, r3 + +#if !defined(LEFT) + ldr r3 , KK + add r3 , r3 , #4 // number of values in BO + str r3 , KK +#endif + + subs J , #1 // j-- + bgt _L4_BEGIN + + + +/*********************************************************************************************/ + +_L2_BEGIN: + + ldr J , N + tst J , #3 + ble _L999 + + tst J , #2 + ble _L1_BEGIN + + ldr CO1, C // CO1 = C + ldr r4 , LDC + lsl r4 , r4 , #1 // LDC * 2 + add r3 , r4, CO1 + str r3 , C // store C + +#if defined(LEFT) + ldr r3 , OFFSET + str r3 , KK +#endif + + ldr AO, A // AO = A + //pld [AO , #A_PRE-96] + //pld [AO , #A_PRE-64] + //pld [AO , #A_PRE-32] + + + +_L2_M4_BEGIN: + + ldr I, M + asrs I, I, #2 // I = I / 4 + ble _L2_M2_BEGIN + +_L2_M4_20: + + INIT4x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #4 // 2 double values + add BO , BO , r4 + lsls r4 , r3 , #5 // 4 double values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr L , K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr L , K + ldr r3, KK + sub L , L, r3 + str L , KKK +#else + ldr L , KK +#ifdef LEFT + add L , L , #4 // number of values in AO +#else + add L , L , #2 // number of values in BO +#endif + str L , KKK +#endif + + mov K1, L + asrs L , K1, #3 // L = L / 8 + ble _L2_M4_40 + .align 5 + +_L2_M4_22: + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + subs L, L, #1 + bgt _L2_M4_22 + + +_L2_M4_40: + + ands L , K1, #7 // L = L % 8 + ble _L2_M4_100 + +_L2_M4_42: + + KERNEL4x2_SUB + + subs L, L, #1 + bgt _L2_M4_42 + +_L2_M4_100: + + SAVE4x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #4 // 2 double values + add BO , BO , r4 + lsls r4 , r3 , #5 // 4 double values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #4 // number of values in AO + str r3 , KK +#endif + + + +_L2_M4_END: + + subs I, I, #1 + bgt _L2_M4_20 + + +_L2_M2_BEGIN: + + ldr I, M + tst I , #3 + ble _L2_END + + tst I, #2 // I = I / 2 + ble _L2_M1_BEGIN + +_L2_M2_20: + + INIT2x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #4 // 2 double values + add BO , BO , r4 + lsls r4 , r3 , #4 // 2 double values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr L , K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr L , K + ldr r3, KK + sub L , L, r3 + str L , KKK +#else + ldr L , KK +#ifdef LEFT + add L , L , #2 // number of values in AO +#else + add L , L , #2 // number of values in BO +#endif + str L , KKK +#endif + + mov K1, L + asrs L , K1, #3 // L = L / 8 + ble _L2_M2_40 + +_L2_M2_22: + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + subs L, L, #1 + bgt _L2_M2_22 + + +_L2_M2_40: + + ands L , K1, #7 // L = L % 8 + ble _L2_M2_100 + +_L2_M2_42: + + KERNEL2x2_SUB + + subs L, L, #1 + bgt _L2_M2_42 + +_L2_M2_100: + + SAVE2x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #4 // 2 double values + add BO , BO , r4 + lsls r4 , r3 , #4 // 2 double values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #2 // number of values in AO + str r3 , KK +#endif + + + +_L2_M2_END: + + +_L2_M1_BEGIN: + + tst I, #1 // I = I % 2 + ble _L2_END + +_L2_M1_20: + + INIT1x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #4 // 2 double values + add BO , BO , r4 + lsls r4 , r3 , #3 // 1 double value + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr L , K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr L , K + ldr r3, KK + sub L , L, r3 + str L , KKK +#else + ldr L , KK +#ifdef LEFT + add L , L , #1 // number of values in AO +#else + add L , L , #2 // number of values in BO +#endif + str L , KKK +#endif + + mov K1, L + asrs L , K1, #3 // L = L / 8 + ble _L2_M1_40 + +_L2_M1_22: + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + subs L, L, #1 + bgt _L2_M1_22 + + +_L2_M1_40: + + ands L , K1, #7 // L = L % 8 + ble _L2_M1_100 + +_L2_M1_42: + + KERNEL1x2_SUB + + subs L, L, #1 + bgt _L2_M1_42 + +_L2_M1_100: + + SAVE1x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #4 // 2 double values + add BO , BO , r4 + lsls r4 , r3 , #3 // 1 double value + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #1 // number of values in AO + str r3 , KK +#endif + + + + +_L2_END: + + mov r3, BC + ldr r4, K + lsl r4, r4, #4 // k * 2 * 8 + add r3, r3, r4 // B = B + K * 2 * 8 + mov BC, r3 + +#if !defined(LEFT) + ldr r3 , KK + add r3 , r3 , #2 // number of values in BO + str r3 , KK +#endif + + +/*********************************************************************************************/ + +_L1_BEGIN: + + ldr J , N + tst J , #1 + ble _L999 + + + ldr CO1, C // CO1 = C + ldr r4 , LDC + add r3 , r4, CO1 + str r3 , C // store C + +#if defined(LEFT) + ldr r3 , OFFSET + str r3 , KK +#endif + + ldr AO, A // AO = A + //pld [AO , #A_PRE-96] + //pld [AO , #A_PRE-64] + //pld [AO , #A_PRE-32] + + + +_L1_M4_BEGIN: + + ldr I, M + asrs I, I, #2 // I = I / 4 + ble _L1_M2_BEGIN + +_L1_M4_20: + + INIT4x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #3 // 1 double value + add BO , BO , r4 + lsls r4 , r3 , #5 // 4 double values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr L , K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr L , K + ldr r3, KK + sub L , L, r3 + str L , KKK +#else + ldr L , KK +#ifdef LEFT + add L , L , #4 // number of values in AO +#else + add L , L , #1 // number of values in BO +#endif + str L , KKK +#endif + + mov K1, L + asrs L , K1, #3 // L = L / 8 + ble _L1_M4_40 + .align 5 + +_L1_M4_22: + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + subs L, L, #1 + bgt _L1_M4_22 + + +_L1_M4_40: + + ands L , K1, #7 // L = L % 8 + ble _L1_M4_100 + +_L1_M4_42: + + KERNEL4x1_SUB + + subs L, L, #1 + bgt _L1_M4_42 + +_L1_M4_100: + + SAVE4x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #3 // 1 double value + add BO , BO , r4 + lsls r4 , r3 , #5 // 4 double values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #4 // number of values in AO + str r3 , KK +#endif + + + +_L1_M4_END: + + subs I, I, #1 + bgt _L1_M4_20 + + +_L1_M2_BEGIN: + + ldr I, M + tst I , #3 + ble _L1_END + + tst I, #2 // I = I / 2 + ble _L1_M1_BEGIN + +_L1_M2_20: + + INIT2x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #3 // 1 double value + add BO , BO , r4 + lsls r4 , r3 , #4 // 2 double values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr L , K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr L , K + ldr r3, KK + sub L , L, r3 + str L , KKK +#else + ldr L , KK +#ifdef LEFT + add L , L , #2 // number of values in AO +#else + add L , L , #1 // number of values in BO +#endif + str L , KKK +#endif + + mov K1, L + asrs L , K1, #3 // L = L / 8 + ble _L1_M2_40 + +_L1_M2_22: + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + subs L, L, #1 + bgt _L1_M2_22 + + +_L1_M2_40: + + ands L , K1, #7 // L = L % 8 + ble _L1_M2_100 + +_L1_M2_42: + + KERNEL2x1_SUB + + subs L, L, #1 + bgt _L1_M2_42 + +_L1_M2_100: + + SAVE2x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #3 // 1 double value + add BO , BO , r4 + lsls r4 , r3 , #4 // 2 double values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #2 // number of values in AO + str r3 , KK +#endif + + + +_L1_M2_END: + + +_L1_M1_BEGIN: + + tst I, #1 // I = I % 2 + ble _L1_END + +_L1_M1_20: + + INIT1x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #3 // 1 double value + add BO , BO , r4 + lsls r4 , r3 , #3 // 1 double value + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr L , K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr L , K + ldr r3, KK + sub L , L, r3 + str L , KKK +#else + ldr L , KK +#ifdef LEFT + add L , L , #1 // number of values in AO +#else + add L , L , #1 // number of values in BO +#endif + str L , KKK +#endif + + mov K1, L + asrs L , K1, #3 // L = L / 8 + ble _L1_M1_40 + +_L1_M1_22: + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + subs L, L, #1 + bgt _L1_M1_22 + + +_L1_M1_40: + + ands L , K1, #7 // L = L % 8 + ble _L1_M1_100 + +_L1_M1_42: + + KERNEL1x1_SUB + + subs L, L, #1 + bgt _L1_M1_42 + +_L1_M1_100: + + SAVE1x1 + + +_L1_END: + + +_L999: + + sub r3, fp, #128 + vldm r3, { d8 - d15} // restore floating point registers + + movs r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + diff --git a/param.h b/param.h index a2ed09d6f..cf1665438 100644 --- a/param.h +++ b/param.h @@ -1805,8 +1805,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SGEMM_DEFAULT_UNROLL_M 2 #define SGEMM_DEFAULT_UNROLL_N 2 -#define DGEMM_DEFAULT_UNROLL_M 8 -#define DGEMM_DEFAULT_UNROLL_N 2 +#define DGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_N 4 #define CGEMM_DEFAULT_UNROLL_M 2 #define CGEMM_DEFAULT_UNROLL_N 2 @@ -1815,12 +1815,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ZGEMM_DEFAULT_UNROLL_N 2 #define SGEMM_DEFAULT_P 64 -#define DGEMM_DEFAULT_P 64 +#define DGEMM_DEFAULT_P 128 #define CGEMM_DEFAULT_P 24 #define ZGEMM_DEFAULT_P 20 #define SGEMM_DEFAULT_Q 192 -#define DGEMM_DEFAULT_Q 64 +#define DGEMM_DEFAULT_Q 96 #define CGEMM_DEFAULT_Q 128 #define ZGEMM_DEFAULT_Q 64 From 31f51e78bc4a0c69ab5a1f968eaa43f403288edd Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sat, 12 Oct 2013 09:42:18 +0200 Subject: [PATCH 10/81] minor optimizations on dgemm_kernel --- kernel/arm/dgemm_kernel_4x4_vfpv3.S | 333 +++++++++++----------------- 1 file changed, 131 insertions(+), 202 deletions(-) diff --git a/kernel/arm/dgemm_kernel_4x4_vfpv3.S b/kernel/arm/dgemm_kernel_4x4_vfpv3.S index 4c30e108c..dfe3e3634 100644 --- a/kernel/arm/dgemm_kernel_4x4_vfpv3.S +++ b/kernel/arm/dgemm_kernel_4x4_vfpv3.S @@ -26,34 +26,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** -* 2013/10/05 Saar +* 2013/10/11 Saar * BLASTEST : OK * CTEST : OK * TEST : OK * * -* 2013/10/03 Saar +* 2013/10/11 Saar * UNROLL_N 4 * UNROLL_M 4 * DGEMM_P 128 * DGEMM_Q 96 * DGEMM_R 512 -* A_PRE 64 -* B_PRE 64 +* A_PRE 96 +* B_PRE 96 * C_PRE 64 * * Performance on Odroid U2: * -* 1 Core: 1.55 GFLOPS ATLAS: 1.59 GFLOPS -* 2 Cores: 3.10 GFLOPS ATLAS: - GFLOPS -* 3 Cores: 4.54 GFLOPS ATLAS: - GFLOPS -* 4 Cores: 5.67 GFLOPS ATLAS: 3.88 GFLOPS +* 1 Core: 1.57 GFLOPS ATLAS: 1.59 GFLOPS +* 2 Cores: 3.14 GFLOPS ATLAS: 3.16 GFLOPS +* 3 Cores: 4.56 GFLOPS ATLAS: 4.60 GFLOPS +* 4 Cores: 5.82 GFLOPS ATLAS: 5.41 GFLOPS **************************************************************************************/ #define ASSEMBLER #include "common.h" -#define STACKSIZE 252 +#define STACKSIZE 256 #define OLD_M r0 #define OLD_N r1 @@ -67,17 +67,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * registers *******************************************************/ -#define C [fp, #-248 ] #define LDC [fp, #-252 ] #define M [fp, #-256 ] #define N [fp, #-260 ] #define K [fp, #-264 ] #define A [fp, #-268 ] -#define ALPHA [fp, #-276 ] +#define ALPHA [fp, #-280] #define B [fp, #4 ] -#define OLD_C [fp, #8 ] +#define C [fp, #8 ] #define OLD_LDC [fp, #12 ] #define I r0 @@ -93,8 +92,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define K1 r7 #define BC r12 -#define A_PRE 64 -#define B_PRE 64 +#define A_PRE 96 +#define B_PRE 96 #define C_PRE 64 /************************************************************************************** @@ -124,34 +123,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x4_I pld [ BO , #B_PRE ] - fldd d8 , [ BO ] - + fldd d0 , [ AO ] pld [ AO , #A_PRE ] - fldmiad AO!, { d0 - d1} + fldd d1 , [ AO, #8 ] fmuld d16 , d0, d8 - fldmiad AO!, { d2 - d3} + fldd d2 , [ AO, #16 ] fmuld d17 , d1, d8 - fldd d9 , [ BO, #8 ] + fldd d3 , [ AO, #24 ] fmuld d18 , d2, d8 - fldd d10, [ BO, #16 ] + fldd d9 , [ BO, #8 ] fmuld d19 , d3, d8 - fldd d11, [ BO, #24 ] + fldd d10, [ BO, #16 ] fmuld d20 , d0, d9 + fldd d11, [ BO, #24 ] fmuld d21 , d1, d9 add BO , BO, #32 + add AO , AO, #32 fmuld d22 , d2, d9 + pld [ BO , #B_PRE ] fldd d12, [ BO ] fmuld d23 , d3, d9 + pld [ AO , #A_PRE ] + fldd d4 , [ AO, #0 ] fmuld d24 , d0, d10 - fldmiad AO!, { d4 - d5 } + fldd d5 , [ AO, #8 ] fmuld d25 , d1, d10 + fldd d6 , [ AO, #16 ] fmuld d26 , d2, d10 - fldmiad AO!, { d6 - d7 } + fldd d7 , [ AO, #24 ] fmuld d27 , d3, d10 fldd d13, [ BO, #8 ] @@ -161,132 +165,91 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d15, [ BO, #24 ] fmuld d30 , d2, d11 fmuld d31 , d3, d11 - add BO , BO, #32 .endm -.macro KERNEL4x4_S - pld [ BO , #B_PRE ] - - fldd d8 , [ BO ] - - pld [ AO , #A_PRE ] - fldmiad AO!, { d0 - d1} - - fmacd d16 , d0, d8 - fldmiad AO!, { d2 - d3} - fmacd d17 , d1, d8 - fldd d9 , [ BO, #8 ] - fmacd d18 , d2, d8 - fldd d10, [ BO, #16 ] - fmacd d19 , d3, d8 - - fldd d11, [ BO, #24 ] - fmacd d20 , d0, d9 - fmacd d21 , d1, d9 - add BO , BO, #32 - fmacd d22 , d2, d9 - - fldd d12, [ BO ] - fmacd d23 , d3, d9 - - fmacd d24 , d0, d10 - fldmiad AO!, { d4 - d5 } - fmacd d25 , d1, d10 - fmacd d26 , d2, d10 - fldmiad AO!, { d6 - d7 } - fmacd d27 , d3, d10 - - fldd d13, [ BO, #8 ] - fmacd d28 , d0, d11 - fldd d14, [ BO, #16 ] - fmacd d29 , d1, d11 - fldd d15, [ BO, #24 ] - fmacd d30 , d2, d11 - fmacd d31 , d3, d11 - add BO , BO, #32 - -.endm - - - -.macro KERNEL4x4_M1 +.macro KERNEL4x4_M2 fmacd d16 , d4, d12 - pld [ AO , #A_PRE ] + pld [ AO , #A_PRE+32 ] fmacd d17 , d5, d12 + fldd d0 , [ AO , #32 ] fmacd d18 , d6, d12 - pld [ BO , #B_PRE ] + pld [ BO , #B_PRE+32 ] fmacd d19 , d7, d12 + fldd d8 , [ BO , #32 ] fmacd d20 , d4, d13 - fldd d8 , [ BO ] + fldd d1 , [ AO, #40 ] fmacd d21 , d5, d13 + fldd d2 , [ AO, #48 ] fmacd d22 , d6, d13 - fldmiad AO!, { d0 - d1 } + fldd d3 , [ AO, #56 ] fmacd d23 , d7, d13 fmacd d24 , d4, d14 - fldmiad AO!, { d2 - d3 } fmacd d25 , d5, d14 - fldd d9 , [ BO, #8 ] + fldd d9 , [ BO, #40 ] fmacd d26 , d6, d14 - fldd d10, [ BO, #16 ] + fldd d10, [ BO, #48 ] fmacd d27 , d7, d14 - fldd d11, [ BO, #24 ] + fldd d11, [ BO, #56 ] fmacd d28 , d4, d15 fmacd d29 , d5, d15 + add AO , AO, #64 fmacd d30 , d6, d15 - add BO , BO, #32 + add BO , BO, #64 fmacd d31 , d7, d15 .endm -.macro KERNEL4x4_M2 +.macro KERNEL4x4_M1 fmacd d16 , d0, d8 pld [ AO , #A_PRE ] fmacd d17 , d1, d8 - pld [ BO , #B_PRE ] + fldd d4 , [ AO ] fmacd d18 , d2, d8 - fldd d12, [ BO ] + pld [ BO , #B_PRE ] fmacd d19 , d3, d8 + fldd d12, [ BO ] fmacd d20 , d0, d9 - fldmiad AO!, { d4 - d5 } + fldd d5 , [ AO, #8 ] fmacd d21 , d1, d9 + fldd d6 , [ AO, #16 ] fmacd d22 , d2, d9 - fldmiad AO!, { d6 - d7 } + fldd d7 , [ AO, #24 ] fmacd d23 , d3, d9 fmacd d24 , d0, d10 fmacd d25 , d1, d10 + fldd d13, [ BO, #8 ] fmacd d26 , d2, d10 + fldd d14, [ BO, #16 ] fmacd d27 , d3, d10 - fldd d13, [ BO, #8 ] - fmacd d28 , d0, d11 - fldd d14, [ BO, #16 ] - fmacd d29 , d1, d11 fldd d15, [ BO, #24 ] + fmacd d28 , d0, d11 + fmacd d29 , d1, d11 fmacd d30 , d2, d11 fmacd d31 , d3, d11 - add BO , BO, #32 .endm + .macro KERNEL4x4_E fmacd d16 , d4, d12 - pld [ AO , #A_PRE ] fmacd d17 , d5, d12 + add BO , BO, #32 + add AO , AO, #32 fmacd d18 , d6, d12 - pld [ BO , #B_PRE ] fmacd d19 , d7, d12 fmacd d20 , d4, d13 @@ -310,25 +273,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x4_SUB - pld [ BO , #B_PRE ] - pld [ AO , #A_PRE ] fldd d8 , [ BO ] + pld [ BO , #B_PRE ] fldd d0 , [ AO ] + pld [ AO , #A_PRE ] fldd d1 , [ AO, #8 ] - fldd d2 , [ AO, #16 ] - fldd d3 , [ AO, #24 ] fmacd d16 , d0, d8 - fldd d9 , [ BO, #8 ] + fldd d2 , [ AO, #16 ] fmacd d17 , d1, d8 - fldd d10, [ BO, #16 ] + fldd d3 , [ AO, #24 ] fmacd d18 , d2, d8 - fldd d11, [ BO, #24 ] + fldd d9 , [ BO, #8 ] fmacd d19 , d3, d8 + fldd d10, [ BO, #16 ] fmacd d20 , d0, d9 + fldd d11, [ BO, #24 ] fmacd d21 , d1, d9 fmacd d22 , d2, d9 fmacd d23 , d3, d9 @@ -924,9 +887,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lsl r3, r3, #3 // ldc = ldc * 8 str r3, LDC - ldr r3, OLD_C - str r3, C - ldr K1, K ldr BC, B @@ -958,140 +918,109 @@ _L4_M4_20: mov BO, BC - asrs L , K1, #5 // L = L / 32 - ble _L4_M4_40 + asrs L , K1, #3 // L = L / 8 + cmp L , #3 + blt _L4_M4_30 .align 5 KERNEL4x4_I - KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 + KERNEL4x4_M2 - KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 + KERNEL4x4_M2 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - - - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_E - - subs L, L, #1 - ble _L4_M4_41 + sub L, L, #2 _L4_M4_22: - KERNEL4x4_S KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 + KERNEL4x4_M2 - KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - - - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_E subs L, L, #1 - ble _L4_M4_41 + bgt _L4_M4_22 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_E + + b _L4_M4_44 - b _L4_M4_22 +_L4_M4_30: + tst L, #3 + ble _L4_M4_40 + + tst L, #2 + ble _L4_M4_32 + + KERNEL4x4_I + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_E + + b _L4_M4_44 + +_L4_M4_32: + + tst L, #1 + ble _L4_M4_40 + + KERNEL4x4_I + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_E + + b _L4_M4_44 + _L4_M4_40: INIT4x4 -_L4_M4_41: - - tst K1, #31 - ble _L4_M4_100 - - tst K1, #16 - ble _L4_M4_44 - - KERNEL4x4_S - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_E _L4_M4_44: - ands L , K1, #15 // L = L % 16 + ands L , K1, #7 // L = L % 8 ble _L4_M4_100 _L4_M4_46: @@ -1099,7 +1028,7 @@ _L4_M4_46: KERNEL4x4_SUB subs L, L, #1 - bgt _L4_M4_46 + bne _L4_M4_46 _L4_M4_100: @@ -1108,7 +1037,7 @@ _L4_M4_100: _L4_M4_END: subs I, I, #1 - bgt _L4_M4_20 + bne _L4_M4_20 _L4_M2_BEGIN: From 2a1515c9dd561fe8c3dca9dcae9efc22a0c56644 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sat, 12 Oct 2013 16:48:29 +0200 Subject: [PATCH 11/81] added dgemm_ncopy_4_vfpv3.S --- kernel/arm/dgemm_ncopy_4_vfpv3.S | 344 +++++++++++++++++++++++++++++++ 1 file changed, 344 insertions(+) create mode 100644 kernel/arm/dgemm_ncopy_4_vfpv3.S diff --git a/kernel/arm/dgemm_ncopy_4_vfpv3.S b/kernel/arm/dgemm_ncopy_4_vfpv3.S new file mode 100644 index 000000000..bdb63bfdd --- /dev/null +++ b/kernel/arm/dgemm_ncopy_4_vfpv3.S @@ -0,0 +1,344 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/10/11 Saar +* BLASTEST : xOK +* CTEST : xOK +* TEST : xOK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_M r0 +#define OLD_N r1 +#define OLD_A r2 +#define OLD_LDA r3 + + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define LDA [fp, #-260 ] + +#define B [fp, #4 ] + +#define M r0 +#define N r1 +#define A r2 + +#define BO r5 + +#define AO1 r6 +#define AO2 r7 +#define AO3 r8 +#define AO4 r9 + +#define I r3 +#define J r12 + +#define A_PRE 96 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro COPY4x4 + + fldd d0 , [ AO1, #0 ] + fldd d1 , [ AO2, #0 ] + fldd d2 , [ AO3, #0 ] + fldd d3 , [ AO4, #0 ] + + fldd d4 , [ AO1, #8 ] + fldd d8 , [ AO1, #16 ] + fldd d12, [ AO1, #24 ] + + fldd d5 , [ AO2, #8 ] + add AO1, AO1, #32 + fldd d9 , [ AO2, #16 ] + fldd d13, [ AO2, #24 ] + + fldd d6 , [ AO3, #8 ] + add AO2, AO2, #32 + fldd d10, [ AO3, #16 ] + fldd d14, [ AO3, #24 ] + + fldd d7 , [ AO4, #8 ] + add AO3, AO3, #32 + fldd d11, [ AO4, #16 ] + fldd d15, [ AO4, #24 ] + + fstmiad BO!, { d0 - d3 } + add AO4, AO4, #32 + fstmiad BO!, { d4 - d7 } + fstmiad BO!, { d8 - d15 } + +.endm + +.macro COPY1x4 + + fldd d0 , [ AO1, #0 ] + fldd d1 , [ AO2, #0 ] + add AO1, AO1, #8 + fldd d2 , [ AO3, #0 ] + add AO2, AO2, #8 + fldd d3 , [ AO4, #0 ] + + add AO3, AO3, #8 + fstmiad BO!, { d0 - d3 } + add AO4, AO4, #8 + +.endm + +.macro COPY4x2 + + fldd d0 , [ AO1, #0 ] + fldd d2 , [ AO1, #8 ] + fldd d4 , [ AO1, #16 ] + fldd d6 , [ AO1, #24 ] + + fldd d1 , [ AO2, #0 ] + fldd d3 , [ AO2, #8 ] + add AO1, AO1, #32 + fldd d5 , [ AO2, #16 ] + fldd d7 , [ AO2, #24 ] + + fstmiad BO!, { d0 - d7 } + add AO2, AO2, #32 + +.endm + + +.macro COPY1x2 + + fldd d0 , [ AO1, #0 ] + fldd d1 , [ AO2, #0 ] + add AO1, AO1, #8 + + fstmiad BO!, { d0 - d1 } + add AO2, AO2, #8 + +.endm + +.macro COPY4x1 + + fldd d0 , [ AO1, #0 ] + fldd d1 , [ AO1, #8 ] + fldd d2 , [ AO1, #16 ] + fldd d3 , [ AO1, #24 ] + + fstmiad BO!, { d0 - d3 } + add AO1, AO1, #32 + +.endm + + +.macro COPY1x1 + + fldd d0 , [ AO1, #0 ] + + fstmiad BO!, { d0 } + add AO1, AO1, #8 + +.endm + + + + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + + lsl r3, r3, #3 // lda = lda * 8 + str r3, LDA + + sub r4, fp, #128 + vstm r4, { d8 - d15} // store floating point registers + + ldr BO, B + +_L4_BEGIN: + + asrs J, N, #2 // J = N / 4 + ble _L2_BEGIN + +_L4_M4_BEGIN: + + mov AO1, A // AO1 = A + ldr r4 , LDA + add AO2, AO1, r4 + add AO3, AO2, r4 + add AO4, AO3, r4 + add A , AO4, r4 // A = A + 4 * LDA + + asrs I, M, #2 // I = M / 4 + ble _L4_M4_40 + +_L4_M4_20: + + COPY4x4 + + subs I , I , #1 + bne _L4_M4_20 + + +_L4_M4_40: + + ands I, M , #3 + ble _L4_M4_END + +_L4_M4_60: + + COPY1x4 + + subs I , I , #1 + bne _L4_M4_60 + + +_L4_M4_END: + + subs J , J, #1 // j-- + bne _L4_M4_BEGIN + + + +/*********************************************************************************************/ + +_L2_BEGIN: + + tst N, #3 + ble _L999 + + tst N, #2 + ble _L1_BEGIN + +_L2_M4_BEGIN: + + mov AO1, A // AO1 = A + ldr r4 , LDA + add AO2, AO1, r4 + add A , AO2, r4 // A = A + 2 * LDA + + asrs I, M, #2 // I = M / 4 + ble _L2_M4_40 + +_L2_M4_20: + + COPY4x2 + + subs I , I , #1 + bne _L2_M4_20 + + +_L2_M4_40: + + ands I, M , #3 + ble _L2_M4_END + +_L2_M4_60: + + COPY1x2 + + subs I , I , #1 + bne _L2_M4_60 + + +_L2_M4_END: + + +/*********************************************************************************************/ + +_L1_BEGIN: + + tst N, #1 + ble _L999 + + +_L1_M4_BEGIN: + + mov AO1, A // AO1 = A + ldr r4 , LDA + add A , AO1, r4 // A = A + 1 * LDA + + asrs I, M, #2 // I = M / 4 + ble _L1_M4_40 + +_L1_M4_20: + + COPY4x1 + + subs I , I , #1 + bne _L1_M4_20 + + +_L1_M4_40: + + ands I, M , #3 + ble _L1_M4_END + +_L1_M4_60: + + COPY1x1 + + subs I , I , #1 + bne _L1_M4_60 + + +_L1_M4_END: + + + +_L999: + + sub r3, fp, #128 + vldm r3, { d8 - d15} // restore floating point registers + + movs r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + From 3983011f0b00021f9663009192c89ce12dbac794 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Mon, 14 Oct 2013 08:22:27 +0200 Subject: [PATCH 12/81] added sgemm- and strmm_kernel --- kernel/arm/sgemm_kernel_4x4_vfpv3.S | 1491 +++++++++++++++++++++ kernel/arm/strmm_kernel_4x4_vfpv3.S | 1884 +++++++++++++++++++++++++++ 2 files changed, 3375 insertions(+) create mode 100644 kernel/arm/sgemm_kernel_4x4_vfpv3.S create mode 100644 kernel/arm/strmm_kernel_4x4_vfpv3.S diff --git a/kernel/arm/sgemm_kernel_4x4_vfpv3.S b/kernel/arm/sgemm_kernel_4x4_vfpv3.S new file mode 100644 index 000000000..4746a587e --- /dev/null +++ b/kernel/arm/sgemm_kernel_4x4_vfpv3.S @@ -0,0 +1,1491 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/10/13 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +* +* 2013/10/13 Saar +* UNROLL_N 4 +* UNROLL_M 4 +* DGEMM_P 128 +* DGEMM_Q 240 +* DGEMM_R 4096 +* A_PRE 96 +* B_PRE 96 +* C_PRE 64 +* +* Performance on Odroid U2: +* +* 1 Core: 2.60 GFLOPS ATLAS: 2.67 GFLOPS +* 2 Cores: 5.17 GFLOPS ATLAS: 5.25 GFLOPS +* 3 Cores: 7.60 GFLOPS ATLAS: 7.82 GFLOPS +* 4 Cores: 9.98 GFLOPS ATLAS: 9.95 GFLOPS +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_M r0 +#define OLD_N r1 +#define OLD_K r2 +#define OLD_A r3 +#define OLD_ALPHA s0 + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define LDC [fp, #-252 ] +#define M [fp, #-256 ] +#define N [fp, #-260 ] +#define K [fp, #-264 ] +#define A [fp, #-268 ] + +#define ALPHA [fp, #-280] + +#define B [fp, #4 ] +#define C [fp, #8 ] +#define OLD_LDC [fp, #12 ] + +#define I r0 +#define J r1 +#define L r2 + +#define AO r5 +#define BO r6 + +#define CO1 r8 +#define CO2 r9 + +#define K1 r7 +#define BC r12 + +#define A_PRE 96 +#define B_PRE 96 +#define C_PRE 64 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro INIT4x4 + + vsub.f32 s16 , s16 , s16 + vmov.f32 s17, s16 + vmov.f32 s18, s16 + vmov.f32 s19, s16 + vmov.f32 s20, s16 + vmov.f32 s21, s16 + vmov.f32 s22, s16 + vmov.f32 s23, s16 + vmov.f32 s24, s16 + vmov.f32 s25, s16 + vmov.f32 s26, s16 + vmov.f32 s27, s16 + vmov.f32 s28, s16 + vmov.f32 s29, s16 + vmov.f32 s30, s16 + vmov.f32 s31, s16 + +.endm + +.macro KERNEL4x4_I + + fldmias AO!, { s0 - s1 } + pld [ AO , #A_PRE-8 ] + fldmias BO!, { s8 - s9 } + pld [ BO , #B_PRE-8 ] + + fmuls s16 , s0, s8 + fldmias AO!, { s2 - s3 } + fmuls s17 , s1, s8 + fmuls s18 , s2, s8 + fldmias BO!, { s10 - s11 } + fmuls s19 , s3, s8 + + fmuls s20 , s0, s9 + fldmias AO!, { s4 - s5 } + fmuls s21 , s1, s9 + fmuls s22 , s2, s9 + fldmias AO!, { s6 - s7 } + fmuls s23 , s3, s9 + + fmuls s24 , s0, s10 + fldmias BO!, { s12 - s13 } + fmuls s25 , s1, s10 + fmuls s26 , s2, s10 + fldmias BO!, { s14 - s15 } + fmuls s27 , s3, s10 + + fmuls s28 , s0, s11 + fmuls s29 , s1, s11 + fmuls s30 , s2, s11 + fmuls s31 , s3, s11 + +.endm + + +.macro KERNEL4x4_M2 + + pld [ AO , #A_PRE ] + fmacs s16 , s4, s12 + fmacs s17 , s5, s12 + fldmias AO!, { s0 - s1 } + fmacs s18 , s6, s12 + pld [ BO , #B_PRE ] + fmacs s19 , s7, s12 + + fmacs s20 , s4, s13 + fldmias AO!, { s2 - s3 } + fmacs s21 , s5, s13 + fmacs s22 , s6, s13 + fldmias BO!, { s8 - s9 } + fmacs s23 , s7, s13 + + fmacs s24 , s4, s14 + fldmias BO!, { s10 - s11 } + fmacs s25 , s5, s14 + fmacs s26 , s6, s14 + fmacs s27 , s7, s14 + + fmacs s28 , s4, s15 + fmacs s29 , s5, s15 + fmacs s30 , s6, s15 + fmacs s31 , s7, s15 + +.endm + + +.macro KERNEL4x4_M1 + + fmacs s16 , s0, s8 + fldmias AO!, { s4 - s5 } + fmacs s17 , s1, s8 + fmacs s18 , s2, s8 + fldmias AO!, { s6 - s7 } + fmacs s19 , s3, s8 + + fmacs s20 , s0, s9 + fldmias BO!, { s12 - s13 } + fmacs s21 , s1, s9 + fmacs s22 , s2, s9 + fldmias BO!, { s14 - s15 } + fmacs s23 , s3, s9 + + fmacs s24 , s0, s10 + fmacs s25 , s1, s10 + fmacs s26 , s2, s10 + fmacs s27 , s3, s10 + + fmacs s28 , s0, s11 + fmacs s29 , s1, s11 + fmacs s30 , s2, s11 + fmacs s31 , s3, s11 + +.endm + + + +.macro KERNEL4x4_E + + fmacs s16 , s4, s12 + fmacs s17 , s5, s12 + fmacs s18 , s6, s12 + fmacs s19 , s7, s12 + + fmacs s20 , s4, s13 + fmacs s21 , s5, s13 + fmacs s22 , s6, s13 + fmacs s23 , s7, s13 + + fmacs s24 , s4, s14 + fmacs s25 , s5, s14 + fmacs s26 , s6, s14 + fmacs s27 , s7, s14 + + fmacs s28 , s4, s15 + fmacs s29 , s5, s15 + fmacs s30 , s6, s15 + fmacs s31 , s7, s15 + +.endm + + + + +.macro KERNEL4x4_SUB + + flds s8 , [ BO ] + pld [ BO , #B_PRE ] + + flds s0 , [ AO ] + pld [ AO , #A_PRE ] + flds s1 , [ AO, #4 ] + + fmacs s16 , s0, s8 + flds s2 , [ AO, #8 ] + fmacs s17 , s1, s8 + flds s3 , [ AO, #12 ] + fmacs s18 , s2, s8 + flds s9 , [ BO, #4 ] + fmacs s19 , s3, s8 + + flds s10, [ BO, #8 ] + fmacs s20 , s0, s9 + flds s11, [ BO, #12 ] + fmacs s21 , s1, s9 + fmacs s22 , s2, s9 + fmacs s23 , s3, s9 + + fmacs s24 , s0, s10 + fmacs s25 , s1, s10 + fmacs s26 , s2, s10 + fmacs s27 , s3, s10 + + fmacs s28 , s0, s11 + fmacs s29 , s1, s11 + add AO , AO, #16 + fmacs s30 , s2, s11 + add BO , BO, #16 + fmacs s31 , s3, s11 + +.endm + +.macro SAVE4x4 + pld [ CO1 , #C_PRE ] + + ldr r3 , LDC + add CO2 , CO1, r3 + flds s0, ALPHA + add r4 , CO2, r3 + pld [ CO2 , #C_PRE ] + + fldmias CO1, { s8 - s11 } + pld [ r4 , #C_PRE ] + + fmacs s8 , s0 , s16 + flds s12, [CO2] + fmacs s9 , s0 , s17 + flds s13, [CO2, #4 ] + fmacs s10, s0 , s18 + flds s14, [CO2, #8 ] + fmacs s11, s0 , s19 + flds s15, [CO2, #12 ] + + fmacs s12, s0 , s20 + fsts s8 , [CO1] + fmacs s13, s0 , s21 + fsts s9 , [CO1, #4 ] + fmacs s14, s0 , s22 + fsts s10, [CO1, #8 ] + fmacs s15, s0 , s23 + fsts s11, [CO1, #12 ] + + fldmias r4, { s8 - s11 } + + fmacs s8 , s0 , s24 + fsts s12, [CO2] + fmacs s9 , s0 , s25 + fsts s13, [CO2, #4 ] + fmacs s10, s0 , s26 + fsts s14, [CO2, #8 ] + fmacs s11, s0 , s27 + fsts s15, [CO2, #12 ] + + add CO2, r4 , r3 + + pld [ CO2 , #C_PRE ] + + fldmias CO2, { s12 - s15 } + + fsts s8 , [r4 ] + fmacs s12, s0 , s28 + fsts s9 , [r4 , #4 ] + fmacs s13, s0 , s29 + fsts s10, [r4 , #8 ] + fmacs s14, s0 , s30 + fsts s11, [r4 , #12 ] + fmacs s15, s0 , s31 + + fstmias CO2, { s12 - s15 } + + add CO1, CO1, #16 + +.endm + +/******************************************************************************/ + +.macro INIT2x4 + + vsub.f32 s16 , s16 , s16 + vmov.f32 s17, s16 + vmov.f32 s20, s16 + vmov.f32 s21, s16 + vmov.f32 s24, s16 + vmov.f32 s25, s16 + vmov.f32 s28, s16 + vmov.f32 s29, s16 + +.endm + + + +.macro KERNEL2x4_SUB + + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + flds s10, [ BO, #8 ] + flds s11, [ BO, #12 ] + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + fmacs s16 , s0, s8 + fmacs s17 , s1, s8 + + fmacs s20 , s0, s9 + fmacs s21 , s1, s9 + + fmacs s24 , s0, s10 + fmacs s25 , s1, s10 + + fmacs s28 , s0, s11 + fmacs s29 , s1, s11 + add AO , AO, #8 + add BO , BO, #16 + +.endm + +.macro SAVE2x4 + + ldr r3 , LDC + add CO2 , CO1, r3 + add r4 , CO2, r3 + + flds s0, ALPHA + + flds s8 , [CO1] + flds s9 , [CO1, #4 ] + + fmacs s8 , s0 , s16 + fmacs s9 , s0 , s17 + + fsts s8 , [CO1] + fsts s9 , [CO1, #4 ] + + flds s12, [CO2] + flds s13, [CO2, #4 ] + + fmacs s12, s0 , s20 + fmacs s13, s0 , s21 + + fsts s12, [CO2] + fsts s13, [CO2, #4 ] + + flds s8 , [r4 ] + flds s9 , [r4 , #4 ] + + fmacs s8 , s0 , s24 + fmacs s9 , s0 , s25 + + fsts s8 , [r4 ] + fsts s9 , [r4 , #4 ] + + add CO2, r4 , r3 + + flds s12, [CO2] + flds s13, [CO2, #4 ] + + fmacs s12, s0 , s28 + fmacs s13, s0 , s29 + + fsts s12, [CO2] + fsts s13, [CO2, #4 ] + + add CO1, CO1, #8 + +.endm + + +/******************************************************************************/ + +.macro INIT1x4 + + vsub.f32 s16 , s16 , s16 + vmov.f32 s20, s16 + vmov.f32 s24, s16 + vmov.f32 s28, s16 + +.endm + + + +.macro KERNEL1x4_SUB + + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + flds s10, [ BO, #8 ] + flds s11, [ BO, #12 ] + + flds s0 , [ AO ] + + fmacs s16 , s0, s8 + fmacs s20 , s0, s9 + fmacs s24 , s0, s10 + fmacs s28 , s0, s11 + + add AO , AO, #4 + add BO , BO, #16 + +.endm + +.macro SAVE1x4 + + ldr r3 , LDC + add CO2 , CO1, r3 + add r4 , CO2, r3 + + flds s0, ALPHA + + flds s8 , [CO1] + fmacs s8 , s0 , s16 + fsts s8 , [CO1] + + flds s12, [CO2] + fmacs s12, s0 , s20 + fsts s12, [CO2] + + flds s8 , [r4 ] + fmacs s8 , s0 , s24 + fsts s8 , [r4 ] + + add CO2, r4 , r3 + + flds s12, [CO2] + fmacs s12, s0 , s28 + fsts s12, [CO2] + + add CO1, CO1, #4 + +.endm + +/******************************************************************************/ +/******************************************************************************/ + +.macro INIT4x2 + + vsub.f32 s16 , s16 , s16 + vmov.f32 s17, s16 + vmov.f32 s18, s16 + vmov.f32 s19, s16 + vmov.f32 s20, s16 + vmov.f32 s21, s16 + vmov.f32 s22, s16 + vmov.f32 s23, s16 + +.endm + + + +.macro KERNEL4x2_SUB + + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s2 , [ AO, #8 ] + flds s3 , [ AO, #12 ] + + fmacs s16 , s0, s8 + fmacs s17 , s1, s8 + fmacs s18 , s2, s8 + fmacs s19 , s3, s8 + + fmacs s20 , s0, s9 + fmacs s21 , s1, s9 + fmacs s22 , s2, s9 + fmacs s23 , s3, s9 + + add AO , AO, #16 + add BO , BO, #8 + +.endm + +.macro SAVE4x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + flds s0, ALPHA + + flds s8 , [CO1] + flds s9 , [CO1, #4 ] + flds s10, [CO1, #8 ] + flds s11, [CO1, #12 ] + + fmacs s8 , s0 , s16 + fmacs s9 , s0 , s17 + fmacs s10, s0 , s18 + fmacs s11, s0 , s19 + + fsts s8 , [CO1] + fsts s9 , [CO1, #4 ] + fsts s10, [CO1, #8 ] + fsts s11, [CO1, #12 ] + + flds s12, [CO2] + flds s13, [CO2, #4 ] + flds s14, [CO2, #8 ] + flds s15, [CO2, #12 ] + + fmacs s12, s0 , s20 + fmacs s13, s0 , s21 + fmacs s14, s0 , s22 + fmacs s15, s0 , s23 + + fsts s12, [CO2] + fsts s13, [CO2, #4 ] + fsts s14, [CO2, #8 ] + fsts s15, [CO2, #12 ] + + add CO1, CO1, #16 + +.endm + + +/******************************************************************************/ + +.macro INIT2x2 + + vsub.f32 s16 , s16 , s16 + vmov.f32 s17, s16 + vmov.f32 s20, s16 + vmov.f32 s21, s16 + +.endm + + + +.macro KERNEL2x2_SUB + + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + fmacs s16 , s0, s8 + fmacs s17 , s1, s8 + + fmacs s20 , s0, s9 + fmacs s21 , s1, s9 + + add AO , AO, #8 + add BO , BO, #8 + +.endm + +.macro SAVE2x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + flds s0, ALPHA + + flds s8 , [CO1] + flds s9 , [CO1, #4 ] + + fmacs s8 , s0 , s16 + fmacs s9 , s0 , s17 + + fsts s8 , [CO1] + fsts s9 , [CO1, #4 ] + + flds s12, [CO2] + flds s13, [CO2, #4 ] + + fmacs s12, s0 , s20 + fmacs s13, s0 , s21 + + fsts s12, [CO2] + fsts s13, [CO2, #4 ] + + add CO1, CO1, #8 + +.endm + +/******************************************************************************/ + +.macro INIT1x2 + + vsub.f32 s16 , s16 , s16 + vmov.f32 s20, s16 + +.endm + + + +.macro KERNEL1x2_SUB + + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + + flds s0 , [ AO ] + fmacs s16 , s0, s8 + fmacs s20 , s0, s9 + + add AO , AO, #4 + add BO , BO, #8 + +.endm + +.macro SAVE1x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + flds s0, ALPHA + + flds s8 , [CO1] + fmacs s8 , s0 , s16 + fsts s8 , [CO1] + + flds s12, [CO2] + fmacs s12, s0 , s20 + fsts s12, [CO2] + + add CO1, CO1, #8 + +.endm + +/******************************************************************************/ +/******************************************************************************/ + +.macro INIT4x1 + + vsub.f32 s16 , s16 , s16 + vmov.f32 s17, s16 + vmov.f32 s18, s16 + vmov.f32 s19, s16 + +.endm + + + +.macro KERNEL4x1_SUB + + flds s8 , [ BO ] + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s2 , [ AO, #8 ] + flds s3 , [ AO, #12 ] + + fmacs s16 , s0, s8 + fmacs s17 , s1, s8 + fmacs s18 , s2, s8 + fmacs s19 , s3, s8 + + add AO , AO, #16 + add BO , BO, #4 + +.endm + +.macro SAVE4x1 + + + flds s0, ALPHA + + flds s8 , [CO1] + flds s9 , [CO1, #4 ] + flds s10, [CO1, #8 ] + flds s11, [CO1, #12 ] + + fmacs s8 , s0 , s16 + fmacs s9 , s0 , s17 + fmacs s10, s0 , s18 + fmacs s11, s0 , s19 + + fsts s8 , [CO1] + fsts s9 , [CO1, #4 ] + fsts s10, [CO1, #8 ] + fsts s11, [CO1, #12 ] + + add CO1, CO1, #16 + +.endm + + + + +/******************************************************************************/ + +.macro INIT2x1 + + vsub.f32 s16 , s16 , s16 + vmov.f32 s17, s16 + +.endm + + + +.macro KERNEL2x1_SUB + + flds s8 , [ BO ] + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + fmacs s16 , s0, s8 + fmacs s17 , s1, s8 + + add AO , AO, #8 + add BO , BO, #4 + +.endm + +.macro SAVE2x1 + + + flds s0, ALPHA + + flds s8 , [CO1] + flds s9 , [CO1, #4 ] + + fmacs s8 , s0 , s16 + fmacs s9 , s0 , s17 + + fsts s8 , [CO1] + fsts s9 , [CO1, #4 ] + + add CO1, CO1, #8 + +.endm + +/******************************************************************************/ + +.macro INIT1x1 + + vsub.f32 s16 , s16 , s16 + +.endm + + + +.macro KERNEL1x1_SUB + + flds s8 , [ BO ] + + flds s0 , [ AO ] + + fmacs s16 , s0, s8 + + add AO , AO, #4 + add BO , BO, #4 + +.endm + +.macro SAVE1x1 + + + flds s0, ALPHA + + flds s8 , [CO1] + fmacs s8 , s0 , s16 + fsts s8 , [CO1] + + add CO1, CO1, #4 + +.endm + + + + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + str OLD_M, M + str OLD_N, N + str OLD_K, K + str OLD_A, A + vstr OLD_ALPHA, ALPHA + + sub r3, fp, #128 + vstm r3, { s8 - s31} // store floating point registers + + ldr r3, OLD_LDC + lsl r3, r3, #2 // ldc = ldc * 4 + str r3, LDC + + ldr K1, K + ldr BC, B + + ldr J, N + asrs J, J, #2 // J = J / 4 + ble _L2_BEGIN + +_L4_BEGIN: + + ldr CO1, C // CO1 = C + ldr r4 , LDC + lsl r4 , r4 , #2 // LDC * 4 + add r3 , r4, CO1 + str r3 , C // store C + + ldr AO, A // AO = A + pld [AO , #A_PRE-64] + pld [AO , #A_PRE-32] + + + +_L4_M4_BEGIN: + + ldr I, M + asrs I, I, #2 // I = I / 4 + ble _L4_M2_BEGIN + +_L4_M4_20: + + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + cmp L , #3 + blt _L4_M4_30 + .align 5 + + + + KERNEL4x4_I + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + sub L, L, #2 + +_L4_M4_22: + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + subs L, L, #1 + bgt _L4_M4_22 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_E + + b _L4_M4_44 + + +_L4_M4_30: + tst L, #3 + ble _L4_M4_40 + + tst L, #2 + ble _L4_M4_32 + + KERNEL4x4_I + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_E + + b _L4_M4_44 + +_L4_M4_32: + + tst L, #1 + ble _L4_M4_40 + + KERNEL4x4_I + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_E + + b _L4_M4_44 + + +_L4_M4_40: + + INIT4x4 + + +_L4_M4_44: + + ands L , K1, #7 // L = L % 8 + ble _L4_M4_100 + +_L4_M4_46: + + KERNEL4x4_SUB + + subs L, L, #1 + bne _L4_M4_46 + +_L4_M4_100: + + SAVE4x4 + +_L4_M4_END: + + subs I, I, #1 + bne _L4_M4_20 + + +_L4_M2_BEGIN: + + ldr I, M + tst I , #3 + ble _L4_END + + tst I, #2 // I = I / 2 + ble _L4_M1_BEGIN + +_L4_M2_20: + + INIT2x4 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble _L4_M2_40 + +_L4_M2_22: + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + subs L, L, #1 + bgt _L4_M2_22 + + +_L4_M2_40: + + ands L , K1, #7 // L = L % 8 + ble _L4_M2_100 + +_L4_M2_42: + + KERNEL2x4_SUB + + subs L, L, #1 + bgt _L4_M2_42 + +_L4_M2_100: + + SAVE2x4 + +_L4_M2_END: + + +_L4_M1_BEGIN: + + tst I, #1 // I = I % 2 + ble _L4_END + +_L4_M1_20: + + INIT1x4 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble _L4_M1_40 + +_L4_M1_22: + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + subs L, L, #1 + bgt _L4_M1_22 + + +_L4_M1_40: + + ands L , K1, #7 // L = L % 8 + ble _L4_M1_100 + +_L4_M1_42: + + KERNEL1x4_SUB + + subs L, L, #1 + bgt _L4_M1_42 + +_L4_M1_100: + + SAVE1x4 + + +_L4_END: + + mov r3, BC + mov r4, K1 + lsl r4, r4, #4 // k * 4 * 4 + add r3, r3, r4 // B = B + K * 4 * 4 + mov BC, r3 + + subs J , #1 // j-- + bgt _L4_BEGIN + + + +/*********************************************************************************************/ + +_L2_BEGIN: + + ldr J , N + tst J , #3 + ble _L999 + + tst J , #2 + ble _L1_BEGIN + + ldr CO1, C // CO1 = C + ldr r4 , LDC + lsl r4 , r4 , #1 // LDC * 2 + add r3 , r4, CO1 + str r3 , C // store C + + ldr AO, A // AO = A + //pld [AO , #A_PRE-96] + //pld [AO , #A_PRE-64] + //pld [AO , #A_PRE-32] + + + +_L2_M4_BEGIN: + + ldr I, M + asrs I, I, #2 // I = I / 4 + ble _L2_M2_BEGIN + +_L2_M4_20: + + INIT4x2 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble _L2_M4_40 + .align 5 + +_L2_M4_22: + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + subs L, L, #1 + bgt _L2_M4_22 + + +_L2_M4_40: + + ands L , K1, #7 // L = L % 8 + ble _L2_M4_100 + +_L2_M4_42: + + KERNEL4x2_SUB + + subs L, L, #1 + bgt _L2_M4_42 + +_L2_M4_100: + + SAVE4x2 + +_L2_M4_END: + + subs I, I, #1 + bgt _L2_M4_20 + + +_L2_M2_BEGIN: + + ldr I, M + tst I , #3 + ble _L2_END + + tst I, #2 // I = I / 2 + ble _L2_M1_BEGIN + +_L2_M2_20: + + INIT2x2 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble _L2_M2_40 + +_L2_M2_22: + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + subs L, L, #1 + bgt _L2_M2_22 + + +_L2_M2_40: + + ands L , K1, #7 // L = L % 8 + ble _L2_M2_100 + +_L2_M2_42: + + KERNEL2x2_SUB + + subs L, L, #1 + bgt _L2_M2_42 + +_L2_M2_100: + + SAVE2x2 + +_L2_M2_END: + + +_L2_M1_BEGIN: + + tst I, #1 // I = I % 2 + ble _L2_END + +_L2_M1_20: + + INIT1x2 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble _L2_M1_40 + +_L2_M1_22: + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + subs L, L, #1 + bgt _L2_M1_22 + + +_L2_M1_40: + + ands L , K1, #7 // L = L % 8 + ble _L2_M1_100 + +_L2_M1_42: + + KERNEL1x2_SUB + + subs L, L, #1 + bgt _L2_M1_42 + +_L2_M1_100: + + SAVE1x2 + + +_L2_END: + + mov r3, BC + mov r4, K1 + lsl r4, r4, #3 // k * 2 * 4 + add r3, r3, r4 // B = B + K * 2 * 4 + mov BC, r3 + +/*********************************************************************************************/ + +_L1_BEGIN: + + ldr J , N + tst J , #1 + ble _L999 + + + ldr CO1, C // CO1 = C + ldr r4 , LDC + add r3 , r4, CO1 + str r3 , C // store C + + ldr AO, A // AO = A + //pld [AO , #A_PRE-96] + //pld [AO , #A_PRE-64] + //pld [AO , #A_PRE-32] + + + +_L1_M4_BEGIN: + + ldr I, M + asrs I, I, #2 // I = I / 4 + ble _L1_M2_BEGIN + +_L1_M4_20: + + INIT4x1 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble _L1_M4_40 + .align 5 + +_L1_M4_22: + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + subs L, L, #1 + bgt _L1_M4_22 + + +_L1_M4_40: + + ands L , K1, #7 // L = L % 8 + ble _L1_M4_100 + +_L1_M4_42: + + KERNEL4x1_SUB + + subs L, L, #1 + bgt _L1_M4_42 + +_L1_M4_100: + + SAVE4x1 + +_L1_M4_END: + + subs I, I, #1 + bgt _L1_M4_20 + + +_L1_M2_BEGIN: + + ldr I, M + tst I , #3 + ble _L1_END + + tst I, #2 // I = I / 2 + ble _L1_M1_BEGIN + +_L1_M2_20: + + INIT2x1 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble _L1_M2_40 + +_L1_M2_22: + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + subs L, L, #1 + bgt _L1_M2_22 + + +_L1_M2_40: + + ands L , K1, #7 // L = L % 8 + ble _L1_M2_100 + +_L1_M2_42: + + KERNEL2x1_SUB + + subs L, L, #1 + bgt _L1_M2_42 + +_L1_M2_100: + + SAVE2x1 + +_L1_M2_END: + + +_L1_M1_BEGIN: + + tst I, #1 // I = I % 2 + ble _L1_END + +_L1_M1_20: + + INIT1x1 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble _L1_M1_40 + +_L1_M1_22: + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + subs L, L, #1 + bgt _L1_M1_22 + + +_L1_M1_40: + + ands L , K1, #7 // L = L % 8 + ble _L1_M1_100 + +_L1_M1_42: + + KERNEL1x1_SUB + + subs L, L, #1 + bgt _L1_M1_42 + +_L1_M1_100: + + SAVE1x1 + + +_L1_END: + + +_L999: + + sub r3, fp, #128 + vldm r3, { s8 - s31} // restore floating point registers + + movs r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/strmm_kernel_4x4_vfpv3.S b/kernel/arm/strmm_kernel_4x4_vfpv3.S new file mode 100644 index 000000000..15c866856 --- /dev/null +++ b/kernel/arm/strmm_kernel_4x4_vfpv3.S @@ -0,0 +1,1884 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/10/14 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_M r0 +#define OLD_N r1 +#define OLD_K r2 +#define OLD_A r3 +#define OLD_ALPHA s0 + +/****************************************************** +* [fp, #-128] - [fp, #-32] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define KK [fp, #-244 ] +#define KKK [fp, #-248] +#define LDC [fp, #-252 ] +#define M [fp, #-256 ] +#define N [fp, #-260 ] +#define K [fp, #-264 ] +#define A [fp, #-268 ] + +#define ALPHA [fp, #-280] + +#define B [fp, #4 ] +#define C [fp, #8 ] +#define OLD_LDC [fp, #12 ] +#define OFFSET [fp, #16 ] + +#define I r0 +#define J r1 +#define L r2 + +#define AO r5 +#define BO r6 + +#define CO1 r8 +#define CO2 r9 + +#define K1 r7 +#define BC r12 + +#define A_PRE 96 +#define B_PRE 96 +#define C_PRE 64 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro INIT4x4 + + vsub.f32 s16 , s16 , s16 + vmov.f32 s17, s16 + vmov.f32 s18, s16 + vmov.f32 s19, s16 + vmov.f32 s20, s16 + vmov.f32 s21, s16 + vmov.f32 s22, s16 + vmov.f32 s23, s16 + vmov.f32 s24, s16 + vmov.f32 s25, s16 + vmov.f32 s26, s16 + vmov.f32 s27, s16 + vmov.f32 s28, s16 + vmov.f32 s29, s16 + vmov.f32 s30, s16 + vmov.f32 s31, s16 + +.endm + +.macro KERNEL4x4_I + + fldmias AO!, { s0 - s1 } + pld [ AO , #A_PRE-8 ] + fldmias BO!, { s8 - s9 } + pld [ BO , #B_PRE-8 ] + + fmuls s16 , s0, s8 + fldmias AO!, { s2 - s3 } + fmuls s17 , s1, s8 + fmuls s18 , s2, s8 + fldmias BO!, { s10 - s11 } + fmuls s19 , s3, s8 + + fmuls s20 , s0, s9 + fldmias AO!, { s4 - s5 } + fmuls s21 , s1, s9 + fmuls s22 , s2, s9 + fldmias AO!, { s6 - s7 } + fmuls s23 , s3, s9 + + fmuls s24 , s0, s10 + fldmias BO!, { s12 - s13 } + fmuls s25 , s1, s10 + fmuls s26 , s2, s10 + fldmias BO!, { s14 - s15 } + fmuls s27 , s3, s10 + + fmuls s28 , s0, s11 + fmuls s29 , s1, s11 + fmuls s30 , s2, s11 + fmuls s31 , s3, s11 + +.endm + + +.macro KERNEL4x4_M2 + + pld [ AO , #A_PRE ] + fmacs s16 , s4, s12 + fmacs s17 , s5, s12 + fldmias AO!, { s0 - s1 } + fmacs s18 , s6, s12 + pld [ BO , #B_PRE ] + fmacs s19 , s7, s12 + + fmacs s20 , s4, s13 + fldmias AO!, { s2 - s3 } + fmacs s21 , s5, s13 + fmacs s22 , s6, s13 + fldmias BO!, { s8 - s9 } + fmacs s23 , s7, s13 + + fmacs s24 , s4, s14 + fldmias BO!, { s10 - s11 } + fmacs s25 , s5, s14 + fmacs s26 , s6, s14 + fmacs s27 , s7, s14 + + fmacs s28 , s4, s15 + fmacs s29 , s5, s15 + fmacs s30 , s6, s15 + fmacs s31 , s7, s15 + +.endm + + +.macro KERNEL4x4_M1 + + fmacs s16 , s0, s8 + fldmias AO!, { s4 - s5 } + fmacs s17 , s1, s8 + fmacs s18 , s2, s8 + fldmias AO!, { s6 - s7 } + fmacs s19 , s3, s8 + + fmacs s20 , s0, s9 + fldmias BO!, { s12 - s13 } + fmacs s21 , s1, s9 + fmacs s22 , s2, s9 + fldmias BO!, { s14 - s15 } + fmacs s23 , s3, s9 + + fmacs s24 , s0, s10 + fmacs s25 , s1, s10 + fmacs s26 , s2, s10 + fmacs s27 , s3, s10 + + fmacs s28 , s0, s11 + fmacs s29 , s1, s11 + fmacs s30 , s2, s11 + fmacs s31 , s3, s11 + +.endm + + + +.macro KERNEL4x4_E + + fmacs s16 , s4, s12 + fmacs s17 , s5, s12 + fmacs s18 , s6, s12 + fmacs s19 , s7, s12 + + fmacs s20 , s4, s13 + fmacs s21 , s5, s13 + fmacs s22 , s6, s13 + fmacs s23 , s7, s13 + + fmacs s24 , s4, s14 + fmacs s25 , s5, s14 + fmacs s26 , s6, s14 + fmacs s27 , s7, s14 + + fmacs s28 , s4, s15 + fmacs s29 , s5, s15 + fmacs s30 , s6, s15 + fmacs s31 , s7, s15 + +.endm + + + + +.macro KERNEL4x4_SUB + + flds s8 , [ BO ] + pld [ BO , #B_PRE ] + + flds s0 , [ AO ] + pld [ AO , #A_PRE ] + flds s1 , [ AO, #4 ] + + fmacs s16 , s0, s8 + flds s2 , [ AO, #8 ] + fmacs s17 , s1, s8 + flds s3 , [ AO, #12 ] + fmacs s18 , s2, s8 + flds s9 , [ BO, #4 ] + fmacs s19 , s3, s8 + + flds s10, [ BO, #8 ] + fmacs s20 , s0, s9 + flds s11, [ BO, #12 ] + fmacs s21 , s1, s9 + fmacs s22 , s2, s9 + fmacs s23 , s3, s9 + + fmacs s24 , s0, s10 + fmacs s25 , s1, s10 + fmacs s26 , s2, s10 + fmacs s27 , s3, s10 + + fmacs s28 , s0, s11 + fmacs s29 , s1, s11 + add AO , AO, #16 + fmacs s30 , s2, s11 + add BO , BO, #16 + fmacs s31 , s3, s11 + +.endm + +.macro SAVE4x4 + + ldr r3 , LDC + add CO2 , CO1, r3 + flds s0, ALPHA + add r4 , CO2, r3 + + + fmuls s8 , s0 , s16 + fmuls s9 , s0 , s17 + fmuls s10, s0 , s18 + fmuls s11, s0 , s19 + + fmuls s12, s0 , s20 + fsts s8 , [CO1] + fmuls s13, s0 , s21 + fsts s9 , [CO1, #4 ] + fmuls s14, s0 , s22 + fsts s10, [CO1, #8 ] + fmuls s15, s0 , s23 + fsts s11, [CO1, #12 ] + + + fmuls s8 , s0 , s24 + fsts s12, [CO2] + fmuls s9 , s0 , s25 + fsts s13, [CO2, #4 ] + fmuls s10, s0 , s26 + fsts s14, [CO2, #8 ] + fmuls s11, s0 , s27 + fsts s15, [CO2, #12 ] + + add CO2, r4 , r3 + + fsts s8 , [r4 ] + fmuls s12, s0 , s28 + fsts s9 , [r4 , #4 ] + fmuls s13, s0 , s29 + fsts s10, [r4 , #8 ] + fmuls s14, s0 , s30 + fsts s11, [r4 , #12 ] + fmuls s15, s0 , s31 + + fstmias CO2, { s12 - s15 } + + add CO1, CO1, #16 + +.endm + +/******************************************************************************/ + +.macro INIT2x4 + + vsub.f32 s16 , s16 , s16 + vmov.f32 s17, s16 + vmov.f32 s20, s16 + vmov.f32 s21, s16 + vmov.f32 s24, s16 + vmov.f32 s25, s16 + vmov.f32 s28, s16 + vmov.f32 s29, s16 + +.endm + + + +.macro KERNEL2x4_SUB + + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + flds s10, [ BO, #8 ] + flds s11, [ BO, #12 ] + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + fmacs s16 , s0, s8 + fmacs s17 , s1, s8 + + fmacs s20 , s0, s9 + fmacs s21 , s1, s9 + + fmacs s24 , s0, s10 + fmacs s25 , s1, s10 + + fmacs s28 , s0, s11 + fmacs s29 , s1, s11 + add AO , AO, #8 + add BO , BO, #16 + +.endm + +.macro SAVE2x4 + + ldr r3 , LDC + add CO2 , CO1, r3 + add r4 , CO2, r3 + + flds s0, ALPHA + + fmuls s8 , s0 , s16 + fmuls s9 , s0 , s17 + + fsts s8 , [CO1] + fsts s9 , [CO1, #4 ] + + fmuls s12, s0 , s20 + fmuls s13, s0 , s21 + + fsts s12, [CO2] + fsts s13, [CO2, #4 ] + + + fmuls s8 , s0 , s24 + fmuls s9 , s0 , s25 + + fsts s8 , [r4 ] + fsts s9 , [r4 , #4 ] + + add CO2, r4 , r3 + + fmuls s12, s0 , s28 + fmuls s13, s0 , s29 + + fsts s12, [CO2] + fsts s13, [CO2, #4 ] + + add CO1, CO1, #8 + +.endm + + +/******************************************************************************/ + +.macro INIT1x4 + + vsub.f32 s16 , s16 , s16 + vmov.f32 s20, s16 + vmov.f32 s24, s16 + vmov.f32 s28, s16 + +.endm + + + +.macro KERNEL1x4_SUB + + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + flds s10, [ BO, #8 ] + flds s11, [ BO, #12 ] + + flds s0 , [ AO ] + + fmacs s16 , s0, s8 + fmacs s20 , s0, s9 + fmacs s24 , s0, s10 + fmacs s28 , s0, s11 + + add AO , AO, #4 + add BO , BO, #16 + +.endm + +.macro SAVE1x4 + + ldr r3 , LDC + add CO2 , CO1, r3 + add r4 , CO2, r3 + + flds s0, ALPHA + + fmuls s8 , s0 , s16 + fsts s8 , [CO1] + + fmuls s12, s0 , s20 + fsts s12, [CO2] + + fmuls s8 , s0 , s24 + fsts s8 , [r4 ] + + add CO2, r4 , r3 + + fmuls s12, s0 , s28 + fsts s12, [CO2] + + add CO1, CO1, #4 + +.endm + +/******************************************************************************/ +/******************************************************************************/ + +.macro INIT4x2 + + vsub.f32 s16 , s16 , s16 + vmov.f32 s17, s16 + vmov.f32 s18, s16 + vmov.f32 s19, s16 + vmov.f32 s20, s16 + vmov.f32 s21, s16 + vmov.f32 s22, s16 + vmov.f32 s23, s16 + +.endm + + + +.macro KERNEL4x2_SUB + + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s2 , [ AO, #8 ] + flds s3 , [ AO, #12 ] + + fmacs s16 , s0, s8 + fmacs s17 , s1, s8 + fmacs s18 , s2, s8 + fmacs s19 , s3, s8 + + fmacs s20 , s0, s9 + fmacs s21 , s1, s9 + fmacs s22 , s2, s9 + fmacs s23 , s3, s9 + + add AO , AO, #16 + add BO , BO, #8 + +.endm + +.macro SAVE4x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + flds s0, ALPHA + + fmuls s8 , s0 , s16 + fmuls s9 , s0 , s17 + fmuls s10, s0 , s18 + fmuls s11, s0 , s19 + + fsts s8 , [CO1] + fsts s9 , [CO1, #4 ] + fsts s10, [CO1, #8 ] + fsts s11, [CO1, #12 ] + + fmuls s12, s0 , s20 + fmuls s13, s0 , s21 + fmuls s14, s0 , s22 + fmuls s15, s0 , s23 + + fsts s12, [CO2] + fsts s13, [CO2, #4 ] + fsts s14, [CO2, #8 ] + fsts s15, [CO2, #12 ] + + add CO1, CO1, #16 + +.endm + + +/******************************************************************************/ + +.macro INIT2x2 + + vsub.f32 s16 , s16 , s16 + vmov.f32 s17, s16 + vmov.f32 s20, s16 + vmov.f32 s21, s16 + +.endm + + + +.macro KERNEL2x2_SUB + + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + fmacs s16 , s0, s8 + fmacs s17 , s1, s8 + + fmacs s20 , s0, s9 + fmacs s21 , s1, s9 + + add AO , AO, #8 + add BO , BO, #8 + +.endm + +.macro SAVE2x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + flds s0, ALPHA + + fmuls s8 , s0 , s16 + fmuls s9 , s0 , s17 + + fsts s8 , [CO1] + fsts s9 , [CO1, #4 ] + + fmuls s12, s0 , s20 + fmuls s13, s0 , s21 + + fsts s12, [CO2] + fsts s13, [CO2, #4 ] + + add CO1, CO1, #8 + +.endm + +/******************************************************************************/ + +.macro INIT1x2 + + vsub.f32 s16 , s16 , s16 + vmov.f32 s20, s16 + +.endm + + + +.macro KERNEL1x2_SUB + + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + + flds s0 , [ AO ] + fmacs s16 , s0, s8 + fmacs s20 , s0, s9 + + add AO , AO, #4 + add BO , BO, #8 + +.endm + +.macro SAVE1x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + flds s0, ALPHA + + fmuls s8 , s0 , s16 + fsts s8 , [CO1] + + fmuls s12, s0 , s20 + fsts s12, [CO2] + + add CO1, CO1, #8 + +.endm + +/******************************************************************************/ +/******************************************************************************/ + +.macro INIT4x1 + + vsub.f32 s16 , s16 , s16 + vmov.f32 s17, s16 + vmov.f32 s18, s16 + vmov.f32 s19, s16 + +.endm + + + +.macro KERNEL4x1_SUB + + flds s8 , [ BO ] + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s2 , [ AO, #8 ] + flds s3 , [ AO, #12 ] + + fmacs s16 , s0, s8 + fmacs s17 , s1, s8 + fmacs s18 , s2, s8 + fmacs s19 , s3, s8 + + add AO , AO, #16 + add BO , BO, #4 + +.endm + +.macro SAVE4x1 + + + flds s0, ALPHA + + fmuls s8 , s0 , s16 + fmuls s9 , s0 , s17 + fmuls s10, s0 , s18 + fmuls s11, s0 , s19 + + fsts s8 , [CO1] + fsts s9 , [CO1, #4 ] + fsts s10, [CO1, #8 ] + fsts s11, [CO1, #12 ] + + add CO1, CO1, #16 + +.endm + + + + +/******************************************************************************/ + +.macro INIT2x1 + + vsub.f32 s16 , s16 , s16 + vmov.f32 s17, s16 + +.endm + + + +.macro KERNEL2x1_SUB + + flds s8 , [ BO ] + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + fmacs s16 , s0, s8 + fmacs s17 , s1, s8 + + add AO , AO, #8 + add BO , BO, #4 + +.endm + +.macro SAVE2x1 + + + flds s0, ALPHA + + fmuls s8 , s0 , s16 + fmuls s9 , s0 , s17 + + fsts s8 , [CO1] + fsts s9 , [CO1, #4 ] + + add CO1, CO1, #8 + +.endm + +/******************************************************************************/ + +.macro INIT1x1 + + vsub.f32 s16 , s16 , s16 + +.endm + + + +.macro KERNEL1x1_SUB + + flds s8 , [ BO ] + + flds s0 , [ AO ] + + fmacs s16 , s0, s8 + + add AO , AO, #4 + add BO , BO, #4 + +.endm + +.macro SAVE1x1 + + + flds s0, ALPHA + + fmuls s8 , s0 , s16 + fsts s8 , [CO1] + + add CO1, CO1, #4 + +.endm + + + + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + str OLD_M, M + str OLD_N, N + str OLD_K, K + str OLD_A, A + vstr OLD_ALPHA, ALPHA + + sub r3, fp, #128 + vstm r3, { s8 - s31} // store floating point registers + + ldr r3, OLD_LDC + lsl r3, r3, #2 // ldc = ldc * 4 + str r3, LDC + + ldr r3, OFFSET +#ifndef LEFT + neg r3 , r3 +#endif + str r3 , KK + + ldr BC, B + + ldr J, N + asrs J, J, #2 // J = J / 4 + ble _L2_BEGIN + +_L4_BEGIN: + + ldr CO1, C // CO1 = C + ldr r4 , LDC + lsl r4 , r4 , #2 // LDC * 4 + add r3 , r4, CO1 + str r3 , C // store C + +#if defined(LEFT) + ldr r3 , OFFSET + str r3 , KK +#endif + + ldr AO, A // AO = A + pld [AO , #A_PRE-64] + pld [AO , #A_PRE-32] + + + +_L4_M4_BEGIN: + + ldr I, M + asrs I, I, #2 // I = I / 4 + ble _L4_M2_BEGIN + +_L4_M4_20: + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #4 // 4 float values + add BO , BO , r4 + lsls r4 , r3 , #4 // 4 float values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr K1, K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr K1, K + ldr r3, KK + sub K1, K1, r3 + str K1, KKK +#else + ldr K1, KK +#ifdef LEFT + add K1, K1, #4 // number of values in AO +#else + add K1, K1, #4 // number of values in BO +#endif + str K1, KKK +#endif + + asrs L , K1, #3 // L = L / 8 + cmp L , #3 + blt _L4_M4_30 + .align 5 + + + + KERNEL4x4_I + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + sub L, L, #2 + +_L4_M4_22: + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + subs L, L, #1 + bgt _L4_M4_22 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_E + + b _L4_M4_44 + + +_L4_M4_30: + tst L, #3 + ble _L4_M4_40 + + tst L, #2 + ble _L4_M4_32 + + KERNEL4x4_I + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_E + + b _L4_M4_44 + +_L4_M4_32: + + tst L, #1 + ble _L4_M4_40 + + KERNEL4x4_I + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_E + + b _L4_M4_44 + + +_L4_M4_40: + + INIT4x4 + + +_L4_M4_44: + + ands L , K1, #7 // L = L % 8 + ble _L4_M4_100 + +_L4_M4_46: + + KERNEL4x4_SUB + + subs L, L, #1 + bne _L4_M4_46 + +_L4_M4_100: + + SAVE4x4 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #4 // 4 float values + add BO , BO , r4 + lsls r4 , r3 , #4 // 4 float values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #4 // number of values in AO + str r3 , KK +#endif + + + + +_L4_M4_END: + + subs I, I, #1 + bne _L4_M4_20 + + +_L4_M2_BEGIN: + + ldr I, M + tst I , #3 + ble _L4_END + + tst I, #2 // I = I / 2 + ble _L4_M1_BEGIN + +_L4_M2_20: + + INIT2x4 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #4 // 4 float values + add BO , BO , r4 + lsls r4 , r3 , #3 // 2 float values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr K1, K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr K1, K + ldr r3, KK + sub K1, K1, r3 + str K1, KKK +#else + ldr K1, KK +#ifdef LEFT + add K1, K1, #2 // number of values in AO +#else + add K1, K1, #4 // number of values in BO +#endif + str K1, KKK +#endif + + + asrs L , K1, #3 // L = L / 8 + ble _L4_M2_40 + +_L4_M2_22: + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + subs L, L, #1 + bgt _L4_M2_22 + + +_L4_M2_40: + + ands L , K1, #7 // L = L % 8 + ble _L4_M2_100 + +_L4_M2_42: + + KERNEL2x4_SUB + + subs L, L, #1 + bgt _L4_M2_42 + +_L4_M2_100: + + SAVE2x4 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #4 // 4 float values + add BO , BO , r4 + lsls r4 , r3 , #3 // 2 float values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #2 // number of values in AO + str r3 , KK +#endif + + + +_L4_M2_END: + + +_L4_M1_BEGIN: + + tst I, #1 // I = I % 2 + ble _L4_END + +_L4_M1_20: + + INIT1x4 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #4 // 4 float values + add BO , BO , r4 + lsls r4 , r3 , #2 // 1 float value + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr K1, K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr K1, K + ldr r3, KK + sub K1, K1, r3 + str K1, KKK +#else + ldr K1, KK +#ifdef LEFT + add K1, K1, #1 // number of values in AO +#else + add K1, K1, #4 // number of values in BO +#endif + str K1, KKK +#endif + + + asrs L , K1, #3 // L = L / 8 + ble _L4_M1_40 + +_L4_M1_22: + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + subs L, L, #1 + bgt _L4_M1_22 + + +_L4_M1_40: + + ands L , K1, #7 // L = L % 8 + ble _L4_M1_100 + +_L4_M1_42: + + KERNEL1x4_SUB + + subs L, L, #1 + bgt _L4_M1_42 + +_L4_M1_100: + + SAVE1x4 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #4 // 4 float values + add BO , BO , r4 + lsls r4 , r3 , #2 // 1 float value + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #1 // number of values in AO + str r3 , KK +#endif + + + +_L4_END: + + mov r3, BC + ldr r4, K + lsl r4, r4, #4 // k * 4 * 4 + add r3, r3, r4 // B = B + K * 4 * 4 + mov BC, r3 + +#if !defined(LEFT) + ldr r3 , KK + add r3 , r3 , #4 // number of values in BO + str r3 , KK +#endif + + subs J , #1 // j-- + bgt _L4_BEGIN + + + +/*********************************************************************************************/ + +_L2_BEGIN: + + ldr J , N + tst J , #3 + ble _L999 + + tst J , #2 + ble _L1_BEGIN + + ldr CO1, C // CO1 = C + ldr r4 , LDC + lsl r4 , r4 , #1 // LDC * 2 + add r3 , r4, CO1 + str r3 , C // store C + +#if defined(LEFT) + ldr r3 , OFFSET + str r3 , KK +#endif + + ldr AO, A // AO = A + //pld [AO , #A_PRE-96] + //pld [AO , #A_PRE-64] + //pld [AO , #A_PRE-32] + + + +_L2_M4_BEGIN: + + ldr I, M + asrs I, I, #2 // I = I / 4 + ble _L2_M2_BEGIN + +_L2_M4_20: + + INIT4x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #3 // 2 float values + add BO , BO , r4 + lsls r4 , r3 , #4 // 4 float values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr K1, K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr K1, K + ldr r3, KK + sub K1, K1, r3 + str K1, KKK +#else + ldr K1, KK +#ifdef LEFT + add K1, K1, #4 // number of values in AO +#else + add K1, K1, #2 // number of values in BO +#endif + str K1, KKK +#endif + + + asrs L , K1, #3 // L = L / 8 + ble _L2_M4_40 + .align 5 + +_L2_M4_22: + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + subs L, L, #1 + bgt _L2_M4_22 + + +_L2_M4_40: + + ands L , K1, #7 // L = L % 8 + ble _L2_M4_100 + +_L2_M4_42: + + KERNEL4x2_SUB + + subs L, L, #1 + bgt _L2_M4_42 + +_L2_M4_100: + + SAVE4x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #3 // 2 float values + add BO , BO , r4 + lsls r4 , r3 , #4 // 4 float values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #4 // number of values in AO + str r3 , KK +#endif + + + +_L2_M4_END: + + subs I, I, #1 + bgt _L2_M4_20 + + +_L2_M2_BEGIN: + + ldr I, M + tst I , #3 + ble _L2_END + + tst I, #2 // I = I / 2 + ble _L2_M1_BEGIN + +_L2_M2_20: + + INIT2x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #3 // 2 float values + add BO , BO , r4 + lsls r4 , r3 , #3 // 2 float values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr K1, K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr K1, K + ldr r3, KK + sub K1, K1, r3 + str K1, KKK +#else + ldr K1, KK +#ifdef LEFT + add K1, K1, #2 // number of values in AO +#else + add K1, K1, #2 // number of values in BO +#endif + str K1, KKK +#endif + + + asrs L , K1, #3 // L = L / 8 + ble _L2_M2_40 + +_L2_M2_22: + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + subs L, L, #1 + bgt _L2_M2_22 + + +_L2_M2_40: + + ands L , K1, #7 // L = L % 8 + ble _L2_M2_100 + +_L2_M2_42: + + KERNEL2x2_SUB + + subs L, L, #1 + bgt _L2_M2_42 + +_L2_M2_100: + + SAVE2x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #3 // 2 float values + add BO , BO , r4 + lsls r4 , r3 , #3 // 2 float values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #2 // number of values in AO + str r3 , KK +#endif + + + +_L2_M2_END: + + +_L2_M1_BEGIN: + + tst I, #1 // I = I % 2 + ble _L2_END + +_L2_M1_20: + + INIT1x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #3 // 2 float values + add BO , BO , r4 + lsls r4 , r3 , #2 // 1 float value + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr K1, K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr K1, K + ldr r3, KK + sub K1, K1, r3 + str K1, KKK +#else + ldr K1, KK +#ifdef LEFT + add K1, K1, #1 // number of values in AO +#else + add K1, K1, #2 // number of values in BO +#endif + str K1, KKK +#endif + + + asrs L , K1, #3 // L = L / 8 + ble _L2_M1_40 + +_L2_M1_22: + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + subs L, L, #1 + bgt _L2_M1_22 + + +_L2_M1_40: + + ands L , K1, #7 // L = L % 8 + ble _L2_M1_100 + +_L2_M1_42: + + KERNEL1x2_SUB + + subs L, L, #1 + bgt _L2_M1_42 + +_L2_M1_100: + + SAVE1x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #3 // 2 float values + add BO , BO , r4 + lsls r4 , r3 , #2 // 1 float value + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #1 // number of values in AO + str r3 , KK +#endif + + + +_L2_END: + + mov r3, BC + ldr r4, K + lsl r4, r4, #3 // k * 2 * 4 + add r3, r3, r4 // B = B + K * 2 * 4 + mov BC, r3 + +#if !defined(LEFT) + ldr r3 , KK + add r3 , r3 , #2 // number of values in BO + str r3 , KK +#endif + + +/*********************************************************************************************/ + +_L1_BEGIN: + + ldr J , N + tst J , #1 + ble _L999 + + + ldr CO1, C // CO1 = C + ldr r4 , LDC + add r3 , r4, CO1 + str r3 , C // store C + +#if defined(LEFT) + ldr r3 , OFFSET + str r3 , KK +#endif + + ldr AO, A // AO = A + //pld [AO , #A_PRE-96] + //pld [AO , #A_PRE-64] + //pld [AO , #A_PRE-32] + + + +_L1_M4_BEGIN: + + ldr I, M + asrs I, I, #2 // I = I / 4 + ble _L1_M2_BEGIN + +_L1_M4_20: + + INIT4x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #2 // 1 float value + add BO , BO , r4 + lsls r4 , r3 , #4 // 4 float values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr K1, K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr K1, K + ldr r3, KK + sub K1, K1, r3 + str K1, KKK +#else + ldr K1, KK +#ifdef LEFT + add K1, K1, #4 // number of values in AO +#else + add K1, K1, #1 // number of values in BO +#endif + str K1, KKK +#endif + + + asrs L , K1, #3 // L = L / 8 + ble _L1_M4_40 + .align 5 + +_L1_M4_22: + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + subs L, L, #1 + bgt _L1_M4_22 + + +_L1_M4_40: + + ands L , K1, #7 // L = L % 8 + ble _L1_M4_100 + +_L1_M4_42: + + KERNEL4x1_SUB + + subs L, L, #1 + bgt _L1_M4_42 + +_L1_M4_100: + + SAVE4x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #2 // 1 float value + add BO , BO , r4 + lsls r4 , r3 , #4 // 4 float values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #4 // number of values in AO + str r3 , KK +#endif + + + + +_L1_M4_END: + + subs I, I, #1 + bgt _L1_M4_20 + + +_L1_M2_BEGIN: + + ldr I, M + tst I , #3 + ble _L1_END + + tst I, #2 // I = I / 2 + ble _L1_M1_BEGIN + +_L1_M2_20: + + INIT2x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #2 // 1 float value + add BO , BO , r4 + lsls r4 , r3 , #3 // 2 float values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr K1, K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr K1, K + ldr r3, KK + sub K1, K1, r3 + str K1, KKK +#else + ldr K1, KK +#ifdef LEFT + add K1, K1, #2 // number of values in AO +#else + add K1, K1, #1 // number of values in BO +#endif + str K1, KKK +#endif + + + asrs L , K1, #3 // L = L / 8 + ble _L1_M2_40 + +_L1_M2_22: + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + subs L, L, #1 + bgt _L1_M2_22 + + +_L1_M2_40: + + ands L , K1, #7 // L = L % 8 + ble _L1_M2_100 + +_L1_M2_42: + + KERNEL2x1_SUB + + subs L, L, #1 + bgt _L1_M2_42 + +_L1_M2_100: + + SAVE2x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #2 // 1 float value + add BO , BO , r4 + lsls r4 , r3 , #3 // 2 float values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #2 // number of values in AO + str r3 , KK +#endif + + + +_L1_M2_END: + + +_L1_M1_BEGIN: + + tst I, #1 // I = I % 2 + ble _L1_END + +_L1_M1_20: + + INIT1x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #2 // 1 float value + add BO , BO , r4 + lsls r4 , r3 , #2 // 1 float value + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr K1, K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr K1, K + ldr r3, KK + sub K1, K1, r3 + str K1, KKK +#else + ldr K1, KK +#ifdef LEFT + add K1, K1, #1 // number of values in AO +#else + add K1, K1, #1 // number of values in BO +#endif + str K1, KKK +#endif + + + asrs L , K1, #3 // L = L / 8 + ble _L1_M1_40 + +_L1_M1_22: + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + subs L, L, #1 + bgt _L1_M1_22 + + +_L1_M1_40: + + ands L , K1, #7 // L = L % 8 + ble _L1_M1_100 + +_L1_M1_42: + + KERNEL1x1_SUB + + subs L, L, #1 + bgt _L1_M1_42 + +_L1_M1_100: + + SAVE1x1 + + +_L1_END: + + +_L999: + + sub r3, fp, #128 + vldm r3, { s8 - s31} // restore floating point registers + + movs r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + From 85484a42df31257592161ebac7cda80f25133547 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Wed, 16 Oct 2013 18:00:41 +0200 Subject: [PATCH 13/81] added kernels for cgemm, ctrmm, zgemm and ztrmm --- kernel/arm/KERNEL.ARMV7 | 35 +- kernel/arm/cgemm_kernel_2x2_vfpv3.S | 1293 ++++++++++++++++++++++ kernel/arm/ctrmm_kernel_2x2_vfpv3.S | 1476 +++++++++++++++++++++++++ kernel/arm/zgemm_kernel_2x2_vfpv3.S | 1329 +++++++++++++++++++++++ kernel/arm/ztrmm_kernel_2x2_vfpv3.S | 1538 +++++++++++++++++++++++++++ 5 files changed, 5656 insertions(+), 15 deletions(-) create mode 100644 kernel/arm/cgemm_kernel_2x2_vfpv3.S create mode 100644 kernel/arm/ctrmm_kernel_2x2_vfpv3.S create mode 100644 kernel/arm/zgemm_kernel_2x2_vfpv3.S create mode 100644 kernel/arm/ztrmm_kernel_2x2_vfpv3.S diff --git a/kernel/arm/KERNEL.ARMV7 b/kernel/arm/KERNEL.ARMV7 index 8c69ad5cf..43153798e 100644 --- a/kernel/arm/KERNEL.ARMV7 +++ b/kernel/arm/KERNEL.ARMV7 @@ -80,36 +80,41 @@ DGEMVTKERNEL = gemv_t.c CGEMVTKERNEL = zgemv_t.c ZGEMVTKERNEL = zgemv_t.c -STRMMKERNEL = ../generic/trmmkernel_2x2.c -DTRMMKERNEL = dtrmm_kernel_4x4_vfpv3.S -CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c -ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c +STRMMKERNEL = strmm_kernel_4x4_vfpv3.S +DTRMMKERNEL = dtrmm_kernel_4x4_vfpv3.S +CTRMMKERNEL = ctrmm_kernel_2x2_vfpv3.S +ZTRMMKERNEL = ztrmm_kernel_2x2_vfpv3.S -SGEMMKERNEL = ../generic/gemmkernel_2x2.c -SGEMMONCOPY = ../generic/gemm_ncopy_2.c -SGEMMOTCOPY = ../generic/gemm_tcopy_2.c +#SGEMMKERNEL = ../generic/gemmkernel_2x2.c +SGEMMKERNEL = sgemm_kernel_4x4_vfpv3.S +SGEMMINCOPY = +SGEMMITCOPY = +SGEMMONCOPY = ../generic/gemm_ncopy_4.c +SGEMMOTCOPY = ../generic/gemm_tcopy_4.c +SGEMMINCOPYOBJ = +SGEMMITCOPYOBJ = SGEMMONCOPYOBJ = sgemm_oncopy.o SGEMMOTCOPYOBJ = sgemm_otcopy.o #DGEMMKERNEL = ../generic/gemmkernel_2x2.c #DGEMMKERNEL = dgemm_kernel_4x2_vfpv2.S DGEMMKERNEL = dgemm_kernel_4x4_vfpv3.S -DGEMMINCOPY = -DGEMMITCOPY = -DGEMMONCOPY = ../generic/gemm_ncopy_4.c -DGEMMOTCOPY = ../generic/gemm_tcopy_4.c -DGEMMINCOPYOBJ = -DGEMMITCOPYOBJ = +DGEMMINCOPY = dgemm_ncopy_4_vfpv3.S +DGEMMITCOPY = ../generic/gemm_tcopy_4.c +DGEMMONCOPY = dgemm_ncopy_4_vfpv3.S +DGEMMOTCOPY = ../generic/gemm_tcopy_4.c +DGEMMINCOPYOBJ = dgemm_incopy.o +DGEMMITCOPYOBJ = dgemm_itcopy.o DGEMMONCOPYOBJ = dgemm_oncopy.o DGEMMOTCOPYOBJ = dgemm_otcopy.o -CGEMMKERNEL = ../generic/zgemmkernel_2x2.c +CGEMMKERNEL = cgemm_kernel_2x2_vfpv3.S CGEMMONCOPY = ../generic/zgemm_ncopy_2.c CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c CGEMMONCOPYOBJ = cgemm_oncopy.o CGEMMOTCOPYOBJ = cgemm_otcopy.o -ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c +ZGEMMKERNEL = zgemm_kernel_2x2_vfpv3.S ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c ZGEMMONCOPYOBJ = zgemm_oncopy.o diff --git a/kernel/arm/cgemm_kernel_2x2_vfpv3.S b/kernel/arm/cgemm_kernel_2x2_vfpv3.S new file mode 100644 index 000000000..abbbac831 --- /dev/null +++ b/kernel/arm/cgemm_kernel_2x2_vfpv3.S @@ -0,0 +1,1293 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/10/16 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_M r0 +#define OLD_N r1 +#define OLD_K r2 +#define OLD_A r3 +#define OLD_ALPHA_R s0 +#define OLD_ALPHA_I s1 + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define A [fp, #-248 ] +#define LDC [fp, #-252 ] +#define M [fp, #-256 ] +#define N [fp, #-260 ] +#define K [fp, #-264 ] + +#define ALPHA_I [fp, #-272] +#define ALPHA_R [fp, #-280] + +#define B [fp, #4 ] +#define C [fp, #8 ] +#define OLD_LDC [fp, #12 ] + +#define I r0 +#define J r1 +#define L r2 + +#define AO r5 +#define BO r6 + +#define CO1 r8 +#define CO2 r9 + +#define K1 r7 +#define BC r12 + +#define A_PRE 96 +#define B_PRE 96 +#define C_PRE 64 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + + #define FADD_R fsubs + #define FADD_I fadds + + #define FMAC_R1 fnmacs + #define FMAC_R2 fnmacs + #define FMAC_I1 fmacs + #define FMAC_I2 fnmacs + +#elif defined(CN) || defined(CT) + + #define FADD_R fadds + #define FADD_I fsubs + + #define FMAC_R1 fmacs + #define FMAC_R2 fmacs + #define FMAC_I1 fnmacs + #define FMAC_I2 fmacs + +#elif defined(NC) || defined(TC) + + #define FADD_R fadds + #define FADD_I fsubs + + #define FMAC_R1 fmacs + #define FMAC_R2 fnmacs + #define FMAC_I1 fmacs + #define FMAC_I2 fmacs + +#else + + #define FADD_R fsubs + #define FADD_I fadds + + #define FMAC_R1 fnmacs + #define FMAC_R2 fmacs + #define FMAC_I1 fnmacs + #define FMAC_I2 fnmacs + +#endif + + + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro INIT2x2 + + vsub.f32 s16 , s16 , s16 + vmov.f32 s17, s16 + vmov.f32 s18, s16 + vmov.f32 s19, s16 + vmov.f32 s20, s16 + vmov.f32 s21, s16 + vmov.f32 s22, s16 + vmov.f32 s23, s16 + vmov.f32 s24, s16 + vmov.f32 s25, s16 + vmov.f32 s26, s16 + vmov.f32 s27, s16 + vmov.f32 s28, s16 + vmov.f32 s29, s16 + vmov.f32 s30, s16 + vmov.f32 s31, s16 + +.endm + +.macro KERNEL2x2_I + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + fldmias AO!, { s0 - s1 } + fldmias BO!, { s8 - s9 } + + fmuls s16 , s0, s8 + fmuls s24 , s1, s9 + fldmias AO!, { s2 - s3 } + fmuls s17 , s0, s9 + fmuls s25 , s1, s8 + + fldmias BO!, { s10 - s11 } + fmuls s18 , s2, s8 + fmuls s26 , s3, s9 + fldmias AO!, { s4 - s5 } + fmuls s19 , s2, s9 + fmuls s27 , s3, s8 + + fldmias BO!, { s12 - s13 } + fmuls s20 , s0, s10 + fmuls s28 , s1, s11 + fldmias AO!, { s6 - s7 } + fmuls s21 , s0, s11 + fmuls s29 , s1, s10 + + fldmias BO!, { s14 - s15 } + fmuls s22 , s2, s10 + fmuls s30 , s3, s11 + fmuls s23 , s2, s11 + fmuls s31 , s3, s10 + +.endm + + + +.macro KERNEL2x2_M1 + + fmacs s16 , s0, s8 + fldmias AO!, { s4 - s5 } + fmacs s24 , s1, s9 + fmacs s17 , s0, s9 + fldmias BO!, { s12 - s13 } + fmacs s25 , s1, s8 + + fmacs s18 , s2, s8 + fldmias AO!, { s6 - s7 } + fmacs s26 , s3, s9 + fmacs s19 , s2, s9 + fldmias BO!, { s14 - s15 } + fmacs s27 , s3, s8 + + fmacs s20 , s0, s10 + fmacs s28 , s1, s11 + fmacs s21 , s0, s11 + fmacs s29 , s1, s10 + + fmacs s22 , s2, s10 + fmacs s30 , s3, s11 + fmacs s23 , s2, s11 + fmacs s31 , s3, s10 + +.endm + +.macro KERNEL2x2_M2 + pld [ AO , #A_PRE ] + + fmacs s16 , s4, s12 + pld [ BO , #B_PRE ] + fmacs s24 , s5, s13 + fmacs s17 , s4, s13 + fldmias AO!, { s0 - s1 } + fmacs s25 , s5, s12 + + fmacs s18 , s6, s12 + fmacs s26 , s7, s13 + fldmias BO!, { s8 - s9 } + fmacs s19 , s6, s13 + fmacs s27 , s7, s12 + + fldmias AO!, { s2 - s3 } + fmacs s20 , s4, s14 + fmacs s28 , s5, s15 + fldmias BO!, { s10 - s11 } + fmacs s21 , s4, s15 + fmacs s29 , s5, s14 + + fmacs s22 , s6, s14 + fmacs s30 , s7, s15 + fmacs s23 , s6, s15 + fmacs s31 , s7, s14 + +.endm + + +.macro KERNEL2x2_E + + fmacs s16 , s4, s12 + fmacs s24 , s5, s13 + fmacs s17 , s4, s13 + fmacs s25 , s5, s12 + + fmacs s18 , s6, s12 + fmacs s26 , s7, s13 + fmacs s19 , s6, s13 + fmacs s27 , s7, s12 + + fmacs s20 , s4, s14 + fmacs s28 , s5, s15 + fmacs s21 , s4, s15 + fmacs s29 , s5, s14 + + fmacs s22 , s6, s14 + fmacs s30 , s7, s15 + fmacs s23 , s6, s15 + fmacs s31 , s7, s14 + +.endm + +.macro KERNEL2x2_SUB + + fldmias AO!, { s0 - s1 } + fldmias BO!, { s8 - s9 } + + fmacs s16 , s0, s8 + fmacs s24 , s1, s9 + fldmias AO!, { s2 - s3 } + fmacs s17 , s0, s9 + fmacs s25 , s1, s8 + + fldmias BO!, { s10 - s11 } + fmacs s18 , s2, s8 + fmacs s26 , s3, s9 + fmacs s19 , s2, s9 + fmacs s27 , s3, s8 + + fmacs s20 , s0, s10 + fmacs s28 , s1, s11 + fmacs s21 , s0, s11 + fmacs s29 , s1, s10 + + fmacs s22 , s2, s10 + fmacs s30 , s3, s11 + fmacs s23 , s2, s11 + fmacs s31 , s3, s10 + +.endm + + + + +.macro SAVE2x2 + pld [ CO1 , #C_PRE ] + + ldr r3 , LDC + add CO2 , CO1, r3 + flds s0, ALPHA_R + flds s1, ALPHA_I + + fldmias CO1, { s4 - s7 } + fldmias CO2, { s8 - s11 } + + FADD_R s16, s24 , s16 + FADD_I s17, s25 , s17 + FADD_R s18, s26 , s18 + FADD_I s19, s27 , s19 + FADD_R s20, s28 , s20 + FADD_I s21, s29 , s21 + FADD_R s22, s30 , s22 + FADD_I s23, s31 , s23 + + FMAC_R1 s4 , s0 , s16 + FMAC_I1 s5 , s0 , s17 + FMAC_R2 s4 , s1 , s17 + FMAC_I2 s5 , s1 , s16 + + FMAC_R1 s6 , s0 , s18 + FMAC_I1 s7 , s0 , s19 + FMAC_R2 s6 , s1 , s19 + FMAC_I2 s7 , s1 , s18 + + FMAC_R1 s8 , s0 , s20 + FMAC_I1 s9 , s0 , s21 + FMAC_R2 s8 , s1 , s21 + FMAC_I2 s9 , s1 , s20 + + FMAC_R1 s10, s0 , s22 + FMAC_I1 s11, s0 , s23 + FMAC_R2 s10, s1 , s23 + FMAC_I2 s11, s1 , s22 + + fstmias CO1, { s4 - s7 } + fstmias CO2, { s8 - s11 } + + add CO1, CO1, #16 + +.endm + +/******************************************************************************/ + +.macro INIT1x2 + + vsub.f32 s16 , s16 , s16 + vmov.f32 s17, s16 + vmov.f32 s20, s16 + vmov.f32 s21, s16 + vmov.f32 s24, s16 + vmov.f32 s25, s16 + vmov.f32 s28, s16 + vmov.f32 s29, s16 + +.endm + +.macro KERNEL1x2_I + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + flds s10, [ BO, #8 ] + flds s11, [ BO, #12 ] + + fmuls s16 , s0, s8 + fmuls s24 , s1, s9 + fmuls s17 , s0, s9 + fmuls s25 , s1, s8 + + fmuls s20 , s0, s10 + fmuls s28 , s1, s11 + fmuls s21 , s0, s11 + fmuls s29 , s1, s10 + + add BO , BO, #16 + add AO , AO, #8 + + pld [ BO , #B_PRE ] + + flds s4 , [ AO, #0 ] + flds s5 , [ AO, #4 ] + + flds s12, [ BO ] + flds s13, [ BO, #4 ] + flds s14, [ BO, #8 ] + flds s15, [ BO, #12 ] + + add BO , BO, #16 + add AO , AO, #8 +.endm + + + +.macro KERNEL1x2_M1 + pld [ BO , #B_PRE ] + + fmacs s16 , s0, s8 + fmacs s24 , s1, s9 + fmacs s17 , s0, s9 + fmacs s25 , s1, s8 + + fmacs s20 , s0, s10 + fmacs s28 , s1, s11 + fmacs s21 , s0, s11 + fmacs s29 , s1, s10 + + flds s4 , [ AO, #0 ] + flds s5 , [ AO, #4 ] + + flds s12, [ BO ] + flds s13, [ BO, #4 ] + flds s14, [ BO, #8 ] + flds s15, [ BO, #12 ] + + add BO , BO, #16 + add AO , AO, #8 +.endm + +.macro KERNEL1x2_M2 + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + + fmacs s16 , s4, s12 + fmacs s24 , s5, s13 + fmacs s17 , s4, s13 + fmacs s25 , s5, s12 + + fmacs s20 , s4, s14 + fmacs s28 , s5, s15 + fmacs s21 , s4, s15 + fmacs s29 , s5, s14 + + flds s0 , [ AO, #0 ] + flds s1 , [ AO, #4 ] + + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + flds s10, [ BO, #8 ] + flds s11, [ BO, #12 ] + + add BO , BO, #16 + add AO , AO, #8 +.endm + + +.macro KERNEL1x2_E + + fmacs s16 , s4, s12 + fmacs s24 , s5, s13 + fmacs s17 , s4, s13 + fmacs s25 , s5, s12 + + fmacs s20 , s4, s14 + fmacs s28 , s5, s15 + fmacs s21 , s4, s15 + fmacs s29 , s5, s14 + +.endm + +.macro KERNEL1x2_SUB + + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + flds s10, [ BO, #8 ] + flds s11, [ BO, #12 ] + + fmacs s16 , s0, s8 + fmacs s24 , s1, s9 + fmacs s17 , s0, s9 + fmacs s25 , s1, s8 + + fmacs s20 , s0, s10 + fmacs s28 , s1, s11 + fmacs s21 , s0, s11 + fmacs s29 , s1, s10 + + add BO , BO, #16 + add AO , AO, #8 + +.endm + + + + +.macro SAVE1x2 + pld [ CO1 , #C_PRE ] + + ldr r3 , LDC + add CO2 , CO1, r3 + flds s0, ALPHA_R + flds s1, ALPHA_I + + fldmias CO1, { s4 - s5 } + fldmias CO2, { s8 - s9 } + + FADD_R s16, s24 , s16 + FADD_I s17, s25 , s17 + FADD_R s20, s28 , s20 + FADD_I s21, s29 , s21 + + FMAC_R1 s4 , s0 , s16 + FMAC_I1 s5 , s0 , s17 + FMAC_R2 s4 , s1 , s17 + FMAC_I2 s5 , s1 , s16 + + FMAC_R1 s8 , s0 , s20 + FMAC_I1 s9 , s0 , s21 + FMAC_R2 s8 , s1 , s21 + FMAC_I2 s9 , s1 , s20 + + fstmias CO1, { s4 - s5 } + fstmias CO2, { s8 - s9 } + + add CO1, CO1, #8 + +.endm + +/******************************************************************************/ + +.macro INIT2x1 + + vsub.f32 s16 , s16 , s16 + vmov.f32 s17, s16 + vmov.f32 s18, s16 + vmov.f32 s19, s16 + vmov.f32 s24, s16 + vmov.f32 s25, s16 + vmov.f32 s26, s16 + vmov.f32 s27, s16 + +.endm + +.macro KERNEL2x1_I + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s2 , [ AO, #8 ] + flds s3 , [ AO, #12 ] + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + + fmuls s16 , s0, s8 + fmuls s24 , s1, s9 + fmuls s17 , s0, s9 + fmuls s25 , s1, s8 + + fmuls s18 , s2, s8 + fmuls s26 , s3, s9 + fmuls s19 , s2, s9 + fmuls s27 , s3, s8 + + add BO , BO, #8 + add AO , AO, #16 + + pld [ BO , #B_PRE ] + pld [ AO , #A_PRE ] + + flds s4 , [ AO, #0 ] + flds s5 , [ AO, #4 ] + flds s6 , [ AO, #8 ] + flds s7 , [ AO, #12 ] + + flds s12, [ BO ] + flds s13, [ BO, #4 ] + + add BO , BO, #8 + add AO , AO, #16 +.endm + + + +.macro KERNEL2x1_M1 + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + + fmacs s16 , s0, s8 + fmacs s24 , s1, s9 + fmacs s17 , s0, s9 + fmacs s25 , s1, s8 + + fmacs s18 , s2, s8 + fmacs s26 , s3, s9 + fmacs s19 , s2, s9 + fmacs s27 , s3, s8 + + flds s4 , [ AO, #0 ] + flds s5 , [ AO, #4 ] + flds s6 , [ AO, #8 ] + flds s7 , [ AO, #12 ] + + flds s12, [ BO ] + flds s13, [ BO, #4 ] + + add BO , BO, #8 + add AO , AO, #16 +.endm + +.macro KERNEL2x1_M2 + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + + fmacs s16 , s4, s12 + fmacs s24 , s5, s13 + fmacs s17 , s4, s13 + fmacs s25 , s5, s12 + + fmacs s18 , s6, s12 + fmacs s26 , s7, s13 + fmacs s19 , s6, s13 + fmacs s27 , s7, s12 + + flds s0 , [ AO, #0 ] + flds s1 , [ AO, #4 ] + flds s2 , [ AO, #8 ] + flds s3 , [ AO, #12 ] + + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + + add BO , BO, #8 + add AO , AO, #16 +.endm + + +.macro KERNEL2x1_E + + fmacs s16 , s4, s12 + fmacs s24 , s5, s13 + fmacs s17 , s4, s13 + fmacs s25 , s5, s12 + + fmacs s18 , s6, s12 + fmacs s26 , s7, s13 + fmacs s19 , s6, s13 + fmacs s27 , s7, s12 + +.endm + +.macro KERNEL2x1_SUB + + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s2 , [ AO, #8 ] + flds s3 , [ AO, #12 ] + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + + fmacs s16 , s0, s8 + fmacs s24 , s1, s9 + fmacs s17 , s0, s9 + fmacs s25 , s1, s8 + + fmacs s18 , s2, s8 + fmacs s26 , s3, s9 + fmacs s19 , s2, s9 + fmacs s27 , s3, s8 + + add BO , BO, #8 + add AO , AO, #16 + +.endm + + + + +.macro SAVE2x1 + pld [ CO1 , #C_PRE ] + + flds s0, ALPHA_R + flds s1, ALPHA_I + + fldmias CO1, { s4 - s7 } + + FADD_R s16, s24 , s16 + FADD_I s17, s25 , s17 + FADD_R s18, s26 , s18 + FADD_I s19, s27 , s19 + + FMAC_R1 s4 , s0 , s16 + FMAC_I1 s5 , s0 , s17 + FMAC_R2 s4 , s1 , s17 + FMAC_I2 s5 , s1 , s16 + + FMAC_R1 s6 , s0 , s18 + FMAC_I1 s7 , s0 , s19 + FMAC_R2 s6 , s1 , s19 + FMAC_I2 s7 , s1 , s18 + + fstmias CO1, { s4 - s7 } + + add CO1, CO1, #16 + +.endm + +/******************************************************************************/ + +.macro INIT1x1 + + vsub.f32 s16 , s16 , s16 + vmov.f32 s17, s16 + vmov.f32 s24, s16 + vmov.f32 s25, s16 + +.endm + +.macro KERNEL1x1_I + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + + fmuls s16 , s0, s8 + fmuls s24 , s1, s9 + fmuls s17 , s0, s9 + fmuls s25 , s1, s8 + + add BO , BO, #8 + add AO , AO, #8 + + pld [ BO , #B_PRE ] + pld [ AO , #A_PRE ] + + flds s4 , [ AO, #0 ] + flds s5 , [ AO, #4 ] + + flds s12, [ BO ] + flds s13, [ BO, #4 ] + + add BO , BO, #8 + add AO , AO, #8 +.endm + + + +.macro KERNEL1x1_M1 + + fmacs s16 , s0, s8 + fmacs s24 , s1, s9 + fmacs s17 , s0, s9 + fmacs s25 , s1, s8 + + flds s4 , [ AO, #0 ] + flds s5 , [ AO, #4 ] + + flds s12, [ BO ] + flds s13, [ BO, #4 ] + + add BO , BO, #8 + add AO , AO, #8 +.endm + +.macro KERNEL1x1_M2 + + fmacs s16 , s4, s12 + fmacs s24 , s5, s13 + fmacs s17 , s4, s13 + fmacs s25 , s5, s12 + + flds s0 , [ AO, #0 ] + flds s1 , [ AO, #4 ] + + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + + add BO , BO, #8 + add AO , AO, #8 +.endm + + +.macro KERNEL1x1_E + + fmacs s16 , s4, s12 + fmacs s24 , s5, s13 + fmacs s17 , s4, s13 + fmacs s25 , s5, s12 + +.endm + +.macro KERNEL1x1_SUB + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + + fmacs s16 , s0, s8 + fmacs s24 , s1, s9 + fmacs s17 , s0, s9 + fmacs s25 , s1, s8 + + add BO , BO, #8 + add AO , AO, #8 + +.endm + + + + +.macro SAVE1x1 + pld [ CO1 , #C_PRE ] + + flds s0, ALPHA_R + flds s1, ALPHA_I + + fldmias CO1, { s4 - s5 } + + FADD_R s16, s24 , s16 + FADD_I s17, s25 , s17 + + FMAC_R1 s4 , s0 , s16 + FMAC_I1 s5 , s0 , s17 + FMAC_R2 s4 , s1 , s17 + FMAC_I2 s5 , s1 , s16 + + fstmias CO1, { s4 - s5 } + + add CO1, CO1, #8 + +.endm + +/******************************************************************************/ + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + str OLD_M, M + str OLD_N, N + str OLD_K, K + str OLD_A, A + vstr OLD_ALPHA_R, ALPHA_R + vstr OLD_ALPHA_I, ALPHA_I + + sub r3, fp, #128 + vstm r3, { s8 - s31} // store floating point registers + + ldr r3, OLD_LDC + lsl r3, r3, #3 // ldc = ldc * 4 * 2 + str r3, LDC + + ldr K1, K + ldr BC, B + + ldr J, N + asrs J, J, #1 // J = J / 2 + ble _L1_BEGIN + +_L2_BEGIN: + + ldr CO1, C // CO1 = C + ldr r4 , LDC + lsl r4 , r4 , #1 // LDC * 2 + add r3 , r4, CO1 + str r3 , C // store C + + ldr AO, A // AO = A + pld [AO , #A_PRE-64] + pld [AO , #A_PRE-32] + + + +_L2_M2_BEGIN: + + ldr I, M + asrs I, I, #1 // I = I / 2 + ble _L2_M1_BEGIN + +_L2_M2_20: + + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + cmp L , #3 + blt _L2_M2_30 + .align 5 + + + + KERNEL2x2_I + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + sub L, L, #2 + +_L2_M2_22: + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + subs L, L, #1 + bgt _L2_M2_22 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_E + + b _L2_M2_44 + + +_L2_M2_30: + tst L, #3 + ble _L2_M2_40 + + tst L, #2 + ble _L2_M2_32 + + KERNEL2x2_I + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_E + + b _L2_M2_44 + +_L2_M2_32: + + tst L, #1 + ble _L2_M2_40 + + KERNEL2x2_I + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_E + + b _L2_M2_44 + + +_L2_M2_40: + + INIT2x2 + + +_L2_M2_44: + + ands L , K1, #7 // L = L % 8 + ble _L2_M2_100 + +_L2_M2_46: + + KERNEL2x2_SUB + + subs L, L, #1 + bne _L2_M2_46 + +_L2_M2_100: + + SAVE2x2 + +_L2_M2_END: + + subs I, I, #1 + bne _L2_M2_20 + + +_L2_M1_BEGIN: + + ldr I, M + tst I, #1 // I = I % 2 + ble _L2_END + +_L2_M1_20: + + INIT1x2 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble _L2_M1_40 + +_L2_M1_22: + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + subs L, L, #1 + bgt _L2_M1_22 + + +_L2_M1_40: + + ands L , K1, #7 // L = L % 8 + ble _L2_M1_100 + +_L2_M1_42: + + KERNEL1x2_SUB + + subs L, L, #1 + bgt _L2_M1_42 + +_L2_M1_100: + + SAVE1x2 + + +_L2_END: + + mov r3, BC + mov r4, K1 + lsl r4, r4, #4 // k * 2 * 4 * 2 + add r3, r3, r4 // B = B + K * 2 * 8 + mov BC, r3 + + subs J , #1 // j-- + bgt _L2_BEGIN + + + +/*********************************************************************************************/ + +_L1_BEGIN: + + ldr J , N + tst J , #1 + ble _L999 + + + ldr CO1, C // CO1 = C + ldr r4 , LDC + add r3 , r4, CO1 + str r3 , C // store C + + ldr AO, A // AO = A + +_L1_M2_BEGIN: + + ldr I, M + asrs I, I, #1 // I = I / 2 + ble _L1_M1_BEGIN + +_L1_M2_20: + + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + cmp L , #3 + blt _L1_M2_30 + .align 5 + + + + KERNEL2x1_I + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + sub L, L, #2 + +_L1_M2_22: + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + subs L, L, #1 + bgt _L1_M2_22 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_E + + b _L1_M2_44 + + +_L1_M2_30: + tst L, #3 + ble _L1_M2_40 + + tst L, #2 + ble _L1_M2_32 + + KERNEL2x1_I + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_E + + b _L1_M2_44 + +_L1_M2_32: + + tst L, #1 + ble _L1_M2_40 + + KERNEL2x1_I + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_E + + b _L1_M2_44 + + +_L1_M2_40: + + INIT2x1 + + +_L1_M2_44: + + ands L , K1, #7 // L = L % 8 + ble _L1_M2_100 + +_L1_M2_46: + + KERNEL2x1_SUB + + subs L, L, #1 + bne _L1_M2_46 + +_L1_M2_100: + + SAVE2x1 + +_L1_M2_END: + + subs I, I, #1 + bne _L1_M2_20 + + +_L1_M1_BEGIN: + + ldr I, M + tst I, #1 // I = I % 2 + ble _L1_END + +_L1_M1_20: + + INIT1x1 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble _L1_M1_40 + +_L1_M1_22: + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + subs L, L, #1 + bgt _L1_M1_22 + + +_L1_M1_40: + + ands L , K1, #7 // L = L % 8 + ble _L1_M1_100 + +_L1_M1_42: + + KERNEL1x1_SUB + + subs L, L, #1 + bgt _L1_M1_42 + +_L1_M1_100: + + SAVE1x1 + + +_L1_END: + + + +_L999: + + sub r3, fp, #128 + vldm r3, { s8 - s31} // restore floating point registers + + movs r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/ctrmm_kernel_2x2_vfpv3.S b/kernel/arm/ctrmm_kernel_2x2_vfpv3.S new file mode 100644 index 000000000..28e555caa --- /dev/null +++ b/kernel/arm/ctrmm_kernel_2x2_vfpv3.S @@ -0,0 +1,1476 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/10/16 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_M r0 +#define OLD_N r1 +#define OLD_K r2 +#define OLD_A r3 +#define OLD_ALPHA_R s0 +#define OLD_ALPHA_I s1 + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define KKK [fp, #-240] +#define KK [fp, #-244 ] +#define A [fp, #-248 ] +#define LDC [fp, #-252 ] +#define M [fp, #-256 ] +#define N [fp, #-260 ] +#define K [fp, #-264 ] + +#define ALPHA_I [fp, #-272] +#define ALPHA_R [fp, #-280] + +#define B [fp, #4 ] +#define C [fp, #8 ] +#define OLD_LDC [fp, #12 ] +#define OFFSET [fp, #16 ] + +#define I r0 +#define J r1 +#define L r2 + +#define AO r5 +#define BO r6 + +#define CO1 r8 +#define CO2 r9 + +#define K1 r7 +#define BC r12 + +#define A_PRE 96 +#define B_PRE 96 +#define C_PRE 64 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + + #define FADD_R fsubs + #define FADD_I fadds + + #define FMAC_R1 fnmuls + #define FMAC_R2 fnmacs + #define FMAC_I1 fmuls + #define FMAC_I2 fnmacs + +#elif defined(CN) || defined(CT) + + #define FADD_R fadds + #define FADD_I fsubs + + #define FMAC_R1 fmuls + #define FMAC_R2 fmacs + #define FMAC_I1 fnmuls + #define FMAC_I2 fmacs + +#elif defined(NC) || defined(TC) + + #define FADD_R fadds + #define FADD_I fsubs + + #define FMAC_R1 fmuls + #define FMAC_R2 fnmacs + #define FMAC_I1 fmuls + #define FMAC_I2 fmacs + +#else + + #define FADD_R fsubs + #define FADD_I fadds + + #define FMAC_R1 fnmuls + #define FMAC_R2 fmacs + #define FMAC_I1 fnmuls + #define FMAC_I2 fnmacs + +#endif + + + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro INIT2x2 + + vsub.f32 s16 , s16 , s16 + vmov.f32 s17, s16 + vmov.f32 s18, s16 + vmov.f32 s19, s16 + vmov.f32 s20, s16 + vmov.f32 s21, s16 + vmov.f32 s22, s16 + vmov.f32 s23, s16 + vmov.f32 s24, s16 + vmov.f32 s25, s16 + vmov.f32 s26, s16 + vmov.f32 s27, s16 + vmov.f32 s28, s16 + vmov.f32 s29, s16 + vmov.f32 s30, s16 + vmov.f32 s31, s16 + +.endm + +.macro KERNEL2x2_I + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + fldmias AO!, { s0 - s1 } + fldmias BO!, { s8 - s9 } + + fmuls s16 , s0, s8 + fmuls s24 , s1, s9 + fldmias AO!, { s2 - s3 } + fmuls s17 , s0, s9 + fmuls s25 , s1, s8 + + fldmias BO!, { s10 - s11 } + fmuls s18 , s2, s8 + fmuls s26 , s3, s9 + fldmias AO!, { s4 - s5 } + fmuls s19 , s2, s9 + fmuls s27 , s3, s8 + + fldmias BO!, { s12 - s13 } + fmuls s20 , s0, s10 + fmuls s28 , s1, s11 + fldmias AO!, { s6 - s7 } + fmuls s21 , s0, s11 + fmuls s29 , s1, s10 + + fldmias BO!, { s14 - s15 } + fmuls s22 , s2, s10 + fmuls s30 , s3, s11 + fmuls s23 , s2, s11 + fmuls s31 , s3, s10 + +.endm + + + +.macro KERNEL2x2_M1 + + fmacs s16 , s0, s8 + fldmias AO!, { s4 - s5 } + fmacs s24 , s1, s9 + fmacs s17 , s0, s9 + fldmias BO!, { s12 - s13 } + fmacs s25 , s1, s8 + + fmacs s18 , s2, s8 + fldmias AO!, { s6 - s7 } + fmacs s26 , s3, s9 + fmacs s19 , s2, s9 + fldmias BO!, { s14 - s15 } + fmacs s27 , s3, s8 + + fmacs s20 , s0, s10 + fmacs s28 , s1, s11 + fmacs s21 , s0, s11 + fmacs s29 , s1, s10 + + fmacs s22 , s2, s10 + fmacs s30 , s3, s11 + fmacs s23 , s2, s11 + fmacs s31 , s3, s10 + +.endm + +.macro KERNEL2x2_M2 + pld [ AO , #A_PRE ] + + fmacs s16 , s4, s12 + pld [ BO , #B_PRE ] + fmacs s24 , s5, s13 + fmacs s17 , s4, s13 + fldmias AO!, { s0 - s1 } + fmacs s25 , s5, s12 + + fmacs s18 , s6, s12 + fmacs s26 , s7, s13 + fldmias BO!, { s8 - s9 } + fmacs s19 , s6, s13 + fmacs s27 , s7, s12 + + fldmias AO!, { s2 - s3 } + fmacs s20 , s4, s14 + fmacs s28 , s5, s15 + fldmias BO!, { s10 - s11 } + fmacs s21 , s4, s15 + fmacs s29 , s5, s14 + + fmacs s22 , s6, s14 + fmacs s30 , s7, s15 + fmacs s23 , s6, s15 + fmacs s31 , s7, s14 + +.endm + + +.macro KERNEL2x2_E + + fmacs s16 , s4, s12 + fmacs s24 , s5, s13 + fmacs s17 , s4, s13 + fmacs s25 , s5, s12 + + fmacs s18 , s6, s12 + fmacs s26 , s7, s13 + fmacs s19 , s6, s13 + fmacs s27 , s7, s12 + + fmacs s20 , s4, s14 + fmacs s28 , s5, s15 + fmacs s21 , s4, s15 + fmacs s29 , s5, s14 + + fmacs s22 , s6, s14 + fmacs s30 , s7, s15 + fmacs s23 , s6, s15 + fmacs s31 , s7, s14 + +.endm + +.macro KERNEL2x2_SUB + + fldmias AO!, { s0 - s1 } + fldmias BO!, { s8 - s9 } + + fmacs s16 , s0, s8 + fmacs s24 , s1, s9 + fldmias AO!, { s2 - s3 } + fmacs s17 , s0, s9 + fmacs s25 , s1, s8 + + fldmias BO!, { s10 - s11 } + fmacs s18 , s2, s8 + fmacs s26 , s3, s9 + fmacs s19 , s2, s9 + fmacs s27 , s3, s8 + + fmacs s20 , s0, s10 + fmacs s28 , s1, s11 + fmacs s21 , s0, s11 + fmacs s29 , s1, s10 + + fmacs s22 , s2, s10 + fmacs s30 , s3, s11 + fmacs s23 , s2, s11 + fmacs s31 , s3, s10 + +.endm + + + + +.macro SAVE2x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + flds s0, ALPHA_R + flds s1, ALPHA_I + + FADD_R s16, s24 , s16 + FADD_I s17, s25 , s17 + FADD_R s18, s26 , s18 + FADD_I s19, s27 , s19 + FADD_R s20, s28 , s20 + FADD_I s21, s29 , s21 + FADD_R s22, s30 , s22 + FADD_I s23, s31 , s23 + + FMAC_R1 s4 , s0 , s16 + FMAC_I1 s5 , s0 , s17 + FMAC_R2 s4 , s1 , s17 + FMAC_I2 s5 , s1 , s16 + + FMAC_R1 s6 , s0 , s18 + FMAC_I1 s7 , s0 , s19 + FMAC_R2 s6 , s1 , s19 + FMAC_I2 s7 , s1 , s18 + + FMAC_R1 s8 , s0 , s20 + FMAC_I1 s9 , s0 , s21 + FMAC_R2 s8 , s1 , s21 + FMAC_I2 s9 , s1 , s20 + + FMAC_R1 s10, s0 , s22 + FMAC_I1 s11, s0 , s23 + FMAC_R2 s10, s1 , s23 + FMAC_I2 s11, s1 , s22 + + fstmias CO1, { s4 - s7 } + fstmias CO2, { s8 - s11 } + + add CO1, CO1, #16 + +.endm + +/******************************************************************************/ + +.macro INIT1x2 + + vsub.f32 s16 , s16 , s16 + vmov.f32 s17, s16 + vmov.f32 s20, s16 + vmov.f32 s21, s16 + vmov.f32 s24, s16 + vmov.f32 s25, s16 + vmov.f32 s28, s16 + vmov.f32 s29, s16 + +.endm + +.macro KERNEL1x2_I + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + flds s10, [ BO, #8 ] + flds s11, [ BO, #12 ] + + fmuls s16 , s0, s8 + fmuls s24 , s1, s9 + fmuls s17 , s0, s9 + fmuls s25 , s1, s8 + + fmuls s20 , s0, s10 + fmuls s28 , s1, s11 + fmuls s21 , s0, s11 + fmuls s29 , s1, s10 + + add BO , BO, #16 + add AO , AO, #8 + + pld [ BO , #B_PRE ] + + flds s4 , [ AO, #0 ] + flds s5 , [ AO, #4 ] + + flds s12, [ BO ] + flds s13, [ BO, #4 ] + flds s14, [ BO, #8 ] + flds s15, [ BO, #12 ] + + add BO , BO, #16 + add AO , AO, #8 +.endm + + + +.macro KERNEL1x2_M1 + pld [ BO , #B_PRE ] + + fmacs s16 , s0, s8 + fmacs s24 , s1, s9 + fmacs s17 , s0, s9 + fmacs s25 , s1, s8 + + fmacs s20 , s0, s10 + fmacs s28 , s1, s11 + fmacs s21 , s0, s11 + fmacs s29 , s1, s10 + + flds s4 , [ AO, #0 ] + flds s5 , [ AO, #4 ] + + flds s12, [ BO ] + flds s13, [ BO, #4 ] + flds s14, [ BO, #8 ] + flds s15, [ BO, #12 ] + + add BO , BO, #16 + add AO , AO, #8 +.endm + +.macro KERNEL1x2_M2 + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + + fmacs s16 , s4, s12 + fmacs s24 , s5, s13 + fmacs s17 , s4, s13 + fmacs s25 , s5, s12 + + fmacs s20 , s4, s14 + fmacs s28 , s5, s15 + fmacs s21 , s4, s15 + fmacs s29 , s5, s14 + + flds s0 , [ AO, #0 ] + flds s1 , [ AO, #4 ] + + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + flds s10, [ BO, #8 ] + flds s11, [ BO, #12 ] + + add BO , BO, #16 + add AO , AO, #8 +.endm + + +.macro KERNEL1x2_E + + fmacs s16 , s4, s12 + fmacs s24 , s5, s13 + fmacs s17 , s4, s13 + fmacs s25 , s5, s12 + + fmacs s20 , s4, s14 + fmacs s28 , s5, s15 + fmacs s21 , s4, s15 + fmacs s29 , s5, s14 + +.endm + +.macro KERNEL1x2_SUB + + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + flds s10, [ BO, #8 ] + flds s11, [ BO, #12 ] + + fmacs s16 , s0, s8 + fmacs s24 , s1, s9 + fmacs s17 , s0, s9 + fmacs s25 , s1, s8 + + fmacs s20 , s0, s10 + fmacs s28 , s1, s11 + fmacs s21 , s0, s11 + fmacs s29 , s1, s10 + + add BO , BO, #16 + add AO , AO, #8 + +.endm + + + + +.macro SAVE1x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + flds s0, ALPHA_R + flds s1, ALPHA_I + + FADD_R s16, s24 , s16 + FADD_I s17, s25 , s17 + FADD_R s20, s28 , s20 + FADD_I s21, s29 , s21 + + FMAC_R1 s4 , s0 , s16 + FMAC_I1 s5 , s0 , s17 + FMAC_R2 s4 , s1 , s17 + FMAC_I2 s5 , s1 , s16 + + FMAC_R1 s8 , s0 , s20 + FMAC_I1 s9 , s0 , s21 + FMAC_R2 s8 , s1 , s21 + FMAC_I2 s9 , s1 , s20 + + fstmias CO1, { s4 - s5 } + fstmias CO2, { s8 - s9 } + + add CO1, CO1, #8 + +.endm + +/******************************************************************************/ + +.macro INIT2x1 + + vsub.f32 s16 , s16 , s16 + vmov.f32 s17, s16 + vmov.f32 s18, s16 + vmov.f32 s19, s16 + vmov.f32 s24, s16 + vmov.f32 s25, s16 + vmov.f32 s26, s16 + vmov.f32 s27, s16 + +.endm + +.macro KERNEL2x1_I + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s2 , [ AO, #8 ] + flds s3 , [ AO, #12 ] + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + + fmuls s16 , s0, s8 + fmuls s24 , s1, s9 + fmuls s17 , s0, s9 + fmuls s25 , s1, s8 + + fmuls s18 , s2, s8 + fmuls s26 , s3, s9 + fmuls s19 , s2, s9 + fmuls s27 , s3, s8 + + add BO , BO, #8 + add AO , AO, #16 + + pld [ BO , #B_PRE ] + pld [ AO , #A_PRE ] + + flds s4 , [ AO, #0 ] + flds s5 , [ AO, #4 ] + flds s6 , [ AO, #8 ] + flds s7 , [ AO, #12 ] + + flds s12, [ BO ] + flds s13, [ BO, #4 ] + + add BO , BO, #8 + add AO , AO, #16 +.endm + + + +.macro KERNEL2x1_M1 + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + + fmacs s16 , s0, s8 + fmacs s24 , s1, s9 + fmacs s17 , s0, s9 + fmacs s25 , s1, s8 + + fmacs s18 , s2, s8 + fmacs s26 , s3, s9 + fmacs s19 , s2, s9 + fmacs s27 , s3, s8 + + flds s4 , [ AO, #0 ] + flds s5 , [ AO, #4 ] + flds s6 , [ AO, #8 ] + flds s7 , [ AO, #12 ] + + flds s12, [ BO ] + flds s13, [ BO, #4 ] + + add BO , BO, #8 + add AO , AO, #16 +.endm + +.macro KERNEL2x1_M2 + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + + fmacs s16 , s4, s12 + fmacs s24 , s5, s13 + fmacs s17 , s4, s13 + fmacs s25 , s5, s12 + + fmacs s18 , s6, s12 + fmacs s26 , s7, s13 + fmacs s19 , s6, s13 + fmacs s27 , s7, s12 + + flds s0 , [ AO, #0 ] + flds s1 , [ AO, #4 ] + flds s2 , [ AO, #8 ] + flds s3 , [ AO, #12 ] + + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + + add BO , BO, #8 + add AO , AO, #16 +.endm + + +.macro KERNEL2x1_E + + fmacs s16 , s4, s12 + fmacs s24 , s5, s13 + fmacs s17 , s4, s13 + fmacs s25 , s5, s12 + + fmacs s18 , s6, s12 + fmacs s26 , s7, s13 + fmacs s19 , s6, s13 + fmacs s27 , s7, s12 + +.endm + +.macro KERNEL2x1_SUB + + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s2 , [ AO, #8 ] + flds s3 , [ AO, #12 ] + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + + fmacs s16 , s0, s8 + fmacs s24 , s1, s9 + fmacs s17 , s0, s9 + fmacs s25 , s1, s8 + + fmacs s18 , s2, s8 + fmacs s26 , s3, s9 + fmacs s19 , s2, s9 + fmacs s27 , s3, s8 + + add BO , BO, #8 + add AO , AO, #16 + +.endm + + + + +.macro SAVE2x1 + + flds s0, ALPHA_R + flds s1, ALPHA_I + + FADD_R s16, s24 , s16 + FADD_I s17, s25 , s17 + FADD_R s18, s26 , s18 + FADD_I s19, s27 , s19 + + FMAC_R1 s4 , s0 , s16 + FMAC_I1 s5 , s0 , s17 + FMAC_R2 s4 , s1 , s17 + FMAC_I2 s5 , s1 , s16 + + FMAC_R1 s6 , s0 , s18 + FMAC_I1 s7 , s0 , s19 + FMAC_R2 s6 , s1 , s19 + FMAC_I2 s7 , s1 , s18 + + fstmias CO1, { s4 - s7 } + + add CO1, CO1, #16 + +.endm + +/******************************************************************************/ + +.macro INIT1x1 + + vsub.f32 s16 , s16 , s16 + vmov.f32 s17, s16 + vmov.f32 s24, s16 + vmov.f32 s25, s16 + +.endm + +.macro KERNEL1x1_I + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + + fmuls s16 , s0, s8 + fmuls s24 , s1, s9 + fmuls s17 , s0, s9 + fmuls s25 , s1, s8 + + add BO , BO, #8 + add AO , AO, #8 + + pld [ BO , #B_PRE ] + pld [ AO , #A_PRE ] + + flds s4 , [ AO, #0 ] + flds s5 , [ AO, #4 ] + + flds s12, [ BO ] + flds s13, [ BO, #4 ] + + add BO , BO, #8 + add AO , AO, #8 +.endm + + + +.macro KERNEL1x1_M1 + + fmacs s16 , s0, s8 + fmacs s24 , s1, s9 + fmacs s17 , s0, s9 + fmacs s25 , s1, s8 + + flds s4 , [ AO, #0 ] + flds s5 , [ AO, #4 ] + + flds s12, [ BO ] + flds s13, [ BO, #4 ] + + add BO , BO, #8 + add AO , AO, #8 +.endm + +.macro KERNEL1x1_M2 + + fmacs s16 , s4, s12 + fmacs s24 , s5, s13 + fmacs s17 , s4, s13 + fmacs s25 , s5, s12 + + flds s0 , [ AO, #0 ] + flds s1 , [ AO, #4 ] + + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + + add BO , BO, #8 + add AO , AO, #8 +.endm + + +.macro KERNEL1x1_E + + fmacs s16 , s4, s12 + fmacs s24 , s5, s13 + fmacs s17 , s4, s13 + fmacs s25 , s5, s12 + +.endm + +.macro KERNEL1x1_SUB + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + + fmacs s16 , s0, s8 + fmacs s24 , s1, s9 + fmacs s17 , s0, s9 + fmacs s25 , s1, s8 + + add BO , BO, #8 + add AO , AO, #8 + +.endm + + + + +.macro SAVE1x1 + + flds s0, ALPHA_R + flds s1, ALPHA_I + + FADD_R s16, s24 , s16 + FADD_I s17, s25 , s17 + + FMAC_R1 s4 , s0 , s16 + FMAC_I1 s5 , s0 , s17 + FMAC_R2 s4 , s1 , s17 + FMAC_I2 s5 , s1 , s16 + + fstmias CO1, { s4 - s5 } + + add CO1, CO1, #8 + +.endm + +/******************************************************************************/ + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + str OLD_M, M + str OLD_N, N + str OLD_K, K + str OLD_A, A + vstr OLD_ALPHA_R, ALPHA_R + vstr OLD_ALPHA_I, ALPHA_I + + sub r3, fp, #128 + vstm r3, { s8 - s31} // store floating point registers + + ldr r3, OLD_LDC + lsl r3, r3, #3 // ldc = ldc * 4 * 2 + str r3, LDC + + ldr r3, OFFSET +#ifndef LEFT + neg r3 , r3 +#endif + str r3 , KK + + ldr BC, B + + ldr J, N + asrs J, J, #1 // J = J / 2 + ble _L1_BEGIN + +_L2_BEGIN: + + ldr CO1, C // CO1 = C + ldr r4 , LDC + lsl r4 , r4 , #1 // LDC * 2 + add r3 , r4, CO1 + str r3 , C // store C + +#if defined(LEFT) + ldr r3 , OFFSET + str r3 , KK +#endif + + ldr AO, A // AO = A + pld [AO , #A_PRE-64] + pld [AO , #A_PRE-32] + + + +_L2_M2_BEGIN: + + ldr I, M + asrs I, I, #1 // I = I / 2 + ble _L2_M1_BEGIN + +_L2_M2_20: + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #4 // 2 * 4 * 2 float values + add BO , BO , r4 + lsls r4 , r3 , #4 // 2 * 4 * 2 float values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr K1, K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr K1, K + ldr r3, KK + sub K1, K1, r3 + str K1, KKK +#else + ldr K1, KK +#ifdef LEFT + add K1, K1, #2 // number of values in AO +#else + add K1, K1, #2 // number of values in BO +#endif + str K1, KKK +#endif + + asrs L , K1, #3 // L = L / 8 + cmp L , #3 + blt _L2_M2_30 + .align 5 + + + + KERNEL2x2_I + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + sub L, L, #2 + +_L2_M2_22: + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + subs L, L, #1 + bgt _L2_M2_22 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_E + + b _L2_M2_44 + + +_L2_M2_30: + tst L, #3 + ble _L2_M2_40 + + tst L, #2 + ble _L2_M2_32 + + KERNEL2x2_I + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_E + + b _L2_M2_44 + +_L2_M2_32: + + tst L, #1 + ble _L2_M2_40 + + KERNEL2x2_I + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_E + + b _L2_M2_44 + + +_L2_M2_40: + + INIT2x2 + + +_L2_M2_44: + + ands L , K1, #7 // L = L % 8 + ble _L2_M2_100 + +_L2_M2_46: + + KERNEL2x2_SUB + + subs L, L, #1 + bne _L2_M2_46 + +_L2_M2_100: + + SAVE2x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #4 // 2 * 4 * 2 float values + add BO , BO , r4 + lsls r4 , r3 , #4 // 2 * 4 * 2 float values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #2 // number of values in AO + str r3 , KK +#endif + + +_L2_M2_END: + + subs I, I, #1 + bne _L2_M2_20 + + +_L2_M1_BEGIN: + + ldr I, M + tst I, #1 // I = I % 2 + ble _L2_END + +_L2_M1_20: + + INIT1x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #4 // 2 * 4 * 2 float values + add BO , BO , r4 + lsls r4 , r3 , #3 // 1 * 4 * 2 float values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr K1, K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr K1, K + ldr r3, KK + sub K1, K1, r3 + str K1, KKK +#else + ldr K1, KK +#ifdef LEFT + add K1, K1, #1 // number of values in AO +#else + add K1, K1, #2 // number of values in BO +#endif + str K1, KKK +#endif + + asrs L , K1, #3 // L = L / 8 + ble _L2_M1_40 + +_L2_M1_22: + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + subs L, L, #1 + bgt _L2_M1_22 + + +_L2_M1_40: + + ands L , K1, #7 // L = L % 8 + ble _L2_M1_100 + +_L2_M1_42: + + KERNEL1x2_SUB + + subs L, L, #1 + bgt _L2_M1_42 + +_L2_M1_100: + + SAVE1x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #4 // 2 * 4 * 2 float values + add BO , BO , r4 + lsls r4 , r3 , #3 // 1 * 4 * 2 float values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #1 // number of values in AO + str r3 , KK +#endif + + + +_L2_END: + + mov r3, BC + ldr r4, K + lsl r4, r4, #4 // k * 2 * 4 * 2 + add r3, r3, r4 // B = B + K * 2 * 8 + mov BC, r3 + +#if !defined(LEFT) + ldr r3 , KK + add r3 , r3 , #2 // number of values in BO + str r3 , KK +#endif + + subs J , #1 // j-- + bgt _L2_BEGIN + + + +/*********************************************************************************************/ + +_L1_BEGIN: + + ldr J , N + tst J , #1 + ble _L999 + + + ldr CO1, C // CO1 = C + ldr r4 , LDC + add r3 , r4, CO1 + str r3 , C // store C + +#if defined(LEFT) + ldr r3 , OFFSET + str r3 , KK +#endif + + ldr AO, A // AO = A + +_L1_M2_BEGIN: + + ldr I, M + asrs I, I, #1 // I = I / 2 + ble _L1_M1_BEGIN + +_L1_M2_20: + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #3 // 1 * 4 * 2 float values + add BO , BO , r4 + lsls r4 , r3 , #4 // 2 * 4 * 2 float values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr K1, K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr K1, K + ldr r3, KK + sub K1, K1, r3 + str K1, KKK +#else + ldr K1, KK +#ifdef LEFT + add K1, K1, #2 // number of values in AO +#else + add K1, K1, #1 // number of values in BO +#endif + str K1, KKK +#endif + + asrs L , K1, #3 // L = L / 8 + cmp L , #3 + blt _L1_M2_30 + .align 5 + + + + KERNEL2x1_I + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + sub L, L, #2 + +_L1_M2_22: + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + subs L, L, #1 + bgt _L1_M2_22 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_E + + b _L1_M2_44 + + +_L1_M2_30: + tst L, #3 + ble _L1_M2_40 + + tst L, #2 + ble _L1_M2_32 + + KERNEL2x1_I + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_E + + b _L1_M2_44 + +_L1_M2_32: + + tst L, #1 + ble _L1_M2_40 + + KERNEL2x1_I + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_E + + b _L1_M2_44 + + +_L1_M2_40: + + INIT2x1 + + +_L1_M2_44: + + ands L , K1, #7 // L = L % 8 + ble _L1_M2_100 + +_L1_M2_46: + + KERNEL2x1_SUB + + subs L, L, #1 + bne _L1_M2_46 + +_L1_M2_100: + + SAVE2x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #3 // 1 * 4 * 2 float values + add BO , BO , r4 + lsls r4 , r3 , #4 // 2 * 4 * 2 float values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #2 // number of values in AO + str r3 , KK +#endif + + + +_L1_M2_END: + + subs I, I, #1 + bne _L1_M2_20 + + +_L1_M1_BEGIN: + + ldr I, M + tst I, #1 // I = I % 2 + ble _L1_END + +_L1_M1_20: + + INIT1x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #3 // 1 * 4 * 2 float values + add BO , BO , r4 + lsls r4 , r3 , #3 // 1 * 4 * 2 float values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr K1, K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr K1, K + ldr r3, KK + sub K1, K1, r3 + str K1, KKK +#else + ldr K1, KK +#ifdef LEFT + add K1, K1, #1 // number of values in AO +#else + add K1, K1, #1 // number of values in BO +#endif + str K1, KKK +#endif + + asrs L , K1, #3 // L = L / 8 + ble _L1_M1_40 + +_L1_M1_22: + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + subs L, L, #1 + bgt _L1_M1_22 + + +_L1_M1_40: + + ands L , K1, #7 // L = L % 8 + ble _L1_M1_100 + +_L1_M1_42: + + KERNEL1x1_SUB + + subs L, L, #1 + bgt _L1_M1_42 + +_L1_M1_100: + + SAVE1x1 + + +_L1_END: + + + +_L999: + + sub r3, fp, #128 + vldm r3, { s8 - s31} // restore floating point registers + + movs r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/zgemm_kernel_2x2_vfpv3.S b/kernel/arm/zgemm_kernel_2x2_vfpv3.S new file mode 100644 index 000000000..4b01f0429 --- /dev/null +++ b/kernel/arm/zgemm_kernel_2x2_vfpv3.S @@ -0,0 +1,1329 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/10/16 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_M r0 +#define OLD_N r1 +#define OLD_K r2 +#define OLD_A r3 +#define OLD_ALPHA_R d0 +#define OLD_ALPHA_I d1 + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define A [fp, #-248 ] +#define LDC [fp, #-252 ] +#define M [fp, #-256 ] +#define N [fp, #-260 ] +#define K [fp, #-264 ] + +#define ALPHA_I [fp, #-272] +#define ALPHA_R [fp, #-280] + +#define B [fp, #4 ] +#define C [fp, #8 ] +#define OLD_LDC [fp, #12 ] + +#define I r0 +#define J r1 +#define L r2 + +#define AO r5 +#define BO r6 + +#define CO1 r8 +#define CO2 r9 + +#define K1 r7 +#define BC r12 + +#define A_PRE 96 +#define B_PRE 96 +#define C_PRE 64 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + + #define FADD_R fsubd + #define FADD_I faddd + + #define FMAC_R1 fnmacd + #define FMAC_R2 fnmacd + #define FMAC_I1 fmacd + #define FMAC_I2 fnmacd + +#elif defined(CN) || defined(CT) + + #define FADD_R faddd + #define FADD_I fsubd + + #define FMAC_R1 fmacd + #define FMAC_R2 fmacd + #define FMAC_I1 fnmacd + #define FMAC_I2 fmacd + +#elif defined(NC) || defined(TC) + + #define FADD_R faddd + #define FADD_I fsubd + + #define FMAC_R1 fmacd + #define FMAC_R2 fnmacd + #define FMAC_I1 fmacd + #define FMAC_I2 fmacd + +#else + + #define FADD_R fsubd + #define FADD_I faddd + + #define FMAC_R1 fnmacd + #define FMAC_R2 fmacd + #define FMAC_I1 fnmacd + #define FMAC_I2 fnmacd + +#endif + + + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro INIT2x2 + + vsub.f64 d16 , d16 , d16 + vmov.f64 d17, d16 + vmov.f64 d18, d16 + vmov.f64 d19, d16 + vmov.f64 d20, d16 + vmov.f64 d21, d16 + vmov.f64 d22, d16 + vmov.f64 d23, d16 + vmov.f64 d24, d16 + vmov.f64 d25, d16 + vmov.f64 d26, d16 + vmov.f64 d27, d16 + vmov.f64 d28, d16 + vmov.f64 d29, d16 + vmov.f64 d30, d16 + vmov.f64 d31, d16 + +.endm + +.macro KERNEL2x2_I + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + + fmuld d16 , d0, d8 + fldd d2 , [ AO, #16 ] + fmuld d24 , d1, d9 + fldd d3 , [ AO, #24 ] + fmuld d17 , d0, d9 + fldd d10, [ BO, #16 ] + fmuld d25 , d1, d8 + + fldd d11, [ BO, #24 ] + fmuld d18 , d2, d8 + add BO , BO, #32 + fmuld d26 , d3, d9 + add AO , AO, #32 + fmuld d19 , d2, d9 + pld [ BO , #B_PRE ] + fmuld d27 , d3, d8 + + pld [ AO , #A_PRE ] + fmuld d20 , d0, d10 + fldd d4 , [ AO, #0 ] + fmuld d28 , d1, d11 + fldd d5 , [ AO, #8 ] + fmuld d21 , d0, d11 + fldd d12, [ BO ] + fmuld d29 , d1, d10 + + fldd d13, [ BO, #8 ] + fmuld d22 , d2, d10 + fldd d6 , [ AO, #16 ] + fmuld d30 , d3, d11 + fldd d7 , [ AO, #24 ] + fmuld d23 , d2, d11 + fldd d14, [ BO, #16 ] + fmuld d31 , d3, d10 + fldd d15, [ BO, #24 ] + + add BO , BO, #32 + add AO , AO, #32 +.endm + + + +.macro KERNEL2x2_M1 + pld [ AO , #A_PRE ] + + fmacd d16 , d0, d8 + pld [ BO , #B_PRE ] + fmacd d24 , d1, d9 + fldd d4 , [ AO, #0 ] + fmacd d17 , d0, d9 + fldd d5 , [ AO, #8 ] + fmacd d25 , d1, d8 + + fldd d12, [ BO ] + fmacd d18 , d2, d8 + fldd d13, [ BO, #8 ] + fmacd d26 , d3, d9 + fldd d6 , [ AO, #16 ] + fmacd d19 , d2, d9 + fldd d7 , [ AO, #24 ] + fmacd d27 , d3, d8 + + fmacd d20 , d0, d10 + fldd d14, [ BO, #16 ] + fmacd d28 , d1, d11 + fmacd d21 , d0, d11 + fldd d15, [ BO, #24 ] + fmacd d29 , d1, d10 + + fmacd d22 , d2, d10 + add BO , BO, #32 + fmacd d30 , d3, d11 + fmacd d23 , d2, d11 + add AO , AO, #32 + fmacd d31 , d3, d10 + +.endm + +.macro KERNEL2x2_M2 + pld [ AO , #A_PRE ] + + fmacd d16 , d4, d12 + pld [ BO , #B_PRE ] + fmacd d24 , d5, d13 + fldd d0 , [ AO, #0 ] + fmacd d17 , d4, d13 + fldd d1 , [ AO, #8 ] + fmacd d25 , d5, d12 + + fmacd d18 , d6, d12 + fldd d8 , [ BO ] + fmacd d26 , d7, d13 + fldd d9 , [ BO, #8 ] + fmacd d19 , d6, d13 + fmacd d27 , d7, d12 + + fldd d2 , [ AO, #16 ] + fmacd d20 , d4, d14 + fldd d3 , [ AO, #24 ] + fmacd d28 , d5, d15 + fmacd d21 , d4, d15 + fldd d10, [ BO, #16 ] + fmacd d29 , d5, d14 + + fldd d11, [ BO, #24 ] + fmacd d22 , d6, d14 + fmacd d30 , d7, d15 + add BO , BO, #32 + fmacd d23 , d6, d15 + add AO , AO, #32 + fmacd d31 , d7, d14 + +.endm + + +.macro KERNEL2x2_E + + fmacd d16 , d4, d12 + fmacd d24 , d5, d13 + fmacd d17 , d4, d13 + fmacd d25 , d5, d12 + + fmacd d18 , d6, d12 + fmacd d26 , d7, d13 + fmacd d19 , d6, d13 + fmacd d27 , d7, d12 + + fmacd d20 , d4, d14 + fmacd d28 , d5, d15 + fmacd d21 , d4, d15 + fmacd d29 , d5, d14 + + fmacd d22 , d6, d14 + fmacd d30 , d7, d15 + fmacd d23 , d6, d15 + fmacd d31 , d7, d14 + +.endm + +.macro KERNEL2x2_SUB + + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + + fmacd d16 , d0, d8 + fldd d2 , [ AO, #16 ] + fmacd d24 , d1, d9 + fldd d3 , [ AO, #24 ] + fmacd d17 , d0, d9 + fldd d10, [ BO, #16 ] + fmacd d25 , d1, d8 + + fldd d11, [ BO, #24 ] + fmacd d18 , d2, d8 + fmacd d26 , d3, d9 + fmacd d19 , d2, d9 + fmacd d27 , d3, d8 + + fmacd d20 , d0, d10 + fmacd d28 , d1, d11 + fmacd d21 , d0, d11 + fmacd d29 , d1, d10 + + fmacd d22 , d2, d10 + add BO , BO, #32 + fmacd d30 , d3, d11 + fmacd d23 , d2, d11 + add AO , AO, #32 + fmacd d31 , d3, d10 + +.endm + + + + +.macro SAVE2x2 + pld [ CO1 , #C_PRE ] + + ldr r3 , LDC + add CO2 , CO1, r3 + fldd d0, ALPHA_R + fldd d1, ALPHA_I + + fldmiad CO1, { d4 - d7 } + fldmiad CO2, { d8 - d11 } + + FADD_R d16, d24 , d16 + FADD_I d17, d25 , d17 + FADD_R d18, d26 , d18 + FADD_I d19, d27 , d19 + FADD_R d20, d28 , d20 + FADD_I d21, d29 , d21 + FADD_R d22, d30 , d22 + FADD_I d23, d31 , d23 + + FMAC_R1 d4 , d0 , d16 + FMAC_I1 d5 , d0 , d17 + FMAC_R2 d4 , d1 , d17 + FMAC_I2 d5 , d1 , d16 + + FMAC_R1 d6 , d0 , d18 + FMAC_I1 d7 , d0 , d19 + FMAC_R2 d6 , d1 , d19 + FMAC_I2 d7 , d1 , d18 + + FMAC_R1 d8 , d0 , d20 + FMAC_I1 d9 , d0 , d21 + FMAC_R2 d8 , d1 , d21 + FMAC_I2 d9 , d1 , d20 + + FMAC_R1 d10, d0 , d22 + FMAC_I1 d11, d0 , d23 + FMAC_R2 d10, d1 , d23 + FMAC_I2 d11, d1 , d22 + + fstmiad CO1, { d4 - d7 } + fstmiad CO2, { d8 - d11 } + + add CO1, CO1, #32 + +.endm + +/******************************************************************************/ + +.macro INIT1x2 + + vsub.f64 d16 , d16 , d16 + vmov.f64 d17, d16 + vmov.f64 d20, d16 + vmov.f64 d21, d16 + vmov.f64 d24, d16 + vmov.f64 d25, d16 + vmov.f64 d28, d16 + vmov.f64 d29, d16 + +.endm + +.macro KERNEL1x2_I + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + fldd d10, [ BO, #16 ] + fldd d11, [ BO, #24 ] + + fmuld d16 , d0, d8 + fmuld d24 , d1, d9 + fmuld d17 , d0, d9 + fmuld d25 , d1, d8 + + fmuld d20 , d0, d10 + fmuld d28 , d1, d11 + fmuld d21 , d0, d11 + fmuld d29 , d1, d10 + + add BO , BO, #32 + add AO , AO, #16 + + pld [ BO , #B_PRE ] + + fldd d4 , [ AO, #0 ] + fldd d5 , [ AO, #8 ] + + fldd d12, [ BO ] + fldd d13, [ BO, #8 ] + fldd d14, [ BO, #16 ] + fldd d15, [ BO, #24 ] + + add BO , BO, #32 + add AO , AO, #16 +.endm + + + +.macro KERNEL1x2_M1 + pld [ BO , #B_PRE ] + + fmacd d16 , d0, d8 + fmacd d24 , d1, d9 + fmacd d17 , d0, d9 + fmacd d25 , d1, d8 + + fmacd d20 , d0, d10 + fmacd d28 , d1, d11 + fmacd d21 , d0, d11 + fmacd d29 , d1, d10 + + fldd d4 , [ AO, #0 ] + fldd d5 , [ AO, #8 ] + + fldd d12, [ BO ] + fldd d13, [ BO, #8 ] + fldd d14, [ BO, #16 ] + fldd d15, [ BO, #24 ] + + add BO , BO, #32 + add AO , AO, #16 +.endm + +.macro KERNEL1x2_M2 + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + + fmacd d16 , d4, d12 + fmacd d24 , d5, d13 + fmacd d17 , d4, d13 + fmacd d25 , d5, d12 + + fmacd d20 , d4, d14 + fmacd d28 , d5, d15 + fmacd d21 , d4, d15 + fmacd d29 , d5, d14 + + fldd d0 , [ AO, #0 ] + fldd d1 , [ AO, #8 ] + + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + fldd d10, [ BO, #16 ] + fldd d11, [ BO, #24 ] + + add BO , BO, #32 + add AO , AO, #16 +.endm + + +.macro KERNEL1x2_E + + fmacd d16 , d4, d12 + fmacd d24 , d5, d13 + fmacd d17 , d4, d13 + fmacd d25 , d5, d12 + + fmacd d20 , d4, d14 + fmacd d28 , d5, d15 + fmacd d21 , d4, d15 + fmacd d29 , d5, d14 + +.endm + +.macro KERNEL1x2_SUB + + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + fldd d10, [ BO, #16 ] + fldd d11, [ BO, #24 ] + + fmacd d16 , d0, d8 + fmacd d24 , d1, d9 + fmacd d17 , d0, d9 + fmacd d25 , d1, d8 + + fmacd d20 , d0, d10 + fmacd d28 , d1, d11 + fmacd d21 , d0, d11 + fmacd d29 , d1, d10 + + add BO , BO, #32 + add AO , AO, #16 + +.endm + + + + +.macro SAVE1x2 + pld [ CO1 , #C_PRE ] + + ldr r3 , LDC + add CO2 , CO1, r3 + fldd d0, ALPHA_R + fldd d1, ALPHA_I + + fldmiad CO1, { d4 - d5 } + fldmiad CO2, { d8 - d9 } + + FADD_R d16, d24 , d16 + FADD_I d17, d25 , d17 + FADD_R d20, d28 , d20 + FADD_I d21, d29 , d21 + + FMAC_R1 d4 , d0 , d16 + FMAC_I1 d5 , d0 , d17 + FMAC_R2 d4 , d1 , d17 + FMAC_I2 d5 , d1 , d16 + + FMAC_R1 d8 , d0 , d20 + FMAC_I1 d9 , d0 , d21 + FMAC_R2 d8 , d1 , d21 + FMAC_I2 d9 , d1 , d20 + + fstmiad CO1, { d4 - d5 } + fstmiad CO2, { d8 - d9 } + + add CO1, CO1, #16 + +.endm + +/******************************************************************************/ + +.macro INIT2x1 + + vsub.f64 d16 , d16 , d16 + vmov.f64 d17, d16 + vmov.f64 d18, d16 + vmov.f64 d19, d16 + vmov.f64 d24, d16 + vmov.f64 d25, d16 + vmov.f64 d26, d16 + vmov.f64 d27, d16 + +.endm + +.macro KERNEL2x1_I + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + + fmuld d16 , d0, d8 + fmuld d24 , d1, d9 + fmuld d17 , d0, d9 + fmuld d25 , d1, d8 + + fmuld d18 , d2, d8 + fmuld d26 , d3, d9 + fmuld d19 , d2, d9 + fmuld d27 , d3, d8 + + add BO , BO, #16 + add AO , AO, #32 + + pld [ BO , #B_PRE ] + pld [ AO , #A_PRE ] + + fldd d4 , [ AO, #0 ] + fldd d5 , [ AO, #8 ] + fldd d6 , [ AO, #16 ] + fldd d7 , [ AO, #24 ] + + fldd d12, [ BO ] + fldd d13, [ BO, #8 ] + + add BO , BO, #16 + add AO , AO, #32 +.endm + + + +.macro KERNEL2x1_M1 + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + + fmacd d16 , d0, d8 + fmacd d24 , d1, d9 + fmacd d17 , d0, d9 + fmacd d25 , d1, d8 + + fmacd d18 , d2, d8 + fmacd d26 , d3, d9 + fmacd d19 , d2, d9 + fmacd d27 , d3, d8 + + fldd d4 , [ AO, #0 ] + fldd d5 , [ AO, #8 ] + fldd d6 , [ AO, #16 ] + fldd d7 , [ AO, #24 ] + + fldd d12, [ BO ] + fldd d13, [ BO, #8 ] + + add BO , BO, #16 + add AO , AO, #32 +.endm + +.macro KERNEL2x1_M2 + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + + fmacd d16 , d4, d12 + fmacd d24 , d5, d13 + fmacd d17 , d4, d13 + fmacd d25 , d5, d12 + + fmacd d18 , d6, d12 + fmacd d26 , d7, d13 + fmacd d19 , d6, d13 + fmacd d27 , d7, d12 + + fldd d0 , [ AO, #0 ] + fldd d1 , [ AO, #8 ] + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + + add BO , BO, #16 + add AO , AO, #32 +.endm + + +.macro KERNEL2x1_E + + fmacd d16 , d4, d12 + fmacd d24 , d5, d13 + fmacd d17 , d4, d13 + fmacd d25 , d5, d12 + + fmacd d18 , d6, d12 + fmacd d26 , d7, d13 + fmacd d19 , d6, d13 + fmacd d27 , d7, d12 + +.endm + +.macro KERNEL2x1_SUB + + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + + fmacd d16 , d0, d8 + fmacd d24 , d1, d9 + fmacd d17 , d0, d9 + fmacd d25 , d1, d8 + + fmacd d18 , d2, d8 + fmacd d26 , d3, d9 + fmacd d19 , d2, d9 + fmacd d27 , d3, d8 + + add BO , BO, #16 + add AO , AO, #32 + +.endm + + + + +.macro SAVE2x1 + pld [ CO1 , #C_PRE ] + + fldd d0, ALPHA_R + fldd d1, ALPHA_I + + fldmiad CO1, { d4 - d7 } + + FADD_R d16, d24 , d16 + FADD_I d17, d25 , d17 + FADD_R d18, d26 , d18 + FADD_I d19, d27 , d19 + + FMAC_R1 d4 , d0 , d16 + FMAC_I1 d5 , d0 , d17 + FMAC_R2 d4 , d1 , d17 + FMAC_I2 d5 , d1 , d16 + + FMAC_R1 d6 , d0 , d18 + FMAC_I1 d7 , d0 , d19 + FMAC_R2 d6 , d1 , d19 + FMAC_I2 d7 , d1 , d18 + + fstmiad CO1, { d4 - d7 } + + add CO1, CO1, #32 + +.endm + +/******************************************************************************/ + +.macro INIT1x1 + + vsub.f64 d16 , d16 , d16 + vmov.f64 d17, d16 + vmov.f64 d24, d16 + vmov.f64 d25, d16 + +.endm + +.macro KERNEL1x1_I + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + + fmuld d16 , d0, d8 + fmuld d24 , d1, d9 + fmuld d17 , d0, d9 + fmuld d25 , d1, d8 + + add BO , BO, #16 + add AO , AO, #16 + + pld [ BO , #B_PRE ] + pld [ AO , #A_PRE ] + + fldd d4 , [ AO, #0 ] + fldd d5 , [ AO, #8 ] + + fldd d12, [ BO ] + fldd d13, [ BO, #8 ] + + add BO , BO, #16 + add AO , AO, #16 +.endm + + + +.macro KERNEL1x1_M1 + + fmacd d16 , d0, d8 + fmacd d24 , d1, d9 + fmacd d17 , d0, d9 + fmacd d25 , d1, d8 + + fldd d4 , [ AO, #0 ] + fldd d5 , [ AO, #8 ] + + fldd d12, [ BO ] + fldd d13, [ BO, #8 ] + + add BO , BO, #16 + add AO , AO, #16 +.endm + +.macro KERNEL1x1_M2 + + fmacd d16 , d4, d12 + fmacd d24 , d5, d13 + fmacd d17 , d4, d13 + fmacd d25 , d5, d12 + + fldd d0 , [ AO, #0 ] + fldd d1 , [ AO, #8 ] + + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + + add BO , BO, #16 + add AO , AO, #16 +.endm + + +.macro KERNEL1x1_E + + fmacd d16 , d4, d12 + fmacd d24 , d5, d13 + fmacd d17 , d4, d13 + fmacd d25 , d5, d12 + +.endm + +.macro KERNEL1x1_SUB + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + + fmacd d16 , d0, d8 + fmacd d24 , d1, d9 + fmacd d17 , d0, d9 + fmacd d25 , d1, d8 + + add BO , BO, #16 + add AO , AO, #16 + +.endm + + + + +.macro SAVE1x1 + pld [ CO1 , #C_PRE ] + + fldd d0, ALPHA_R + fldd d1, ALPHA_I + + fldmiad CO1, { d4 - d5 } + + FADD_R d16, d24 , d16 + FADD_I d17, d25 , d17 + + FMAC_R1 d4 , d0 , d16 + FMAC_I1 d5 , d0 , d17 + FMAC_R2 d4 , d1 , d17 + FMAC_I2 d5 , d1 , d16 + + fstmiad CO1, { d4 - d5 } + + add CO1, CO1, #16 + +.endm + +/******************************************************************************/ + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + str OLD_M, M + str OLD_N, N + str OLD_K, K + str OLD_A, A + vstr OLD_ALPHA_R, ALPHA_R + vstr OLD_ALPHA_I, ALPHA_I + + sub r3, fp, #128 + vstm r3, { d8 - d15} // store floating point registers + + ldr r3, OLD_LDC + lsl r3, r3, #4 // ldc = ldc * 8 * 2 + str r3, LDC + + ldr K1, K + ldr BC, B + + ldr J, N + asrs J, J, #1 // J = J / 2 + ble _L1_BEGIN + +_L2_BEGIN: + + ldr CO1, C // CO1 = C + ldr r4 , LDC + lsl r4 , r4 , #1 // LDC * 2 + add r3 , r4, CO1 + str r3 , C // store C + + ldr AO, A // AO = A + pld [AO , #A_PRE-64] + pld [AO , #A_PRE-32] + + + +_L2_M2_BEGIN: + + ldr I, M + asrs I, I, #1 // I = I / 2 + ble _L2_M1_BEGIN + +_L2_M2_20: + + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + cmp L , #3 + blt _L2_M2_30 + .align 5 + + + + KERNEL2x2_I + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + sub L, L, #2 + +_L2_M2_22: + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + subs L, L, #1 + bgt _L2_M2_22 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_E + + b _L2_M2_44 + + +_L2_M2_30: + tst L, #3 + ble _L2_M2_40 + + tst L, #2 + ble _L2_M2_32 + + KERNEL2x2_I + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_E + + b _L2_M2_44 + +_L2_M2_32: + + tst L, #1 + ble _L2_M2_40 + + KERNEL2x2_I + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_E + + b _L2_M2_44 + + +_L2_M2_40: + + INIT2x2 + + +_L2_M2_44: + + ands L , K1, #7 // L = L % 8 + ble _L2_M2_100 + +_L2_M2_46: + + KERNEL2x2_SUB + + subs L, L, #1 + bne _L2_M2_46 + +_L2_M2_100: + + SAVE2x2 + +_L2_M2_END: + + subs I, I, #1 + bne _L2_M2_20 + + +_L2_M1_BEGIN: + + ldr I, M + tst I, #1 // I = I % 2 + ble _L2_END + +_L2_M1_20: + + INIT1x2 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble _L2_M1_40 + +_L2_M1_22: + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + subs L, L, #1 + bgt _L2_M1_22 + + +_L2_M1_40: + + ands L , K1, #7 // L = L % 8 + ble _L2_M1_100 + +_L2_M1_42: + + KERNEL1x2_SUB + + subs L, L, #1 + bgt _L2_M1_42 + +_L2_M1_100: + + SAVE1x2 + + +_L2_END: + + mov r3, BC + mov r4, K1 + lsl r4, r4, #5 // k * 2 * 8 * 2 + add r3, r3, r4 // B = B + K * 4 * 8 + mov BC, r3 + + subs J , #1 // j-- + bgt _L2_BEGIN + + + +/*********************************************************************************************/ + +_L1_BEGIN: + + ldr J , N + tst J , #1 + ble _L999 + + + ldr CO1, C // CO1 = C + ldr r4 , LDC + add r3 , r4, CO1 + str r3 , C // store C + + ldr AO, A // AO = A + +_L1_M2_BEGIN: + + ldr I, M + asrs I, I, #1 // I = I / 2 + ble _L1_M1_BEGIN + +_L1_M2_20: + + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + cmp L , #3 + blt _L1_M2_30 + .align 5 + + + + KERNEL2x1_I + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + sub L, L, #2 + +_L1_M2_22: + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + subs L, L, #1 + bgt _L1_M2_22 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_E + + b _L1_M2_44 + + +_L1_M2_30: + tst L, #3 + ble _L1_M2_40 + + tst L, #2 + ble _L1_M2_32 + + KERNEL2x1_I + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_E + + b _L1_M2_44 + +_L1_M2_32: + + tst L, #1 + ble _L1_M2_40 + + KERNEL2x1_I + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_E + + b _L1_M2_44 + + +_L1_M2_40: + + INIT2x1 + + +_L1_M2_44: + + ands L , K1, #7 // L = L % 8 + ble _L1_M2_100 + +_L1_M2_46: + + KERNEL2x1_SUB + + subs L, L, #1 + bne _L1_M2_46 + +_L1_M2_100: + + SAVE2x1 + +_L1_M2_END: + + subs I, I, #1 + bne _L1_M2_20 + + +_L1_M1_BEGIN: + + ldr I, M + tst I, #1 // I = I % 2 + ble _L1_END + +_L1_M1_20: + + INIT1x1 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble _L1_M1_40 + +_L1_M1_22: + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + subs L, L, #1 + bgt _L1_M1_22 + + +_L1_M1_40: + + ands L , K1, #7 // L = L % 8 + ble _L1_M1_100 + +_L1_M1_42: + + KERNEL1x1_SUB + + subs L, L, #1 + bgt _L1_M1_42 + +_L1_M1_100: + + SAVE1x1 + + +_L1_END: + + + +_L999: + + sub r3, fp, #128 + vldm r3, { d8 - d15} // restore floating point registers + + movs r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/ztrmm_kernel_2x2_vfpv3.S b/kernel/arm/ztrmm_kernel_2x2_vfpv3.S new file mode 100644 index 000000000..917ce610f --- /dev/null +++ b/kernel/arm/ztrmm_kernel_2x2_vfpv3.S @@ -0,0 +1,1538 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/10/16 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_M r0 +#define OLD_N r1 +#define OLD_K r2 +#define OLD_A r3 +#define OLD_ALPHA_R d0 +#define OLD_ALPHA_I d1 + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define KKK [fp, #-240] +#define KK [fp, #-244 ] +#define A [fp, #-248 ] +#define LDC [fp, #-252 ] +#define M [fp, #-256 ] +#define N [fp, #-260 ] +#define K [fp, #-264 ] + +#define ALPHA_I [fp, #-272] +#define ALPHA_R [fp, #-280] + +#define B [fp, #4 ] +#define C [fp, #8 ] +#define OLD_LDC [fp, #12 ] +#define OFFSET [fp, #16 ] + +#define I r0 +#define J r1 +#define L r2 + +#define AO r5 +#define BO r6 + +#define CO1 r8 +#define CO2 r9 + +#define K1 r7 +#define BC r12 + +#define A_PRE 96 +#define B_PRE 96 +#define C_PRE 64 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + + #define FADD_R fsubd + #define FADD_I faddd + + #define FMAC_R1 fnmuld + #define FMAC_R2 fnmacd + #define FMAC_I1 fmuld + #define FMAC_I2 fnmacd + +#elif defined(CN) || defined(CT) + + #define FADD_R faddd + #define FADD_I fsubd + + #define FMAC_R1 fmuld + #define FMAC_R2 fmacd + #define FMAC_I1 fnmuld + #define FMAC_I2 fmacd + +#elif defined(NC) || defined(TC) + + #define FADD_R faddd + #define FADD_I fsubd + + #define FMAC_R1 fmuld + #define FMAC_R2 fnmacd + #define FMAC_I1 fmuld + #define FMAC_I2 fmacd + +#else + + #define FADD_R fsubd + #define FADD_I faddd + + #define FMAC_R1 fnmuld + #define FMAC_R2 fmacd + #define FMAC_I1 fnmuld + #define FMAC_I2 fnmacd + +#endif + + + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro INIT2x2 + + vsub.f64 d16 , d16 , d16 + vmov.f64 d17, d16 + vmov.f64 d18, d16 + vmov.f64 d19, d16 + vmov.f64 d20, d16 + vmov.f64 d21, d16 + vmov.f64 d22, d16 + vmov.f64 d23, d16 + vmov.f64 d24, d16 + vmov.f64 d25, d16 + vmov.f64 d26, d16 + vmov.f64 d27, d16 + vmov.f64 d28, d16 + vmov.f64 d29, d16 + vmov.f64 d30, d16 + vmov.f64 d31, d16 + +.endm + +.macro KERNEL2x2_I + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + + fmuld d16 , d0, d8 + fldd d2 , [ AO, #16 ] + fmuld d24 , d1, d9 + fldd d3 , [ AO, #24 ] + fmuld d17 , d0, d9 + fldd d10, [ BO, #16 ] + fmuld d25 , d1, d8 + + fldd d11, [ BO, #24 ] + fmuld d18 , d2, d8 + add BO , BO, #32 + fmuld d26 , d3, d9 + add AO , AO, #32 + fmuld d19 , d2, d9 + pld [ BO , #B_PRE ] + fmuld d27 , d3, d8 + + pld [ AO , #A_PRE ] + fmuld d20 , d0, d10 + fldd d4 , [ AO, #0 ] + fmuld d28 , d1, d11 + fldd d5 , [ AO, #8 ] + fmuld d21 , d0, d11 + fldd d12, [ BO ] + fmuld d29 , d1, d10 + + fldd d13, [ BO, #8 ] + fmuld d22 , d2, d10 + fldd d6 , [ AO, #16 ] + fmuld d30 , d3, d11 + fldd d7 , [ AO, #24 ] + fmuld d23 , d2, d11 + fldd d14, [ BO, #16 ] + fmuld d31 , d3, d10 + fldd d15, [ BO, #24 ] + + add BO , BO, #32 + add AO , AO, #32 +.endm + + + +.macro KERNEL2x2_M1 + pld [ AO , #A_PRE ] + + fmacd d16 , d0, d8 + pld [ BO , #B_PRE ] + fmacd d24 , d1, d9 + fldd d4 , [ AO, #0 ] + fmacd d17 , d0, d9 + fldd d5 , [ AO, #8 ] + fmacd d25 , d1, d8 + + fldd d12, [ BO ] + fmacd d18 , d2, d8 + fldd d13, [ BO, #8 ] + fmacd d26 , d3, d9 + fldd d6 , [ AO, #16 ] + fmacd d19 , d2, d9 + fldd d7 , [ AO, #24 ] + fmacd d27 , d3, d8 + + fmacd d20 , d0, d10 + fldd d14, [ BO, #16 ] + fmacd d28 , d1, d11 + fmacd d21 , d0, d11 + fldd d15, [ BO, #24 ] + fmacd d29 , d1, d10 + + fmacd d22 , d2, d10 + add BO , BO, #32 + fmacd d30 , d3, d11 + fmacd d23 , d2, d11 + add AO , AO, #32 + fmacd d31 , d3, d10 + +.endm + +.macro KERNEL2x2_M2 + pld [ AO , #A_PRE ] + + fmacd d16 , d4, d12 + pld [ BO , #B_PRE ] + fmacd d24 , d5, d13 + fldd d0 , [ AO, #0 ] + fmacd d17 , d4, d13 + fldd d1 , [ AO, #8 ] + fmacd d25 , d5, d12 + + fmacd d18 , d6, d12 + fldd d8 , [ BO ] + fmacd d26 , d7, d13 + fldd d9 , [ BO, #8 ] + fmacd d19 , d6, d13 + fmacd d27 , d7, d12 + + fldd d2 , [ AO, #16 ] + fmacd d20 , d4, d14 + fldd d3 , [ AO, #24 ] + fmacd d28 , d5, d15 + fmacd d21 , d4, d15 + fldd d10, [ BO, #16 ] + fmacd d29 , d5, d14 + + fldd d11, [ BO, #24 ] + fmacd d22 , d6, d14 + fmacd d30 , d7, d15 + add BO , BO, #32 + fmacd d23 , d6, d15 + add AO , AO, #32 + fmacd d31 , d7, d14 + +.endm + + +.macro KERNEL2x2_E + + fmacd d16 , d4, d12 + fmacd d24 , d5, d13 + fmacd d17 , d4, d13 + fmacd d25 , d5, d12 + + fmacd d18 , d6, d12 + fmacd d26 , d7, d13 + fmacd d19 , d6, d13 + fmacd d27 , d7, d12 + + fmacd d20 , d4, d14 + fmacd d28 , d5, d15 + fmacd d21 , d4, d15 + fmacd d29 , d5, d14 + + fmacd d22 , d6, d14 + fmacd d30 , d7, d15 + fmacd d23 , d6, d15 + fmacd d31 , d7, d14 + +.endm + +.macro KERNEL2x2_SUB + + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + + fmacd d16 , d0, d8 + fldd d2 , [ AO, #16 ] + fmacd d24 , d1, d9 + fldd d3 , [ AO, #24 ] + fmacd d17 , d0, d9 + fldd d10, [ BO, #16 ] + fmacd d25 , d1, d8 + + fldd d11, [ BO, #24 ] + fmacd d18 , d2, d8 + fmacd d26 , d3, d9 + fmacd d19 , d2, d9 + fmacd d27 , d3, d8 + + fmacd d20 , d0, d10 + fmacd d28 , d1, d11 + fmacd d21 , d0, d11 + fmacd d29 , d1, d10 + + fmacd d22 , d2, d10 + add BO , BO, #32 + fmacd d30 , d3, d11 + fmacd d23 , d2, d11 + add AO , AO, #32 + fmacd d31 , d3, d10 + +.endm + + + + +.macro SAVE2x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + fldd d0, ALPHA_R + fldd d1, ALPHA_I + + + FADD_R d16, d24 , d16 + FADD_I d17, d25 , d17 + FADD_R d18, d26 , d18 + FADD_I d19, d27 , d19 + FADD_R d20, d28 , d20 + FADD_I d21, d29 , d21 + FADD_R d22, d30 , d22 + FADD_I d23, d31 , d23 + + FMAC_R1 d4 , d0 , d16 + FMAC_I1 d5 , d0 , d17 + FMAC_R2 d4 , d1 , d17 + FMAC_I2 d5 , d1 , d16 + + FMAC_R1 d6 , d0 , d18 + FMAC_I1 d7 , d0 , d19 + FMAC_R2 d6 , d1 , d19 + FMAC_I2 d7 , d1 , d18 + + FMAC_R1 d8 , d0 , d20 + FMAC_I1 d9 , d0 , d21 + FMAC_R2 d8 , d1 , d21 + FMAC_I2 d9 , d1 , d20 + + FMAC_R1 d10, d0 , d22 + FMAC_I1 d11, d0 , d23 + FMAC_R2 d10, d1 , d23 + FMAC_I2 d11, d1 , d22 + + fstmiad CO1, { d4 - d7 } + fstmiad CO2, { d8 - d11 } + + add CO1, CO1, #32 + +.endm + +/******************************************************************************/ + +.macro INIT1x2 + + vsub.f64 d16 , d16 , d16 + vmov.f64 d17, d16 + vmov.f64 d20, d16 + vmov.f64 d21, d16 + vmov.f64 d24, d16 + vmov.f64 d25, d16 + vmov.f64 d28, d16 + vmov.f64 d29, d16 + +.endm + +.macro KERNEL1x2_I + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + fldd d10, [ BO, #16 ] + fldd d11, [ BO, #24 ] + + fmuld d16 , d0, d8 + fmuld d24 , d1, d9 + fmuld d17 , d0, d9 + fmuld d25 , d1, d8 + + fmuld d20 , d0, d10 + fmuld d28 , d1, d11 + fmuld d21 , d0, d11 + fmuld d29 , d1, d10 + + add BO , BO, #32 + add AO , AO, #16 + + pld [ BO , #B_PRE ] + + fldd d4 , [ AO, #0 ] + fldd d5 , [ AO, #8 ] + + fldd d12, [ BO ] + fldd d13, [ BO, #8 ] + fldd d14, [ BO, #16 ] + fldd d15, [ BO, #24 ] + + add BO , BO, #32 + add AO , AO, #16 +.endm + + + +.macro KERNEL1x2_M1 + pld [ BO , #B_PRE ] + + fmacd d16 , d0, d8 + fmacd d24 , d1, d9 + fmacd d17 , d0, d9 + fmacd d25 , d1, d8 + + fmacd d20 , d0, d10 + fmacd d28 , d1, d11 + fmacd d21 , d0, d11 + fmacd d29 , d1, d10 + + fldd d4 , [ AO, #0 ] + fldd d5 , [ AO, #8 ] + + fldd d12, [ BO ] + fldd d13, [ BO, #8 ] + fldd d14, [ BO, #16 ] + fldd d15, [ BO, #24 ] + + add BO , BO, #32 + add AO , AO, #16 +.endm + +.macro KERNEL1x2_M2 + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + + fmacd d16 , d4, d12 + fmacd d24 , d5, d13 + fmacd d17 , d4, d13 + fmacd d25 , d5, d12 + + fmacd d20 , d4, d14 + fmacd d28 , d5, d15 + fmacd d21 , d4, d15 + fmacd d29 , d5, d14 + + fldd d0 , [ AO, #0 ] + fldd d1 , [ AO, #8 ] + + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + fldd d10, [ BO, #16 ] + fldd d11, [ BO, #24 ] + + add BO , BO, #32 + add AO , AO, #16 +.endm + + +.macro KERNEL1x2_E + + fmacd d16 , d4, d12 + fmacd d24 , d5, d13 + fmacd d17 , d4, d13 + fmacd d25 , d5, d12 + + fmacd d20 , d4, d14 + fmacd d28 , d5, d15 + fmacd d21 , d4, d15 + fmacd d29 , d5, d14 + +.endm + +.macro KERNEL1x2_SUB + + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + fldd d10, [ BO, #16 ] + fldd d11, [ BO, #24 ] + + fmacd d16 , d0, d8 + fmacd d24 , d1, d9 + fmacd d17 , d0, d9 + fmacd d25 , d1, d8 + + fmacd d20 , d0, d10 + fmacd d28 , d1, d11 + fmacd d21 , d0, d11 + fmacd d29 , d1, d10 + + add BO , BO, #32 + add AO , AO, #16 + +.endm + + + + +.macro SAVE1x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + fldd d0, ALPHA_R + fldd d1, ALPHA_I + + FADD_R d16, d24 , d16 + FADD_I d17, d25 , d17 + FADD_R d20, d28 , d20 + FADD_I d21, d29 , d21 + + FMAC_R1 d4 , d0 , d16 + FMAC_I1 d5 , d0 , d17 + FMAC_R2 d4 , d1 , d17 + FMAC_I2 d5 , d1 , d16 + + FMAC_R1 d8 , d0 , d20 + FMAC_I1 d9 , d0 , d21 + FMAC_R2 d8 , d1 , d21 + FMAC_I2 d9 , d1 , d20 + + fstmiad CO1, { d4 - d5 } + fstmiad CO2, { d8 - d9 } + + add CO1, CO1, #16 + +.endm + +/******************************************************************************/ + +.macro INIT2x1 + + vsub.f64 d16 , d16 , d16 + vmov.f64 d17, d16 + vmov.f64 d18, d16 + vmov.f64 d19, d16 + vmov.f64 d24, d16 + vmov.f64 d25, d16 + vmov.f64 d26, d16 + vmov.f64 d27, d16 + +.endm + +.macro KERNEL2x1_I + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + + fmuld d16 , d0, d8 + fmuld d24 , d1, d9 + fmuld d17 , d0, d9 + fmuld d25 , d1, d8 + + fmuld d18 , d2, d8 + fmuld d26 , d3, d9 + fmuld d19 , d2, d9 + fmuld d27 , d3, d8 + + add BO , BO, #16 + add AO , AO, #32 + + pld [ BO , #B_PRE ] + pld [ AO , #A_PRE ] + + fldd d4 , [ AO, #0 ] + fldd d5 , [ AO, #8 ] + fldd d6 , [ AO, #16 ] + fldd d7 , [ AO, #24 ] + + fldd d12, [ BO ] + fldd d13, [ BO, #8 ] + + add BO , BO, #16 + add AO , AO, #32 +.endm + + + +.macro KERNEL2x1_M1 + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + + fmacd d16 , d0, d8 + fmacd d24 , d1, d9 + fmacd d17 , d0, d9 + fmacd d25 , d1, d8 + + fmacd d18 , d2, d8 + fmacd d26 , d3, d9 + fmacd d19 , d2, d9 + fmacd d27 , d3, d8 + + fldd d4 , [ AO, #0 ] + fldd d5 , [ AO, #8 ] + fldd d6 , [ AO, #16 ] + fldd d7 , [ AO, #24 ] + + fldd d12, [ BO ] + fldd d13, [ BO, #8 ] + + add BO , BO, #16 + add AO , AO, #32 +.endm + +.macro KERNEL2x1_M2 + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + + fmacd d16 , d4, d12 + fmacd d24 , d5, d13 + fmacd d17 , d4, d13 + fmacd d25 , d5, d12 + + fmacd d18 , d6, d12 + fmacd d26 , d7, d13 + fmacd d19 , d6, d13 + fmacd d27 , d7, d12 + + fldd d0 , [ AO, #0 ] + fldd d1 , [ AO, #8 ] + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + + add BO , BO, #16 + add AO , AO, #32 +.endm + + +.macro KERNEL2x1_E + + fmacd d16 , d4, d12 + fmacd d24 , d5, d13 + fmacd d17 , d4, d13 + fmacd d25 , d5, d12 + + fmacd d18 , d6, d12 + fmacd d26 , d7, d13 + fmacd d19 , d6, d13 + fmacd d27 , d7, d12 + +.endm + +.macro KERNEL2x1_SUB + + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + + fmacd d16 , d0, d8 + fmacd d24 , d1, d9 + fmacd d17 , d0, d9 + fmacd d25 , d1, d8 + + fmacd d18 , d2, d8 + fmacd d26 , d3, d9 + fmacd d19 , d2, d9 + fmacd d27 , d3, d8 + + add BO , BO, #16 + add AO , AO, #32 + +.endm + + + + +.macro SAVE2x1 + + fldd d0, ALPHA_R + fldd d1, ALPHA_I + + FADD_R d16, d24 , d16 + FADD_I d17, d25 , d17 + FADD_R d18, d26 , d18 + FADD_I d19, d27 , d19 + + FMAC_R1 d4 , d0 , d16 + FMAC_I1 d5 , d0 , d17 + FMAC_R2 d4 , d1 , d17 + FMAC_I2 d5 , d1 , d16 + + FMAC_R1 d6 , d0 , d18 + FMAC_I1 d7 , d0 , d19 + FMAC_R2 d6 , d1 , d19 + FMAC_I2 d7 , d1 , d18 + + fstmiad CO1, { d4 - d7 } + + add CO1, CO1, #32 + +.endm + +/******************************************************************************/ + +.macro INIT1x1 + + vsub.f64 d16 , d16 , d16 + vmov.f64 d17, d16 + vmov.f64 d24, d16 + vmov.f64 d25, d16 + +.endm + +.macro KERNEL1x1_I + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + + fmuld d16 , d0, d8 + fmuld d24 , d1, d9 + fmuld d17 , d0, d9 + fmuld d25 , d1, d8 + + add BO , BO, #16 + add AO , AO, #16 + + pld [ BO , #B_PRE ] + pld [ AO , #A_PRE ] + + fldd d4 , [ AO, #0 ] + fldd d5 , [ AO, #8 ] + + fldd d12, [ BO ] + fldd d13, [ BO, #8 ] + + add BO , BO, #16 + add AO , AO, #16 +.endm + + + +.macro KERNEL1x1_M1 + + fmacd d16 , d0, d8 + fmacd d24 , d1, d9 + fmacd d17 , d0, d9 + fmacd d25 , d1, d8 + + fldd d4 , [ AO, #0 ] + fldd d5 , [ AO, #8 ] + + fldd d12, [ BO ] + fldd d13, [ BO, #8 ] + + add BO , BO, #16 + add AO , AO, #16 +.endm + +.macro KERNEL1x1_M2 + + fmacd d16 , d4, d12 + fmacd d24 , d5, d13 + fmacd d17 , d4, d13 + fmacd d25 , d5, d12 + + fldd d0 , [ AO, #0 ] + fldd d1 , [ AO, #8 ] + + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + + add BO , BO, #16 + add AO , AO, #16 +.endm + + +.macro KERNEL1x1_E + + fmacd d16 , d4, d12 + fmacd d24 , d5, d13 + fmacd d17 , d4, d13 + fmacd d25 , d5, d12 + +.endm + +.macro KERNEL1x1_SUB + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + + fmacd d16 , d0, d8 + fmacd d24 , d1, d9 + fmacd d17 , d0, d9 + fmacd d25 , d1, d8 + + add BO , BO, #16 + add AO , AO, #16 + +.endm + + + + +.macro SAVE1x1 + + fldd d0, ALPHA_R + fldd d1, ALPHA_I + + FADD_R d16, d24 , d16 + FADD_I d17, d25 , d17 + + FMAC_R1 d4 , d0 , d16 + FMAC_I1 d5 , d0 , d17 + FMAC_R2 d4 , d1 , d17 + FMAC_I2 d5 , d1 , d16 + + fstmiad CO1, { d4 - d5 } + + add CO1, CO1, #16 + +.endm + +/******************************************************************************/ + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + str OLD_M, M + str OLD_N, N + str OLD_K, K + str OLD_A, A + vstr OLD_ALPHA_R, ALPHA_R + vstr OLD_ALPHA_I, ALPHA_I + + sub r3, fp, #128 + vstm r3, { d8 - d15} // store floating point registers + + ldr r3, OLD_LDC + lsl r3, r3, #4 // ldc = ldc * 8 * 2 + str r3, LDC + + ldr r3, OFFSET +#ifndef LEFT + neg r3 , r3 +#endif + str r3 , KK + + ldr BC, B + + ldr J, N + asrs J, J, #1 // J = J / 2 + ble _L1_BEGIN + +_L2_BEGIN: + + ldr CO1, C // CO1 = C + ldr r4 , LDC + lsl r4 , r4 , #1 // LDC * 2 + add r3 , r4, CO1 + str r3 , C // store C + +#if defined(LEFT) + ldr r3 , OFFSET + str r3 , KK +#endif + + + ldr AO, A // AO = A + pld [AO , #A_PRE-64] + pld [AO , #A_PRE-32] + + + +_L2_M2_BEGIN: + + ldr I, M + asrs I, I, #1 // I = I / 2 + ble _L2_M1_BEGIN + +_L2_M2_20: + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #5 // 2 * 8 * 2 double values + add BO , BO , r4 + lsls r4 , r3 , #5 // 2 * 8 * 2 double values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr K1, K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr K1, K + ldr r3, KK + sub K1, K1, r3 + str K1, KKK +#else + ldr K1, KK +#ifdef LEFT + add K1, K1, #2 // number of values in AO +#else + add K1, K1, #2 // number of values in BO +#endif + str K1, KKK +#endif + + + asrs L , K1, #3 // L = L / 8 + cmp L , #3 + blt _L2_M2_30 + .align 5 + + + + KERNEL2x2_I + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + sub L, L, #2 + +_L2_M2_22: + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + subs L, L, #1 + bgt _L2_M2_22 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_E + + b _L2_M2_44 + + +_L2_M2_30: + tst L, #3 + ble _L2_M2_40 + + tst L, #2 + ble _L2_M2_32 + + KERNEL2x2_I + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_E + + b _L2_M2_44 + +_L2_M2_32: + + tst L, #1 + ble _L2_M2_40 + + KERNEL2x2_I + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_E + + b _L2_M2_44 + + +_L2_M2_40: + + INIT2x2 + + +_L2_M2_44: + + ands L , K1, #7 // L = L % 8 + ble _L2_M2_100 + +_L2_M2_46: + + KERNEL2x2_SUB + + subs L, L, #1 + bne _L2_M2_46 + +_L2_M2_100: + + SAVE2x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #5 // 2 * 8 * 2 double values + add BO , BO , r4 + lsls r4 , r3 , #5 // 2 * 8 * 2 double values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #2 // number of values in AO + str r3 , KK +#endif + + + + +_L2_M2_END: + + subs I, I, #1 + bne _L2_M2_20 + + +_L2_M1_BEGIN: + + ldr I, M + tst I, #1 // I = I % 2 + ble _L2_END + +_L2_M1_20: + + INIT1x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #5 // 2 * 8 * 2 double values + add BO , BO , r4 + lsls r4 , r3 , #4 // 1 * 8 * 2 double values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr K1, K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr K1, K + ldr r3, KK + sub K1, K1, r3 + str K1, KKK +#else + ldr K1, KK +#ifdef LEFT + add K1, K1, #1 // number of values in AO +#else + add K1, K1, #2 // number of values in BO +#endif + str K1, KKK +#endif + + asrs L , K1, #3 // L = L / 8 + ble _L2_M1_40 + +_L2_M1_22: + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + subs L, L, #1 + bgt _L2_M1_22 + + +_L2_M1_40: + + ands L , K1, #7 // L = L % 8 + ble _L2_M1_100 + +_L2_M1_42: + + KERNEL1x2_SUB + + subs L, L, #1 + bgt _L2_M1_42 + +_L2_M1_100: + + SAVE1x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #5 // 2 * 8 * 2 double values + add BO , BO , r4 + lsls r4 , r3 , #4 // 1 * 8 * 2 double values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #1 // number of values in AO + str r3 , KK +#endif + + + +_L2_END: + + mov r3, BC + ldr r4, K + lsl r4, r4, #5 // k * 2 * 8 * 2 + add r3, r3, r4 // B = B + K * 4 * 8 + mov BC, r3 + +#if !defined(LEFT) + ldr r3 , KK + add r3 , r3 , #2 // number of values in BO + str r3 , KK +#endif + + + subs J , #1 // j-- + bgt _L2_BEGIN + + + +/*********************************************************************************************/ + +_L1_BEGIN: + + ldr J , N + tst J , #1 + ble _L999 + + + ldr CO1, C // CO1 = C + ldr r4 , LDC + add r3 , r4, CO1 + str r3 , C // store C + +#if defined(LEFT) + ldr r3 , OFFSET + str r3 , KK +#endif + + + ldr AO, A // AO = A + +_L1_M2_BEGIN: + + ldr I, M + asrs I, I, #1 // I = I / 2 + ble _L1_M1_BEGIN + +_L1_M2_20: + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #4 // 1 * 8 * 2 double values + add BO , BO , r4 + lsls r4 , r3 , #5 // 2 * 8 * 2 double values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr K1, K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr K1, K + ldr r3, KK + sub K1, K1, r3 + str K1, KKK +#else + ldr K1, KK +#ifdef LEFT + add K1, K1, #2 // number of values in AO +#else + add K1, K1, #1 // number of values in BO +#endif + str K1, KKK +#endif + + asrs L , K1, #3 // L = L / 8 + cmp L , #3 + blt _L1_M2_30 + .align 5 + + + + KERNEL2x1_I + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + sub L, L, #2 + +_L1_M2_22: + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + subs L, L, #1 + bgt _L1_M2_22 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_E + + b _L1_M2_44 + + +_L1_M2_30: + tst L, #3 + ble _L1_M2_40 + + tst L, #2 + ble _L1_M2_32 + + KERNEL2x1_I + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_E + + b _L1_M2_44 + +_L1_M2_32: + + tst L, #1 + ble _L1_M2_40 + + KERNEL2x1_I + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_E + + b _L1_M2_44 + + +_L1_M2_40: + + INIT2x1 + + +_L1_M2_44: + + ands L , K1, #7 // L = L % 8 + ble _L1_M2_100 + +_L1_M2_46: + + KERNEL2x1_SUB + + subs L, L, #1 + bne _L1_M2_46 + +_L1_M2_100: + + SAVE2x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #4 // 1 * 8 * 2 double values + add BO , BO , r4 + lsls r4 , r3 , #5 // 2 * 8 * 2 double values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #2 // number of values in AO + str r3 , KK +#endif + + + +_L1_M2_END: + + subs I, I, #1 + bne _L1_M2_20 + + +_L1_M1_BEGIN: + + ldr I, M + tst I, #1 // I = I % 2 + ble _L1_END + +_L1_M1_20: + + INIT1x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #4 // 1 * 8 * 2 double values + add BO , BO , r4 + lsls r4 , r3 , #4 // 1 * 8 * 2 double values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr K1, K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr K1, K + ldr r3, KK + sub K1, K1, r3 + str K1, KKK +#else + ldr K1, KK +#ifdef LEFT + add K1, K1, #1 // number of values in AO +#else + add K1, K1, #1 // number of values in BO +#endif + str K1, KKK +#endif + + asrs L , K1, #3 // L = L / 8 + ble _L1_M1_40 + +_L1_M1_22: + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + subs L, L, #1 + bgt _L1_M1_22 + + +_L1_M1_40: + + ands L , K1, #7 // L = L % 8 + ble _L1_M1_100 + +_L1_M1_42: + + KERNEL1x1_SUB + + subs L, L, #1 + bgt _L1_M1_42 + +_L1_M1_100: + + SAVE1x1 + + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #4 // 1 * 8 * 2 double values + add BO , BO , r4 + lsls r4 , r3 , #4 // 1 * 8 * 2 double values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #1 // number of values in AO + str r3 , KK +#endif + + + +_L1_END: + + + +_L999: + + sub r3, fp, #128 + vldm r3, { d8 - d15} // restore floating point registers + + movs r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + From 04391e6d9c99fec8d6314ef24550d0f2f9f836c1 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Wed, 16 Oct 2013 18:04:34 +0200 Subject: [PATCH 14/81] optimized param.h --- param.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/param.h b/param.h index cf1665438..8d9d0fc47 100644 --- a/param.h +++ b/param.h @@ -1802,8 +1802,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_B 0 #define GEMM_DEFAULT_ALIGN 0x03fffUL -#define SGEMM_DEFAULT_UNROLL_M 2 -#define SGEMM_DEFAULT_UNROLL_N 2 +#define SGEMM_DEFAULT_UNROLL_M 4 +#define SGEMM_DEFAULT_UNROLL_N 4 #define DGEMM_DEFAULT_UNROLL_M 4 #define DGEMM_DEFAULT_UNROLL_N 4 @@ -1814,17 +1814,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ZGEMM_DEFAULT_UNROLL_M 2 #define ZGEMM_DEFAULT_UNROLL_N 2 -#define SGEMM_DEFAULT_P 64 +#define SGEMM_DEFAULT_P 128 #define DGEMM_DEFAULT_P 128 #define CGEMM_DEFAULT_P 24 #define ZGEMM_DEFAULT_P 20 -#define SGEMM_DEFAULT_Q 192 +#define SGEMM_DEFAULT_Q 240 #define DGEMM_DEFAULT_Q 96 #define CGEMM_DEFAULT_Q 128 #define ZGEMM_DEFAULT_Q 64 -#define SGEMM_DEFAULT_R 512 +#define SGEMM_DEFAULT_R 4096 #define DGEMM_DEFAULT_R 512 #define CGEMM_DEFAULT_R 512 #define ZGEMM_DEFAULT_R 512 From 2d49db2f5bebbd727cb860cff023705ecd4bfda3 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Wed, 16 Oct 2013 19:04:42 +0200 Subject: [PATCH 15/81] moved compiler flags from Makefile.rule to Makefile.arm --- Makefile.arm | 8 ++++++-- Makefile.rule | 14 +++++++------- 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/Makefile.arm b/Makefile.arm index 05ea9c679..6cdeb2f75 100644 --- a/Makefile.arm +++ b/Makefile.arm @@ -1,3 +1,7 @@ -ifdef BINARY64 -else + +ifeq ($(CORE), ARMV7) +CCOMMON_OPT += -marm -mfpu=vfpv3 -mfloat-abi=hard +FCOMMON_OPT += -marm -mfpu=vfpv3 -mfloat-abi=hard endif + + diff --git a/Makefile.rule b/Makefile.rule index a7aa0873d..534f4d1a2 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -19,14 +19,14 @@ TARGET = ARMV7 # C compiler including binary type(32bit / 64bit). Default is gcc. # Don't use Intel Compiler or PGI, it won't generate right codes as I expect. -# CC = gcc +CC = gcc # Fortran compiler. Default is g77. -# FC = gfortran +FC = gfortran # Even you can specify cross compiler. Meanwhile, please set HOSTCC. -CC = arm-linux-gnueabihf-gcc -FC = arm-linux-gnueabihf-gfortran +#CC = arm-linux-gnueabihf-gcc +#FC = arm-linux-gnueabihf-gfortran # If you use the cross compiler, please set this host compiler. HOSTCC = gcc @@ -38,7 +38,7 @@ HOSTCC = gcc # specify it. # For force setting for single threaded, specify USE_THREAD = 0 # For force setting for multi threaded, specify USE_THREAD = 1 -USE_THREAD = 0 +#USE_THREAD = 0 # If you're going to use this library with OpenMP, please comment it in. # USE_OPENMP = 1 @@ -46,7 +46,7 @@ USE_THREAD = 0 # You can define maximum number of threads. Basically it should be # less than actual number of cores. If you don't specify one, it's # automatically detected by the the script. -NUM_THREADS = 4 +NUM_THREADS = 16 # if you don't need generate the shared library, please comment it in. # NO_SHARED = 1 @@ -123,7 +123,7 @@ NO_AFFINITY = 1 # Common Optimization Flag; # The default -O2 is enough. -COMMON_OPT = -O0 -marm -mfpu=vfpv3 -fno-omit-frame-pointer +#COMMON_OPT = -O3 -marm -mfpu=vfpv3 -mfloat-abi=hard # Profiling flags COMMON_PROF = -pg From 02bc36ac79b06142b97e2c7a93632f668f3e6b4e Mon Sep 17 00:00:00 2001 From: wernsaar Date: Fri, 1 Nov 2013 18:22:27 +0100 Subject: [PATCH 16/81] added sgemm_ncopy routine and made some improvements on cgemm_kernel for ARMV7 --- common_arm.h | 34 +-- kernel/arm/KERNEL.ARMV7 | 10 +- kernel/arm/cgemm_kernel_2x2_vfpv3.S | 18 +- kernel/arm/sgemm_ncopy_4_vfpv3.S | 344 ++++++++++++++++++++++++++++ param.h | 16 +- 5 files changed, 393 insertions(+), 29 deletions(-) create mode 100644 kernel/arm/sgemm_ncopy_4_vfpv3.S diff --git a/common_arm.h b/common_arm.h index b61efd7c1..e3d1d4079 100644 --- a/common_arm.h +++ b/common_arm.h @@ -80,31 +80,35 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifndef ASSEMBLER -static void INLINE blas_lock(volatile unsigned long *address){ +static void __inline blas_lock(volatile BLASULONG *address){ + + int register ret; -// long int ret, val = 1; -/* do { while (*address) {YIELDING;}; __asm__ __volatile__( - "1: ll %0, %3\n" - " ori %2, %0, 1\n" - " sc %2, %1\n" - " beqz %2, 1b\n" - " andi %2, %0, 1\n" - " sync\n" - : "=&r" (val), "=m" (address), "=&r" (ret) - : "m" (address) - : "memory"); + "ldrex r2, [%1] \n\t" + "mov r2, #0 \n\t" + "strex r3, r2, [%1] \n\t" + "mov %0 , r3 \n\t" + : "=r"(ret), "=r"(address) + : "1"(address) + : "memory", "r2" , "r3" + + + ); } while (ret); -*/ + } -static inline unsigned int rpcc(void){ - unsigned long ret=0; +static inline BLASULONG rpcc(void){ + BLASULONG ret=0; + struct timeval tv; + gettimeofday(&tv,NULL); + ret=1000000* tv.tv_sec + tv.tv_usec; return ret; } diff --git a/kernel/arm/KERNEL.ARMV7 b/kernel/arm/KERNEL.ARMV7 index 43153798e..ec692d5c2 100644 --- a/kernel/arm/KERNEL.ARMV7 +++ b/kernel/arm/KERNEL.ARMV7 @@ -89,7 +89,7 @@ ZTRMMKERNEL = ztrmm_kernel_2x2_vfpv3.S SGEMMKERNEL = sgemm_kernel_4x4_vfpv3.S SGEMMINCOPY = SGEMMITCOPY = -SGEMMONCOPY = ../generic/gemm_ncopy_4.c +SGEMMONCOPY = sgemm_ncopy_4_vfpv3.S SGEMMOTCOPY = ../generic/gemm_tcopy_4.c SGEMMINCOPYOBJ = SGEMMITCOPYOBJ = @@ -99,12 +99,12 @@ SGEMMOTCOPYOBJ = sgemm_otcopy.o #DGEMMKERNEL = ../generic/gemmkernel_2x2.c #DGEMMKERNEL = dgemm_kernel_4x2_vfpv2.S DGEMMKERNEL = dgemm_kernel_4x4_vfpv3.S -DGEMMINCOPY = dgemm_ncopy_4_vfpv3.S -DGEMMITCOPY = ../generic/gemm_tcopy_4.c +DGEMMINCOPY = +DGEMMITCOPY = DGEMMONCOPY = dgemm_ncopy_4_vfpv3.S DGEMMOTCOPY = ../generic/gemm_tcopy_4.c -DGEMMINCOPYOBJ = dgemm_incopy.o -DGEMMITCOPYOBJ = dgemm_itcopy.o +DGEMMINCOPYOBJ = +DGEMMITCOPYOBJ = DGEMMONCOPYOBJ = dgemm_oncopy.o DGEMMOTCOPYOBJ = dgemm_otcopy.o diff --git a/kernel/arm/cgemm_kernel_2x2_vfpv3.S b/kernel/arm/cgemm_kernel_2x2_vfpv3.S index abbbac831..4cebcab77 100644 --- a/kernel/arm/cgemm_kernel_2x2_vfpv3.S +++ b/kernel/arm/cgemm_kernel_2x2_vfpv3.S @@ -26,11 +26,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** -* 2013/10/16 Saar +* 2013/11/01 Saar * BLASTEST : OK * CTEST : OK * TEST : OK * +* 2013/11/01 Saar +* UNROLL_N 2 +* UNROLL_M 2 +* CGEMM_P 96 +* CGEMM_Q 120 +* CGEMM_R 4096 +* A_PRE 96 +* B_PRE 96 +* C_PRE 64 +* +* Performance on Odroid U2: +* +* 1 Core: 2.59 GFLOPS ATLAS: 2.37 GFLOPS +* 2 Cores: 5.17 GFLOPS ATLAS: 4.46 GFLOPS +* 3 Cores: 7.69 GFLOPS ATLAS: 6.50 GFLOPS +* 4 Cores: 10.22 GFLOPS ATLAS: 8.18 GFLOPS **************************************************************************************/ #define ASSEMBLER diff --git a/kernel/arm/sgemm_ncopy_4_vfpv3.S b/kernel/arm/sgemm_ncopy_4_vfpv3.S new file mode 100644 index 000000000..34fbb3252 --- /dev/null +++ b/kernel/arm/sgemm_ncopy_4_vfpv3.S @@ -0,0 +1,344 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/10/11 Saar +* BLASTEST : xOK +* CTEST : xOK +* TEST : xOK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_M r0 +#define OLD_N r1 +#define OLD_A r2 +#define OLD_LDA r3 + + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define LDA [fp, #-260 ] + +#define B [fp, #4 ] + +#define M r0 +#define N r1 +#define A r2 + +#define BO r5 + +#define AO1 r6 +#define AO2 r7 +#define AO3 r8 +#define AO4 r9 + +#define I r3 +#define J r12 + +#define A_PRE 96 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro COPY4x4 + + flds s0 , [ AO1, #0 ] + flds s1 , [ AO2, #0 ] + flds s2 , [ AO3, #0 ] + flds s3 , [ AO4, #0 ] + + flds s4 , [ AO1, #4 ] + flds s8 , [ AO1, #8 ] + flds s12, [ AO1, #12 ] + + flds s5 , [ AO2, #4 ] + add AO1, AO1, #16 + flds s9 , [ AO2, #8 ] + flds s13, [ AO2, #12 ] + + flds s6 , [ AO3, #4 ] + add AO2, AO2, #16 + flds s10, [ AO3, #8 ] + flds s14, [ AO3, #12 ] + + flds s7 , [ AO4, #4 ] + add AO3, AO3, #16 + flds s11, [ AO4, #8 ] + flds s15, [ AO4, #12 ] + + fstmias BO!, { s0 - s3 } + add AO4, AO4, #16 + fstmias BO!, { s4 - s7 } + fstmias BO!, { s8 - s15 } + +.endm + +.macro COPY1x4 + + flds s0 , [ AO1, #0 ] + flds s1 , [ AO2, #0 ] + add AO1, AO1, #4 + flds s2 , [ AO3, #0 ] + add AO2, AO2, #4 + flds s3 , [ AO4, #0 ] + + add AO3, AO3, #4 + fstmias BO!, { s0 - s3 } + add AO4, AO4, #4 + +.endm + +.macro COPY4x2 + + flds s0 , [ AO1, #0 ] + flds s2 , [ AO1, #4 ] + flds s4 , [ AO1, #8 ] + flds s6 , [ AO1, #12 ] + + flds s1 , [ AO2, #0 ] + flds s3 , [ AO2, #4 ] + add AO1, AO1, #16 + flds s5 , [ AO2, #8 ] + flds s7 , [ AO2, #12 ] + + fstmias BO!, { s0 - s7 } + add AO2, AO2, #16 + +.endm + + +.macro COPY1x2 + + flds s0 , [ AO1, #0 ] + flds s1 , [ AO2, #0 ] + add AO1, AO1, #4 + + fstmias BO!, { s0 - s1 } + add AO2, AO2, #4 + +.endm + +.macro COPY4x1 + + flds s0 , [ AO1, #0 ] + flds s1 , [ AO1, #4 ] + flds s2 , [ AO1, #8 ] + flds s3 , [ AO1, #12 ] + + fstmias BO!, { s0 - s3 } + add AO1, AO1, #16 + +.endm + + +.macro COPY1x1 + + flds s0 , [ AO1, #0 ] + + fstmias BO!, { s0 } + add AO1, AO1, #4 + +.endm + + + + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + + lsl r3, r3, #2 // lda = lda * 4 + str r3, LDA + + sub r4, fp, #128 + vstm r4, { s8 - s15} // store floating point registers + + ldr BO, B + +_L4_BEGIN: + + asrs J, N, #2 // J = N / 4 + ble _L2_BEGIN + +_L4_M4_BEGIN: + + mov AO1, A // AO1 = A + ldr r4 , LDA + add AO2, AO1, r4 + add AO3, AO2, r4 + add AO4, AO3, r4 + add A , AO4, r4 // A = A + 4 * LDA + + asrs I, M, #2 // I = M / 4 + ble _L4_M4_40 + +_L4_M4_20: + + COPY4x4 + + subs I , I , #1 + bne _L4_M4_20 + + +_L4_M4_40: + + ands I, M , #3 + ble _L4_M4_END + +_L4_M4_60: + + COPY1x4 + + subs I , I , #1 + bne _L4_M4_60 + + +_L4_M4_END: + + subs J , J, #1 // j-- + bne _L4_M4_BEGIN + + + +/*********************************************************************************************/ + +_L2_BEGIN: + + tst N, #3 + ble _L999 + + tst N, #2 + ble _L1_BEGIN + +_L2_M4_BEGIN: + + mov AO1, A // AO1 = A + ldr r4 , LDA + add AO2, AO1, r4 + add A , AO2, r4 // A = A + 2 * LDA + + asrs I, M, #2 // I = M / 4 + ble _L2_M4_40 + +_L2_M4_20: + + COPY4x2 + + subs I , I , #1 + bne _L2_M4_20 + + +_L2_M4_40: + + ands I, M , #3 + ble _L2_M4_END + +_L2_M4_60: + + COPY1x2 + + subs I , I , #1 + bne _L2_M4_60 + + +_L2_M4_END: + + +/*********************************************************************************************/ + +_L1_BEGIN: + + tst N, #1 + ble _L999 + + +_L1_M4_BEGIN: + + mov AO1, A // AO1 = A + ldr r4 , LDA + add A , AO1, r4 // A = A + 1 * LDA + + asrs I, M, #2 // I = M / 4 + ble _L1_M4_40 + +_L1_M4_20: + + COPY4x1 + + subs I , I , #1 + bne _L1_M4_20 + + +_L1_M4_40: + + ands I, M , #3 + ble _L1_M4_END + +_L1_M4_60: + + COPY1x1 + + subs I , I , #1 + bne _L1_M4_60 + + +_L1_M4_END: + + + +_L999: + + sub r3, fp, #128 + vldm r3, { s8 - s15} // restore floating point registers + + movs r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + diff --git a/param.h b/param.h index 8d9d0fc47..d6284f5cb 100644 --- a/param.h +++ b/param.h @@ -1814,19 +1814,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ZGEMM_DEFAULT_UNROLL_M 2 #define ZGEMM_DEFAULT_UNROLL_N 2 -#define SGEMM_DEFAULT_P 128 +#define SGEMM_DEFAULT_P 192 #define DGEMM_DEFAULT_P 128 -#define CGEMM_DEFAULT_P 24 +#define CGEMM_DEFAULT_P 96 #define ZGEMM_DEFAULT_P 20 -#define SGEMM_DEFAULT_Q 240 -#define DGEMM_DEFAULT_Q 96 -#define CGEMM_DEFAULT_Q 128 +#define SGEMM_DEFAULT_Q 120 +#define DGEMM_DEFAULT_Q 120 +#define CGEMM_DEFAULT_Q 120 #define ZGEMM_DEFAULT_Q 64 -#define SGEMM_DEFAULT_R 4096 -#define DGEMM_DEFAULT_R 512 -#define CGEMM_DEFAULT_R 512 +#define SGEMM_DEFAULT_R 16384 +#define DGEMM_DEFAULT_R 8192 +#define CGEMM_DEFAULT_R 4096 #define ZGEMM_DEFAULT_R 512 From b3eab8fcb7c3e5d0b90e9d2864c0df6257eb68b0 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sat, 2 Nov 2013 09:43:53 +0100 Subject: [PATCH 17/81] minor optimizations on zgemm_kernel for ARMV7 --- kernel/arm/zgemm_kernel_2x2_vfpv3.S | 18 +++++++++++++++++- param.h | 6 +++--- 2 files changed, 20 insertions(+), 4 deletions(-) diff --git a/kernel/arm/zgemm_kernel_2x2_vfpv3.S b/kernel/arm/zgemm_kernel_2x2_vfpv3.S index 4b01f0429..9c14aec10 100644 --- a/kernel/arm/zgemm_kernel_2x2_vfpv3.S +++ b/kernel/arm/zgemm_kernel_2x2_vfpv3.S @@ -26,11 +26,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** -* 2013/10/16 Saar +* 2013/11/02 Saar * BLASTEST : OK * CTEST : OK * TEST : OK * +* 2013/11/02 Saar +* UNROLL_N 2 +* UNROLL_M 2 +* ZGEMM_P 64 +* ZGEMM_Q 120 +* ZGEMM_R 4096 +* A_PRE 96 +* B_PRE 96 +* C_PRE 64 +* +* Performance on Odroid U2: +* +* 1 Core: 1.62 GFLOPS ATLAS: 1.39 GFLOPS +* 2 Cores: 3.20 GFLOPS ATLAS: 2.54 GFLOPS +* 3 Cores: 4.72 GFLOPS ATLAS: 3.76 GFLOPS +* 4 Cores: 5.93 GFLOPS ATLAS: 4.88 GFLOPS **************************************************************************************/ #define ASSEMBLER diff --git a/param.h b/param.h index d6284f5cb..f6895d90b 100644 --- a/param.h +++ b/param.h @@ -1817,17 +1817,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SGEMM_DEFAULT_P 192 #define DGEMM_DEFAULT_P 128 #define CGEMM_DEFAULT_P 96 -#define ZGEMM_DEFAULT_P 20 +#define ZGEMM_DEFAULT_P 64 #define SGEMM_DEFAULT_Q 120 #define DGEMM_DEFAULT_Q 120 #define CGEMM_DEFAULT_Q 120 -#define ZGEMM_DEFAULT_Q 64 +#define ZGEMM_DEFAULT_Q 120 #define SGEMM_DEFAULT_R 16384 #define DGEMM_DEFAULT_R 8192 #define CGEMM_DEFAULT_R 4096 -#define ZGEMM_DEFAULT_R 512 +#define ZGEMM_DEFAULT_R 4096 From 2b801a00a56c8e270e0de628c5e55e93adaeb1b3 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sat, 2 Nov 2013 13:06:11 +0100 Subject: [PATCH 18/81] small optimizations on sgemm_kernel for ARMV7 --- kernel/arm/sgemm_kernel_4x4_vfpv3.S | 131 ++++++++-------------------- kernel/arm/sgemm_ncopy_4_vfpv3.S | 17 +++- param.h | 6 +- 3 files changed, 54 insertions(+), 100 deletions(-) diff --git a/kernel/arm/sgemm_kernel_4x4_vfpv3.S b/kernel/arm/sgemm_kernel_4x4_vfpv3.S index 4746a587e..8bc3e5325 100644 --- a/kernel/arm/sgemm_kernel_4x4_vfpv3.S +++ b/kernel/arm/sgemm_kernel_4x4_vfpv3.S @@ -26,28 +26,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** -* 2013/10/13 Saar +* 2013/11/02 Saar * BLASTEST : OK * CTEST : OK * TEST : OK * * -* 2013/10/13 Saar +* 2013/11/02 Saar * UNROLL_N 4 * UNROLL_M 4 * DGEMM_P 128 * DGEMM_Q 240 -* DGEMM_R 4096 -* A_PRE 96 -* B_PRE 96 -* C_PRE 64 +* DGEMM_R 12288 +* A_PRE 128 +* B_PRE 128 +* C_PRE 32 * -* Performance on Odroid U2: +* Performance on Odroid U2: * -* 1 Core: 2.60 GFLOPS ATLAS: 2.67 GFLOPS -* 2 Cores: 5.17 GFLOPS ATLAS: 5.25 GFLOPS -* 3 Cores: 7.60 GFLOPS ATLAS: 7.82 GFLOPS -* 4 Cores: 9.98 GFLOPS ATLAS: 9.95 GFLOPS +* 3072x3072 1 Core: 2.62 GFLOPS ATLAS: 2.69 GFLOPS +* 3072x3072 2 Cores: 5.23 GFLOPS ATLAS: 5.27 GFLOPS +* 3072x3072 3 Cores: 7.78 GFLOPS ATLAS: 7.87 GFLOPS +* 3072x3072 4 Cores: 10.10 GFLOPS ATLAS: 9.98 GFLOPS **************************************************************************************/ #define ASSEMBLER @@ -92,9 +92,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define K1 r7 #define BC r12 -#define A_PRE 96 -#define B_PRE 96 -#define C_PRE 64 +#define A_PRE 128 +#define B_PRE 128 +#define C_PRE 32 /************************************************************************************** * Macro definitions @@ -123,10 +123,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x4_I + pld [ AO , #A_PRE ] fldmias AO!, { s0 - s1 } - pld [ AO , #A_PRE-8 ] + pld [ BO , #B_PRE ] fldmias BO!, { s8 - s9 } - pld [ BO , #B_PRE-8 ] fmuls s16 , s0, s8 fldmias AO!, { s2 - s3 } @@ -162,20 +162,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. pld [ AO , #A_PRE ] fmacs s16 , s4, s12 fmacs s17 , s5, s12 - fldmias AO!, { s0 - s1 } + fldmias AO!, { s0 - s3 } fmacs s18 , s6, s12 pld [ BO , #B_PRE ] fmacs s19 , s7, s12 fmacs s20 , s4, s13 - fldmias AO!, { s2 - s3 } + fldmias BO!, { s8 - s11 } fmacs s21 , s5, s13 fmacs s22 , s6, s13 - fldmias BO!, { s8 - s9 } + //fldmias AO!, { s2 - s3 } fmacs s23 , s7, s13 fmacs s24 , s4, s14 - fldmias BO!, { s10 - s11 } + //fldmias BO!, { s10 - s11 } fmacs s25 , s5, s14 fmacs s26 , s6, s14 fmacs s27 , s7, s14 @@ -191,17 +191,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x4_M1 fmacs s16 , s0, s8 - fldmias AO!, { s4 - s5 } + fldmias AO!, { s4 - s7 } fmacs s17 , s1, s8 fmacs s18 , s2, s8 - fldmias AO!, { s6 - s7 } + fldmias BO!, { s12 - s15 } + //fldmias AO!, { s6 - s7 } fmacs s19 , s3, s8 fmacs s20 , s0, s9 - fldmias BO!, { s12 - s13 } fmacs s21 , s1, s9 fmacs s22 , s2, s9 - fldmias BO!, { s14 - s15 } + //fldmias BO!, { s14 - s15 } fmacs s23 , s3, s9 fmacs s24 , s0, s10 @@ -248,10 +248,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x4_SUB flds s8 , [ BO ] - pld [ BO , #B_PRE ] flds s0 , [ AO ] - pld [ AO , #A_PRE ] flds s1 , [ AO, #4 ] fmacs s16 , s0, s8 @@ -284,16 +282,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE4x4 - pld [ CO1 , #C_PRE ] ldr r3 , LDC add CO2 , CO1, r3 flds s0, ALPHA add r4 , CO2, r3 - pld [ CO2 , #C_PRE ] fldmias CO1, { s8 - s11 } - pld [ r4 , #C_PRE ] fmacs s8 , s0 , s16 flds s12, [CO2] @@ -313,6 +308,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmacs s15, s0 , s23 fsts s11, [CO1, #12 ] + pld [ CO1 , #C_PRE ] + fldmias r4, { s8 - s11 } fmacs s8 , s0 , s24 @@ -324,9 +321,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmacs s11, s0 , s27 fsts s15, [CO2, #12 ] + pld [ CO2 , #C_PRE ] + add CO2, r4 , r3 - pld [ CO2 , #C_PRE ] fldmias CO2, { s12 - s15 } @@ -339,7 +337,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fsts s11, [r4 , #12 ] fmacs s15, s0 , s31 + pld [ r4 , #C_PRE ] fstmias CO2, { s12 - s15 } + pld [ CO2 , #C_PRE ] add CO1, CO1, #16 @@ -891,78 +891,29 @@ _L4_M4_20: mov BO, BC - asrs L , K1, #3 // L = L / 8 - cmp L , #3 - blt _L4_M4_30 - .align 5 + asrs L , K1, #1 // L = L / 8 + cmp L , #2 + blt _L4_M4_32 KERNEL4x4_I KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - - sub L, L, #2 + subs L, L, #2 + ble _L4_M4_22a + .align 5 _L4_M4_22: KERNEL4x4_M1 KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 subs L, L, #1 bgt _L4_M4_22 - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 +_L4_M4_22a: - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_E - - b _L4_M4_44 - - -_L4_M4_30: - tst L, #3 - ble _L4_M4_40 - - tst L, #2 - ble _L4_M4_32 - - KERNEL4x4_I - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - - - KERNEL4x4_M1 - KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_E @@ -974,13 +925,7 @@ _L4_M4_32: ble _L4_M4_40 KERNEL4x4_I - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 KERNEL4x4_E b _L4_M4_44 @@ -993,7 +938,7 @@ _L4_M4_40: _L4_M4_44: - ands L , K1, #7 // L = L % 8 + ands L , K1, #1 // L = L % 8 ble _L4_M4_100 _L4_M4_46: diff --git a/kernel/arm/sgemm_ncopy_4_vfpv3.S b/kernel/arm/sgemm_ncopy_4_vfpv3.S index 34fbb3252..8af7ed8f2 100644 --- a/kernel/arm/sgemm_ncopy_4_vfpv3.S +++ b/kernel/arm/sgemm_ncopy_4_vfpv3.S @@ -26,10 +26,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** -* 2013/10/11 Saar -* BLASTEST : xOK -* CTEST : xOK -* TEST : xOK +* 2013/11/02 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK * **************************************************************************************/ @@ -218,6 +218,15 @@ _L4_M4_BEGIN: _L4_M4_20: + pld [ AO1, #A_PRE ] + pld [ AO2, #A_PRE ] + pld [ AO3, #A_PRE ] + pld [ AO4, #A_PRE ] + COPY4x4 + + subs I , I , #1 + ble _L4_M4_40 + COPY4x4 subs I , I , #1 diff --git a/param.h b/param.h index f6895d90b..ab0ed91b7 100644 --- a/param.h +++ b/param.h @@ -1814,17 +1814,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ZGEMM_DEFAULT_UNROLL_M 2 #define ZGEMM_DEFAULT_UNROLL_N 2 -#define SGEMM_DEFAULT_P 192 +#define SGEMM_DEFAULT_P 128 #define DGEMM_DEFAULT_P 128 #define CGEMM_DEFAULT_P 96 #define ZGEMM_DEFAULT_P 64 -#define SGEMM_DEFAULT_Q 120 +#define SGEMM_DEFAULT_Q 240 #define DGEMM_DEFAULT_Q 120 #define CGEMM_DEFAULT_Q 120 #define ZGEMM_DEFAULT_Q 120 -#define SGEMM_DEFAULT_R 16384 +#define SGEMM_DEFAULT_R 12288 #define DGEMM_DEFAULT_R 8192 #define CGEMM_DEFAULT_R 4096 #define ZGEMM_DEFAULT_R 4096 From e31186efd49370280120f3f97b28ec182f71545b Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sat, 2 Nov 2013 13:12:21 +0100 Subject: [PATCH 19/81] deleted obsolete dgemm_kernel and dtrmm_kernel --- kernel/arm/dgemm_kernel_8x2_vfpv3.S | 1112 -------------------- kernel/arm/dtrmm_kernel_8x2_vfpv3.S | 1521 --------------------------- 2 files changed, 2633 deletions(-) delete mode 100644 kernel/arm/dgemm_kernel_8x2_vfpv3.S delete mode 100644 kernel/arm/dtrmm_kernel_8x2_vfpv3.S diff --git a/kernel/arm/dgemm_kernel_8x2_vfpv3.S b/kernel/arm/dgemm_kernel_8x2_vfpv3.S deleted file mode 100644 index 6c1b0f5fd..000000000 --- a/kernel/arm/dgemm_kernel_8x2_vfpv3.S +++ /dev/null @@ -1,1112 +0,0 @@ -/*************************************************************************** -Copyright (c) 2013, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -/************************************************************************************** -* 2013/09/30 Saar -* BLASTEST : OK -* CTEST : OK -* TEST : OK -* -* -* 2013/09/30 Saar -* UNROLL_N 2 -* UNROLL_M 8 -* DGEMM_P 64 -* DGEMM_Q 64 -* DGEMM_R 512 -* A_PRE 192 -* B_PRE 32 -* C_PRE 64 -* -* Performance on Odroid U2: -* -* 1 Core: 1.42 GFLOPS ATLAS: 1.58 GFLOPS -* 2 Cores: 2.81 GFLOPS ATLAS: - GFLOPS -* 3 Cores: 4.05 GFLOPS ATLAS: - GFLOPS -* 4 Cores: 5.40 GFLOPS ATLAS: 3.88 GFLOPS -**************************************************************************************/ - -#define ASSEMBLER -#include "common.h" - -#define STACKSIZE 252 - -#define OLD_M r0 -#define OLD_N r1 -#define OLD_K r2 -#define OLD_A r3 -#define OLD_ALPHA d0 - -/****************************************************** -* [fp, #-128] - [fp, #-64] is reserved -* for store and restore of floating point -* registers -*******************************************************/ - -#define C [fp, #-248 ] -#define LDC [fp, #-252 ] -#define M [fp, #-256 ] -#define N [fp, #-260 ] -#define K [fp, #-264 ] -#define A [fp, #-268 ] - -#define ALPHA [fp, #-276 ] - -#define B [fp, #4 ] -#define OLD_C [fp, #8 ] -#define OLD_LDC [fp, #12 ] - -#define I r0 -#define J r1 -#define L r2 - -#define AO r5 -#define BO r6 - -#define CO1 r8 -#define CO2 r9 - -#define K1 r7 -#define BC r12 - -#define A_PRE 192 -#define A_PRE1 224 -#define B_PRE 32 -#define C_PRE 64 - -/************************************************************************************** -* Macro definitions -**************************************************************************************/ - -.macro INIT8x2 - - vsub.f64 d8 , d8 , d8 - vmov.f64 d9 , d8 - vmov.f64 d10, d8 - vmov.f64 d11, d8 - vmov.f64 d12, d8 - vmov.f64 d13, d8 - vmov.f64 d14, d8 - vmov.f64 d15, d8 - vmov.f64 d16, d8 - vmov.f64 d17, d8 - vmov.f64 d18, d8 - vmov.f64 d19, d8 - vmov.f64 d20, d8 - vmov.f64 d21, d8 - vmov.f64 d22, d8 - vmov.f64 d23, d8 - -.endm - - - -.macro KERNEL8x2 - - fldmiad BO!, { d24 , d25} - fldd d0, [ AO ] - fmacd d8 , d0, d24 - fldd d1, [ AO , #8 ] - fmacd d16 , d0, d25 - fldd d2, [ AO , #16 ] - fmacd d9 , d1, d24 - fmacd d17 , d1, d25 - fldd d3, [ AO , #24 ] - fmacd d10 , d2, d24 - fmacd d18 , d2, d25 - fldd d4, [ AO , #32 ] - fmacd d11 , d3, d24 - pld [AO , #A_PRE] - fmacd d19 , d3, d25 - fldd d5, [ AO , #40 ] - - fmacd d12 , d4, d24 - fmacd d20 , d4, d25 - fldd d6, [ AO , #48 ] - fmacd d13 , d5, d24 - fmacd d21 , d5, d25 - - fldd d7, [ AO , #56 ] - fmacd d14 , d6, d24 - fmacd d22 , d6, d25 - pld [AO , #A_PRE+32] - fmacd d15 , d7, d24 - add AO, AO, #64 - fmacd d23 , d7, d25 - -.endm - -.macro SAVE8x2 - - vldr d0, ALPHA - - fldd d24, [CO1] - fldd d25, [CO1, #8 ] - - fmacd d24, d0 , d8 - fldd d8 , [CO2] - fldd d26, [CO1, #16] - fmacd d25, d0 , d9 - fldd d9 , [CO2, #8 ] - fldd d27, [CO1, #24] - fmacd d26, d0 , d10 - fldd d10 , [CO2, #16 ] - fldd d28, [CO1, #32] - fmacd d27, d0 , d11 - fldd d11 , [CO2, #24 ] - fldd d29, [CO1, #40] - fmacd d28, d0 , d12 - fldd d12 , [CO2, #32 ] - fldd d30, [CO1, #48] - fmacd d29, d0 , d13 - fldd d13 , [CO2, #40 ] - fldd d31, [CO1, #56] - fmacd d30, d0 , d14 - fldd d14 , [CO2, #48 ] - fmacd d31, d0 , d15 - fldd d15 , [CO2, #56 ] - - - fmacd d8 , d0 , d16 - fstd d24, [CO1] - fmacd d9 , d0 , d17 - fstd d25, [CO1, #8 ] - fstd d8 , [CO2] - fmacd d10, d0 , d18 - fstd d26, [CO1, #16 ] - fstd d9 , [CO2, #8 ] - fmacd d11, d0 , d19 - fstd d27, [CO1, #24 ] - fstd d10, [CO2, #16 ] - fmacd d12, d0 , d20 - fstd d28, [CO1, #32 ] - fstd d11, [CO2, #24 ] - fmacd d13, d0 , d21 - fstd d29, [CO1, #40 ] - fstd d12, [CO2, #32 ] - fmacd d14, d0 , d22 - fstd d30, [CO1, #48 ] - fstd d13, [CO2, #40 ] - fmacd d15, d0 , d23 - fstd d31, [CO1, #56 ] - fstd d14, [CO2, #48 ] - - add CO1, CO1, #64 - fstd d15, [CO2, #56 ] - add CO2, CO2, #64 - - -.endm - -.macro SAVE8x2_BAD - - vldr d0, ALPHA - vldm CO2, { d24, d25, d26 , d27 , d28 , d29 , d30 , d31 } - - vmul.f64 d8 , d0 , d8 - vmul.f64 d9 , d0 , d9 - vmul.f64 d10, d0 , d10 - vmul.f64 d11, d0 , d11 - vmul.f64 d12, d0 , d12 - vmul.f64 d13, d0 , d13 - vmul.f64 d14, d0 , d14 - vmul.f64 d15, d0 , d15 - - vmul.f64 d16, d0 , d16 - vmul.f64 d17, d0 , d17 - vmul.f64 d18, d0 , d18 - vmul.f64 d19, d0 , d19 - vmul.f64 d20, d0 , d20 - vmul.f64 d21, d0 , d21 - vmul.f64 d22, d0 , d22 - vmul.f64 d23, d0 , d23 - - vldm CO1, { d0 , d1 , d2 , d3 , d4 , d5 , d6 , d7 } - - vadd.f64 d16, d16, d24 - vadd.f64 d17, d17, d25 - vadd.f64 d18, d18, d26 - vadd.f64 d19, d19, d27 - - vadd.f64 d20, d20, d28 - vadd.f64 d21, d21, d29 - vadd.f64 d22, d22, d30 - vadd.f64 d23, d23, d31 - - vadd.f64 d8 , d8 , d0 - vadd.f64 d9 , d9 , d1 - vadd.f64 d10, d10, d2 - vadd.f64 d11, d11, d3 - - vadd.f64 d12, d12, d4 - vadd.f64 d13, d13, d5 - vadd.f64 d14, d14, d6 - vadd.f64 d15, d15, d7 - - vstm CO2!, { d16, d17, d18 , d19 , d20 , d21 , d22 , d23 } - vstm CO1!, { d8 , d9 , d10 , d11 , d12, d13 , d14 , d15 } - -.endm - - - -/*************************************************************************************/ - - -.macro INIT4x2 - - vsub.f64 d8 , d8 , d8 - vsub.f64 d9 , d9 , d9 - vsub.f64 d10, d10, d10 - vsub.f64 d11, d11, d11 - vsub.f64 d12, d12, d12 - vsub.f64 d13, d13, d13 - vsub.f64 d14, d14, d14 - vsub.f64 d15, d15, d15 - -.endm - -.macro KERNEL4x2 - - vldm AO!, { d0, d1 , d2, d3 } - vldm BO!, { d4, d5 } - - vmul.f64 d6 , d0 , d4 - vmul.f64 d7 , d1 , d4 - vadd.f64 d8 , d8 , d6 - vadd.f64 d9 , d9 , d7 - - vmul.f64 d6 , d2 , d4 - vmul.f64 d7 , d3 , d4 - vadd.f64 d10, d10, d6 - vadd.f64 d11, d11, d7 - - vmul.f64 d6 , d0 , d5 - vmul.f64 d7 , d1 , d5 - vadd.f64 d12, d12, d6 - vadd.f64 d13, d13, d7 - - vmul.f64 d6 , d2 , d5 - vmul.f64 d7 , d3 , d5 - vadd.f64 d14, d14, d6 - vadd.f64 d15, d15, d7 - -.endm - -.macro SAVE4x2 - - vldr d0, ALPHA - - vmul.f64 d8 , d0 , d8 - vmul.f64 d9 , d0 , d9 - vmul.f64 d10, d0 , d10 - vmul.f64 d11, d0 , d11 - vmul.f64 d12, d0 , d12 - vmul.f64 d13, d0 , d13 - vmul.f64 d14, d0 , d14 - vmul.f64 d15, d0 , d15 - - vldm CO1, { d0, d1 , d2 , d3 } - vldm CO2, { d4, d5 , d6 , d7 } - - vadd.f64 d8 , d8 , d0 - vadd.f64 d9 , d9 , d1 - vadd.f64 d10, d10, d2 - vadd.f64 d11, d11, d3 - - vadd.f64 d12, d12, d4 - vadd.f64 d13, d13, d5 - vadd.f64 d14, d14, d6 - vadd.f64 d15, d15, d7 - - vstm CO1!, { d8 , d9 , d10 , d11 } - vstm CO2!, { d12, d13 ,d14 , d15 } - -.endm - - - -/*************************************************************************************/ - -.macro INIT2x2 - - vsub.f64 d8 , d8 , d8 - vsub.f64 d9 , d9 , d9 - vsub.f64 d12, d12, d12 - vsub.f64 d13, d13, d13 - -.endm - -.macro KERNEL2x2 - - vldm AO!, { d0, d1 } - vldm BO!, { d4, d5 } - - vmul.f64 d6 , d0 , d4 - vmul.f64 d7 , d1 , d4 - vadd.f64 d8 , d8 , d6 - vadd.f64 d9 , d9 , d7 - - vmul.f64 d6 , d0 , d5 - vmul.f64 d7 , d1 , d5 - vadd.f64 d12, d12, d6 - vadd.f64 d13, d13, d7 - -.endm - -.macro SAVE2x2 - - vldr d0, ALPHA - - vmul.f64 d8 , d0 , d8 - vmul.f64 d9 , d0 , d9 - vmul.f64 d12, d0 , d12 - vmul.f64 d13, d0 , d13 - - vldm CO1, { d0, d1 } - vldm CO2, { d4, d5 } - - vadd.f64 d8 , d8 , d0 - vadd.f64 d9 , d9 , d1 - - vadd.f64 d12, d12, d4 - vadd.f64 d13, d13, d5 - - vstm CO1!, { d8 , d9 } - vstm CO2!, { d12, d13 } - -.endm - -/*************************************************************************************/ - -.macro INIT1x2 - - vsub.f64 d8 , d8 , d8 - vsub.f64 d12, d12, d12 - -.endm - -.macro KERNEL1x2 - - vldm AO!, { d0 } - vldm BO!, { d4, d5 } - - vmul.f64 d6 , d0 , d4 - vadd.f64 d8 , d8 , d6 - - vmul.f64 d6 , d0 , d5 - vadd.f64 d12, d12, d6 - -.endm - -.macro SAVE1x2 - - vldr d0, ALPHA - - vmul.f64 d8 , d0 , d8 - vmul.f64 d12, d0 , d12 - - vldm CO1, { d0 } - vldm CO2, { d4 } - - vadd.f64 d8 , d8 , d0 - vadd.f64 d12, d12, d4 - - vstm CO1!, { d8 } - vstm CO2!, { d12} - -.endm - -/*************************************************************************************/ - -.macro INIT8x1 - - vsub.f64 d8 , d8 , d8 - vsub.f64 d9 , d9 , d9 - vsub.f64 d10, d10, d10 - vsub.f64 d11, d11, d11 - vsub.f64 d12, d12, d12 - vsub.f64 d13, d13, d13 - vsub.f64 d14, d14, d14 - vsub.f64 d15, d15, d15 - -.endm - -.macro KERNEL8x1 - - vldm AO!, { d0, d1 , d2, d3, d4 , d5 , d6 , d7 } - vldm BO!, { d24 } - - vmul.f64 d26 , d0 , d24 - vmul.f64 d27 , d1 , d24 - vadd.f64 d8 , d8 , d26 - vadd.f64 d9 , d9 , d27 - - vmul.f64 d28 , d2 , d24 - vmul.f64 d29 , d3 , d24 - vadd.f64 d10 , d10, d28 - vadd.f64 d11 , d11, d29 - - vmul.f64 d26 , d4 , d24 - vmul.f64 d27 , d5 , d24 - vadd.f64 d12 , d12, d26 - vadd.f64 d13 , d13, d27 - - vmul.f64 d28 , d6 , d24 - vmul.f64 d29 , d7 , d24 - vadd.f64 d14 , d14, d28 - vadd.f64 d15 , d15, d29 - - -.endm - -.macro SAVE8x1 - - vldr d0, ALPHA - - vmul.f64 d8 , d0 , d8 - vmul.f64 d9 , d0 , d9 - vmul.f64 d10, d0 , d10 - vmul.f64 d11, d0 , d11 - vmul.f64 d12, d0 , d12 - vmul.f64 d13, d0 , d13 - vmul.f64 d14, d0 , d14 - vmul.f64 d15, d0 , d15 - - vldm CO1, { d0, d1 , d2 , d3 , d4 , d5 , d6 , d7 } - - vadd.f64 d8 , d8 , d0 - vadd.f64 d9 , d9 , d1 - vadd.f64 d10, d10, d2 - vadd.f64 d11, d11, d3 - - vadd.f64 d12, d12, d4 - vadd.f64 d13, d13, d5 - vadd.f64 d14, d14, d6 - vadd.f64 d15, d15, d7 - - vstm CO1!, { d8 , d9 , d10 , d11 , d12, d13 ,d14 , d15 } - -.endm - - -/*************************************************************************************/ - -.macro INIT4x1 - - vsub.f64 d8 , d8 , d8 - vsub.f64 d9 , d9 , d9 - vsub.f64 d10, d10, d10 - vsub.f64 d11, d11, d11 - -.endm - -.macro KERNEL4x1 - - vldm AO!, { d0, d1 , d2, d3 } - vldm BO!, { d4 } - - vmul.f64 d6 , d0 , d4 - vmul.f64 d7 , d1 , d4 - vadd.f64 d8 , d8 , d6 - vadd.f64 d9 , d9 , d7 - - vmul.f64 d6 , d2 , d4 - vmul.f64 d7 , d3 , d4 - vadd.f64 d10, d10, d6 - vadd.f64 d11, d11, d7 - -.endm - -.macro SAVE4x1 - - vldr d0, ALPHA - - vmul.f64 d8 , d0 , d8 - vmul.f64 d9 , d0 , d9 - vmul.f64 d10, d0 , d10 - vmul.f64 d11, d0 , d11 - - vldm CO1, { d0, d1 , d2 , d3 } - - vadd.f64 d8 , d8 , d0 - vadd.f64 d9 , d9 , d1 - vadd.f64 d10, d10, d2 - vadd.f64 d11, d11, d3 - - vstm CO1!, { d8 , d9 , d10 , d11 } - -.endm - -/*************************************************************************************/ - -.macro INIT2x1 - - vsub.f64 d8 , d8 , d8 - vsub.f64 d9 , d9 , d9 - -.endm - -.macro KERNEL2x1 - - vldm AO!, { d0, d1 } - vldm BO!, { d4 } - - vmul.f64 d6 , d0 , d4 - vmul.f64 d7 , d1 , d4 - vadd.f64 d8 , d8 , d6 - vadd.f64 d9 , d9 , d7 - -.endm - -.macro SAVE2x1 - - vldr d0, ALPHA - - vmul.f64 d8 , d0 , d8 - vmul.f64 d9 , d0 , d9 - - vldm CO1, { d0, d1 } - - vadd.f64 d8 , d8 , d0 - vadd.f64 d9 , d9 , d1 - - vstm CO1!, { d8 , d9 } - -.endm - -/*************************************************************************************/ - -.macro INIT1x1 - - vsub.f64 d8 , d8 , d8 - -.endm - -.macro KERNEL1x1 - - vldm AO!, { d0 } - vldm BO!, { d4 } - - vmul.f64 d6 , d0 , d4 - vadd.f64 d8 , d8 , d6 - -.endm - -.macro SAVE1x1 - - vldr d0, ALPHA - - vmul.f64 d8 , d0 , d8 - - vldm CO1, { d0 } - - vadd.f64 d8 , d8 , d0 - - vstm CO1!, { d8 } - -.endm - - - -/************************************************************************************** -* End of macro definitions -**************************************************************************************/ - - PROLOGUE - - .align 5 - - push {r4 - r9, fp} - add fp, sp, #24 - sub sp, sp, #STACKSIZE // reserve stack - - str OLD_M, M - str OLD_N, N - str OLD_K, K - str OLD_A, A - vstr OLD_ALPHA, ALPHA - - sub r3, fp, #128 - vstm r3, { d8 - d15} // store floating point registers - - ldr r3, OLD_LDC - lsl r3, r3, #3 // ldc = ldc * 8 - str r3, LDC - - ldr r3, OLD_C - str r3, C - - ldr K1, K - ldr BC, B - - ldr J, N - asrs J, J, #1 // J = J / 2 - ble _L1_BEGIN - -_L2_BEGIN: - - ldr CO1, C // CO1 = C - ldr r4 , LDC - add CO2, CO1, r4 // CO2 = C + LDC - add r3 , CO2, r4 // C = CO2 + LDC - str r3 , C // store C - - ldr AO, A // AO = A - pld [AO , #A_PRE-96] - pld [AO , #A_PRE-64] - pld [AO , #A_PRE-32] - -_L2_M8_BEGIN: - - ldr I, M - asrs I, I, #3 // I = I / 8 - ble _L2_M4_BEGIN - -_L2_M8_20: - - pld [CO1, #C_PRE] - pld [CO1, #C_PRE+32] - pld [CO2, #C_PRE] - pld [CO2, #C_PRE+32] - INIT8x2 - - mov BO, BC - asrs L , K1, #3 // L = L / 8 - ble _L2_M8_40 - .align 5 - -_L2_M8_22: - - pld [BO , #B_PRE] - KERNEL8x2 - KERNEL8x2 - pld [BO , #B_PRE] - KERNEL8x2 - KERNEL8x2 - - pld [BO , #B_PRE] - KERNEL8x2 - KERNEL8x2 - pld [BO , #B_PRE] - KERNEL8x2 - KERNEL8x2 - - subs L, L, #1 - bgt _L2_M8_22 - - -_L2_M8_40: - - ands L , K1, #7 // L = L % 8 - ble _L2_M8_100 - -_L2_M8_42: - - KERNEL8x2 - - subs L, L, #1 - bgt _L2_M8_42 - -_L2_M8_100: - - SAVE8x2 - -_L2_M8_END: - - subs I, I, #1 - bgt _L2_M8_20 - - -_L2_M4_BEGIN: - - ldr I, M - tst I , #7 - ble _L2_END - - tst I , #4 - ble _L2_M2_BEGIN - -_L2_M4_20: - - INIT4x2 - - mov BO, BC - asrs L , K1, #3 // L = L / 8 - ble _L2_M4_40 - .align 5 - -_L2_M4_22: - - KERNEL4x2 - KERNEL4x2 - KERNEL4x2 - KERNEL4x2 - - KERNEL4x2 - KERNEL4x2 - KERNEL4x2 - KERNEL4x2 - - subs L, L, #1 - bgt _L2_M4_22 - - -_L2_M4_40: - - ands L , K1, #7 // L = L % 8 - ble _L2_M4_100 - -_L2_M4_42: - - KERNEL4x2 - - subs L, L, #1 - bgt _L2_M4_42 - -_L2_M4_100: - - SAVE4x2 - -_L2_M4_END: - - - -_L2_M2_BEGIN: - - tst I, #2 // I = I / 2 - ble _L2_M1_BEGIN - -_L2_M2_20: - - INIT2x2 - - mov BO, BC - asrs L , K1, #2 // L = L / 4 - ble _L2_M2_40 - -_L2_M2_22: - - KERNEL2x2 - KERNEL2x2 - KERNEL2x2 - KERNEL2x2 - - subs L, L, #1 - bgt _L2_M2_22 - - -_L2_M2_40: - - ands L , K1, #3 // L = L % 4 - ble _L2_M2_100 - -_L2_M2_42: - - KERNEL2x2 - - subs L, L, #1 - bgt _L2_M2_42 - -_L2_M2_100: - - SAVE2x2 - -_L2_M2_END: - - -_L2_M1_BEGIN: - - tst I, #1 // I = I % 2 - ble _L2_END - -_L2_M1_20: - - INIT1x2 - - mov BO, BC - asrs L , K1, #2 // L = L / 4 - ble _L2_M1_40 - -_L2_M1_22: - - KERNEL1x2 - KERNEL1x2 - KERNEL1x2 - KERNEL1x2 - - subs L, L, #1 - bgt _L2_M1_22 - - -_L2_M1_40: - - ands L , K1, #3 // L = L % 4 - ble _L2_M1_100 - -_L2_M1_42: - - KERNEL1x2 - - subs L, L, #1 - bgt _L2_M1_42 - -_L2_M1_100: - - SAVE1x2 - - -_L2_END: - - mov r3, BC - mov r4, K1 - lsl r4, r4, #4 // k * 2 * 8 - add r3, r3, r4 // B = B + K * 2 * 8 - mov BC, r3 - - subs J , #1 // j-- - bgt _L2_BEGIN - - -_L1_BEGIN: - - ldr J, N - tst J , #1 // J = J % 2 - ble _L999 - - ldr CO1, C // CO1 = C - ldr r4 , LDC - add r3 , CO1, r4 // C = CO1 + LDC - str r3 , C // store C - - ldr AO, A // AO = A - - - -_L1_M8_BEGIN: - - ldr I, M - asrs I, I, #3 // I = I / 8 - ble _L1_M4_BEGIN - -_L1_M8_20: - - INIT8x1 - - mov BO, BC - asrs L , K1, #3 // L = L / 8 - ble _L1_M8_40 - -_L1_M8_22: - - KERNEL8x1 - KERNEL8x1 - KERNEL8x1 - KERNEL8x1 - KERNEL8x1 - KERNEL8x1 - KERNEL8x1 - KERNEL8x1 - - subs L, L, #1 - bgt _L1_M8_22 - - -_L1_M8_40: - - ands L , K1, #7 // L = L % 8 - ble _L1_M8_100 - -_L1_M8_42: - - KERNEL8x1 - - subs L, L, #1 - bgt _L1_M8_42 - -_L1_M8_100: - - SAVE8x1 - -_L1_M8_END: - - subs I, I, #1 - bgt _L1_M8_20 - - - - -_L1_M4_BEGIN: - - ldr I, M - tst I, #7 // I = I % 8 - ble _L1_END - - tst I, #4 // I = I % 8 - ble _L1_M2_BEGIN - -_L1_M4_20: - - INIT4x1 - - mov BO, BC - asrs L , K1, #2 // L = L / 4 - ble _L1_M4_40 - -_L1_M4_22: - - KERNEL4x1 - KERNEL4x1 - KERNEL4x1 - KERNEL4x1 - - subs L, L, #1 - bgt _L1_M4_22 - - -_L1_M4_40: - - ands L , K1, #3 // L = L % 4 - ble _L1_M4_100 - -_L1_M4_42: - - KERNEL4x1 - - subs L, L, #1 - bgt _L1_M4_42 - -_L1_M4_100: - - SAVE4x1 - -_L1_M4_END: - - - - -_L1_M2_BEGIN: - - tst I, #2 // I = I % 4 - ble _L1_M1_BEGIN - -_L1_M2_20: - - INIT2x1 - - mov BO, BC - asrs L , K1, #2 // L = L / 4 - ble _L1_M2_40 - -_L1_M2_22: - - KERNEL2x1 - KERNEL2x1 - KERNEL2x1 - KERNEL2x1 - - subs L, L, #1 - bgt _L1_M2_22 - - -_L1_M2_40: - - ands L , K1, #3 // L = L % 4 - ble _L1_M2_100 - -_L1_M2_42: - - KERNEL2x1 - - subs L, L, #1 - bgt _L1_M2_42 - -_L1_M2_100: - - SAVE2x1 - -_L1_M2_END: - - - -_L1_M1_BEGIN: - - tst I, #1 // I = I % 4 - ble _L1_END - -_L1_M1_20: - - INIT1x1 - - mov BO, BC - asrs L , K1, #2 // L = L / 4 - ble _L1_M1_40 - -_L1_M1_22: - - KERNEL1x1 - KERNEL1x1 - KERNEL1x1 - KERNEL1x1 - - subs L, L, #1 - bgt _L1_M1_22 - - -_L1_M1_40: - - ands L , K1, #3 // L = L % 4 - ble _L1_M1_100 - -_L1_M1_42: - - KERNEL1x1 - - subs L, L, #1 - bgt _L1_M1_42 - -_L1_M1_100: - - SAVE1x1 - - -_L1_END: - - - -_L999: - - sub r3, fp, #128 - vldm r3, { d8 - d15} // restore floating point registers - - movs r0, #0 // set return value - sub sp, fp, #24 - pop {r4 - r9, fp} - bx lr - - EPILOGUE - diff --git a/kernel/arm/dtrmm_kernel_8x2_vfpv3.S b/kernel/arm/dtrmm_kernel_8x2_vfpv3.S deleted file mode 100644 index 930616635..000000000 --- a/kernel/arm/dtrmm_kernel_8x2_vfpv3.S +++ /dev/null @@ -1,1521 +0,0 @@ -/*************************************************************************** -Copyright (c) 2013, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -/************************************************************************************** -* 2013/09/28 Saar -* BLASTEST : OK -* CTEST : OK -* TEST : OK -* -**************************************************************************************/ - -#define ASSEMBLER -#include "common.h" - -#define STACKSIZE 252 - -#define OLD_M r0 -#define OLD_N r1 -#define OLD_K r2 -#define OLD_A r3 -#define OLD_ALPHA d0 - -/****************************************************** -* [fp, #-128] - [fp, #-64] is reserved -* for store and restore of floating point -* registers -*******************************************************/ - - -#define KK [fp, #-240 ] -#define KKK [fp, #-244] -#define C [fp, #-248 ] -#define LDC [fp, #-252 ] -#define M [fp, #-256 ] -#define N [fp, #-260 ] -#define K [fp, #-264 ] -#define A [fp, #-268 ] - -#define ALPHA [fp, #-276 ] - -#define B [fp, #4 ] -#define OLD_C [fp, #8 ] -#define OLD_LDC [fp, #12 ] -#define OFFSET [fp, #16 ] - -#define I r0 -#define J r1 -#define L r2 - -#define AO r5 -#define BO r6 - -#define CO1 r8 -#define CO2 r9 - -#define K1 r7 -#define BC r12 - -#define A_PRE 128 -#define A_PRE1 160 -#define B_PRE 128 -#define C_PRE 32 - -/************************************************************************************** -* Macro definitions -**************************************************************************************/ - -.macro INIT8x2 - - vsub.f64 d8 , d8 , d8 - vmov.f64 d9 , d8 - vmov.f64 d10, d8 - vmov.f64 d11, d8 - vmov.f64 d12, d8 - vmov.f64 d13, d8 - vmov.f64 d14, d8 - vmov.f64 d15, d8 - vmov.f64 d16, d8 - vmov.f64 d17, d8 - vmov.f64 d18, d8 - vmov.f64 d19, d8 - vmov.f64 d20, d8 - vmov.f64 d21, d8 - vmov.f64 d22, d8 - vmov.f64 d23, d8 - -.endm - -.macro KERNEL8x2_START - - vldm BO!, { d24 , d25} - vldm AO!, { d0, d1 , d2, d3, d4 , d5 , d6 , d7 } - - vmul.f64 d26 , d0 , d24 - vmul.f64 d29 , d0 , d25 - vmul.f64 d27 , d1 , d24 - vmul.f64 d30 , d1 , d25 - vmul.f64 d28 , d2 , d24 - vmul.f64 d31 , d2 , d25 - - pld [AO , #A_PRE] - - vadd.f64 d8 , d8 , d26 - vadd.f64 d16 , d16, d29 - vadd.f64 d9 , d9 , d27 - vadd.f64 d17 , d17, d30 - vadd.f64 d10 , d10, d28 - vadd.f64 d18 , d18, d31 - - vmul.f64 d26 , d3 , d24 - vmul.f64 d27 , d4 , d24 - vmul.f64 d28 , d5 , d24 - vmul.f64 d29 , d3 , d25 - vmul.f64 d30 , d4 , d25 - vmul.f64 d31 , d5 , d25 - - pld [AO , #A_PRE1] - - vadd.f64 d11 , d11, d26 - vadd.f64 d12 , d12, d27 - vadd.f64 d13 , d13, d28 - vadd.f64 d19 , d19, d29 - vadd.f64 d20 , d20, d30 - vadd.f64 d21 , d21, d31 - - vmul.f64 d26 , d6 , d24 - vmul.f64 d27 , d7 , d24 - vmul.f64 d29 , d6 , d25 - vmul.f64 d30 , d7 , d25 - - vldm BO!, { d24 , d25} - vldm AO!, { d0, d1 , d2, d3, d4 , d5 , d6 , d7 } - - vadd.f64 d14 , d14, d26 - vadd.f64 d15 , d15, d27 - vadd.f64 d22 , d22, d29 - vadd.f64 d23 , d23, d30 - -.endm - - -.macro KERNEL8x2_M - - vmul.f64 d26 , d0 , d24 - vmul.f64 d29 , d0 , d25 - vmul.f64 d27 , d1 , d24 - vmul.f64 d30 , d1 , d25 - vmul.f64 d28 , d2 , d24 - vmul.f64 d31 , d2 , d25 - - pld [AO , #A_PRE] - - vadd.f64 d8 , d8 , d26 - vadd.f64 d16 , d16, d29 - vadd.f64 d9 , d9 , d27 - vadd.f64 d17 , d17, d30 - vadd.f64 d10 , d10, d28 - vadd.f64 d18 , d18, d31 - - vmul.f64 d26 , d3 , d24 - vmul.f64 d27 , d4 , d24 - vmul.f64 d28 , d5 , d24 - vmul.f64 d29 , d3 , d25 - vmul.f64 d30 , d4 , d25 - vmul.f64 d31 , d5 , d25 - - pld [AO , #A_PRE1] - - vadd.f64 d11 , d11, d26 - vadd.f64 d12 , d12, d27 - vadd.f64 d13 , d13, d28 - vadd.f64 d19 , d19, d29 - vadd.f64 d20 , d20, d30 - vadd.f64 d21 , d21, d31 - - vmul.f64 d26 , d6 , d24 - vmul.f64 d27 , d7 , d24 - vmul.f64 d29 , d6 , d25 - vmul.f64 d30 , d7 , d25 - - vldm BO!, { d24 , d25} - vldm AO!, { d0, d1 , d2, d3, d4 , d5 , d6 , d7 } - - vadd.f64 d14 , d14, d26 - vadd.f64 d15 , d15, d27 - vadd.f64 d22 , d22, d29 - vadd.f64 d23 , d23, d30 - -.endm - - -.macro KERNEL8x2_END - - vmul.f64 d26 , d0 , d24 - vmul.f64 d29 , d0 , d25 - vmul.f64 d27 , d1 , d24 - vmul.f64 d30 , d1 , d25 - vmul.f64 d28 , d2 , d24 - vmul.f64 d31 , d2 , d25 - - pld [AO , #A_PRE] - - vadd.f64 d8 , d8 , d26 - vadd.f64 d16 , d16, d29 - vadd.f64 d9 , d9 , d27 - vadd.f64 d17 , d17, d30 - vadd.f64 d10 , d10, d28 - vadd.f64 d18 , d18, d31 - - vmul.f64 d26 , d3 , d24 - vmul.f64 d27 , d4 , d24 - vmul.f64 d28 , d5 , d24 - vmul.f64 d29 , d3 , d25 - vmul.f64 d30 , d4 , d25 - vmul.f64 d31 , d5 , d25 - - pld [AO , #A_PRE1] - - vadd.f64 d11 , d11, d26 - vadd.f64 d12 , d12, d27 - vadd.f64 d13 , d13, d28 - vadd.f64 d19 , d19, d29 - vadd.f64 d20 , d20, d30 - vadd.f64 d21 , d21, d31 - - vmul.f64 d26 , d6 , d24 - vmul.f64 d27 , d7 , d24 - vmul.f64 d29 , d6 , d25 - vmul.f64 d30 , d7 , d25 - - vadd.f64 d14 , d14, d26 - vadd.f64 d15 , d15, d27 - vadd.f64 d22 , d22, d29 - vadd.f64 d23 , d23, d30 - -.endm - - - - -.macro KERNEL8x2 - - vldm BO!, { d24 , d25} - vldm AO!, { d0, d1 , d2, d3, d4 , d5 , d6 , d7 } - - vmul.f64 d26 , d0 , d24 - vmul.f64 d29 , d0 , d25 - vmul.f64 d27 , d1 , d24 - vmul.f64 d30 , d1 , d25 - vmul.f64 d28 , d2 , d24 - vmul.f64 d31 , d2 , d25 - - pld [AO , #A_PRE] - - vadd.f64 d8 , d8 , d26 - vadd.f64 d16 , d16, d29 - vadd.f64 d9 , d9 , d27 - vadd.f64 d17 , d17, d30 - vadd.f64 d10 , d10, d28 - vadd.f64 d18 , d18, d31 - - vmul.f64 d26 , d3 , d24 - vmul.f64 d27 , d4 , d24 - vmul.f64 d28 , d5 , d24 - vmul.f64 d29 , d3 , d25 - vmul.f64 d30 , d4 , d25 - vmul.f64 d31 , d5 , d25 - - pld [AO , #A_PRE1] - - vadd.f64 d11 , d11, d26 - vadd.f64 d12 , d12, d27 - vadd.f64 d13 , d13, d28 - vadd.f64 d19 , d19, d29 - vadd.f64 d20 , d20, d30 - vadd.f64 d21 , d21, d31 - - vmul.f64 d26 , d6 , d24 - vmul.f64 d27 , d7 , d24 - vmul.f64 d29 , d6 , d25 - vmul.f64 d30 , d7 , d25 - - vadd.f64 d14 , d14, d26 - vadd.f64 d15 , d15, d27 - vadd.f64 d22 , d22, d29 - vadd.f64 d23 , d23, d30 - -.endm - -.macro SAVE8x2 - - vldr d0, ALPHA - - vmul.f64 d8 , d0 , d8 - vmul.f64 d9 , d0 , d9 - vmul.f64 d10, d0 , d10 - vmul.f64 d11, d0 , d11 - vmul.f64 d12, d0 , d12 - vmul.f64 d13, d0 , d13 - vmul.f64 d14, d0 , d14 - vmul.f64 d15, d0 , d15 - - vmul.f64 d16, d0 , d16 - vmul.f64 d17, d0 , d17 - vmul.f64 d18, d0 , d18 - vmul.f64 d19, d0 , d19 - vmul.f64 d20, d0 , d20 - vmul.f64 d21, d0 , d21 - vmul.f64 d22, d0 , d22 - vmul.f64 d23, d0 , d23 - - vstm CO2!, { d16, d17, d18 , d19 , d20 , d21 , d22 , d23 } - vstm CO1!, { d8 , d9 , d10 , d11 , d12, d13 , d14 , d15 } - -.endm - - -/*************************************************************************************/ - - -.macro INIT4x2 - - vsub.f64 d8 , d8 , d8 - vsub.f64 d9 , d9 , d9 - vsub.f64 d10, d10, d10 - vsub.f64 d11, d11, d11 - vsub.f64 d12, d12, d12 - vsub.f64 d13, d13, d13 - vsub.f64 d14, d14, d14 - vsub.f64 d15, d15, d15 - -.endm - -.macro KERNEL4x2 - - vldm AO!, { d0, d1 , d2, d3 } - vldm BO!, { d4, d5 } - - vmul.f64 d6 , d0 , d4 - vmul.f64 d7 , d1 , d4 - vadd.f64 d8 , d8 , d6 - vadd.f64 d9 , d9 , d7 - - vmul.f64 d6 , d2 , d4 - vmul.f64 d7 , d3 , d4 - vadd.f64 d10, d10, d6 - vadd.f64 d11, d11, d7 - - vmul.f64 d6 , d0 , d5 - vmul.f64 d7 , d1 , d5 - vadd.f64 d12, d12, d6 - vadd.f64 d13, d13, d7 - - vmul.f64 d6 , d2 , d5 - vmul.f64 d7 , d3 , d5 - vadd.f64 d14, d14, d6 - vadd.f64 d15, d15, d7 - -.endm - -.macro SAVE4x2 - - vldr d0, ALPHA - - vmul.f64 d8 , d0 , d8 - vmul.f64 d9 , d0 , d9 - vmul.f64 d10, d0 , d10 - vmul.f64 d11, d0 , d11 - vmul.f64 d12, d0 , d12 - vmul.f64 d13, d0 , d13 - vmul.f64 d14, d0 , d14 - vmul.f64 d15, d0 , d15 - - vstm CO1!, { d8 , d9 , d10 , d11 } - vstm CO2!, { d12, d13 ,d14 , d15 } - -.endm - - - -/*************************************************************************************/ - -.macro INIT2x2 - - vsub.f64 d8 , d8 , d8 - vsub.f64 d9 , d9 , d9 - vsub.f64 d12, d12, d12 - vsub.f64 d13, d13, d13 - -.endm - -.macro KERNEL2x2 - - vldm AO!, { d0, d1 } - vldm BO!, { d4, d5 } - - vmul.f64 d6 , d0 , d4 - vmul.f64 d7 , d1 , d4 - vadd.f64 d8 , d8 , d6 - vadd.f64 d9 , d9 , d7 - - vmul.f64 d6 , d0 , d5 - vmul.f64 d7 , d1 , d5 - vadd.f64 d12, d12, d6 - vadd.f64 d13, d13, d7 - -.endm - -.macro SAVE2x2 - - vldr d0, ALPHA - - vmul.f64 d8 , d0 , d8 - vmul.f64 d9 , d0 , d9 - vmul.f64 d12, d0 , d12 - vmul.f64 d13, d0 , d13 - - vstm CO1!, { d8 , d9 } - vstm CO2!, { d12, d13 } - -.endm - -/*************************************************************************************/ - -.macro INIT1x2 - - vsub.f64 d8 , d8 , d8 - vsub.f64 d12, d12, d12 - -.endm - -.macro KERNEL1x2 - - vldm AO!, { d0 } - vldm BO!, { d4, d5 } - - vmul.f64 d6 , d0 , d4 - vadd.f64 d8 , d8 , d6 - - vmul.f64 d6 , d0 , d5 - vadd.f64 d12, d12, d6 - -.endm - -.macro SAVE1x2 - - vldr d0, ALPHA - - vmul.f64 d8 , d0 , d8 - vmul.f64 d12, d0 , d12 - - vstm CO1!, { d8 } - vstm CO2!, { d12} - -.endm - -/*************************************************************************************/ - -.macro INIT8x1 - - vsub.f64 d8 , d8 , d8 - vsub.f64 d9 , d9 , d9 - vsub.f64 d10, d10, d10 - vsub.f64 d11, d11, d11 - vsub.f64 d12, d12, d12 - vsub.f64 d13, d13, d13 - vsub.f64 d14, d14, d14 - vsub.f64 d15, d15, d15 - -.endm - -.macro KERNEL8x1 - - vldm AO!, { d0, d1 , d2, d3, d4 , d5 , d6 , d7 } - vldm BO!, { d24 } - - vmul.f64 d26 , d0 , d24 - vmul.f64 d27 , d1 , d24 - vadd.f64 d8 , d8 , d26 - vadd.f64 d9 , d9 , d27 - - vmul.f64 d28 , d2 , d24 - vmul.f64 d29 , d3 , d24 - vadd.f64 d10 , d10, d28 - vadd.f64 d11 , d11, d29 - - vmul.f64 d26 , d4 , d24 - vmul.f64 d27 , d5 , d24 - vadd.f64 d12 , d12, d26 - vadd.f64 d13 , d13, d27 - - vmul.f64 d28 , d6 , d24 - vmul.f64 d29 , d7 , d24 - vadd.f64 d14 , d14, d28 - vadd.f64 d15 , d15, d29 - - -.endm - -.macro SAVE8x1 - - vldr d0, ALPHA - - vmul.f64 d8 , d0 , d8 - vmul.f64 d9 , d0 , d9 - vmul.f64 d10, d0 , d10 - vmul.f64 d11, d0 , d11 - vmul.f64 d12, d0 , d12 - vmul.f64 d13, d0 , d13 - vmul.f64 d14, d0 , d14 - vmul.f64 d15, d0 , d15 - - vstm CO1!, { d8 , d9 , d10 , d11 , d12, d13 ,d14 , d15 } - -.endm - - -/*************************************************************************************/ - -.macro INIT4x1 - - vsub.f64 d8 , d8 , d8 - vsub.f64 d9 , d9 , d9 - vsub.f64 d10, d10, d10 - vsub.f64 d11, d11, d11 - -.endm - -.macro KERNEL4x1 - - vldm AO!, { d0, d1 , d2, d3 } - vldm BO!, { d4 } - - vmul.f64 d6 , d0 , d4 - vmul.f64 d7 , d1 , d4 - vadd.f64 d8 , d8 , d6 - vadd.f64 d9 , d9 , d7 - - vmul.f64 d6 , d2 , d4 - vmul.f64 d7 , d3 , d4 - vadd.f64 d10, d10, d6 - vadd.f64 d11, d11, d7 - -.endm - -.macro SAVE4x1 - - vldr d0, ALPHA - - vmul.f64 d8 , d0 , d8 - vmul.f64 d9 , d0 , d9 - vmul.f64 d10, d0 , d10 - vmul.f64 d11, d0 , d11 - - vstm CO1!, { d8 , d9 , d10 , d11 } - -.endm - -/*************************************************************************************/ - -.macro INIT2x1 - - vsub.f64 d8 , d8 , d8 - vsub.f64 d9 , d9 , d9 - -.endm - -.macro KERNEL2x1 - - vldm AO!, { d0, d1 } - vldm BO!, { d4 } - - vmul.f64 d6 , d0 , d4 - vmul.f64 d7 , d1 , d4 - vadd.f64 d8 , d8 , d6 - vadd.f64 d9 , d9 , d7 - -.endm - -.macro SAVE2x1 - - vldr d0, ALPHA - - vmul.f64 d8 , d0 , d8 - vmul.f64 d9 , d0 , d9 - - vstm CO1!, { d8 , d9 } - -.endm - -/*************************************************************************************/ - -.macro INIT1x1 - - vsub.f64 d8 , d8 , d8 - -.endm - -.macro KERNEL1x1 - - vldm AO!, { d0 } - vldm BO!, { d4 } - - vmul.f64 d6 , d0 , d4 - vadd.f64 d8 , d8 , d6 - -.endm - -.macro SAVE1x1 - - vldr d0, ALPHA - - vmul.f64 d8 , d0 , d8 - - vstm CO1!, { d8 } - -.endm - - - -/************************************************************************************** -* End of macro definitions -**************************************************************************************/ - - PROLOGUE - - .align 5 - - push {r4 - r9, fp} - add fp, sp, #24 - sub sp, sp, #STACKSIZE // reserve stack - - str OLD_M, M - str OLD_N, N - str OLD_K, K - str OLD_A, A - vstr OLD_ALPHA, ALPHA - - sub r3, fp, #128 - vstm r3, { d8 - d15} // store floating point registers - - ldr r3, OLD_LDC - lsl r3, r3, #3 // ldc = ldc * 8 - str r3, LDC - - ldr r3, OLD_C - str r3, C - - ldr BC, B - - ldr r3, OFFSET -#ifndef LEFT - neg r3 , r3 -#endif - str r3 , KK - - ldr J, N - asrs J, J, #1 // J = J / 2 - ble _L1_BEGIN - -_L2_BEGIN: - - ldr CO1, C // CO1 = C - ldr r4 , LDC - add CO2, CO1, r4 // CO2 = C + LDC - add r3 , CO2, r4 // C = CO2 + LDC - str r3 , C // store C - -#if defined(LEFT) - ldr r3 , OFFSET - str r3 , KK -#endif - - ldr AO, A // AO = A - -_L2_M8_BEGIN: - - ldr I, M - asrs I, I, #3 // I = I / 8 - ble _L2_M4_BEGIN - -_L2_M8_20: - - pld [CO1, #C_PRE] - pld [CO2, #C_PRE] - - INIT8x2 - -#if (defined(LEFT) && defined(TRANSA)) || \ - (!defined(LEFT) && !defined(TRANSA)) - - mov BO, BC -#else - mov BO, BC - ldr r3 , KK - lsls r4 , r3 , #4 // 2 double values - add BO , BO , r4 - lsls r4 , r3 , #6 // 8 double values - add AO , AO , r4 - -#endif - -#ifndef TRMMKERNEL - ldr L , K -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - ldr L , K - ldr r3, KK - sub L , L, r3 - str L , KKK -#else - ldr L , KK -#ifdef LEFT - add L , L , #8 // number of values in AO -#else - add L , L , #2 // number of values in BO -#endif - str L , KKK -#endif - - asrs K1 , L, #3 // L = L / 8 - ble _L2_M8_40 - .align 5 - -_L2_M8_22: - - pld [BO , #B_PRE] - KERNEL8x2_START - KERNEL8x2_M - pld [BO , #B_PRE] - KERNEL8x2_M - KERNEL8x2_M - - pld [BO , #B_PRE] - KERNEL8x2_M - KERNEL8x2_M - pld [BO , #B_PRE] - KERNEL8x2_M - KERNEL8x2_END - - subs K1, K1, #1 - bgt _L2_M8_22 - - -_L2_M8_40: - - ands K1 , L, #7 // L = L % 8 - ble _L2_M8_100 - -_L2_M8_42: - - KERNEL8x2 - - subs K1, K1, #1 - bgt _L2_M8_42 - -_L2_M8_100: - - SAVE8x2 - -#if (defined(LEFT) && defined(TRANSA)) || \ - (!defined(LEFT) && !defined(TRANSA)) - ldr r3 , K - ldr r4 , KKK - sub r3 , r3 , r4 - lsls r4 , r3 , #4 // 2 double values - add BO , BO , r4 - lsls r4 , r3 , #6 // 8 double values - add AO , AO , r4 -#endif - -#if defined(LEFT) - ldr r3 , KK - add r3 , r3 , #8 // number of values in AO - str r3 , KK -#endif - - -_L2_M8_END: - - subs I, I, #1 - bgt _L2_M8_20 - - -_L2_M4_BEGIN: - - ldr I, M - tst I , #7 - ble _L2_END - - tst I , #4 - ble _L2_M2_BEGIN - -_L2_M4_20: - - INIT4x2 - -#if (defined(LEFT) && defined(TRANSA)) || \ - (!defined(LEFT) && !defined(TRANSA)) - - mov BO, BC -#else - mov BO, BC - ldr r3 , KK - lsls r4 , r3 , #4 // 2 double values - add BO , BO , r4 - lsls r4 , r3 , #5 // 4 double values - add AO , AO , r4 - -#endif - -#ifndef TRMMKERNEL - ldr L , K -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - ldr L , K - ldr r3, KK - sub L , L, r3 - str L , KKK -#else - ldr L , KK -#ifdef LEFT - add L , L , #4 // number of values in AO -#else - add L , L , #2 // number of values in BO -#endif - str L , KKK -#endif - - asrs K1, L, #3 // L = L / 8 - ble _L2_M4_40 - .align 5 - -_L2_M4_22: - - KERNEL4x2 - KERNEL4x2 - KERNEL4x2 - KERNEL4x2 - - KERNEL4x2 - KERNEL4x2 - KERNEL4x2 - KERNEL4x2 - - subs K1, K1, #1 - bgt _L2_M4_22 - - -_L2_M4_40: - - ands K1, L, #7 // L = L % 8 - ble _L2_M4_100 - -_L2_M4_42: - - KERNEL4x2 - - subs K1, K1, #1 - bgt _L2_M4_42 - -_L2_M4_100: - - SAVE4x2 - -#if (defined(LEFT) && defined(TRANSA)) || \ - (!defined(LEFT) && !defined(TRANSA)) - ldr r3 , K - ldr r4 , KKK - sub r3 , r3 , r4 - lsls r4 , r3 , #4 // 2 double values - add BO , BO , r4 - lsls r4 , r3 , #5 // 4 double values - add AO , AO , r4 -#endif - -#if defined(LEFT) - ldr r3 , KK - add r3 , r3 , #4 // number of values in AO - str r3 , KK -#endif - -_L2_M4_END: - - - -_L2_M2_BEGIN: - - tst I, #2 // I = I / 2 - ble _L2_M1_BEGIN - -_L2_M2_20: - - INIT2x2 - -#if (defined(LEFT) && defined(TRANSA)) || \ - (!defined(LEFT) && !defined(TRANSA)) - - mov BO, BC -#else - mov BO, BC - ldr r3 , KK - lsls r4 , r3 , #4 // 2 double values - add BO , BO , r4 - lsls r4 , r3 , #4 // 2 double values - add AO , AO , r4 - -#endif - -#ifndef TRMMKERNEL - ldr L , K -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - ldr L , K - ldr r3, KK - sub L , L, r3 - str L , KKK -#else - ldr L , KK -#ifdef LEFT - add L , L , #2 // number of values in AO -#else - add L , L , #2 // number of values in BO -#endif - str L , KKK -#endif - - asrs K1, L, #2 // L = L / 4 - ble _L2_M2_40 - -_L2_M2_22: - - KERNEL2x2 - KERNEL2x2 - KERNEL2x2 - KERNEL2x2 - - subs K1, K1, #1 - bgt _L2_M2_22 - - -_L2_M2_40: - - ands K1, L, #3 // L = L % 4 - ble _L2_M2_100 - -_L2_M2_42: - - KERNEL2x2 - - subs K1, K1, #1 - bgt _L2_M2_42 - -_L2_M2_100: - - SAVE2x2 - -#if (defined(LEFT) && defined(TRANSA)) || \ - (!defined(LEFT) && !defined(TRANSA)) - ldr r3 , K - ldr r4 , KKK - sub r3 , r3 , r4 - lsls r4 , r3 , #4 // 2 double values - add BO , BO , r4 - lsls r4 , r3 , #4 // 2 double values - add AO , AO , r4 -#endif - -#if defined(LEFT) - ldr r3 , KK - add r3 , r3 , #2 // number of values in AO - str r3 , KK -#endif - - -_L2_M2_END: - - -_L2_M1_BEGIN: - - tst I, #1 // I = I % 2 - ble _L2_END - -_L2_M1_20: - - INIT1x2 - -#if (defined(LEFT) && defined(TRANSA)) || \ - (!defined(LEFT) && !defined(TRANSA)) - - mov BO, BC -#else - mov BO, BC - ldr r3 , KK - lsls r4 , r3 , #4 // 2 double values - add BO , BO , r4 - lsls r4 , r3 , #3 // 1 double value - add AO , AO , r4 - -#endif - -#ifndef TRMMKERNEL - ldr L , K -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - ldr L , K - ldr r3, KK - sub L , L, r3 - str L , KKK -#else - ldr L , KK -#ifdef LEFT - add L , L , #1 // number of values in AO -#else - add L , L , #2 // number of values in BO -#endif - str L , KKK -#endif - - asrs K1, L, #2 // L = L / 4 - ble _L2_M1_40 - -_L2_M1_22: - - KERNEL1x2 - KERNEL1x2 - KERNEL1x2 - KERNEL1x2 - - subs K1, K1, #1 - bgt _L2_M1_22 - - -_L2_M1_40: - - ands K1, L, #3 // L = L % 4 - ble _L2_M1_100 - -_L2_M1_42: - - KERNEL1x2 - - subs K1, K1, #1 - bgt _L2_M1_42 - -_L2_M1_100: - - SAVE1x2 - -#if (defined(LEFT) && defined(TRANSA)) || \ - (!defined(LEFT) && !defined(TRANSA)) - ldr r3 , K - ldr r4 , KKK - sub r3 , r3 , r4 - lsls r4 , r3 , #4 // 2 double values - add BO , BO , r4 - lsls r4 , r3 , #3 // 1 double value - add AO , AO , r4 -#endif - -#if defined(LEFT) - ldr r3 , KK - add r3 , r3 , #1 // number of values in AO - str r3 , KK -#endif - - - -_L2_END: - - mov r3, BC - ldr r4, K - lsl r4, r4, #4 // k * 2 * 8 - add r3, r3, r4 // B = B + K * 2 * 8 - mov BC, r3 - -#if !defined(LEFT) - ldr r3 , KK - add r3 , r3 , #2 // number of values in BO - str r3 , KK -#endif - - subs J , #1 // j-- - bgt _L2_BEGIN - - -_L1_BEGIN: - - ldr J, N - tst J , #1 // J = J % 2 - ble _L999 - - ldr CO1, C // CO1 = C - ldr r4 , LDC - add r3 , CO1, r4 // C = CO1 + LDC - str r3 , C // store C - -#if defined(LEFT) - ldr r3 , OFFSET - str r3 , KK -#endif - - ldr AO, A // AO = A - - - -_L1_M8_BEGIN: - - ldr I, M - asrs I, I, #3 // I = I / 8 - ble _L1_M4_BEGIN - -_L1_M8_20: - - INIT8x1 - -#if (defined(LEFT) && defined(TRANSA)) || \ - (!defined(LEFT) && !defined(TRANSA)) - - mov BO, BC -#else - mov BO, BC - ldr r3 , KK - lsls r4 , r3 , #3 // 1 double value - add BO , BO , r4 - lsls r4 , r3 , #6 // 8 double values - add AO , AO , r4 - -#endif - -#ifndef TRMMKERNEL - ldr L , K -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - ldr L , K - ldr r3, KK - sub L , L, r3 - str L , KKK -#else - ldr L , KK -#ifdef LEFT - add L , L , #8 // number of values in AO -#else - add L , L , #1 // number of values in BO -#endif - str L , KKK -#endif - - asrs K1, L, #3 // L = L / 8 - ble _L1_M8_40 - -_L1_M8_22: - - KERNEL8x1 - KERNEL8x1 - KERNEL8x1 - KERNEL8x1 - KERNEL8x1 - KERNEL8x1 - KERNEL8x1 - KERNEL8x1 - - subs K1, K1, #1 - bgt _L1_M8_22 - - -_L1_M8_40: - - ands K1, L, #7 // L = L % 8 - ble _L1_M8_100 - -_L1_M8_42: - - KERNEL8x1 - - subs K1, K1, #1 - bgt _L1_M8_42 - -_L1_M8_100: - - SAVE8x1 - -#if (defined(LEFT) && defined(TRANSA)) || \ - (!defined(LEFT) && !defined(TRANSA)) - ldr r3 , K - ldr r4 , KKK - sub r3 , r3 , r4 - lsls r4 , r3 , #3 // 1 double value - add BO , BO , r4 - lsls r4 , r3 , #6 // 8 double values - add AO , AO , r4 -#endif - -#if defined(LEFT) - ldr r3 , KK - add r3 , r3 , #8 // number of values in AO - str r3 , KK -#endif - - -_L1_M8_END: - - subs I, I, #1 - bgt _L1_M8_20 - - - - -_L1_M4_BEGIN: - - ldr I, M - tst I, #7 // I = I % 8 - ble _L1_END - - tst I, #4 // I = I % 8 - ble _L1_M2_BEGIN - -_L1_M4_20: - - INIT4x1 - -#if (defined(LEFT) && defined(TRANSA)) || \ - (!defined(LEFT) && !defined(TRANSA)) - - mov BO, BC -#else - mov BO, BC - ldr r3 , KK - lsls r4 , r3 , #3 // 1 double value - add BO , BO , r4 - lsls r4 , r3 , #5 // 4 double values - add AO , AO , r4 - -#endif - -#ifndef TRMMKERNEL - ldr L , K -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - ldr L , K - ldr r3, KK - sub L , L, r3 - str L , KKK -#else - ldr L , KK -#ifdef LEFT - add L , L , #4 // number of values in AO -#else - add L , L , #1 // number of values in BO -#endif - str L , KKK -#endif - - asrs K1, L, #2 // L = L / 4 - ble _L1_M4_40 - -_L1_M4_22: - - KERNEL4x1 - KERNEL4x1 - KERNEL4x1 - KERNEL4x1 - - subs K1, K1, #1 - bgt _L1_M4_22 - - -_L1_M4_40: - - ands K1, L, #3 // L = L % 4 - ble _L1_M4_100 - -_L1_M4_42: - - KERNEL4x1 - - subs K1, K1, #1 - bgt _L1_M4_42 - -_L1_M4_100: - - SAVE4x1 - -#if (defined(LEFT) && defined(TRANSA)) || \ - (!defined(LEFT) && !defined(TRANSA)) - ldr r3 , K - ldr r4 , KKK - sub r3 , r3 , r4 - lsls r4 , r3 , #3 // 1 double value - add BO , BO , r4 - lsls r4 , r3 , #5 // 4 double values - add AO , AO , r4 -#endif - -#if defined(LEFT) - ldr r3 , KK - add r3 , r3 , #4 // number of values in AO - str r3 , KK -#endif - - -_L1_M4_END: - - - - -_L1_M2_BEGIN: - - tst I, #2 // I = I % 4 - ble _L1_M1_BEGIN - -_L1_M2_20: - - INIT2x1 - -#if (defined(LEFT) && defined(TRANSA)) || \ - (!defined(LEFT) && !defined(TRANSA)) - - mov BO, BC -#else - mov BO, BC - ldr r3 , KK - lsls r4 , r3 , #3 // 1 double value - add BO , BO , r4 - lsls r4 , r3 , #4 // 2 double values - add AO , AO , r4 - -#endif - -#ifndef TRMMKERNEL - ldr L , K -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - ldr L , K - ldr r3, KK - sub L , L, r3 - str L , KKK -#else - ldr L , KK -#ifdef LEFT - add L , L , #2 // number of values in AO -#else - add L , L , #1 // number of values in BO -#endif - str L , KKK -#endif - - asrs K1, L, #2 // L = L / 4 - ble _L1_M2_40 - -_L1_M2_22: - - KERNEL2x1 - KERNEL2x1 - KERNEL2x1 - KERNEL2x1 - - subs K1, K1, #1 - bgt _L1_M2_22 - - -_L1_M2_40: - - ands K1 , L, #3 // L = L % 4 - ble _L1_M2_100 - -_L1_M2_42: - - KERNEL2x1 - - subs K1, K1, #1 - bgt _L1_M2_42 - -_L1_M2_100: - - SAVE2x1 - -#if (defined(LEFT) && defined(TRANSA)) || \ - (!defined(LEFT) && !defined(TRANSA)) - ldr r3 , K - ldr r4 , KKK - sub r3 , r3 , r4 - lsls r4 , r3 , #3 // 1 double value - add BO , BO , r4 - lsls r4 , r3 , #4 // 2 double values - add AO , AO , r4 -#endif - -#if defined(LEFT) - ldr r3 , KK - add r3 , r3 , #2 // number of values in AO - str r3 , KK -#endif - - -_L1_M2_END: - - - -_L1_M1_BEGIN: - - tst I, #1 // I = I % 4 - ble _L1_END - -_L1_M1_20: - - INIT1x1 - -#if (defined(LEFT) && defined(TRANSA)) || \ - (!defined(LEFT) && !defined(TRANSA)) - - mov BO, BC -#else - mov BO, BC - ldr r3 , KK - lsls r4 , r3 , #3 // 1 double value - add BO , BO , r4 - lsls r4 , r3 , #3 // 1 double value - add AO , AO , r4 - -#endif - -#ifndef TRMMKERNEL - ldr L , K -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - ldr L , K - ldr r3, KK - sub L , L, r3 - str L , KKK -#else - ldr L , KK -#ifdef LEFT - add L , L , #1 // number of values in AO -#else - add L , L , #1 // number of values in BO -#endif - str L , KKK -#endif - - asrs K1, L, #2 // L = L / 4 - ble _L1_M1_40 - -_L1_M1_22: - - KERNEL1x1 - KERNEL1x1 - KERNEL1x1 - KERNEL1x1 - - subs K1, K1, #1 - bgt _L1_M1_22 - - -_L1_M1_40: - - ands K1 , L, #3 // L = L % 4 - ble _L1_M1_100 - -_L1_M1_42: - - KERNEL1x1 - - subs K1, K1, #1 - bgt _L1_M1_42 - -_L1_M1_100: - - SAVE1x1 - -#if (defined(LEFT) && defined(TRANSA)) || \ - (!defined(LEFT) && !defined(TRANSA)) - ldr r3 , K - ldr r4 , KKK - sub r3 , r3 , r4 - lsls r4 , r3 , #3 // 1 double value - add BO , BO , r4 - lsls r4 , r3 , #3 // 1 double value - add AO , AO , r4 -#endif - -#if defined(LEFT) - ldr r3 , KK - add r3 , r3 , #1 // number of values in AO - str r3 , KK -#endif - - - -_L1_END: - - - -_L999: - - sub r3, fp, #128 - vldm r3, { d8 - d15} // restore floating point registers - - movs r0, #0 // set return value - sub sp, fp, #24 - pop {r4 - r9, fp} - bx lr - - EPILOGUE - From 5400a9f4e4c30d7a961983f733b81b607d09ba2d Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sun, 3 Nov 2013 10:34:04 +0100 Subject: [PATCH 20/81] redefined functions for TIMING and YIELDING for ARMV7 processor --- common.h | 8 +++++ common_arm.h | 8 +++-- driver/level3/level3.c | 22 +++++++++++- driver/level3/level3_thread.c | 66 +++++++++++++++++++++++++++++++++++ 4 files changed, 100 insertions(+), 4 deletions(-) diff --git a/common.h b/common.h index 418ed25f5..a2775520f 100644 --- a/common.h +++ b/common.h @@ -310,10 +310,18 @@ typedef int blasint; #define YIELDING SwitchToThread() #endif + +#ifdef ARMV7 +#define YIELDING asm volatile ("nop;nop;nop;nop;nop;nop;nop;nop; \n"); +#endif + + #ifndef YIELDING #define YIELDING sched_yield() #endif + + /*** To alloc job_t on heap or statck. please https://github.com/xianyi/OpenBLAS/issues/246 diff --git a/common_arm.h b/common_arm.h index e3d1d4079..8c9752d9f 100644 --- a/common_arm.h +++ b/common_arm.h @@ -104,11 +104,13 @@ static void __inline blas_lock(volatile BLASULONG *address){ } -static inline BLASULONG rpcc(void){ - BLASULONG ret=0; +static inline unsigned long long rpcc(void){ + unsigned long long ret=0; + double v; struct timeval tv; gettimeofday(&tv,NULL); - ret=1000000* tv.tv_sec + tv.tv_usec; + v=(double) tv.tv_sec + (double) tv.tv_usec * 1e-6; + ret = (unsigned long long) ( v * 1000.0d ); return ret; } diff --git a/driver/level3/level3.c b/driver/level3/level3.c index 2fe889527..d87c5f546 100644 --- a/driver/level3/level3.c +++ b/driver/level3/level3.c @@ -36,6 +36,8 @@ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ +// #define TIMING 1 + /* This file is a template for level 3 operation */ #ifndef BETA_OPERATION @@ -341,8 +343,16 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N; else if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; -#else +#elif defined(ARMV7) + if (min_jj >= 32) min_jj = 32; + else + if (min_jj >= 16) min_jj = 16; + else + if (min_jj >= 8) min_jj = 8; + else + if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; +#else if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; #endif @@ -402,12 +412,22 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, #ifdef TIMING total = (double)outercost + (double)innercost + (double)kernelcost; +#ifdef ARMV7 + + printf( "Copy A : %5.2f Copy B: %5.2f Kernel : %5.2f\n", + innercost / total * 100., outercost / total * 100., + kernelcost / total * 100.); + + +#else + printf( "Copy A : %5.2f Copy B: %5.2f Kernel : %5.2f kernel Effi. : %5.2f Total Effi. : %5.2f\n", innercost / total * 100., outercost / total * 100., kernelcost / total * 100., (double)(m_to - m_from) * (double)(n_to - n_from) * (double)k / (double)kernelcost * 100. * (double)COMPSIZE / 2., (double)(m_to - m_from) * (double)(n_to - n_from) * (double)k / total * 100. * (double)COMPSIZE / 2.); +#endif #endif return 0; diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c index 3242790fa..56c4d6eca 100644 --- a/driver/level3/level3_thread.c +++ b/driver/level3/level3_thread.c @@ -36,6 +36,8 @@ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ +// #define TIMING 1 + #ifndef CACHE_LINE_SIZE #define CACHE_LINE_SIZE 8 #endif @@ -233,6 +235,21 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, BLASLONG l1stride, l2size; #ifdef TIMING + +#ifdef ARMV7 + + unsigned long long rpcc_counter; + unsigned long long copy_A = 0; + unsigned long long copy_B = 0; + unsigned long long kernel = 0; + unsigned long long waiting1 = 0; + unsigned long long waiting2 = 0; + unsigned long long waiting3 = 0; + unsigned long long waiting6[MAX_CPU_NUMBER]; + unsigned long long ops = 0; + +#else + BLASULONG rpcc_counter; BLASULONG copy_A = 0; BLASULONG copy_B = 0; @@ -243,6 +260,8 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, BLASULONG waiting6[MAX_CPU_NUMBER]; BLASULONG ops = 0; +#endif + for (i = 0; i < args -> nthreads; i++) waiting6[i] = 0; #endif @@ -320,15 +339,35 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, min_l = k - ls; +#ifdef ARMV7_1 + if (min_l >= GEMM_Q / 4 * 2) { + min_l = GEMM_Q / 4; + } else { + if (min_l > GEMM_Q / 4) min_l = (min_l + 1) / 2; + } + +#else if (min_l >= GEMM_Q * 2) { min_l = GEMM_Q; } else { if (min_l > GEMM_Q) min_l = (min_l + 1) / 2; } +#endif l1stride = 1; min_i = m_to - m_from; +#ifdef ARMV7_1 + if (min_i >= GEMM_P / 4 * 2) { + min_i = GEMM_P / 4; + } else { + if (min_i > GEMM_P / 4) { + min_i = (min_i / 2 + GEMM_UNROLL_M - 1) & ~(GEMM_UNROLL_M - 1); + } else { + if (args -> nthreads == 1) l1stride = 0; + } + } +#else if (min_i >= GEMM_P * 2) { min_i = GEMM_P; } else { @@ -339,6 +378,8 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, } } +#endif + START_RPCC(); ICOPY_OPERATION(min_l, min_i, a, lda, ls, m_from, sa); @@ -375,6 +416,14 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N; else if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; +#elif defined(ARMV7) + if (min_jj >= 16) min_jj = 16; + else + if (min_jj >= 8) min_jj = 8; + else + if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; + + #else if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; @@ -506,6 +555,21 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, STOP_RPCC(waiting3); #ifdef TIMING + +#ifdef ARMV7 + + unsigned long long waiting = waiting1 + waiting2 + waiting3; + unsigned long long total = copy_A + copy_B + kernel + waiting; + + fprintf(stderr, "GEMM [%2ld] Copy_A : %6.2f Copy_B : %6.2f Wait1 : %6.2f Wait2 : %6.2f Wait3 : %6.2f Kernel : %6.2f", + mypos, (double)copy_A /(double)total * 100., (double)copy_B /(double)total * 100., + (double)waiting1 /(double)total * 100., + (double)waiting2 /(double)total * 100., + (double)waiting3 /(double)total * 100., + (double)kernel /(double)total * 100.); + +#else + BLASLONG waiting = waiting1 + waiting2 + waiting3; BLASLONG total = copy_A + copy_B + kernel + waiting; @@ -516,6 +580,8 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, (double)waiting3 /(double)total * 100., (double)ops/(double)kernel / 4. * 100.); +#endif + #if 0 fprintf(stderr, "GEMM [%2ld] Copy_A : %6.2ld Copy_B : %6.2ld Wait : %6.2ld\n", mypos, copy_A, copy_B, waiting); From cba97daf3c7dbb8d44e4fb52005bb843ada9df0d Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sun, 3 Nov 2013 11:04:16 +0100 Subject: [PATCH 21/81] added missing file cblas_noconst.h to the armv7 branch --- cblas_noconst.h | 303 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 303 insertions(+) create mode 100644 cblas_noconst.h diff --git a/cblas_noconst.h b/cblas_noconst.h new file mode 100644 index 000000000..fd2e940c0 --- /dev/null +++ b/cblas_noconst.h @@ -0,0 +1,303 @@ +#ifndef CBLAS_H +#define CBLAS_H + +#include +#include "common.h" + +#ifdef __cplusplus +extern "C" { + /* Assume C declarations for C++ */ +#endif /* __cplusplus */ + +/*Set the number of threads on runtime.*/ +void openblas_set_num_threads(int num_threads); +void goto_set_num_threads(int num_threads); + +/*Get the build configure on runtime.*/ +char* openblas_get_config(void); + +/* Get the parallelization type which is used by OpenBLAS */ +int openblas_get_parallel(void); +/* OpenBLAS is compiled for sequential use */ +#define OPENBLAS_SEQUENTIAL 0 +/* OpenBLAS is compiled using normal threading model */ +#define OPENBLAS_THREAD 1 +/* OpenBLAS is compiled using OpenMP threading model */ +#define OPENBLAS_OPENMP 2 + + +#define CBLAS_INDEX size_t + +typedef enum CBLAS_ORDER {CblasRowMajor=101, CblasColMajor=102} CBLAS_ORDER; +typedef enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=113, CblasConjNoTrans=114} CBLAS_TRANSPOSE; +typedef enum CBLAS_UPLO {CblasUpper=121, CblasLower=122} CBLAS_UPLO; +typedef enum CBLAS_DIAG {CblasNonUnit=131, CblasUnit=132} CBLAS_DIAG; +typedef enum CBLAS_SIDE {CblasLeft=141, CblasRight=142} CBLAS_SIDE; + +float cblas_sdsdot(blasint n, float alpha, float *x, blasint incx, float *y, blasint incy); +double cblas_dsdot (blasint n, float *x, blasint incx, float *y, blasint incy); +float cblas_sdot(blasint n, float *x, blasint incx, float *y, blasint incy); +double cblas_ddot(blasint n, double *x, blasint incx, double *y, blasint incy); + +openblas_complex_float cblas_cdotu(blasint n, float *x, blasint incx, float *y, blasint incy); +openblas_complex_float cblas_cdotc(blasint n, float *x, blasint incx, float *y, blasint incy); +openblas_complex_double cblas_zdotu(blasint n, double *x, blasint incx, double *y, blasint incy); +openblas_complex_double cblas_zdotc(blasint n, double *x, blasint incx, double *y, blasint incy); + +void cblas_cdotu_sub(blasint n, float *x, blasint incx, float *y, blasint incy, openblas_complex_float *ret); +void cblas_cdotc_sub(blasint n, float *x, blasint incx, float *y, blasint incy, openblas_complex_float *ret); +void cblas_zdotu_sub(blasint n, double *x, blasint incx, double *y, blasint incy, openblas_complex_double *ret); +void cblas_zdotc_sub(blasint n, double *x, blasint incx, double *y, blasint incy, openblas_complex_double *ret); + +float cblas_sasum (blasint n, float *x, blasint incx); +double cblas_dasum (blasint n, double *x, blasint incx); +float cblas_scasum(blasint n, float *x, blasint incx); +double cblas_dzasum(blasint n, double *x, blasint incx); + +float cblas_snrm2 (blasint N, float *X, blasint incX); +double cblas_dnrm2 (blasint N, double *X, blasint incX); +float cblas_scnrm2(blasint N, float *X, blasint incX); +double cblas_dznrm2(blasint N, double *X, blasint incX); + +CBLAS_INDEX cblas_isamax(blasint n, float *x, blasint incx); +CBLAS_INDEX cblas_idamax(blasint n, double *x, blasint incx); +CBLAS_INDEX cblas_icamax(blasint n, float *x, blasint incx); +CBLAS_INDEX cblas_izamax(blasint n, double *x, blasint incx); + +void cblas_saxpy(blasint n, float alpha, float *x, blasint incx, float *y, blasint incy); +void cblas_daxpy(blasint n, double alpha, double *x, blasint incx, double *y, blasint incy); +void cblas_caxpy(blasint n, float *alpha, float *x, blasint incx, float *y, blasint incy); +void cblas_zaxpy(blasint n, double *alpha, double *x, blasint incx, double *y, blasint incy); + +void cblas_scopy(blasint n, float *x, blasint incx, float *y, blasint incy); +void cblas_dcopy(blasint n, double *x, blasint incx, double *y, blasint incy); +void cblas_ccopy(blasint n, float *x, blasint incx, float *y, blasint incy); +void cblas_zcopy(blasint n, double *x, blasint incx, double *y, blasint incy); + +void cblas_sswap(blasint n, float *x, blasint incx, float *y, blasint incy); +void cblas_dswap(blasint n, double *x, blasint incx, double *y, blasint incy); +void cblas_cswap(blasint n, float *x, blasint incx, float *y, blasint incy); +void cblas_zswap(blasint n, double *x, blasint incx, double *y, blasint incy); + +void cblas_srot(blasint N, float *X, blasint incX, float *Y, blasint incY, float c, float s); +void cblas_drot(blasint N, double *X, blasint incX, double *Y, blasint incY, double c, double s); + +void cblas_srotg(float *a, float *b, float *c, float *s); +void cblas_drotg(double *a, double *b, double *c, double *s); + +void cblas_srotm(blasint N, float *X, blasint incX, float *Y, blasint incY, float *P); +void cblas_drotm(blasint N, double *X, blasint incX, double *Y, blasint incY, double *P); + +void cblas_srotmg(float *d1, float *d2, float *b1, float b2, float *P); +void cblas_drotmg(double *d1, double *d2, double *b1, double b2, double *P); + +void cblas_sscal(blasint N, float alpha, float *X, blasint incX); +void cblas_dscal(blasint N, double alpha, double *X, blasint incX); +void cblas_cscal(blasint N, float *alpha, float *X, blasint incX); +void cblas_zscal(blasint N, double *alpha, double *X, blasint incX); +void cblas_csscal(blasint N, float alpha, float *X, blasint incX); +void cblas_zdscal(blasint N, double alpha, double *X, blasint incX); + +void cblas_sgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE trans, blasint m, blasint n, + float alpha, float *a, blasint lda, float *x, blasint incx, float beta, float *y, blasint incy); +void cblas_dgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE trans, blasint m, blasint n, + double alpha, double *a, blasint lda, double *x, blasint incx, double beta, double *y, blasint incy); +void cblas_cgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE trans, blasint m, blasint n, + float *alpha, float *a, blasint lda, float *x, blasint incx, float *beta, float *y, blasint incy); +void cblas_zgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE trans, blasint m, blasint n, + double *alpha, double *a, blasint lda, double *x, blasint incx, double *beta, double *y, blasint incy); + +void cblas_sger (enum CBLAS_ORDER order, blasint M, blasint N, float alpha, float *X, blasint incX, float *Y, blasint incY, float *A, blasint lda); +void cblas_dger (enum CBLAS_ORDER order, blasint M, blasint N, double alpha, double *X, blasint incX, double *Y, blasint incY, double *A, blasint lda); +void cblas_cgeru(enum CBLAS_ORDER order, blasint M, blasint N, float *alpha, float *X, blasint incX, float *Y, blasint incY, float *A, blasint lda); +void cblas_cgerc(enum CBLAS_ORDER order, blasint M, blasint N, float *alpha, float *X, blasint incX, float *Y, blasint incY, float *A, blasint lda); +void cblas_zgeru(enum CBLAS_ORDER order, blasint M, blasint N, double *alpha, double *X, blasint incX, double *Y, blasint incY, double *A, blasint lda); +void cblas_zgerc(enum CBLAS_ORDER order, blasint M, blasint N, double *alpha, double *X, blasint incX, double *Y, blasint incY, double *A, blasint lda); + +void cblas_strsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX); +void cblas_dtrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX); +void cblas_ctrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX); +void cblas_ztrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX); + +void cblas_strmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX); +void cblas_dtrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX); +void cblas_ctrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX); +void cblas_ztrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX); + +void cblas_ssyr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *A, blasint lda); +void cblas_dsyr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *A, blasint lda); +void cblas_cher(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *A, blasint lda); +void cblas_zher(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *A, blasint lda); + +void cblas_ssyr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,blasint N, float alpha, float *X, + blasint incX, float *Y, blasint incY, float *A, blasint lda); +void cblas_dsyr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, + blasint incX, double *Y, blasint incY, double *A, blasint lda); +void cblas_cher2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float *alpha, float *X, blasint incX, + float *Y, blasint incY, float *A, blasint lda); +void cblas_zher2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double *alpha, double *X, blasint incX, + double *Y, blasint incY, double *A, blasint lda); + +void cblas_sgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N, + blasint KL, blasint KU, float alpha, float *A, blasint lda, float *X, blasint incX, float beta, float *Y, blasint incY); +void cblas_dgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N, + blasint KL, blasint KU, double alpha, double *A, blasint lda, double *X, blasint incX, double beta, double *Y, blasint incY); +void cblas_cgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N, + blasint KL, blasint KU, float *alpha, float *A, blasint lda, float *X, blasint incX, float *beta, float *Y, blasint incY); +void cblas_zgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N, + blasint KL, blasint KU, double *alpha, double *A, blasint lda, double *X, blasint incX, double *beta, double *Y, blasint incY); + +void cblas_ssbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K, float alpha, float *A, + blasint lda, float *X, blasint incX, float beta, float *Y, blasint incY); +void cblas_dsbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K, double alpha, double *A, + blasint lda, double *X, blasint incX, double beta, double *Y, blasint incY); + + +void cblas_stbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, + blasint N, blasint K, float *A, blasint lda, float *X, blasint incX); +void cblas_dtbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, + blasint N, blasint K, double *A, blasint lda, double *X, blasint incX); +void cblas_ctbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, + blasint N, blasint K, float *A, blasint lda, float *X, blasint incX); +void cblas_ztbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, + blasint N, blasint K, double *A, blasint lda, double *X, blasint incX); + +void cblas_stbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, + blasint N, blasint K, float *A, blasint lda, float *X, blasint incX); +void cblas_dtbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, + blasint N, blasint K, double *A, blasint lda, double *X, blasint incX); +void cblas_ctbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, + blasint N, blasint K, float *A, blasint lda, float *X, blasint incX); +void cblas_ztbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, + blasint N, blasint K, double *A, blasint lda, double *X, blasint incX); + +void cblas_stpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, + blasint N, float *Ap, float *X, blasint incX); +void cblas_dtpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, + blasint N, double *Ap, double *X, blasint incX); +void cblas_ctpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, + blasint N, float *Ap, float *X, blasint incX); +void cblas_ztpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, + blasint N, double *Ap, double *X, blasint incX); + +void cblas_stpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, + blasint N, float *Ap, float *X, blasint incX); +void cblas_dtpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, + blasint N, double *Ap, double *X, blasint incX); +void cblas_ctpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, + blasint N, float *Ap, float *X, blasint incX); +void cblas_ztpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, + blasint N, double *Ap, double *X, blasint incX); + +void cblas_ssymv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *A, + blasint lda, float *X, blasint incX, float beta, float *Y, blasint incY); +void cblas_dsymv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *A, + blasint lda, double *X, blasint incX, double beta, double *Y, blasint incY); +void cblas_chemv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float *alpha, float *A, + blasint lda, float *X, blasint incX, float *beta, float *Y, blasint incY); +void cblas_zhemv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double *alpha, double *A, + blasint lda, double *X, blasint incX, double *beta, double *Y, blasint incY); + + +void cblas_sspmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *Ap, + float *X, blasint incX, float beta, float *Y, blasint incY); +void cblas_dspmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *Ap, + double *X, blasint incX, double beta, double *Y, blasint incY); + +void cblas_sspr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *Ap); +void cblas_dspr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *Ap); + +void cblas_chpr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *A); +void cblas_zhpr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X,blasint incX, double *A); + +void cblas_sspr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *Y, blasint incY, float *A); +void cblas_dspr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *Y, blasint incY, double *A); +void cblas_chpr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float *alpha, float *X, blasint incX, float *Y, blasint incY, float *Ap); +void cblas_zhpr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double *alpha, double *X, blasint incX, double *Y, blasint incY, double *Ap); + +void cblas_chbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K, + float *alpha, float *A, blasint lda, float *X, blasint incX, float *beta, float *Y, blasint incY); +void cblas_zhbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K, + double *alpha, double *A, blasint lda, double *X, blasint incX, double *beta, double *Y, blasint incY); + +void cblas_chpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, + float *alpha, float *Ap, float *X, blasint incX, float *beta, float *Y, blasint incY); +void cblas_zhpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, + double *alpha, double *Ap, double *X, blasint incX, double *beta, double *Y, blasint incY); + +void cblas_sgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K, + float alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc); +void cblas_dgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K, + double alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc); +void cblas_cgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K, + float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc); +void cblas_zgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K, + double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc); + +void cblas_ssymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N, + float alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc); +void cblas_dsymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N, + double alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc); +void cblas_csymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N, + float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc); +void cblas_zsymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N, + double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc); + +void cblas_ssyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, + blasint N, blasint K, float alpha, float *A, blasint lda, float beta, float *C, blasint ldc); +void cblas_dsyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, + blasint N, blasint K, double alpha, double *A, blasint lda, double beta, double *C, blasint ldc); +void cblas_csyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, + blasint N, blasint K, float *alpha, float *A, blasint lda, float *beta, float *C, blasint ldc); +void cblas_zsyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, + blasint N, blasint K, double *alpha, double *A, blasint lda, double *beta, double *C, blasint ldc); + +void cblas_ssyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, + blasint N, blasint K, float alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc); +void cblas_dsyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, + blasint N, blasint K, double alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc); +void cblas_csyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, + blasint N, blasint K, float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc); +void cblas_zsyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, + blasint N, blasint K, double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc); + +void cblas_strmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, + enum CBLAS_DIAG Diag, blasint M, blasint N, float alpha, float *A, blasint lda, float *B, blasint ldb); +void cblas_dtrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, + enum CBLAS_DIAG Diag, blasint M, blasint N, double alpha, double *A, blasint lda, double *B, blasint ldb); +void cblas_ctrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, + enum CBLAS_DIAG Diag, blasint M, blasint N, float *alpha, float *A, blasint lda, float *B, blasint ldb); +void cblas_ztrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, + enum CBLAS_DIAG Diag, blasint M, blasint N, double *alpha, double *A, blasint lda, double *B, blasint ldb); + +void cblas_strsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, + enum CBLAS_DIAG Diag, blasint M, blasint N, float alpha, float *A, blasint lda, float *B, blasint ldb); +void cblas_dtrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, + enum CBLAS_DIAG Diag, blasint M, blasint N, double alpha, double *A, blasint lda, double *B, blasint ldb); +void cblas_ctrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, + enum CBLAS_DIAG Diag, blasint M, blasint N, float *alpha, float *A, blasint lda, float *B, blasint ldb); +void cblas_ztrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, + enum CBLAS_DIAG Diag, blasint M, blasint N, double *alpha, double *A, blasint lda, double *B, blasint ldb); + +void cblas_chemm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N, + float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc); +void cblas_zhemm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N, + double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc); + +void cblas_cherk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K, + float alpha, float *A, blasint lda, float beta, float *C, blasint ldc); +void cblas_zherk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K, + double alpha, double *A, blasint lda, double beta, double *C, blasint ldc); + +void cblas_cher2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K, + float *alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc); +void cblas_zher2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K, + double *alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc); + +void cblas_xerbla(blasint p, char *rout, char *form, ...); + +#ifdef __cplusplus +} +#endif /* __cplusplus */ + +#endif From 95aedfa0ffd57d02cc271d579ed32a56ab6758a5 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sun, 3 Nov 2013 11:19:32 +0100 Subject: [PATCH 22/81] added missing file arm/Makefile in lapack/laswp --- lapack/laswp/arm/Makefile | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) create mode 100644 lapack/laswp/arm/Makefile diff --git a/lapack/laswp/arm/Makefile b/lapack/laswp/arm/Makefile new file mode 100644 index 000000000..434c82a84 --- /dev/null +++ b/lapack/laswp/arm/Makefile @@ -0,0 +1,33 @@ +TOPDIR = ../../.. +include ../../../Makefile.system + +ifeq ($(CORE), CORE2) +LASWP = ../generic/laswp_k_2.c +ZLASWP = ../generic/zlaswp_k_2.c +endif + +ifeq ($(CORE), OPTERON) +LASWP = ../generic/laswp_k_1.c +ZLASWP = ../generic/zlaswp_k_1.c +endif + +ifeq ($(CORE), PRESCOTT) +LASWP = ../generic/laswp_k_1.c +ZLASWP = ../generic/zlaswp_k_1.c +endif + +ifeq ($(DYNAMIC_ARCH), 1) +LASWP = ../generic/laswp_k_4.c +ZLASWP = ../generic/zlaswp_k_4.c +endif + +ifndef LASWP +LASWP = ../generic/laswp_k.c +endif + +ifndef ZLASWP +ZLASWP = ../generic/zlaswp_k.c +endif + +include ../generic/Makefile + From 370e3834a93e9d74a5ffacd5b92b9ecc6ed0411c Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sun, 3 Nov 2013 11:54:39 +0100 Subject: [PATCH 23/81] added missing file kernel/arm/Makefile --- kernel/arm/Makefile | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 kernel/arm/Makefile diff --git a/kernel/arm/Makefile b/kernel/arm/Makefile new file mode 100644 index 000000000..efae70d7b --- /dev/null +++ b/kernel/arm/Makefile @@ -0,0 +1,2 @@ +clean :: + From 82015beaefcc8c702393bb9d20d4552a694753f3 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Tue, 5 Nov 2013 19:31:22 +0100 Subject: [PATCH 24/81] added zgemm_ncopy_2_vfpv3.S and made assembler labels unique --- kernel/arm/KERNEL.ARMV7 | 2 +- kernel/arm/dgemm_kernel_4x4_vfpv3.S | 307 ++++++++++++---------------- kernel/arm/dgemm_ncopy_4_vfpv3.S | 87 ++++---- kernel/arm/zgemm_kernel_2x2_vfpv3.S | 150 +++++++------- kernel/arm/zgemm_ncopy_2_vfpv3.S | 254 +++++++++++++++++++++++ 5 files changed, 512 insertions(+), 288 deletions(-) create mode 100644 kernel/arm/zgemm_ncopy_2_vfpv3.S diff --git a/kernel/arm/KERNEL.ARMV7 b/kernel/arm/KERNEL.ARMV7 index ec692d5c2..e30261698 100644 --- a/kernel/arm/KERNEL.ARMV7 +++ b/kernel/arm/KERNEL.ARMV7 @@ -115,7 +115,7 @@ CGEMMONCOPYOBJ = cgemm_oncopy.o CGEMMOTCOPYOBJ = cgemm_otcopy.o ZGEMMKERNEL = zgemm_kernel_2x2_vfpv3.S -ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMONCOPY = zgemm_ncopy_2_vfpv3.S ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c ZGEMMONCOPYOBJ = zgemm_oncopy.o ZGEMMOTCOPYOBJ = zgemm_otcopy.o diff --git a/kernel/arm/dgemm_kernel_4x4_vfpv3.S b/kernel/arm/dgemm_kernel_4x4_vfpv3.S index dfe3e3634..7d83def94 100644 --- a/kernel/arm/dgemm_kernel_4x4_vfpv3.S +++ b/kernel/arm/dgemm_kernel_4x4_vfpv3.S @@ -77,7 +77,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define B [fp, #4 ] #define C [fp, #8 ] -#define OLD_LDC [fp, #12 ] +#define OLDdgemm_kernel_LDC [fp, #12 ] #define I r0 #define J r1 @@ -883,7 +883,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. sub r3, fp, #128 vstm r3, { d8 - d15} // store floating point registers - ldr r3, OLD_LDC + ldr r3, OLDdgemm_kernel_LDC lsl r3, r3, #3 // ldc = ldc * 8 str r3, LDC @@ -892,9 +892,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ldr J, N asrs J, J, #2 // J = J / 4 - ble _L2_BEGIN + ble dgemm_kernel_L2_BEGIN -_L4_BEGIN: +dgemm_kernel_L4_BEGIN: ldr CO1, C // CO1 = C ldr r4 , LDC @@ -908,21 +908,19 @@ _L4_BEGIN: -_L4_M4_BEGIN: +dgemm_kernel_L4_M4_BEGIN: ldr I, M asrs I, I, #2 // I = I / 4 - ble _L4_M2_BEGIN + ble dgemm_kernel_L4_M2_BEGIN -_L4_M4_20: +dgemm_kernel_L4_M4_20: mov BO, BC asrs L , K1, #3 // L = L / 8 - cmp L , #3 - blt _L4_M4_30 - .align 5 - + cmp L , #2 + blt dgemm_kernel_L4_M4_32 KERNEL4x4_I @@ -935,9 +933,11 @@ _L4_M4_20: KERNEL4x4_M1 KERNEL4x4_M2 - sub L, L, #2 + subs L, L, #2 + ble dgemm_kernel_L4_M4_22a + .align 5 -_L4_M4_22: +dgemm_kernel_L4_M4_22: KERNEL4x4_M1 KERNEL4x4_M2 @@ -950,7 +950,9 @@ _L4_M4_22: KERNEL4x4_M2 subs L, L, #1 - bgt _L4_M4_22 + bgt dgemm_kernel_L4_M4_22 + +dgemm_kernel_L4_M4_22a: KERNEL4x4_M1 KERNEL4x4_M2 @@ -962,43 +964,12 @@ _L4_M4_22: KERNEL4x4_M1 KERNEL4x4_E - b _L4_M4_44 + b dgemm_kernel_L4_M4_44 - -_L4_M4_30: - tst L, #3 - ble _L4_M4_40 - - tst L, #2 - ble _L4_M4_32 - - KERNEL4x4_I - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - - - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_E - - b _L4_M4_44 - -_L4_M4_32: +dgemm_kernel_L4_M4_32: tst L, #1 - ble _L4_M4_40 + ble dgemm_kernel_L4_M4_40 KERNEL4x4_I KERNEL4x4_M2 @@ -1010,54 +981,54 @@ _L4_M4_32: KERNEL4x4_M1 KERNEL4x4_E - b _L4_M4_44 + b dgemm_kernel_L4_M4_44 -_L4_M4_40: +dgemm_kernel_L4_M4_40: INIT4x4 -_L4_M4_44: +dgemm_kernel_L4_M4_44: ands L , K1, #7 // L = L % 8 - ble _L4_M4_100 + ble dgemm_kernel_L4_M4_100 -_L4_M4_46: +dgemm_kernel_L4_M4_46: KERNEL4x4_SUB subs L, L, #1 - bne _L4_M4_46 + bne dgemm_kernel_L4_M4_46 -_L4_M4_100: +dgemm_kernel_L4_M4_100: SAVE4x4 -_L4_M4_END: +dgemm_kernel_L4_M4_END: subs I, I, #1 - bne _L4_M4_20 + bne dgemm_kernel_L4_M4_20 -_L4_M2_BEGIN: +dgemm_kernel_L4_M2_BEGIN: ldr I, M tst I , #3 - ble _L4_END + ble dgemm_kernel_L4_END tst I, #2 // I = I / 2 - ble _L4_M1_BEGIN + ble dgemm_kernel_L4_M1_BEGIN -_L4_M2_20: +dgemm_kernel_L4_M2_20: INIT2x4 mov BO, BC asrs L , K1, #3 // L = L / 8 - ble _L4_M2_40 + ble dgemm_kernel_L4_M2_40 -_L4_M2_22: +dgemm_kernel_L4_M2_22: KERNEL2x4_SUB KERNEL2x4_SUB @@ -1070,42 +1041,42 @@ _L4_M2_22: KERNEL2x4_SUB subs L, L, #1 - bgt _L4_M2_22 + bgt dgemm_kernel_L4_M2_22 -_L4_M2_40: +dgemm_kernel_L4_M2_40: ands L , K1, #7 // L = L % 8 - ble _L4_M2_100 + ble dgemm_kernel_L4_M2_100 -_L4_M2_42: +dgemm_kernel_L4_M2_42: KERNEL2x4_SUB subs L, L, #1 - bgt _L4_M2_42 + bgt dgemm_kernel_L4_M2_42 -_L4_M2_100: +dgemm_kernel_L4_M2_100: SAVE2x4 -_L4_M2_END: +dgemm_kernel_L4_M2_END: -_L4_M1_BEGIN: +dgemm_kernel_L4_M1_BEGIN: tst I, #1 // I = I % 2 - ble _L4_END + ble dgemm_kernel_L4_END -_L4_M1_20: +dgemm_kernel_L4_M1_20: INIT1x4 mov BO, BC asrs L , K1, #3 // L = L / 8 - ble _L4_M1_40 + ble dgemm_kernel_L4_M1_40 -_L4_M1_22: +dgemm_kernel_L4_M1_22: KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB @@ -1117,27 +1088,27 @@ _L4_M1_22: KERNEL1x4_SUB subs L, L, #1 - bgt _L4_M1_22 + bgt dgemm_kernel_L4_M1_22 -_L4_M1_40: +dgemm_kernel_L4_M1_40: ands L , K1, #7 // L = L % 8 - ble _L4_M1_100 + ble dgemm_kernel_L4_M1_100 -_L4_M1_42: +dgemm_kernel_L4_M1_42: KERNEL1x4_SUB subs L, L, #1 - bgt _L4_M1_42 + bgt dgemm_kernel_L4_M1_42 -_L4_M1_100: +dgemm_kernel_L4_M1_100: SAVE1x4 -_L4_END: +dgemm_kernel_L4_END: mov r3, BC mov r4, K1 @@ -1146,20 +1117,20 @@ _L4_END: mov BC, r3 subs J , #1 // j-- - bgt _L4_BEGIN + bgt dgemm_kernel_L4_BEGIN /*********************************************************************************************/ -_L2_BEGIN: +dgemm_kernel_L2_BEGIN: ldr J , N tst J , #3 - ble _L999 + ble dgemm_kernel_L999 tst J , #2 - ble _L1_BEGIN + ble dgemm_kernel_L1_BEGIN ldr CO1, C // CO1 = C ldr r4 , LDC @@ -1168,28 +1139,25 @@ _L2_BEGIN: str r3 , C // store C ldr AO, A // AO = A - //pld [AO , #A_PRE-96] - //pld [AO , #A_PRE-64] - //pld [AO , #A_PRE-32] -_L2_M4_BEGIN: +dgemm_kernel_L2_M4_BEGIN: ldr I, M asrs I, I, #2 // I = I / 4 - ble _L2_M2_BEGIN + ble dgemm_kernel_L2_M2_BEGIN -_L2_M4_20: +dgemm_kernel_L2_M4_20: INIT4x2 mov BO, BC asrs L , K1, #3 // L = L / 8 - ble _L2_M4_40 + ble dgemm_kernel_L2_M4_40 .align 5 -_L2_M4_22: +dgemm_kernel_L2_M4_22: KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB @@ -1201,49 +1169,49 @@ _L2_M4_22: KERNEL4x2_SUB subs L, L, #1 - bgt _L2_M4_22 + bgt dgemm_kernel_L2_M4_22 -_L2_M4_40: +dgemm_kernel_L2_M4_40: ands L , K1, #7 // L = L % 8 - ble _L2_M4_100 + ble dgemm_kernel_L2_M4_100 -_L2_M4_42: +dgemm_kernel_L2_M4_42: KERNEL4x2_SUB subs L, L, #1 - bgt _L2_M4_42 + bgt dgemm_kernel_L2_M4_42 -_L2_M4_100: +dgemm_kernel_L2_M4_100: SAVE4x2 -_L2_M4_END: +dgemm_kernel_L2_M4_END: subs I, I, #1 - bgt _L2_M4_20 + bgt dgemm_kernel_L2_M4_20 -_L2_M2_BEGIN: +dgemm_kernel_L2_M2_BEGIN: ldr I, M tst I , #3 - ble _L2_END + ble dgemm_kernel_L2_END tst I, #2 // I = I / 2 - ble _L2_M1_BEGIN + ble dgemm_kernel_L2_M1_BEGIN -_L2_M2_20: +dgemm_kernel_L2_M2_20: INIT2x2 mov BO, BC asrs L , K1, #3 // L = L / 8 - ble _L2_M2_40 + ble dgemm_kernel_L2_M2_40 -_L2_M2_22: +dgemm_kernel_L2_M2_22: KERNEL2x2_SUB KERNEL2x2_SUB @@ -1256,42 +1224,42 @@ _L2_M2_22: KERNEL2x2_SUB subs L, L, #1 - bgt _L2_M2_22 + bgt dgemm_kernel_L2_M2_22 -_L2_M2_40: +dgemm_kernel_L2_M2_40: ands L , K1, #7 // L = L % 8 - ble _L2_M2_100 + ble dgemm_kernel_L2_M2_100 -_L2_M2_42: +dgemm_kernel_L2_M2_42: KERNEL2x2_SUB subs L, L, #1 - bgt _L2_M2_42 + bgt dgemm_kernel_L2_M2_42 -_L2_M2_100: +dgemm_kernel_L2_M2_100: SAVE2x2 -_L2_M2_END: +dgemm_kernel_L2_M2_END: -_L2_M1_BEGIN: +dgemm_kernel_L2_M1_BEGIN: tst I, #1 // I = I % 2 - ble _L2_END + ble dgemm_kernel_L2_END -_L2_M1_20: +dgemm_kernel_L2_M1_20: INIT1x2 mov BO, BC asrs L , K1, #3 // L = L / 8 - ble _L2_M1_40 + ble dgemm_kernel_L2_M1_40 -_L2_M1_22: +dgemm_kernel_L2_M1_22: KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB @@ -1303,27 +1271,27 @@ _L2_M1_22: KERNEL1x2_SUB subs L, L, #1 - bgt _L2_M1_22 + bgt dgemm_kernel_L2_M1_22 -_L2_M1_40: +dgemm_kernel_L2_M1_40: ands L , K1, #7 // L = L % 8 - ble _L2_M1_100 + ble dgemm_kernel_L2_M1_100 -_L2_M1_42: +dgemm_kernel_L2_M1_42: KERNEL1x2_SUB subs L, L, #1 - bgt _L2_M1_42 + bgt dgemm_kernel_L2_M1_42 -_L2_M1_100: +dgemm_kernel_L2_M1_100: SAVE1x2 -_L2_END: +dgemm_kernel_L2_END: mov r3, BC mov r4, K1 @@ -1333,11 +1301,11 @@ _L2_END: /*********************************************************************************************/ -_L1_BEGIN: +dgemm_kernel_L1_BEGIN: ldr J , N tst J , #1 - ble _L999 + ble dgemm_kernel_L999 ldr CO1, C // CO1 = C @@ -1346,28 +1314,25 @@ _L1_BEGIN: str r3 , C // store C ldr AO, A // AO = A - //pld [AO , #A_PRE-96] - //pld [AO , #A_PRE-64] - //pld [AO , #A_PRE-32] -_L1_M4_BEGIN: +dgemm_kernel_L1_M4_BEGIN: ldr I, M asrs I, I, #2 // I = I / 4 - ble _L1_M2_BEGIN + ble dgemm_kernel_L1_M2_BEGIN -_L1_M4_20: +dgemm_kernel_L1_M4_20: INIT4x1 mov BO, BC asrs L , K1, #3 // L = L / 8 - ble _L1_M4_40 + ble dgemm_kernel_L1_M4_40 .align 5 -_L1_M4_22: +dgemm_kernel_L1_M4_22: KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB @@ -1379,49 +1344,49 @@ _L1_M4_22: KERNEL4x1_SUB subs L, L, #1 - bgt _L1_M4_22 + bgt dgemm_kernel_L1_M4_22 -_L1_M4_40: +dgemm_kernel_L1_M4_40: ands L , K1, #7 // L = L % 8 - ble _L1_M4_100 + ble dgemm_kernel_L1_M4_100 -_L1_M4_42: +dgemm_kernel_L1_M4_42: KERNEL4x1_SUB subs L, L, #1 - bgt _L1_M4_42 + bgt dgemm_kernel_L1_M4_42 -_L1_M4_100: +dgemm_kernel_L1_M4_100: SAVE4x1 -_L1_M4_END: +dgemm_kernel_L1_M4_END: subs I, I, #1 - bgt _L1_M4_20 + bgt dgemm_kernel_L1_M4_20 -_L1_M2_BEGIN: +dgemm_kernel_L1_M2_BEGIN: ldr I, M tst I , #3 - ble _L1_END + ble dgemm_kernel_L1_END tst I, #2 // I = I / 2 - ble _L1_M1_BEGIN + ble dgemm_kernel_L1_M1_BEGIN -_L1_M2_20: +dgemm_kernel_L1_M2_20: INIT2x1 mov BO, BC asrs L , K1, #3 // L = L / 8 - ble _L1_M2_40 + ble dgemm_kernel_L1_M2_40 -_L1_M2_22: +dgemm_kernel_L1_M2_22: KERNEL2x1_SUB KERNEL2x1_SUB @@ -1434,42 +1399,42 @@ _L1_M2_22: KERNEL2x1_SUB subs L, L, #1 - bgt _L1_M2_22 + bgt dgemm_kernel_L1_M2_22 -_L1_M2_40: +dgemm_kernel_L1_M2_40: ands L , K1, #7 // L = L % 8 - ble _L1_M2_100 + ble dgemm_kernel_L1_M2_100 -_L1_M2_42: +dgemm_kernel_L1_M2_42: KERNEL2x1_SUB subs L, L, #1 - bgt _L1_M2_42 + bgt dgemm_kernel_L1_M2_42 -_L1_M2_100: +dgemm_kernel_L1_M2_100: SAVE2x1 -_L1_M2_END: +dgemm_kernel_L1_M2_END: -_L1_M1_BEGIN: +dgemm_kernel_L1_M1_BEGIN: tst I, #1 // I = I % 2 - ble _L1_END + ble dgemm_kernel_L1_END -_L1_M1_20: +dgemm_kernel_L1_M1_20: INIT1x1 mov BO, BC asrs L , K1, #3 // L = L / 8 - ble _L1_M1_40 + ble dgemm_kernel_L1_M1_40 -_L1_M1_22: +dgemm_kernel_L1_M1_22: KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB @@ -1481,30 +1446,30 @@ _L1_M1_22: KERNEL1x1_SUB subs L, L, #1 - bgt _L1_M1_22 + bgt dgemm_kernel_L1_M1_22 -_L1_M1_40: +dgemm_kernel_L1_M1_40: ands L , K1, #7 // L = L % 8 - ble _L1_M1_100 + ble dgemm_kernel_L1_M1_100 -_L1_M1_42: +dgemm_kernel_L1_M1_42: KERNEL1x1_SUB subs L, L, #1 - bgt _L1_M1_42 + bgt dgemm_kernel_L1_M1_42 -_L1_M1_100: +dgemm_kernel_L1_M1_100: SAVE1x1 -_L1_END: +dgemm_kernel_L1_END: -_L999: +dgemm_kernel_L999: sub r3, fp, #128 vldm r3, { d8 - d15} // restore floating point registers diff --git a/kernel/arm/dgemm_ncopy_4_vfpv3.S b/kernel/arm/dgemm_ncopy_4_vfpv3.S index bdb63bfdd..ad6692e50 100644 --- a/kernel/arm/dgemm_ncopy_4_vfpv3.S +++ b/kernel/arm/dgemm_ncopy_4_vfpv3.S @@ -26,10 +26,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** -* 2013/10/11 Saar -* BLASTEST : xOK -* CTEST : xOK -* TEST : xOK +* 2013/11/05 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK * **************************************************************************************/ @@ -68,7 +68,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define I r3 #define J r12 -#define A_PRE 96 +#define A_PRE 256 /************************************************************************************** * Macro definitions @@ -76,6 +76,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY4x4 + pld [ AO1, #A_PRE ] + pld [ AO2, #A_PRE ] + pld [ AO3, #A_PRE ] + pld [ AO4, #A_PRE ] + fldd d0 , [ AO1, #0 ] fldd d1 , [ AO2, #0 ] fldd d2 , [ AO3, #0 ] @@ -199,12 +204,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ldr BO, B -_L4_BEGIN: +dgemm_ncopy_L4_BEGIN: asrs J, N, #2 // J = N / 4 - ble _L2_BEGIN + ble dgemm_ncopy_L2_BEGIN -_L4_M4_BEGIN: +dgemm_ncopy_L4_M4_BEGIN: mov AO1, A // AO1 = A ldr r4 , LDA @@ -214,47 +219,47 @@ _L4_M4_BEGIN: add A , AO4, r4 // A = A + 4 * LDA asrs I, M, #2 // I = M / 4 - ble _L4_M4_40 + ble dgemm_ncopy_L4_M4_40 -_L4_M4_20: +dgemm_ncopy_L4_M4_20: COPY4x4 subs I , I , #1 - bne _L4_M4_20 + bne dgemm_ncopy_L4_M4_20 -_L4_M4_40: +dgemm_ncopy_L4_M4_40: ands I, M , #3 - ble _L4_M4_END + ble dgemm_ncopy_L4_M4_END -_L4_M4_60: +dgemm_ncopy_L4_M4_60: COPY1x4 subs I , I , #1 - bne _L4_M4_60 + bne dgemm_ncopy_L4_M4_60 -_L4_M4_END: +dgemm_ncopy_L4_M4_END: subs J , J, #1 // j-- - bne _L4_M4_BEGIN + bne dgemm_ncopy_L4_M4_BEGIN /*********************************************************************************************/ -_L2_BEGIN: +dgemm_ncopy_L2_BEGIN: tst N, #3 - ble _L999 + ble dgemm_ncopy_L999 tst N, #2 - ble _L1_BEGIN + ble dgemm_ncopy_L1_BEGIN -_L2_M4_BEGIN: +dgemm_ncopy_L2_M4_BEGIN: mov AO1, A // AO1 = A ldr r4 , LDA @@ -262,75 +267,75 @@ _L2_M4_BEGIN: add A , AO2, r4 // A = A + 2 * LDA asrs I, M, #2 // I = M / 4 - ble _L2_M4_40 + ble dgemm_ncopy_L2_M4_40 -_L2_M4_20: +dgemm_ncopy_L2_M4_20: COPY4x2 subs I , I , #1 - bne _L2_M4_20 + bne dgemm_ncopy_L2_M4_20 -_L2_M4_40: +dgemm_ncopy_L2_M4_40: ands I, M , #3 - ble _L2_M4_END + ble dgemm_ncopy_L2_M4_END -_L2_M4_60: +dgemm_ncopy_L2_M4_60: COPY1x2 subs I , I , #1 - bne _L2_M4_60 + bne dgemm_ncopy_L2_M4_60 -_L2_M4_END: +dgemm_ncopy_L2_M4_END: /*********************************************************************************************/ -_L1_BEGIN: +dgemm_ncopy_L1_BEGIN: tst N, #1 - ble _L999 + ble dgemm_ncopy_L999 -_L1_M4_BEGIN: +dgemm_ncopy_L1_M4_BEGIN: mov AO1, A // AO1 = A ldr r4 , LDA add A , AO1, r4 // A = A + 1 * LDA asrs I, M, #2 // I = M / 4 - ble _L1_M4_40 + ble dgemm_ncopy_L1_M4_40 -_L1_M4_20: +dgemm_ncopy_L1_M4_20: COPY4x1 subs I , I , #1 - bne _L1_M4_20 + bne dgemm_ncopy_L1_M4_20 -_L1_M4_40: +dgemm_ncopy_L1_M4_40: ands I, M , #3 - ble _L1_M4_END + ble dgemm_ncopy_L1_M4_END -_L1_M4_60: +dgemm_ncopy_L1_M4_60: COPY1x1 subs I , I , #1 - bne _L1_M4_60 + bne dgemm_ncopy_L1_M4_60 -_L1_M4_END: +dgemm_ncopy_L1_M4_END: -_L999: +dgemm_ncopy_L999: sub r3, fp, #128 vldm r3, { d8 - d15} // restore floating point registers diff --git a/kernel/arm/zgemm_kernel_2x2_vfpv3.S b/kernel/arm/zgemm_kernel_2x2_vfpv3.S index 9c14aec10..2d35028a2 100644 --- a/kernel/arm/zgemm_kernel_2x2_vfpv3.S +++ b/kernel/arm/zgemm_kernel_2x2_vfpv3.S @@ -26,7 +26,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** -* 2013/11/02 Saar +* 2013/11/05 Saar * BLASTEST : OK * CTEST : OK * TEST : OK @@ -924,9 +924,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ldr J, N asrs J, J, #1 // J = J / 2 - ble _L1_BEGIN + ble zgemm_kernel_L1_BEGIN -_L2_BEGIN: +zgemm_kernel_L2_BEGIN: ldr CO1, C // CO1 = C ldr r4 , LDC @@ -940,19 +940,19 @@ _L2_BEGIN: -_L2_M2_BEGIN: +zgemm_kernel_L2_M2_BEGIN: ldr I, M asrs I, I, #1 // I = I / 2 - ble _L2_M1_BEGIN + ble zgemm_kernel_L2_M1_BEGIN -_L2_M2_20: +zgemm_kernel_L2_M2_20: mov BO, BC asrs L , K1, #3 // L = L / 8 cmp L , #3 - blt _L2_M2_30 + blt zgemm_kernel_L2_M2_30 .align 5 @@ -969,7 +969,7 @@ _L2_M2_20: sub L, L, #2 -_L2_M2_22: +zgemm_kernel_L2_M2_22: KERNEL2x2_M1 KERNEL2x2_M2 @@ -982,7 +982,7 @@ _L2_M2_22: KERNEL2x2_M2 subs L, L, #1 - bgt _L2_M2_22 + bgt zgemm_kernel_L2_M2_22 KERNEL2x2_M1 KERNEL2x2_M2 @@ -994,15 +994,15 @@ _L2_M2_22: KERNEL2x2_M1 KERNEL2x2_E - b _L2_M2_44 + b zgemm_kernel_L2_M2_44 -_L2_M2_30: +zgemm_kernel_L2_M2_30: tst L, #3 - ble _L2_M2_40 + ble zgemm_kernel_L2_M2_40 tst L, #2 - ble _L2_M2_32 + ble zgemm_kernel_L2_M2_32 KERNEL2x2_I KERNEL2x2_M2 @@ -1025,12 +1025,12 @@ _L2_M2_30: KERNEL2x2_M1 KERNEL2x2_E - b _L2_M2_44 + b zgemm_kernel_L2_M2_44 -_L2_M2_32: +zgemm_kernel_L2_M2_32: tst L, #1 - ble _L2_M2_40 + ble zgemm_kernel_L2_M2_40 KERNEL2x2_I KERNEL2x2_M2 @@ -1042,51 +1042,51 @@ _L2_M2_32: KERNEL2x2_M1 KERNEL2x2_E - b _L2_M2_44 + b zgemm_kernel_L2_M2_44 -_L2_M2_40: +zgemm_kernel_L2_M2_40: INIT2x2 -_L2_M2_44: +zgemm_kernel_L2_M2_44: ands L , K1, #7 // L = L % 8 - ble _L2_M2_100 + ble zgemm_kernel_L2_M2_100 -_L2_M2_46: +zgemm_kernel_L2_M2_46: KERNEL2x2_SUB subs L, L, #1 - bne _L2_M2_46 + bne zgemm_kernel_L2_M2_46 -_L2_M2_100: +zgemm_kernel_L2_M2_100: SAVE2x2 -_L2_M2_END: +zgemm_kernel_L2_M2_END: subs I, I, #1 - bne _L2_M2_20 + bne zgemm_kernel_L2_M2_20 -_L2_M1_BEGIN: +zgemm_kernel_L2_M1_BEGIN: ldr I, M tst I, #1 // I = I % 2 - ble _L2_END + ble zgemm_kernel_L2_END -_L2_M1_20: +zgemm_kernel_L2_M1_20: INIT1x2 mov BO, BC asrs L , K1, #3 // L = L / 8 - ble _L2_M1_40 + ble zgemm_kernel_L2_M1_40 -_L2_M1_22: +zgemm_kernel_L2_M1_22: KERNEL1x2_SUB KERNEL1x2_SUB @@ -1099,27 +1099,27 @@ _L2_M1_22: KERNEL1x2_SUB subs L, L, #1 - bgt _L2_M1_22 + bgt zgemm_kernel_L2_M1_22 -_L2_M1_40: +zgemm_kernel_L2_M1_40: ands L , K1, #7 // L = L % 8 - ble _L2_M1_100 + ble zgemm_kernel_L2_M1_100 -_L2_M1_42: +zgemm_kernel_L2_M1_42: KERNEL1x2_SUB subs L, L, #1 - bgt _L2_M1_42 + bgt zgemm_kernel_L2_M1_42 -_L2_M1_100: +zgemm_kernel_L2_M1_100: SAVE1x2 -_L2_END: +zgemm_kernel_L2_END: mov r3, BC mov r4, K1 @@ -1128,17 +1128,17 @@ _L2_END: mov BC, r3 subs J , #1 // j-- - bgt _L2_BEGIN + bgt zgemm_kernel_L2_BEGIN /*********************************************************************************************/ -_L1_BEGIN: +zgemm_kernel_L1_BEGIN: ldr J , N tst J , #1 - ble _L999 + ble zgemm_kernel_L999 ldr CO1, C // CO1 = C @@ -1148,19 +1148,19 @@ _L1_BEGIN: ldr AO, A // AO = A -_L1_M2_BEGIN: +zgemm_kernel_L1_M2_BEGIN: ldr I, M asrs I, I, #1 // I = I / 2 - ble _L1_M1_BEGIN + ble zgemm_kernel_L1_M1_BEGIN -_L1_M2_20: +zgemm_kernel_L1_M2_20: mov BO, BC asrs L , K1, #3 // L = L / 8 cmp L , #3 - blt _L1_M2_30 + blt zgemm_kernel_L1_M2_30 .align 5 @@ -1177,7 +1177,7 @@ _L1_M2_20: sub L, L, #2 -_L1_M2_22: +zgemm_kernel_L1_M2_22: KERNEL2x1_M1 KERNEL2x1_M2 @@ -1190,7 +1190,7 @@ _L1_M2_22: KERNEL2x1_M2 subs L, L, #1 - bgt _L1_M2_22 + bgt zgemm_kernel_L1_M2_22 KERNEL2x1_M1 KERNEL2x1_M2 @@ -1202,15 +1202,15 @@ _L1_M2_22: KERNEL2x1_M1 KERNEL2x1_E - b _L1_M2_44 + b zgemm_kernel_L1_M2_44 -_L1_M2_30: +zgemm_kernel_L1_M2_30: tst L, #3 - ble _L1_M2_40 + ble zgemm_kernel_L1_M2_40 tst L, #2 - ble _L1_M2_32 + ble zgemm_kernel_L1_M2_32 KERNEL2x1_I KERNEL2x1_M2 @@ -1233,12 +1233,12 @@ _L1_M2_30: KERNEL2x1_M1 KERNEL2x1_E - b _L1_M2_44 + b zgemm_kernel_L1_M2_44 -_L1_M2_32: +zgemm_kernel_L1_M2_32: tst L, #1 - ble _L1_M2_40 + ble zgemm_kernel_L1_M2_40 KERNEL2x1_I KERNEL2x1_M2 @@ -1250,51 +1250,51 @@ _L1_M2_32: KERNEL2x1_M1 KERNEL2x1_E - b _L1_M2_44 + b zgemm_kernel_L1_M2_44 -_L1_M2_40: +zgemm_kernel_L1_M2_40: INIT2x1 -_L1_M2_44: +zgemm_kernel_L1_M2_44: ands L , K1, #7 // L = L % 8 - ble _L1_M2_100 + ble zgemm_kernel_L1_M2_100 -_L1_M2_46: +zgemm_kernel_L1_M2_46: KERNEL2x1_SUB subs L, L, #1 - bne _L1_M2_46 + bne zgemm_kernel_L1_M2_46 -_L1_M2_100: +zgemm_kernel_L1_M2_100: SAVE2x1 -_L1_M2_END: +zgemm_kernel_L1_M2_END: subs I, I, #1 - bne _L1_M2_20 + bne zgemm_kernel_L1_M2_20 -_L1_M1_BEGIN: +zgemm_kernel_L1_M1_BEGIN: ldr I, M tst I, #1 // I = I % 2 - ble _L1_END + ble zgemm_kernel_L1_END -_L1_M1_20: +zgemm_kernel_L1_M1_20: INIT1x1 mov BO, BC asrs L , K1, #3 // L = L / 8 - ble _L1_M1_40 + ble zgemm_kernel_L1_M1_40 -_L1_M1_22: +zgemm_kernel_L1_M1_22: KERNEL1x1_SUB KERNEL1x1_SUB @@ -1307,31 +1307,31 @@ _L1_M1_22: KERNEL1x1_SUB subs L, L, #1 - bgt _L1_M1_22 + bgt zgemm_kernel_L1_M1_22 -_L1_M1_40: +zgemm_kernel_L1_M1_40: ands L , K1, #7 // L = L % 8 - ble _L1_M1_100 + ble zgemm_kernel_L1_M1_100 -_L1_M1_42: +zgemm_kernel_L1_M1_42: KERNEL1x1_SUB subs L, L, #1 - bgt _L1_M1_42 + bgt zgemm_kernel_L1_M1_42 -_L1_M1_100: +zgemm_kernel_L1_M1_100: SAVE1x1 -_L1_END: +zgemm_kernel_L1_END: -_L999: +zgemm_kernel_L999: sub r3, fp, #128 vldm r3, { d8 - d15} // restore floating point registers diff --git a/kernel/arm/zgemm_ncopy_2_vfpv3.S b/kernel/arm/zgemm_ncopy_2_vfpv3.S new file mode 100644 index 000000000..5ff8ee299 --- /dev/null +++ b/kernel/arm/zgemm_ncopy_2_vfpv3.S @@ -0,0 +1,254 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/05 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_M r0 +#define OLD_N r1 +#define OLD_A r2 +#define OLD_LDA r3 + + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define LDA [fp, #-260 ] + +#define B [fp, #4 ] + +#define M r0 +#define N r1 +#define A r2 + +#define BO r5 + +#define AO1 r6 +#define AO2 r7 + +#define I r3 +#define J r12 + +#define A_PRE 256 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro COPY2x2 + + pld [ AO1, #A_PRE ] + pld [ AO2, #A_PRE ] + fldd d0 , [ AO1, #0 ] + fldd d1 , [ AO1, #8 ] + fldd d4 , [ AO1, #16 ] + fldd d5 , [ AO1, #24 ] + + fldd d2 , [ AO2, #0 ] + fldd d3 , [ AO2, #8 ] + add AO1, AO1, #32 + fldd d6 , [ AO2, #16 ] + fldd d7 , [ AO2, #24 ] + + fstmiad BO!, { d0 - d7 } + add AO2, AO2, #32 + +.endm + + +.macro COPY1x2 + + fldd d0 , [ AO1, #0 ] + fldd d1 , [ AO1, #8 ] + fldd d2 , [ AO2, #0 ] + fldd d3 , [ AO2, #8 ] + + add AO1, AO1, #16 + fstmiad BO!, { d0 - d3 } + add AO2, AO2, #16 + +.endm + +.macro COPY2x1 + + fldd d0 , [ AO1, #0 ] + fldd d1 , [ AO1, #8 ] + fldd d2 , [ AO1, #16 ] + fldd d3 , [ AO1, #24 ] + + fstmiad BO!, { d0 - d3 } + add AO1, AO1, #32 + +.endm + + +.macro COPY1x1 + + fldd d0 , [ AO1, #0 ] + fldd d1 , [ AO1, #8 ] + + fstmiad BO!, { d0 - d1 } + add AO1, AO1, #16 + +.endm + + + + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + + lsl r3, r3, #4 // lda = lda * 8 * 2 + str r3, LDA + + sub r4, fp, #128 + vstm r4, { d8 - d15} // store floating point registers + + ldr BO, B + +/*********************************************************************************************/ + +zgemm_ncopy_L2_BEGIN: + + asrs J, N, #1 // J = N / 2 + ble zgemm_ncopy_L1_BEGIN + +zgemm_ncopy_L2_M2_BEGIN: + + mov AO1, A // AO1 = A + ldr r4 , LDA + add AO2, AO1, r4 + add A , AO2, r4 // A = A + 2 * LDA + + asrs I, M, #1 // I = M / 2 + ble zgemm_ncopy_L2_M2_40 + +zgemm_ncopy_L2_M2_20: + + COPY2x2 + + subs I , I , #1 + bne zgemm_ncopy_L2_M2_20 + + +zgemm_ncopy_L2_M2_40: + + ands I, M , #1 + ble zgemm_ncopy_L2_M2_END + +zgemm_ncopy_L2_M2_60: + + COPY1x2 + + subs I , I , #1 + bne zgemm_ncopy_L2_M2_60 + + +zgemm_ncopy_L2_M2_END: + + subs J , J, #1 // j-- + bne zgemm_ncopy_L2_M2_BEGIN + + +/*********************************************************************************************/ + +zgemm_ncopy_L1_BEGIN: + + tst N, #1 + ble zgemm_ncopy_L999 + + +zgemm_ncopy_L1_M2_BEGIN: + + mov AO1, A // AO1 = A + ldr r4 , LDA + add A , AO1, r4 // A = A + 1 * LDA + + asrs I, M, #1 // I = M / 2 + ble zgemm_ncopy_L1_M2_40 + +zgemm_ncopy_L1_M2_20: + + COPY2x1 + + subs I , I , #1 + bne zgemm_ncopy_L1_M2_20 + + +zgemm_ncopy_L1_M2_40: + + ands I, M , #1 + ble zgemm_ncopy_L1_M2_END + +zgemm_ncopy_L1_M2_60: + + COPY1x1 + + subs I , I , #1 + bne zgemm_ncopy_L1_M2_60 + + +zgemm_ncopy_L1_M2_END: + + + +zgemm_ncopy_L999: + + sub r3, fp, #128 + vldm r3, { d8 - d15} // restore floating point registers + + movs r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + From ac50bccbd250a2222e7aa0222c383187886267be Mon Sep 17 00:00:00 2001 From: wernsaar Date: Tue, 5 Nov 2013 20:21:35 +0100 Subject: [PATCH 25/81] added cgemm_ncopy_2_vfpv3.S and made assembler labels unique --- kernel/arm/KERNEL.ARMV7 | 2 +- kernel/arm/cgemm_kernel_2x2_vfpv3.S | 150 ++++++++-------- kernel/arm/cgemm_ncopy_2_vfpv3.S | 258 +++++++++++++++++++++++++++ kernel/arm/sgemm_kernel_4x4_vfpv3.S | 262 ++++++++++++++-------------- kernel/arm/sgemm_ncopy_4_vfpv3.S | 78 ++++----- 5 files changed, 504 insertions(+), 246 deletions(-) create mode 100644 kernel/arm/cgemm_ncopy_2_vfpv3.S diff --git a/kernel/arm/KERNEL.ARMV7 b/kernel/arm/KERNEL.ARMV7 index e30261698..cdf370725 100644 --- a/kernel/arm/KERNEL.ARMV7 +++ b/kernel/arm/KERNEL.ARMV7 @@ -109,7 +109,7 @@ DGEMMONCOPYOBJ = dgemm_oncopy.o DGEMMOTCOPYOBJ = dgemm_otcopy.o CGEMMKERNEL = cgemm_kernel_2x2_vfpv3.S -CGEMMONCOPY = ../generic/zgemm_ncopy_2.c +CGEMMONCOPY = cgemm_ncopy_2_vfpv3.S CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c CGEMMONCOPYOBJ = cgemm_oncopy.o CGEMMOTCOPYOBJ = cgemm_otcopy.o diff --git a/kernel/arm/cgemm_kernel_2x2_vfpv3.S b/kernel/arm/cgemm_kernel_2x2_vfpv3.S index 4cebcab77..3aba68de8 100644 --- a/kernel/arm/cgemm_kernel_2x2_vfpv3.S +++ b/kernel/arm/cgemm_kernel_2x2_vfpv3.S @@ -26,7 +26,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** -* 2013/11/01 Saar +* 2013/11/05 Saar * BLASTEST : OK * CTEST : OK * TEST : OK @@ -888,9 +888,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ldr J, N asrs J, J, #1 // J = J / 2 - ble _L1_BEGIN + ble cgemm_kernel_L1_BEGIN -_L2_BEGIN: +cgemm_kernel_L2_BEGIN: ldr CO1, C // CO1 = C ldr r4 , LDC @@ -904,19 +904,19 @@ _L2_BEGIN: -_L2_M2_BEGIN: +cgemm_kernel_L2_M2_BEGIN: ldr I, M asrs I, I, #1 // I = I / 2 - ble _L2_M1_BEGIN + ble cgemm_kernel_L2_M1_BEGIN -_L2_M2_20: +cgemm_kernel_L2_M2_20: mov BO, BC asrs L , K1, #3 // L = L / 8 cmp L , #3 - blt _L2_M2_30 + blt cgemm_kernel_L2_M2_30 .align 5 @@ -933,7 +933,7 @@ _L2_M2_20: sub L, L, #2 -_L2_M2_22: +cgemm_kernel_L2_M2_22: KERNEL2x2_M1 KERNEL2x2_M2 @@ -946,7 +946,7 @@ _L2_M2_22: KERNEL2x2_M2 subs L, L, #1 - bgt _L2_M2_22 + bgt cgemm_kernel_L2_M2_22 KERNEL2x2_M1 KERNEL2x2_M2 @@ -958,15 +958,15 @@ _L2_M2_22: KERNEL2x2_M1 KERNEL2x2_E - b _L2_M2_44 + b cgemm_kernel_L2_M2_44 -_L2_M2_30: +cgemm_kernel_L2_M2_30: tst L, #3 - ble _L2_M2_40 + ble cgemm_kernel_L2_M2_40 tst L, #2 - ble _L2_M2_32 + ble cgemm_kernel_L2_M2_32 KERNEL2x2_I KERNEL2x2_M2 @@ -989,12 +989,12 @@ _L2_M2_30: KERNEL2x2_M1 KERNEL2x2_E - b _L2_M2_44 + b cgemm_kernel_L2_M2_44 -_L2_M2_32: +cgemm_kernel_L2_M2_32: tst L, #1 - ble _L2_M2_40 + ble cgemm_kernel_L2_M2_40 KERNEL2x2_I KERNEL2x2_M2 @@ -1006,51 +1006,51 @@ _L2_M2_32: KERNEL2x2_M1 KERNEL2x2_E - b _L2_M2_44 + b cgemm_kernel_L2_M2_44 -_L2_M2_40: +cgemm_kernel_L2_M2_40: INIT2x2 -_L2_M2_44: +cgemm_kernel_L2_M2_44: ands L , K1, #7 // L = L % 8 - ble _L2_M2_100 + ble cgemm_kernel_L2_M2_100 -_L2_M2_46: +cgemm_kernel_L2_M2_46: KERNEL2x2_SUB subs L, L, #1 - bne _L2_M2_46 + bne cgemm_kernel_L2_M2_46 -_L2_M2_100: +cgemm_kernel_L2_M2_100: SAVE2x2 -_L2_M2_END: +cgemm_kernel_L2_M2_END: subs I, I, #1 - bne _L2_M2_20 + bne cgemm_kernel_L2_M2_20 -_L2_M1_BEGIN: +cgemm_kernel_L2_M1_BEGIN: ldr I, M tst I, #1 // I = I % 2 - ble _L2_END + ble cgemm_kernel_L2_END -_L2_M1_20: +cgemm_kernel_L2_M1_20: INIT1x2 mov BO, BC asrs L , K1, #3 // L = L / 8 - ble _L2_M1_40 + ble cgemm_kernel_L2_M1_40 -_L2_M1_22: +cgemm_kernel_L2_M1_22: KERNEL1x2_SUB KERNEL1x2_SUB @@ -1063,27 +1063,27 @@ _L2_M1_22: KERNEL1x2_SUB subs L, L, #1 - bgt _L2_M1_22 + bgt cgemm_kernel_L2_M1_22 -_L2_M1_40: +cgemm_kernel_L2_M1_40: ands L , K1, #7 // L = L % 8 - ble _L2_M1_100 + ble cgemm_kernel_L2_M1_100 -_L2_M1_42: +cgemm_kernel_L2_M1_42: KERNEL1x2_SUB subs L, L, #1 - bgt _L2_M1_42 + bgt cgemm_kernel_L2_M1_42 -_L2_M1_100: +cgemm_kernel_L2_M1_100: SAVE1x2 -_L2_END: +cgemm_kernel_L2_END: mov r3, BC mov r4, K1 @@ -1092,17 +1092,17 @@ _L2_END: mov BC, r3 subs J , #1 // j-- - bgt _L2_BEGIN + bgt cgemm_kernel_L2_BEGIN /*********************************************************************************************/ -_L1_BEGIN: +cgemm_kernel_L1_BEGIN: ldr J , N tst J , #1 - ble _L999 + ble cgemm_kernel_L999 ldr CO1, C // CO1 = C @@ -1112,19 +1112,19 @@ _L1_BEGIN: ldr AO, A // AO = A -_L1_M2_BEGIN: +cgemm_kernel_L1_M2_BEGIN: ldr I, M asrs I, I, #1 // I = I / 2 - ble _L1_M1_BEGIN + ble cgemm_kernel_L1_M1_BEGIN -_L1_M2_20: +cgemm_kernel_L1_M2_20: mov BO, BC asrs L , K1, #3 // L = L / 8 cmp L , #3 - blt _L1_M2_30 + blt cgemm_kernel_L1_M2_30 .align 5 @@ -1141,7 +1141,7 @@ _L1_M2_20: sub L, L, #2 -_L1_M2_22: +cgemm_kernel_L1_M2_22: KERNEL2x1_M1 KERNEL2x1_M2 @@ -1154,7 +1154,7 @@ _L1_M2_22: KERNEL2x1_M2 subs L, L, #1 - bgt _L1_M2_22 + bgt cgemm_kernel_L1_M2_22 KERNEL2x1_M1 KERNEL2x1_M2 @@ -1166,15 +1166,15 @@ _L1_M2_22: KERNEL2x1_M1 KERNEL2x1_E - b _L1_M2_44 + b cgemm_kernel_L1_M2_44 -_L1_M2_30: +cgemm_kernel_L1_M2_30: tst L, #3 - ble _L1_M2_40 + ble cgemm_kernel_L1_M2_40 tst L, #2 - ble _L1_M2_32 + ble cgemm_kernel_L1_M2_32 KERNEL2x1_I KERNEL2x1_M2 @@ -1197,12 +1197,12 @@ _L1_M2_30: KERNEL2x1_M1 KERNEL2x1_E - b _L1_M2_44 + b cgemm_kernel_L1_M2_44 -_L1_M2_32: +cgemm_kernel_L1_M2_32: tst L, #1 - ble _L1_M2_40 + ble cgemm_kernel_L1_M2_40 KERNEL2x1_I KERNEL2x1_M2 @@ -1214,51 +1214,51 @@ _L1_M2_32: KERNEL2x1_M1 KERNEL2x1_E - b _L1_M2_44 + b cgemm_kernel_L1_M2_44 -_L1_M2_40: +cgemm_kernel_L1_M2_40: INIT2x1 -_L1_M2_44: +cgemm_kernel_L1_M2_44: ands L , K1, #7 // L = L % 8 - ble _L1_M2_100 + ble cgemm_kernel_L1_M2_100 -_L1_M2_46: +cgemm_kernel_L1_M2_46: KERNEL2x1_SUB subs L, L, #1 - bne _L1_M2_46 + bne cgemm_kernel_L1_M2_46 -_L1_M2_100: +cgemm_kernel_L1_M2_100: SAVE2x1 -_L1_M2_END: +cgemm_kernel_L1_M2_END: subs I, I, #1 - bne _L1_M2_20 + bne cgemm_kernel_L1_M2_20 -_L1_M1_BEGIN: +cgemm_kernel_L1_M1_BEGIN: ldr I, M tst I, #1 // I = I % 2 - ble _L1_END + ble cgemm_kernel_L1_END -_L1_M1_20: +cgemm_kernel_L1_M1_20: INIT1x1 mov BO, BC asrs L , K1, #3 // L = L / 8 - ble _L1_M1_40 + ble cgemm_kernel_L1_M1_40 -_L1_M1_22: +cgemm_kernel_L1_M1_22: KERNEL1x1_SUB KERNEL1x1_SUB @@ -1271,31 +1271,31 @@ _L1_M1_22: KERNEL1x1_SUB subs L, L, #1 - bgt _L1_M1_22 + bgt cgemm_kernel_L1_M1_22 -_L1_M1_40: +cgemm_kernel_L1_M1_40: ands L , K1, #7 // L = L % 8 - ble _L1_M1_100 + ble cgemm_kernel_L1_M1_100 -_L1_M1_42: +cgemm_kernel_L1_M1_42: KERNEL1x1_SUB subs L, L, #1 - bgt _L1_M1_42 + bgt cgemm_kernel_L1_M1_42 -_L1_M1_100: +cgemm_kernel_L1_M1_100: SAVE1x1 -_L1_END: +cgemm_kernel_L1_END: -_L999: +cgemm_kernel_L999: sub r3, fp, #128 vldm r3, { s8 - s31} // restore floating point registers diff --git a/kernel/arm/cgemm_ncopy_2_vfpv3.S b/kernel/arm/cgemm_ncopy_2_vfpv3.S new file mode 100644 index 000000000..08fbd5501 --- /dev/null +++ b/kernel/arm/cgemm_ncopy_2_vfpv3.S @@ -0,0 +1,258 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/05 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_M r0 +#define OLD_N r1 +#define OLD_A r2 +#define OLD_LDA r3 + + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define LDA [fp, #-260 ] + +#define B [fp, #4 ] + +#define M r0 +#define N r1 +#define A r2 + +#define BO r5 + +#define AO1 r6 +#define AO2 r7 + +#define I r3 +#define J r12 + +#define A_PRE 256 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro COPY2x2 + + flds s0 , [ AO1, #0 ] + flds s1 , [ AO1, #4 ] + flds s4 , [ AO1, #8 ] + flds s5 , [ AO1, #12 ] + + flds s2 , [ AO2, #0 ] + flds s3 , [ AO2, #4 ] + add AO1, AO1, #16 + flds s6 , [ AO2, #8 ] + flds s7 , [ AO2, #12 ] + + fstmias BO!, { s0 - s7 } + add AO2, AO2, #16 + +.endm + + +.macro COPY1x2 + + flds s0 , [ AO1, #0 ] + flds s1 , [ AO1, #4 ] + flds s2 , [ AO2, #0 ] + flds s3 , [ AO2, #4 ] + + add AO1, AO1, #8 + fstmias BO!, { s0 - s3 } + add AO2, AO2, #8 + +.endm + +.macro COPY2x1 + + flds s0 , [ AO1, #0 ] + flds s1 , [ AO1, #4 ] + flds s2 , [ AO1, #8 ] + flds s3 , [ AO1, #12 ] + + fstmias BO!, { s0 - s3 } + add AO1, AO1, #16 + +.endm + + +.macro COPY1x1 + + flds s0 , [ AO1, #0 ] + flds s1 , [ AO1, #4 ] + + fstmias BO!, { s0 - s1 } + add AO1, AO1, #8 + +.endm + + + + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + + lsl r3, r3, #3 // lda = lda * 4 * 2 + str r3, LDA + + sub r4, fp, #128 + vstm r4, { s8 - s15} // store floating point registers + + ldr BO, B + +/*********************************************************************************************/ + +cgemm_ncopy_L2_BEGIN: + + asrs J, N, #1 // J = N / 2 + ble cgemm_ncopy_L1_BEGIN + +cgemm_ncopy_L2_M2_BEGIN: + + mov AO1, A // AO1 = A + ldr r4 , LDA + add AO2, AO1, r4 + add A , AO2, r4 // A = A + 2 * LDA + + asrs I, M, #1 // I = M / 2 + ble cgemm_ncopy_L2_M2_40 + +cgemm_ncopy_L2_M2_20: + + pld [ AO1, #A_PRE ] + pld [ AO2, #A_PRE ] + + COPY2x2 + subs I , I , #1 + ble cgemm_ncopy_L2_M2_40 + + COPY2x2 + subs I , I , #1 + bne cgemm_ncopy_L2_M2_20 + + +cgemm_ncopy_L2_M2_40: + + ands I, M , #1 + ble cgemm_ncopy_L2_M2_END + +cgemm_ncopy_L2_M2_60: + + COPY1x2 + + subs I , I , #1 + bne cgemm_ncopy_L2_M2_60 + + +cgemm_ncopy_L2_M2_END: + + subs J , J, #1 // j-- + bne cgemm_ncopy_L2_M2_BEGIN + + +/*********************************************************************************************/ + +cgemm_ncopy_L1_BEGIN: + + tst N, #1 + ble cgemm_ncopy_L999 + + +cgemm_ncopy_L1_M2_BEGIN: + + mov AO1, A // AO1 = A + ldr r4 , LDA + add A , AO1, r4 // A = A + 1 * LDA + + asrs I, M, #1 // I = M / 2 + ble cgemm_ncopy_L1_M2_40 + +cgemm_ncopy_L1_M2_20: + + COPY2x1 + + subs I , I , #1 + bne cgemm_ncopy_L1_M2_20 + + +cgemm_ncopy_L1_M2_40: + + ands I, M , #1 + ble cgemm_ncopy_L1_M2_END + +cgemm_ncopy_L1_M2_60: + + COPY1x1 + + subs I , I , #1 + bne cgemm_ncopy_L1_M2_60 + + +cgemm_ncopy_L1_M2_END: + + + +cgemm_ncopy_L999: + + sub r3, fp, #128 + vldm r3, { s8 - s15} // restore floating point registers + + movs r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/sgemm_kernel_4x4_vfpv3.S b/kernel/arm/sgemm_kernel_4x4_vfpv3.S index 8bc3e5325..4031c28db 100644 --- a/kernel/arm/sgemm_kernel_4x4_vfpv3.S +++ b/kernel/arm/sgemm_kernel_4x4_vfpv3.S @@ -26,7 +26,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** -* 2013/11/02 Saar +* 2013/11/05 Saar * BLASTEST : OK * CTEST : OK * TEST : OK @@ -865,9 +865,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ldr J, N asrs J, J, #2 // J = J / 4 - ble _L2_BEGIN + ble sgemm_kernel_L2_BEGIN -_L4_BEGIN: +sgemm_kernel_L4_BEGIN: ldr CO1, C // CO1 = C ldr r4 , LDC @@ -881,19 +881,19 @@ _L4_BEGIN: -_L4_M4_BEGIN: +sgemm_kernel_L4_M4_BEGIN: ldr I, M asrs I, I, #2 // I = I / 4 - ble _L4_M2_BEGIN + ble sgemm_kernel_L4_M2_BEGIN -_L4_M4_20: +sgemm_kernel_L4_M4_20: mov BO, BC asrs L , K1, #1 // L = L / 8 cmp L , #2 - blt _L4_M4_32 + blt sgemm_kernel_L4_M4_32 @@ -901,81 +901,81 @@ _L4_M4_20: KERNEL4x4_M2 subs L, L, #2 - ble _L4_M4_22a + ble sgemm_kernel_L4_M4_22a .align 5 -_L4_M4_22: +sgemm_kernel_L4_M4_22: KERNEL4x4_M1 KERNEL4x4_M2 subs L, L, #1 - bgt _L4_M4_22 + bgt sgemm_kernel_L4_M4_22 -_L4_M4_22a: +sgemm_kernel_L4_M4_22a: KERNEL4x4_M1 KERNEL4x4_E - b _L4_M4_44 + b sgemm_kernel_L4_M4_44 -_L4_M4_32: +sgemm_kernel_L4_M4_32: tst L, #1 - ble _L4_M4_40 + ble sgemm_kernel_L4_M4_40 KERNEL4x4_I KERNEL4x4_E - b _L4_M4_44 + b sgemm_kernel_L4_M4_44 -_L4_M4_40: +sgemm_kernel_L4_M4_40: INIT4x4 -_L4_M4_44: +sgemm_kernel_L4_M4_44: ands L , K1, #1 // L = L % 8 - ble _L4_M4_100 + ble sgemm_kernel_L4_M4_100 -_L4_M4_46: +sgemm_kernel_L4_M4_46: KERNEL4x4_SUB subs L, L, #1 - bne _L4_M4_46 + bne sgemm_kernel_L4_M4_46 -_L4_M4_100: +sgemm_kernel_L4_M4_100: SAVE4x4 -_L4_M4_END: +sgemm_kernel_L4_M4_END: subs I, I, #1 - bne _L4_M4_20 + bne sgemm_kernel_L4_M4_20 -_L4_M2_BEGIN: +sgemm_kernel_L4_M2_BEGIN: ldr I, M tst I , #3 - ble _L4_END + ble sgemm_kernel_L4_END tst I, #2 // I = I / 2 - ble _L4_M1_BEGIN + ble sgemm_kernel_L4_M1_BEGIN -_L4_M2_20: +sgemm_kernel_L4_M2_20: INIT2x4 mov BO, BC asrs L , K1, #3 // L = L / 8 - ble _L4_M2_40 + ble sgemm_kernel_L4_M2_40 -_L4_M2_22: +sgemm_kernel_L4_M2_22: KERNEL2x4_SUB KERNEL2x4_SUB @@ -988,42 +988,42 @@ _L4_M2_22: KERNEL2x4_SUB subs L, L, #1 - bgt _L4_M2_22 + bgt sgemm_kernel_L4_M2_22 -_L4_M2_40: +sgemm_kernel_L4_M2_40: ands L , K1, #7 // L = L % 8 - ble _L4_M2_100 + ble sgemm_kernel_L4_M2_100 -_L4_M2_42: +sgemm_kernel_L4_M2_42: KERNEL2x4_SUB subs L, L, #1 - bgt _L4_M2_42 + bgt sgemm_kernel_L4_M2_42 -_L4_M2_100: +sgemm_kernel_L4_M2_100: SAVE2x4 -_L4_M2_END: +sgemm_kernel_L4_M2_END: -_L4_M1_BEGIN: +sgemm_kernel_L4_M1_BEGIN: tst I, #1 // I = I % 2 - ble _L4_END + ble sgemm_kernel_L4_END -_L4_M1_20: +sgemm_kernel_L4_M1_20: INIT1x4 mov BO, BC asrs L , K1, #3 // L = L / 8 - ble _L4_M1_40 + ble sgemm_kernel_L4_M1_40 -_L4_M1_22: +sgemm_kernel_L4_M1_22: KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB @@ -1035,27 +1035,27 @@ _L4_M1_22: KERNEL1x4_SUB subs L, L, #1 - bgt _L4_M1_22 + bgt sgemm_kernel_L4_M1_22 -_L4_M1_40: +sgemm_kernel_L4_M1_40: ands L , K1, #7 // L = L % 8 - ble _L4_M1_100 + ble sgemm_kernel_L4_M1_100 -_L4_M1_42: +sgemm_kernel_L4_M1_42: KERNEL1x4_SUB subs L, L, #1 - bgt _L4_M1_42 + bgt sgemm_kernel_L4_M1_42 -_L4_M1_100: +sgemm_kernel_L4_M1_100: SAVE1x4 -_L4_END: +sgemm_kernel_L4_END: mov r3, BC mov r4, K1 @@ -1064,20 +1064,20 @@ _L4_END: mov BC, r3 subs J , #1 // j-- - bgt _L4_BEGIN + bgt sgemm_kernel_L4_BEGIN /*********************************************************************************************/ -_L2_BEGIN: +sgemm_kernel_L2_BEGIN: ldr J , N tst J , #3 - ble _L999 + ble sgemm_kernel_L999 tst J , #2 - ble _L1_BEGIN + ble sgemm_kernel_L1_BEGIN ldr CO1, C // CO1 = C ldr r4 , LDC @@ -1092,22 +1092,22 @@ _L2_BEGIN: -_L2_M4_BEGIN: +sgemm_kernel_L2_M4_BEGIN: ldr I, M asrs I, I, #2 // I = I / 4 - ble _L2_M2_BEGIN + ble sgemm_kernel_L2_M2_BEGIN -_L2_M4_20: +sgemm_kernel_L2_M4_20: INIT4x2 mov BO, BC asrs L , K1, #3 // L = L / 8 - ble _L2_M4_40 + ble sgemm_kernel_L2_M4_40 .align 5 -_L2_M4_22: +sgemm_kernel_L2_M4_22: KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB @@ -1119,49 +1119,49 @@ _L2_M4_22: KERNEL4x2_SUB subs L, L, #1 - bgt _L2_M4_22 + bgt sgemm_kernel_L2_M4_22 -_L2_M4_40: +sgemm_kernel_L2_M4_40: ands L , K1, #7 // L = L % 8 - ble _L2_M4_100 + ble sgemm_kernel_L2_M4_100 -_L2_M4_42: +sgemm_kernel_L2_M4_42: KERNEL4x2_SUB subs L, L, #1 - bgt _L2_M4_42 + bgt sgemm_kernel_L2_M4_42 -_L2_M4_100: +sgemm_kernel_L2_M4_100: SAVE4x2 -_L2_M4_END: +sgemm_kernel_L2_M4_END: subs I, I, #1 - bgt _L2_M4_20 + bgt sgemm_kernel_L2_M4_20 -_L2_M2_BEGIN: +sgemm_kernel_L2_M2_BEGIN: ldr I, M tst I , #3 - ble _L2_END + ble sgemm_kernel_L2_END tst I, #2 // I = I / 2 - ble _L2_M1_BEGIN + ble sgemm_kernel_L2_M1_BEGIN -_L2_M2_20: +sgemm_kernel_L2_M2_20: INIT2x2 mov BO, BC asrs L , K1, #3 // L = L / 8 - ble _L2_M2_40 + ble sgemm_kernel_L2_M2_40 -_L2_M2_22: +sgemm_kernel_L2_M2_22: KERNEL2x2_SUB KERNEL2x2_SUB @@ -1174,42 +1174,42 @@ _L2_M2_22: KERNEL2x2_SUB subs L, L, #1 - bgt _L2_M2_22 + bgt sgemm_kernel_L2_M2_22 -_L2_M2_40: +sgemm_kernel_L2_M2_40: ands L , K1, #7 // L = L % 8 - ble _L2_M2_100 + ble sgemm_kernel_L2_M2_100 -_L2_M2_42: +sgemm_kernel_L2_M2_42: KERNEL2x2_SUB subs L, L, #1 - bgt _L2_M2_42 + bgt sgemm_kernel_L2_M2_42 -_L2_M2_100: +sgemm_kernel_L2_M2_100: SAVE2x2 -_L2_M2_END: +sgemm_kernel_L2_M2_END: -_L2_M1_BEGIN: +sgemm_kernel_L2_M1_BEGIN: tst I, #1 // I = I % 2 - ble _L2_END + ble sgemm_kernel_L2_END -_L2_M1_20: +sgemm_kernel_L2_M1_20: INIT1x2 mov BO, BC asrs L , K1, #3 // L = L / 8 - ble _L2_M1_40 + ble sgemm_kernel_L2_M1_40 -_L2_M1_22: +sgemm_kernel_L2_M1_22: KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB @@ -1221,27 +1221,27 @@ _L2_M1_22: KERNEL1x2_SUB subs L, L, #1 - bgt _L2_M1_22 + bgt sgemm_kernel_L2_M1_22 -_L2_M1_40: +sgemm_kernel_L2_M1_40: ands L , K1, #7 // L = L % 8 - ble _L2_M1_100 + ble sgemm_kernel_L2_M1_100 -_L2_M1_42: +sgemm_kernel_L2_M1_42: KERNEL1x2_SUB subs L, L, #1 - bgt _L2_M1_42 + bgt sgemm_kernel_L2_M1_42 -_L2_M1_100: +sgemm_kernel_L2_M1_100: SAVE1x2 -_L2_END: +sgemm_kernel_L2_END: mov r3, BC mov r4, K1 @@ -1251,11 +1251,11 @@ _L2_END: /*********************************************************************************************/ -_L1_BEGIN: +sgemm_kernel_L1_BEGIN: ldr J , N tst J , #1 - ble _L999 + ble sgemm_kernel_L999 ldr CO1, C // CO1 = C @@ -1270,22 +1270,22 @@ _L1_BEGIN: -_L1_M4_BEGIN: +sgemm_kernel_L1_M4_BEGIN: ldr I, M asrs I, I, #2 // I = I / 4 - ble _L1_M2_BEGIN + ble sgemm_kernel_L1_M2_BEGIN -_L1_M4_20: +sgemm_kernel_L1_M4_20: INIT4x1 mov BO, BC asrs L , K1, #3 // L = L / 8 - ble _L1_M4_40 + ble sgemm_kernel_L1_M4_40 .align 5 -_L1_M4_22: +sgemm_kernel_L1_M4_22: KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB @@ -1297,49 +1297,49 @@ _L1_M4_22: KERNEL4x1_SUB subs L, L, #1 - bgt _L1_M4_22 + bgt sgemm_kernel_L1_M4_22 -_L1_M4_40: +sgemm_kernel_L1_M4_40: ands L , K1, #7 // L = L % 8 - ble _L1_M4_100 + ble sgemm_kernel_L1_M4_100 -_L1_M4_42: +sgemm_kernel_L1_M4_42: KERNEL4x1_SUB subs L, L, #1 - bgt _L1_M4_42 + bgt sgemm_kernel_L1_M4_42 -_L1_M4_100: +sgemm_kernel_L1_M4_100: SAVE4x1 -_L1_M4_END: +sgemm_kernel_L1_M4_END: subs I, I, #1 - bgt _L1_M4_20 + bgt sgemm_kernel_L1_M4_20 -_L1_M2_BEGIN: +sgemm_kernel_L1_M2_BEGIN: ldr I, M tst I , #3 - ble _L1_END + ble sgemm_kernel_L1_END tst I, #2 // I = I / 2 - ble _L1_M1_BEGIN + ble sgemm_kernel_L1_M1_BEGIN -_L1_M2_20: +sgemm_kernel_L1_M2_20: INIT2x1 mov BO, BC asrs L , K1, #3 // L = L / 8 - ble _L1_M2_40 + ble sgemm_kernel_L1_M2_40 -_L1_M2_22: +sgemm_kernel_L1_M2_22: KERNEL2x1_SUB KERNEL2x1_SUB @@ -1352,42 +1352,42 @@ _L1_M2_22: KERNEL2x1_SUB subs L, L, #1 - bgt _L1_M2_22 + bgt sgemm_kernel_L1_M2_22 -_L1_M2_40: +sgemm_kernel_L1_M2_40: ands L , K1, #7 // L = L % 8 - ble _L1_M2_100 + ble sgemm_kernel_L1_M2_100 -_L1_M2_42: +sgemm_kernel_L1_M2_42: KERNEL2x1_SUB subs L, L, #1 - bgt _L1_M2_42 + bgt sgemm_kernel_L1_M2_42 -_L1_M2_100: +sgemm_kernel_L1_M2_100: SAVE2x1 -_L1_M2_END: +sgemm_kernel_L1_M2_END: -_L1_M1_BEGIN: +sgemm_kernel_L1_M1_BEGIN: tst I, #1 // I = I % 2 - ble _L1_END + ble sgemm_kernel_L1_END -_L1_M1_20: +sgemm_kernel_L1_M1_20: INIT1x1 mov BO, BC asrs L , K1, #3 // L = L / 8 - ble _L1_M1_40 + ble sgemm_kernel_L1_M1_40 -_L1_M1_22: +sgemm_kernel_L1_M1_22: KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB @@ -1399,30 +1399,30 @@ _L1_M1_22: KERNEL1x1_SUB subs L, L, #1 - bgt _L1_M1_22 + bgt sgemm_kernel_L1_M1_22 -_L1_M1_40: +sgemm_kernel_L1_M1_40: ands L , K1, #7 // L = L % 8 - ble _L1_M1_100 + ble sgemm_kernel_L1_M1_100 -_L1_M1_42: +sgemm_kernel_L1_M1_42: KERNEL1x1_SUB subs L, L, #1 - bgt _L1_M1_42 + bgt sgemm_kernel_L1_M1_42 -_L1_M1_100: +sgemm_kernel_L1_M1_100: SAVE1x1 -_L1_END: +sgemm_kernel_L1_END: -_L999: +sgemm_kernel_L999: sub r3, fp, #128 vldm r3, { s8 - s31} // restore floating point registers diff --git a/kernel/arm/sgemm_ncopy_4_vfpv3.S b/kernel/arm/sgemm_ncopy_4_vfpv3.S index 8af7ed8f2..2d8fa2e24 100644 --- a/kernel/arm/sgemm_ncopy_4_vfpv3.S +++ b/kernel/arm/sgemm_ncopy_4_vfpv3.S @@ -26,7 +26,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** -* 2013/11/02 Saar +* 2013/11/05 Saar * BLASTEST : OK * CTEST : OK * TEST : OK @@ -68,7 +68,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define I r3 #define J r12 -#define A_PRE 96 +#define A_PRE 192 /************************************************************************************** * Macro definitions @@ -199,12 +199,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ldr BO, B -_L4_BEGIN: +sgemm_ncopy_L4_BEGIN: asrs J, N, #2 // J = N / 4 - ble _L2_BEGIN + ble sgemm_ncopy_L2_BEGIN -_L4_M4_BEGIN: +sgemm_ncopy_L4_M4_BEGIN: mov AO1, A // AO1 = A ldr r4 , LDA @@ -214,9 +214,9 @@ _L4_M4_BEGIN: add A , AO4, r4 // A = A + 4 * LDA asrs I, M, #2 // I = M / 4 - ble _L4_M4_40 + ble sgemm_ncopy_L4_M4_40 -_L4_M4_20: +sgemm_ncopy_L4_M4_20: pld [ AO1, #A_PRE ] pld [ AO2, #A_PRE ] @@ -225,45 +225,45 @@ _L4_M4_20: COPY4x4 subs I , I , #1 - ble _L4_M4_40 + ble sgemm_ncopy_L4_M4_40 COPY4x4 subs I , I , #1 - bne _L4_M4_20 + bne sgemm_ncopy_L4_M4_20 -_L4_M4_40: +sgemm_ncopy_L4_M4_40: ands I, M , #3 - ble _L4_M4_END + ble sgemm_ncopy_L4_M4_END -_L4_M4_60: +sgemm_ncopy_L4_M4_60: COPY1x4 subs I , I , #1 - bne _L4_M4_60 + bne sgemm_ncopy_L4_M4_60 -_L4_M4_END: +sgemm_ncopy_L4_M4_END: subs J , J, #1 // j-- - bne _L4_M4_BEGIN + bne sgemm_ncopy_L4_M4_BEGIN /*********************************************************************************************/ -_L2_BEGIN: +sgemm_ncopy_L2_BEGIN: tst N, #3 - ble _L999 + ble sgemm_ncopy_L999 tst N, #2 - ble _L1_BEGIN + ble sgemm_ncopy_L1_BEGIN -_L2_M4_BEGIN: +sgemm_ncopy_L2_M4_BEGIN: mov AO1, A // AO1 = A ldr r4 , LDA @@ -271,75 +271,75 @@ _L2_M4_BEGIN: add A , AO2, r4 // A = A + 2 * LDA asrs I, M, #2 // I = M / 4 - ble _L2_M4_40 + ble sgemm_ncopy_L2_M4_40 -_L2_M4_20: +sgemm_ncopy_L2_M4_20: COPY4x2 subs I , I , #1 - bne _L2_M4_20 + bne sgemm_ncopy_L2_M4_20 -_L2_M4_40: +sgemm_ncopy_L2_M4_40: ands I, M , #3 - ble _L2_M4_END + ble sgemm_ncopy_L2_M4_END -_L2_M4_60: +sgemm_ncopy_L2_M4_60: COPY1x2 subs I , I , #1 - bne _L2_M4_60 + bne sgemm_ncopy_L2_M4_60 -_L2_M4_END: +sgemm_ncopy_L2_M4_END: /*********************************************************************************************/ -_L1_BEGIN: +sgemm_ncopy_L1_BEGIN: tst N, #1 - ble _L999 + ble sgemm_ncopy_L999 -_L1_M4_BEGIN: +sgemm_ncopy_L1_M4_BEGIN: mov AO1, A // AO1 = A ldr r4 , LDA add A , AO1, r4 // A = A + 1 * LDA asrs I, M, #2 // I = M / 4 - ble _L1_M4_40 + ble sgemm_ncopy_L1_M4_40 -_L1_M4_20: +sgemm_ncopy_L1_M4_20: COPY4x1 subs I , I , #1 - bne _L1_M4_20 + bne sgemm_ncopy_L1_M4_20 -_L1_M4_40: +sgemm_ncopy_L1_M4_40: ands I, M , #3 - ble _L1_M4_END + ble sgemm_ncopy_L1_M4_END -_L1_M4_60: +sgemm_ncopy_L1_M4_60: COPY1x1 subs I , I , #1 - bne _L1_M4_60 + bne sgemm_ncopy_L1_M4_60 -_L1_M4_END: +sgemm_ncopy_L1_M4_END: -_L999: +sgemm_ncopy_L999: sub r3, fp, #128 vldm r3, { s8 - s15} // restore floating point registers From 80a2e901b119c65b470deb9758798cb14aabcbba Mon Sep 17 00:00:00 2001 From: wernsaar Date: Wed, 6 Nov 2013 20:01:18 +0100 Subject: [PATCH 26/81] added dgemm_tcopy_4_vfpv3.S and sgemm_tcopy_4_vfpv3.S --- kernel/arm/KERNEL.ARMV7 | 6 +- kernel/arm/dgemm_kernel_4x4_vfpv3.S | 4 +- kernel/arm/dgemm_tcopy_4_vfpv3.S | 408 ++++++++++++++++++++++++++ kernel/arm/sgemm_tcopy_4_vfpv3.S | 430 ++++++++++++++++++++++++++++ 4 files changed, 842 insertions(+), 6 deletions(-) create mode 100644 kernel/arm/dgemm_tcopy_4_vfpv3.S create mode 100644 kernel/arm/sgemm_tcopy_4_vfpv3.S diff --git a/kernel/arm/KERNEL.ARMV7 b/kernel/arm/KERNEL.ARMV7 index cdf370725..10bc4620a 100644 --- a/kernel/arm/KERNEL.ARMV7 +++ b/kernel/arm/KERNEL.ARMV7 @@ -90,19 +90,17 @@ SGEMMKERNEL = sgemm_kernel_4x4_vfpv3.S SGEMMINCOPY = SGEMMITCOPY = SGEMMONCOPY = sgemm_ncopy_4_vfpv3.S -SGEMMOTCOPY = ../generic/gemm_tcopy_4.c +SGEMMOTCOPY = sgemm_tcopy_4_vfpv3.S SGEMMINCOPYOBJ = SGEMMITCOPYOBJ = SGEMMONCOPYOBJ = sgemm_oncopy.o SGEMMOTCOPYOBJ = sgemm_otcopy.o -#DGEMMKERNEL = ../generic/gemmkernel_2x2.c -#DGEMMKERNEL = dgemm_kernel_4x2_vfpv2.S DGEMMKERNEL = dgemm_kernel_4x4_vfpv3.S DGEMMINCOPY = DGEMMITCOPY = DGEMMONCOPY = dgemm_ncopy_4_vfpv3.S -DGEMMOTCOPY = ../generic/gemm_tcopy_4.c +DGEMMOTCOPY = dgemm_tcopy_4_vfpv3.S DGEMMINCOPYOBJ = DGEMMITCOPYOBJ = DGEMMONCOPYOBJ = dgemm_oncopy.o diff --git a/kernel/arm/dgemm_kernel_4x4_vfpv3.S b/kernel/arm/dgemm_kernel_4x4_vfpv3.S index 7d83def94..ed7f611f1 100644 --- a/kernel/arm/dgemm_kernel_4x4_vfpv3.S +++ b/kernel/arm/dgemm_kernel_4x4_vfpv3.S @@ -77,7 +77,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define B [fp, #4 ] #define C [fp, #8 ] -#define OLDdgemm_kernel_LDC [fp, #12 ] +#define OLD_LDC [fp, #12 ] #define I r0 #define J r1 @@ -883,7 +883,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. sub r3, fp, #128 vstm r3, { d8 - d15} // store floating point registers - ldr r3, OLDdgemm_kernel_LDC + ldr r3, OLD_LDC lsl r3, r3, #3 // ldc = ldc * 8 str r3, LDC diff --git a/kernel/arm/dgemm_tcopy_4_vfpv3.S b/kernel/arm/dgemm_tcopy_4_vfpv3.S new file mode 100644 index 000000000..88a139ad8 --- /dev/null +++ b/kernel/arm/dgemm_tcopy_4_vfpv3.S @@ -0,0 +1,408 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/06 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_M r0 +#define OLD_N r1 +#define OLD_A r2 +#define OLD_LDA r3 + + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define B [fp, #4 ] +#define A [fp, #-248 ] + +#define M r0 +#define N r1 +#define M4 r2 + +#define LDA r5 + +#define AO1 r6 +#define BO1 r7 +#define BO2 r8 +#define BO3 r9 + +#define I r4 +#define J r12 + +#define A_PRE 256 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro COPY4x4 + + pld [ AO1, #A_PRE ] + fldmiad AO1, { d0 - d3 } + + add r3, AO1, LDA + pld [ r3, #A_PRE ] + fldmiad r3, { d4 - d7 } + + add r3, r3, LDA + pld [ r3, #A_PRE ] + fldmiad r3, { d8 - d11 } + + add r3, r3, LDA + pld [ r3, #A_PRE ] + fldmiad r3, { d12 - d15 } + + fstmiad BO1, { d0 - d15 } + add AO1, AO1, #32 + add BO1, BO1, M4 + +.endm + +.macro COPY2x4 + + fldmiad AO1, { d0 - d1 } + + add r3, AO1, LDA + fldmiad r3, { d2 - d3 } + + add r3, r3, LDA + fldmiad r3, { d4 - d5 } + + add r3, r3, LDA + fldmiad r3, { d6 - d7 } + + fstmiad BO2, { d0 - d7 } + add AO1, AO1, #16 + add BO2, BO2, #64 + +.endm + +.macro COPY1x4 + + fldmiad AO1, { d0 } + + add r3, AO1, LDA + fldmiad r3, { d1 } + + add r3, r3, LDA + fldmiad r3, { d2 } + + add r3, r3, LDA + fldmiad r3, { d3 } + + fstmiad BO3, { d0 - d3 } + add AO1, AO1, #8 + add BO3, BO3, #32 + +.endm + +/*************************************************************************************************************************/ + +.macro COPY4x2 + + pld [ AO1, #A_PRE ] + fldmiad AO1, { d0 - d3 } + + add r3, AO1, LDA + pld [ r3, #A_PRE ] + fldmiad r3, { d4 - d7 } + + fstmiad BO1, { d0 - d7 } + add AO1, AO1, #32 + add BO1, BO1, M4 + +.endm + +.macro COPY2x2 + + fldmiad AO1, { d0 - d1 } + + add r3, AO1, LDA + fldmiad r3, { d2 - d3 } + + fstmiad BO2, { d0 - d3 } + add AO1, AO1, #16 + add BO2, BO2, #32 + +.endm + +.macro COPY1x2 + + fldmiad AO1, { d0 } + + add r3, AO1, LDA + fldmiad r3, { d1 } + + fstmiad BO3, { d0 - d1 } + add AO1, AO1, #8 + add BO3, BO3, #16 + +.endm + +/*************************************************************************************************************************/ + +.macro COPY4x1 + + pld [ AO1, #A_PRE ] + fldmiad AO1, { d0 - d3 } + + fstmiad BO1, { d0 - d3 } + add AO1, AO1, #32 + add BO1, BO1, M4 + +.endm + +.macro COPY2x1 + + fldmiad AO1, { d0 - d1 } + + fstmiad BO2, { d0 - d1 } + add AO1, AO1, #16 + add BO2, BO2, #16 + +.endm + +.macro COPY1x1 + + fldmiad AO1, { d0 } + + fstmiad BO3, { d0 } + add AO1, AO1, #8 + add BO3, BO3, #8 + +.endm + + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + str OLD_A, A // store A + + lsl LDA, OLD_LDA, #3 // lda = lda * SIZE + + sub r4, fp, #128 + vstm r4, { d8 - d15} // store floating point registers + + lsl r4 , M, #3 // M * SIZE + + ldr r3, B + + and BO2 , N , #-4 + and BO3 , N , #-2 + + mul BO2, BO2, r4 + mul BO3, BO3, r4 + + add BO2 , BO2, r3 + add BO3 , BO3, r3 + + lsl M4, M, #5 // M4 = M * 4 * SIZE + +dgemm_tcopy_L4_BEGIN: + + asrs J, M, #2 // J = N / 4 + ble dgemm_tcopy_L2_BEGIN + +dgemm_tcopy_L4_M4_BEGIN: + + ldr AO1, A // AO1 = A + lsl r3, LDA, #2 // r3 = 4 * LDA + add r3, r3 , AO1 // A = A + 4 * LDA + str r3, A // store A + + ldr BO1, B + add r3, BO1, #128 // B = B + 16 * SIZE + str r3, B + + asrs I, N, #2 // I = M / 4 + ble dgemm_tcopy_L4_M4_40 + +dgemm_tcopy_L4_M4_20: + + COPY4x4 + + subs I , I , #1 + bne dgemm_tcopy_L4_M4_20 + + +dgemm_tcopy_L4_M4_40: + + tst N , #2 + ble dgemm_tcopy_L4_M4_60 + + COPY2x4 + + +dgemm_tcopy_L4_M4_60: + + tst N, #1 + ble dgemm_tcopy_L4_M4_END + + COPY1x4 + + +dgemm_tcopy_L4_M4_END: + + subs J , J, #1 // j-- + bne dgemm_tcopy_L4_M4_BEGIN + + + +/*********************************************************************************************/ + +dgemm_tcopy_L2_BEGIN: + + tst M, #3 + ble dgemm_tcopy_L999 + + tst M, #2 + ble dgemm_tcopy_L1_BEGIN + +dgemm_tcopy_L2_M4_BEGIN: + + ldr AO1, A // AO1 = A + lsl r3, LDA, #1 // r3 = 2 * LDA + add r3, r3 , AO1 // A = A + 2 * LDA + str r3, A // store A + + ldr BO1, B + add r3, BO1, #64 // B = B + 8 * SIZE + str r3, B + + asrs I, N, #2 // I = M / 4 + ble dgemm_tcopy_L2_M4_40 + +dgemm_tcopy_L2_M4_20: + + COPY4x2 + + subs I , I , #1 + bne dgemm_tcopy_L2_M4_20 + + +dgemm_tcopy_L2_M4_40: + + tst N , #2 + ble dgemm_tcopy_L2_M4_60 + + COPY2x2 + +dgemm_tcopy_L2_M4_60: + + tst N , #1 + ble dgemm_tcopy_L2_M4_END + + COPY1x2 + + +dgemm_tcopy_L2_M4_END: + + +/*********************************************************************************************/ + +dgemm_tcopy_L1_BEGIN: + + tst M, #1 + ble dgemm_tcopy_L999 + + +dgemm_tcopy_L1_M4_BEGIN: + + ldr AO1, A // AO1 = A + add r3, LDA , AO1 // A = A + 1 * LDA + str r3, A // store A + + ldr BO1, B + add r3, BO1, #32 // B = B + 4 * SIZE + str r3, B + + asrs I, N, #2 // I = M / 4 + ble dgemm_tcopy_L1_M4_40 + +dgemm_tcopy_L1_M4_20: + + COPY4x1 + + subs I , I , #1 + bne dgemm_tcopy_L1_M4_20 + + +dgemm_tcopy_L1_M4_40: + + tst N , #2 + ble dgemm_tcopy_L1_M4_60 + + COPY2x1 + +dgemm_tcopy_L1_M4_60: + + tst N , #1 + ble dgemm_tcopy_L1_M4_END + + COPY1x1 + + +dgemm_tcopy_L1_M4_END: + + + +dgemm_tcopy_L999: + + sub r3, fp, #128 + vldm r3, { d8 - d15} // restore floating point registers + + mov r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/sgemm_tcopy_4_vfpv3.S b/kernel/arm/sgemm_tcopy_4_vfpv3.S new file mode 100644 index 000000000..b0a3278ff --- /dev/null +++ b/kernel/arm/sgemm_tcopy_4_vfpv3.S @@ -0,0 +1,430 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/06 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_M r0 +#define OLD_N r1 +#define OLD_A r2 +#define OLD_LDA r3 + + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define B [fp, #4 ] +#define A [fp, #-248 ] + +#define M r0 +#define N r1 +#define M4 r2 + +#define LDA r5 + +#define AO1 r6 +#define BO1 r7 +#define BO2 r8 +#define BO3 r9 + +#define I r4 +#define J r12 + +#define A_PRE 256 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro COPY4x4_1 + + pld [ AO1, #A_PRE ] + fldmias AO1, { s0 - s3 } + + add r3, AO1, LDA + pld [ r3, #A_PRE ] + fldmias r3, { s4 - s7 } + + add r3, r3, LDA + pld [ r3, #A_PRE ] + fldmias r3, { s8 - s11 } + + add r3, r3, LDA + pld [ r3, #A_PRE ] + fldmias r3, { s12 - s15 } + + fstmias BO1, { s0 - s15 } + add AO1, AO1, #16 + add BO1, BO1, M4 + +.endm + +.macro COPY4x4_2 + + fldmias AO1, { s0 - s3 } + + add r3, AO1, LDA + fldmias r3, { s4 - s7 } + + add r3, r3, LDA + fldmias r3, { s8 - s11 } + + add r3, r3, LDA + fldmias r3, { s12 - s15 } + + fstmias BO1, { s0 - s15 } + add AO1, AO1, #16 + add BO1, BO1, M4 + +.endm + + +.macro COPY2x4 + + fldmias AO1, { s0 - s1 } + + add r3, AO1, LDA + fldmias r3, { s2 - s3 } + + add r3, r3, LDA + fldmias r3, { s4 - s5 } + + add r3, r3, LDA + fldmias r3, { s6 - s7 } + + fstmias BO2, { s0 - s7 } + add AO1, AO1, #8 + add BO2, BO2, #32 + +.endm + +.macro COPY1x4 + + fldmias AO1, { s0 } + + add r3, AO1, LDA + fldmias r3, { s1 } + + add r3, r3, LDA + fldmias r3, { s2 } + + add r3, r3, LDA + fldmias r3, { s3 } + + fstmias BO3, { s0 - s3 } + add AO1, AO1, #4 + add BO3, BO3, #16 + +.endm + +/*************************************************************************************************************************/ + +.macro COPY4x2 + + fldmias AO1, { s0 - s3 } + + add r3, AO1, LDA + fldmias r3, { s4 - s7 } + + fstmias BO1, { s0 - s7 } + add AO1, AO1, #16 + add BO1, BO1, M4 + +.endm + +.macro COPY2x2 + + fldmias AO1, { s0 - s1 } + + add r3, AO1, LDA + fldmias r3, { s2 - s3 } + + fstmias BO2, { s0 - s3 } + add AO1, AO1, #8 + add BO2, BO2, #16 + +.endm + +.macro COPY1x2 + + fldmias AO1, { s0 } + + add r3, AO1, LDA + fldmias r3, { s1 } + + fstmias BO3, { s0 - s1 } + add AO1, AO1, #4 + add BO3, BO3, #8 + +.endm + +/*************************************************************************************************************************/ + +.macro COPY4x1 + + fldmias AO1, { s0 - s3 } + + fstmias BO1, { s0 - s3 } + add AO1, AO1, #16 + add BO1, BO1, M4 + +.endm + +.macro COPY2x1 + + fldmias AO1, { s0 - s1 } + + fstmias BO2, { s0 - s1 } + add AO1, AO1, #8 + add BO2, BO2, #8 + +.endm + +.macro COPY1x1 + + fldmias AO1, { s0 } + + fstmias BO3, { s0 } + add AO1, AO1, #4 + add BO3, BO3, #4 + +.endm + + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + str OLD_A, A // store A + + lsl LDA, OLD_LDA, #2 // lda = lda * SIZE + + sub r4, fp, #128 + vstm r4, { s8 - s15} // store floating point registers + + lsl r4 , M, #2 // M * SIZE + + ldr r3, B + + and BO2 , N , #-4 + and BO3 , N , #-2 + + mul BO2, BO2, r4 + mul BO3, BO3, r4 + + add BO2 , BO2, r3 + add BO3 , BO3, r3 + + lsl M4, M, #4 // M4 = M * 4 * SIZE + +sgemm_tcopy_L4_BEGIN: + + asrs J, M, #2 // J = N / 4 + ble sgemm_tcopy_L2_BEGIN + +sgemm_tcopy_L4_M4_BEGIN: + + ldr AO1, A // AO1 = A + lsl r3, LDA, #2 // r3 = 4 * LDA + add r3, r3 , AO1 // A = A + 4 * LDA + str r3, A // store A + + ldr BO1, B + add r3, BO1, #64 // B = B + 16 * SIZE + str r3, B + + asrs I, N, #2 // I = M / 4 + ble sgemm_tcopy_L4_M4_40 + +sgemm_tcopy_L4_M4_20: + + COPY4x4_1 + + subs I , I , #1 + ble sgemm_tcopy_L4_M4_40 + + COPY4x4_2 + + subs I , I , #1 + bne sgemm_tcopy_L4_M4_20 + + +sgemm_tcopy_L4_M4_40: + + tst N , #2 + ble sgemm_tcopy_L4_M4_60 + + COPY2x4 + + +sgemm_tcopy_L4_M4_60: + + tst N, #1 + ble sgemm_tcopy_L4_M4_END + + COPY1x4 + + +sgemm_tcopy_L4_M4_END: + + subs J , J, #1 // j-- + bne sgemm_tcopy_L4_M4_BEGIN + + + +/*********************************************************************************************/ + +sgemm_tcopy_L2_BEGIN: + + tst M, #3 + ble sgemm_tcopy_L999 + + tst M, #2 + ble sgemm_tcopy_L1_BEGIN + +sgemm_tcopy_L2_M4_BEGIN: + + ldr AO1, A // AO1 = A + lsl r3, LDA, #1 // r3 = 2 * LDA + add r3, r3 , AO1 // A = A + 2 * LDA + str r3, A // store A + + ldr BO1, B + add r3, BO1, #32 // B = B + 8 * SIZE + str r3, B + + asrs I, N, #2 // I = M / 4 + ble sgemm_tcopy_L2_M4_40 + +sgemm_tcopy_L2_M4_20: + + COPY4x2 + + subs I , I , #1 + bne sgemm_tcopy_L2_M4_20 + + +sgemm_tcopy_L2_M4_40: + + tst N , #2 + ble sgemm_tcopy_L2_M4_60 + + COPY2x2 + +sgemm_tcopy_L2_M4_60: + + tst N , #1 + ble sgemm_tcopy_L2_M4_END + + COPY1x2 + + +sgemm_tcopy_L2_M4_END: + + +/*********************************************************************************************/ + +sgemm_tcopy_L1_BEGIN: + + tst M, #1 + ble sgemm_tcopy_L999 + + +sgemm_tcopy_L1_M4_BEGIN: + + ldr AO1, A // AO1 = A + add r3, LDA , AO1 // A = A + 1 * LDA + str r3, A // store A + + ldr BO1, B + add r3, BO1, #16 // B = B + 4 * SIZE + str r3, B + + asrs I, N, #2 // I = M / 4 + ble sgemm_tcopy_L1_M4_40 + +sgemm_tcopy_L1_M4_20: + + COPY4x1 + + subs I , I , #1 + bne sgemm_tcopy_L1_M4_20 + + +sgemm_tcopy_L1_M4_40: + + tst N , #2 + ble sgemm_tcopy_L1_M4_60 + + COPY2x1 + +sgemm_tcopy_L1_M4_60: + + tst N , #1 + ble sgemm_tcopy_L1_M4_END + + COPY1x1 + + +sgemm_tcopy_L1_M4_END: + + + +sgemm_tcopy_L999: + + sub r3, fp, #128 + vldm r3, { s8 - s15} // restore floating point registers + + mov r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + From 1e8128f41c54a449de9c36a6fbd8c86d351f5215 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Thu, 7 Nov 2013 17:15:50 +0100 Subject: [PATCH 27/81] added cgemm_tcopy_2_vfpv3.S and zgemm_tcopy_2_vfpv3.S --- kernel/arm/cgemm_tcopy_2_vfpv3.S | 243 ++++++++++++++++++++++++++++++ kernel/arm/zgemm_tcopy_2_vfpv3.S | 245 +++++++++++++++++++++++++++++++ 2 files changed, 488 insertions(+) create mode 100644 kernel/arm/cgemm_tcopy_2_vfpv3.S create mode 100644 kernel/arm/zgemm_tcopy_2_vfpv3.S diff --git a/kernel/arm/cgemm_tcopy_2_vfpv3.S b/kernel/arm/cgemm_tcopy_2_vfpv3.S new file mode 100644 index 000000000..9036b994d --- /dev/null +++ b/kernel/arm/cgemm_tcopy_2_vfpv3.S @@ -0,0 +1,243 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/07 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_M r0 +#define OLD_N r1 +#define OLD_A r2 +#define OLD_LDA r3 + + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define B [fp, #4 ] +#define A [fp, #-248 ] + +#define M r0 +#define N r1 +#define M4 r2 + +#define LDA r5 + +#define AO1 r6 +#define BO1 r7 +#define BO2 r8 + +#define I r4 +#define J r12 + +#define A_PRE 256 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ +.macro COPY2x2 + + fldmias AO1, { s0 - s3 } + + add r3, AO1, LDA + fldmias r3, { s4 - s7 } + + fstmias BO1, { s0 - s7 } + add AO1, AO1, #16 + add BO1, BO1, M4 + +.endm + +.macro COPY1x2 + + fldmias AO1, { s0 -s1 } + + add r3, AO1, LDA + fldmias r3, { s2 - s3 } + + fstmias BO2, { s0 - s3 } + add AO1, AO1, #8 + add BO2, BO2, #16 + +.endm + +/*************************************************************************************************************************/ +.macro COPY2x1 + + fldmias AO1, { s0 - s3 } + + fstmias BO1, { s0 - s3 } + add AO1, AO1, #16 + add BO1, BO1, M4 + +.endm + +.macro COPY1x1 + + fldmias AO1, { s0 - s1 } + + fstmias BO2, { s0 - s1 } + add AO1, AO1, #8 + add BO2, BO2, #8 + +.endm + + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + str OLD_A, A // store A + + lsl LDA, OLD_LDA, #3 // lda = lda * SIZE * 2 + + sub r4, fp, #128 + vstm r4, { s8 - s15} // store floating point registers + + lsl r4 , M, #3 // M * SIZE * 2 + + ldr r3, B + + and BO2 , N , #-2 + + mul BO2, BO2, r4 + + add BO2 , BO2, r3 + + lsl M4, M, #4 // M4 = M * 2 * SIZE * 2 + +cgemm_tcopy_L2_BEGIN: + + asrs J, M, #1 // J = N / 2 + ble cgemm_tcopy_L1_BEGIN + +cgemm_tcopy_L2_M2_BEGIN: + + ldr AO1, A // AO1 = A + lsl r3, LDA, #1 // r3 = 2 * LDA + add r3, r3 , AO1 // A = A + 2 * LDA + str r3, A // store A + + ldr BO1, B + add r3, BO1, #32 // B = B + 4 * SIZE *2 + str r3, B + + asrs I, N, #1 // I = M / 2 + ble cgemm_tcopy_L2_M2_60 + +cgemm_tcopy_L2_M2_40: + + COPY2x2 + subs I, I, #1 + bne cgemm_tcopy_L2_M2_40 + +cgemm_tcopy_L2_M2_60: + + tst N , #1 + ble cgemm_tcopy_L2_M2_END + + COPY1x2 + + +cgemm_tcopy_L2_M2_END: + + subs J , J, #1 // j-- + bne cgemm_tcopy_L2_M2_BEGIN + +/*********************************************************************************************/ + +cgemm_tcopy_L1_BEGIN: + + tst M, #1 + ble cgemm_tcopy_L999 + + +cgemm_tcopy_L1_M2_BEGIN: + + ldr AO1, A // AO1 = A + add r3, LDA , AO1 // A = A + 1 * LDA + str r3, A // store A + + ldr BO1, B + add r3, BO1, #16 // B = B + 2 * SIZE *2 + str r3, B + + asrs I, N, #1 // I = M / 2 + ble cgemm_tcopy_L1_M2_60 + + +cgemm_tcopy_L1_M2_40: + + COPY2x1 + subs I, I, #1 + bne cgemm_tcopy_L1_M2_40 + +cgemm_tcopy_L1_M2_60: + + tst N , #1 + ble cgemm_tcopy_L1_M2_END + + COPY1x1 + + +cgemm_tcopy_L1_M2_END: + + + +cgemm_tcopy_L999: + + sub r3, fp, #128 + vldm r3, { s8 - s15} // restore floating point registers + + mov r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/zgemm_tcopy_2_vfpv3.S b/kernel/arm/zgemm_tcopy_2_vfpv3.S new file mode 100644 index 000000000..7e27ca6a6 --- /dev/null +++ b/kernel/arm/zgemm_tcopy_2_vfpv3.S @@ -0,0 +1,245 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/07 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_M r0 +#define OLD_N r1 +#define OLD_A r2 +#define OLD_LDA r3 + + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define B [fp, #4 ] +#define A [fp, #-248 ] + +#define M r0 +#define N r1 +#define M4 r2 + +#define LDA r5 + +#define AO1 r6 +#define BO1 r7 +#define BO2 r8 + +#define I r4 +#define J r12 + +#define A_PRE 256 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ +.macro COPY2x2 + + pld [ AO1, #A_PRE ] + fldmiad AO1, { d0 - d3 } + + add r3, AO1, LDA + pld [ r3, #A_PRE ] + fldmiad r3, { d4 - d7 } + + fstmiad BO1, { d0 - d7 } + add AO1, AO1, #32 + add BO1, BO1, M4 + +.endm + +.macro COPY1x2 + + fldmiad AO1, { d0 -d1 } + + add r3, AO1, LDA + fldmiad r3, { d2 - d3 } + + fstmiad BO2, { d0 - d3 } + add AO1, AO1, #16 + add BO2, BO2, #32 + +.endm + +/*************************************************************************************************************************/ +.macro COPY2x1 + + fldmiad AO1, { d0 - d3 } + + fstmiad BO1, { d0 - d3 } + add AO1, AO1, #32 + add BO1, BO1, M4 + +.endm + +.macro COPY1x1 + + fldmiad AO1, { d0 - d1 } + + fstmiad BO2, { d0 - d1 } + add AO1, AO1, #16 + add BO2, BO2, #16 + +.endm + + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + str OLD_A, A // store A + + lsl LDA, OLD_LDA, #4 // lda = lda * SIZE * 2 + + sub r4, fp, #128 + vstm r4, { d8 - d15} // store floating point registers + + lsl r4 , M, #4 // M * SIZE * 2 + + ldr r3, B + + and BO2 , N , #-2 + + mul BO2, BO2, r4 + + add BO2 , BO2, r3 + + lsl M4, M, #5 // M4 = M * 2 * SIZE * 2 + +zgemm_tcopy_L2_BEGIN: + + asrs J, M, #1 // J = N / 2 + ble zgemm_tcopy_L1_BEGIN + +zgemm_tcopy_L2_M2_BEGIN: + + ldr AO1, A // AO1 = A + lsl r3, LDA, #1 // r3 = 2 * LDA + add r3, r3 , AO1 // A = A + 2 * LDA + str r3, A // store A + + ldr BO1, B + add r3, BO1, #64 // B = B + 4 * SIZE *2 + str r3, B + + asrs I, N, #1 // I = M / 2 + ble zgemm_tcopy_L2_M2_60 + +zgemm_tcopy_L2_M2_40: + + COPY2x2 + subs I, I, #1 + bne zgemm_tcopy_L2_M2_40 + +zgemm_tcopy_L2_M2_60: + + tst N , #1 + ble zgemm_tcopy_L2_M2_END + + COPY1x2 + + +zgemm_tcopy_L2_M2_END: + + subs J , J, #1 // j-- + bne zgemm_tcopy_L2_M2_BEGIN + +/*********************************************************************************************/ + +zgemm_tcopy_L1_BEGIN: + + tst M, #1 + ble zgemm_tcopy_L999 + + +zgemm_tcopy_L1_M2_BEGIN: + + ldr AO1, A // AO1 = A + add r3, LDA , AO1 // A = A + 1 * LDA + str r3, A // store A + + ldr BO1, B + add r3, BO1, #32 // B = B + 2 * SIZE *2 + str r3, B + + asrs I, N, #1 // I = M / 2 + ble zgemm_tcopy_L1_M2_60 + + +zgemm_tcopy_L1_M2_40: + + COPY2x1 + subs I, I, #1 + bne zgemm_tcopy_L1_M2_40 + +zgemm_tcopy_L1_M2_60: + + tst N , #1 + ble zgemm_tcopy_L1_M2_END + + COPY1x1 + + +zgemm_tcopy_L1_M2_END: + + + +zgemm_tcopy_L999: + + sub r3, fp, #128 + vldm r3, { d8 - d15} // restore floating point registers + + mov r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + From 8fa93be06ec13bccb1d2e710615bb42e3b694c92 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Thu, 7 Nov 2013 17:18:56 +0100 Subject: [PATCH 28/81] added optimized blas level1 copy kernels --- kernel/arm/ccopy_vfpv3.S | 222 ++++++++++++++++++++++++++++++++++++++ kernel/arm/dcopy_vfpv3.S | 222 ++++++++++++++++++++++++++++++++++++++ kernel/arm/scopy_vfpv3.S | 224 +++++++++++++++++++++++++++++++++++++++ kernel/arm/zcopy_vfpv3.S | 223 ++++++++++++++++++++++++++++++++++++++ 4 files changed, 891 insertions(+) create mode 100644 kernel/arm/ccopy_vfpv3.S create mode 100644 kernel/arm/dcopy_vfpv3.S create mode 100644 kernel/arm/scopy_vfpv3.S create mode 100644 kernel/arm/zcopy_vfpv3.S diff --git a/kernel/arm/ccopy_vfpv3.S b/kernel/arm/ccopy_vfpv3.S new file mode 100644 index 000000000..aaba7825e --- /dev/null +++ b/kernel/arm/ccopy_vfpv3.S @@ -0,0 +1,222 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/07 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define N r0 +#define X r1 +#define INC_X r2 +#define OLD_Y r3 + + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define OLD_INC_Y [fp, #4 ] + +#define I r5 +#define Y r6 +#define INC_Y r7 + +#define X_PRE 256 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro COPY_F4 + + pld [ X, #X_PRE ] + fldmias X!, { s0 - s7 } + fstmias Y!, { s0 - s7 } + +.endm + +.macro COPY_F1 + + fldmias X!, { s0 - s1 } + fstmias Y!, { s0 - s1 } + +.endm + + +/*************************************************************************************************************************/ + +.macro COPY_S4 + + nop + fldmias X, { s0 - s1 } + fstmias Y, { s0 - s1 } + add X, X, INC_X + add Y, Y, INC_Y + + fldmias X, { s2 - s3 } + fstmias Y, { s2 - s3 } + add X, X, INC_X + add Y, Y, INC_Y + + fldmias X, { s0 - s1 } + fstmias Y, { s0 - s1 } + add X, X, INC_X + add Y, Y, INC_Y + + fldmias X, { s2 - s3 } + fstmias Y, { s2 - s3 } + add X, X, INC_X + add Y, Y, INC_Y + +.endm + + +.macro COPY_S1 + + fldmias X, { s0 - s1 } + fstmias Y, { s0 - s1 } + add X, X, INC_X + add Y, Y, INC_Y + +.endm + + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + sub r4, fp, #128 + vstm r4, { s8 - s15} // store floating point registers + + mov Y, OLD_Y + ldr INC_Y, OLD_INC_Y + + cmp N, #0 + ble ccopy_kernel_L999 + + cmp INC_X, #0 + beq ccopy_kernel_L999 + + cmp INC_Y, #0 + beq ccopy_kernel_L999 + + cmp INC_X, #1 + bne ccopy_kernel_S_BEGIN + + cmp INC_Y, #1 + bne ccopy_kernel_S_BEGIN + +ccopy_kernel_F_BEGIN: + + asrs I, N, #2 // I = N / 4 + ble ccopy_kernel_F1 + +ccopy_kernel_F4: + + COPY_F4 + + subs I, I, #1 + bne ccopy_kernel_F4 + +ccopy_kernel_F1: + + ands I, N, #3 + ble ccopy_kernel_L999 + +ccopy_kernel_F10: + + COPY_F1 + + subs I, I, #1 + bne ccopy_kernel_F10 + + b ccopy_kernel_L999 + +ccopy_kernel_S_BEGIN: + + lsl INC_X, INC_X, #3 // INC_X * SIZE * 2 + lsl INC_Y, INC_Y, #3 // INC_Y * SIZE * 2 + + asrs I, N, #2 // I = N / 4 + ble ccopy_kernel_S1 + +ccopy_kernel_S4: + + COPY_S4 + + subs I, I, #1 + bne ccopy_kernel_S4 + +ccopy_kernel_S1: + + ands I, N, #3 + ble ccopy_kernel_L999 + +ccopy_kernel_S10: + + COPY_S1 + + subs I, I, #1 + bne ccopy_kernel_S10 + + + + + + +ccopy_kernel_L999: + + sub r3, fp, #128 + vldm r3, { s8 - s15} // restore floating point registers + + mov r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/dcopy_vfpv3.S b/kernel/arm/dcopy_vfpv3.S new file mode 100644 index 000000000..0fad3c4a6 --- /dev/null +++ b/kernel/arm/dcopy_vfpv3.S @@ -0,0 +1,222 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/07 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define N r0 +#define X r1 +#define INC_X r2 +#define OLD_Y r3 + + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define OLD_INC_Y [fp, #4 ] + +#define I r5 +#define Y r6 +#define INC_Y r7 + +#define X_PRE 256 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro COPY_F4 + + pld [ X, #X_PRE ] + fldmiad X!, { d0 - d3 } + fstmiad Y!, { d0 - d3 } + +.endm + +.macro COPY_F1 + + fldmiad X!, { d0 } + fstmiad Y!, { d0 } + +.endm + + +/*************************************************************************************************************************/ + +.macro COPY_S4 + + nop + fldmiad X, { d0 } + fstmiad Y, { d0 } + add X, X, INC_X + add Y, Y, INC_Y + + fldmiad X, { d1 } + fstmiad Y, { d1 } + add X, X, INC_X + add Y, Y, INC_Y + + fldmiad X, { d0 } + fstmiad Y, { d0 } + add X, X, INC_X + add Y, Y, INC_Y + + fldmiad X, { d1 } + fstmiad Y, { d1 } + add X, X, INC_X + add Y, Y, INC_Y + +.endm + + +.macro COPY_S1 + + fldmiad X, { d0 } + fstmiad Y, { d0 } + add X, X, INC_X + add Y, Y, INC_Y + +.endm + + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + sub r4, fp, #128 + vstm r4, { d8 - d15} // store floating point registers + + mov Y, OLD_Y + ldr INC_Y, OLD_INC_Y + + cmp N, #0 + ble dcopy_kernel_L999 + + cmp INC_X, #0 + beq dcopy_kernel_L999 + + cmp INC_Y, #0 + beq dcopy_kernel_L999 + + cmp INC_X, #1 + bne dcopy_kernel_S_BEGIN + + cmp INC_Y, #1 + bne dcopy_kernel_S_BEGIN + +dcopy_kernel_F_BEGIN: + + asrs I, N, #2 // I = N / 4 + ble dcopy_kernel_F1 + +dcopy_kernel_F4: + + COPY_F4 + + subs I, I, #1 + bne dcopy_kernel_F4 + +dcopy_kernel_F1: + + ands I, N, #3 + ble dcopy_kernel_L999 + +dcopy_kernel_F10: + + COPY_F1 + + subs I, I, #1 + bne dcopy_kernel_F10 + + b dcopy_kernel_L999 + +dcopy_kernel_S_BEGIN: + + lsl INC_X, INC_X, #3 // INC_X * SIZE + lsl INC_Y, INC_Y, #3 // INC_Y * SIZE + + asrs I, N, #2 // I = N / 4 + ble dcopy_kernel_S1 + +dcopy_kernel_S4: + + COPY_S4 + + subs I, I, #1 + bne dcopy_kernel_S4 + +dcopy_kernel_S1: + + ands I, N, #3 + ble dcopy_kernel_L999 + +dcopy_kernel_S10: + + COPY_S1 + + subs I, I, #1 + bne dcopy_kernel_S10 + + + + + + +dcopy_kernel_L999: + + sub r3, fp, #128 + vldm r3, { d8 - d15} // restore floating point registers + + mov r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/scopy_vfpv3.S b/kernel/arm/scopy_vfpv3.S new file mode 100644 index 000000000..e6ceaf2fb --- /dev/null +++ b/kernel/arm/scopy_vfpv3.S @@ -0,0 +1,224 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/07 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define N r0 +#define X r1 +#define INC_X r2 +#define OLD_Y r3 + + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define OLD_INC_Y [fp, #4 ] + +#define I r5 +#define Y r6 +#define INC_Y r7 + +#define X_PRE 256 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro COPY_F8 + + pld [ X, #X_PRE ] + fldmias X!, { s0 - s3 } + fldmias X!, { s4 - s7 } + fstmias Y!, { s0 - s3 } + fstmias Y!, { s4 - s7 } + +.endm + +.macro COPY_F1 + + fldmias X!, { s0 } + fstmias Y!, { s0 } + +.endm + + +/*************************************************************************************************************************/ + +.macro COPY_S4 + + nop + fldmias X, { s0 } + fstmias Y, { s0 } + add X, X, INC_X + add Y, Y, INC_Y + + fldmias X, { s1 } + fstmias Y, { s1 } + add X, X, INC_X + add Y, Y, INC_Y + + fldmias X, { s0 } + fstmias Y, { s0 } + add X, X, INC_X + add Y, Y, INC_Y + + fldmias X, { s1 } + fstmias Y, { s1 } + add X, X, INC_X + add Y, Y, INC_Y + +.endm + + +.macro COPY_S1 + + fldmias X, { s0 } + fstmias Y, { s0 } + add X, X, INC_X + add Y, Y, INC_Y + +.endm + + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + sub r4, fp, #128 + vstm r4, { s8 - s15} // store floating point registers + + mov Y, OLD_Y + ldr INC_Y, OLD_INC_Y + + cmp N, #0 + ble scopy_kernel_L999 + + cmp INC_X, #0 + beq scopy_kernel_L999 + + cmp INC_Y, #0 + beq scopy_kernel_L999 + + cmp INC_X, #1 + bne scopy_kernel_S_BEGIN + + cmp INC_Y, #1 + bne scopy_kernel_S_BEGIN + +scopy_kernel_F_BEGIN: + + asrs I, N, #3 // I = N / 8 + ble scopy_kernel_F1 + +scopy_kernel_F8: + + COPY_F8 + + subs I, I, #1 + bne scopy_kernel_F8 + +scopy_kernel_F1: + + ands I, N, #7 + ble scopy_kernel_L999 + +scopy_kernel_F10: + + COPY_F1 + + subs I, I, #1 + bne scopy_kernel_F10 + + b scopy_kernel_L999 + +scopy_kernel_S_BEGIN: + + lsl INC_X, INC_X, #2 // INC_X * SIZE + lsl INC_Y, INC_Y, #2 // INC_Y * SIZE + + asrs I, N, #2 // I = N / 4 + ble scopy_kernel_S1 + +scopy_kernel_S4: + + COPY_S4 + + subs I, I, #1 + bne scopy_kernel_S4 + +scopy_kernel_S1: + + ands I, N, #3 + ble scopy_kernel_L999 + +scopy_kernel_S10: + + COPY_S1 + + subs I, I, #1 + bne scopy_kernel_S10 + + + + + + +scopy_kernel_L999: + + sub r3, fp, #128 + vldm r3, { s8 - s15} // restore floating point registers + + mov r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/zcopy_vfpv3.S b/kernel/arm/zcopy_vfpv3.S new file mode 100644 index 000000000..06f892446 --- /dev/null +++ b/kernel/arm/zcopy_vfpv3.S @@ -0,0 +1,223 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/07 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define N r0 +#define X r1 +#define INC_X r2 +#define OLD_Y r3 + + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define OLD_INC_Y [fp, #4 ] + +#define I r5 +#define Y r6 +#define INC_Y r7 + +#define X_PRE 256 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro COPY_F4 + + pld [ X, #X_PRE ] + pld [ X, #X_PRE+32 ] + fldmiad X!, { d0 - d7 } + fstmiad Y!, { d0 - d7 } + +.endm + +.macro COPY_F1 + + fldmiad X!, { d0 - d1 } + fstmiad Y!, { d0 - d1 } + +.endm + + +/*************************************************************************************************************************/ + +.macro COPY_S4 + + nop + fldmiad X, { d0 - d1 } + fstmiad Y, { d0 - d1 } + add X, X, INC_X + add Y, Y, INC_Y + + fldmiad X, { d2 - d3 } + fstmiad Y, { d2 - d3 } + add X, X, INC_X + add Y, Y, INC_Y + + fldmiad X, { d0 - d1 } + fstmiad Y, { d0 - d1 } + add X, X, INC_X + add Y, Y, INC_Y + + fldmiad X, { d2 - d3 } + fstmiad Y, { d2 - d3 } + add X, X, INC_X + add Y, Y, INC_Y + +.endm + + +.macro COPY_S1 + + fldmiad X, { d0 - d1 } + fstmiad Y, { d0 - d1 } + add X, X, INC_X + add Y, Y, INC_Y + +.endm + + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + sub r4, fp, #128 + vstm r4, { d8 - d15} // store floating point registers + + mov Y, OLD_Y + ldr INC_Y, OLD_INC_Y + + cmp N, #0 + ble zcopy_kernel_L999 + + cmp INC_X, #0 + beq zcopy_kernel_L999 + + cmp INC_Y, #0 + beq zcopy_kernel_L999 + + cmp INC_X, #1 + bne zcopy_kernel_S_BEGIN + + cmp INC_Y, #1 + bne zcopy_kernel_S_BEGIN + +zcopy_kernel_F_BEGIN: + + asrs I, N, #2 // I = N / 4 + ble zcopy_kernel_F1 + +zcopy_kernel_F4: + + COPY_F4 + + subs I, I, #1 + bne zcopy_kernel_F4 + +zcopy_kernel_F1: + + ands I, N, #3 + ble zcopy_kernel_L999 + +zcopy_kernel_F10: + + COPY_F1 + + subs I, I, #1 + bne zcopy_kernel_F10 + + b zcopy_kernel_L999 + +zcopy_kernel_S_BEGIN: + + lsl INC_X, INC_X, #4 // INC_X * SIZE * 2 + lsl INC_Y, INC_Y, #4 // INC_Y * SIZE * 2 + + asrs I, N, #2 // I = N / 4 + ble zcopy_kernel_S1 + +zcopy_kernel_S4: + + COPY_S4 + + subs I, I, #1 + bne zcopy_kernel_S4 + +zcopy_kernel_S1: + + ands I, N, #3 + ble zcopy_kernel_L999 + +zcopy_kernel_S10: + + COPY_S1 + + subs I, I, #1 + bne zcopy_kernel_S10 + + + + + + +zcopy_kernel_L999: + + sub r3, fp, #128 + vldm r3, { d8 - d15} // restore floating point registers + + mov r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + From c8f1aeb1549c723ac33124b8cd66522881300e47 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Thu, 7 Nov 2013 17:22:03 +0100 Subject: [PATCH 29/81] added optimized blas level1 dot kernels for single and double precision --- kernel/arm/KERNEL.ARMV7 | 29 ++-- kernel/arm/ddot_vfpv3.S | 238 +++++++++++++++++++++++++++ kernel/arm/sdot_vfpv3.S | 347 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 597 insertions(+), 17 deletions(-) create mode 100644 kernel/arm/ddot_vfpv3.S create mode 100644 kernel/arm/sdot_vfpv3.S diff --git a/kernel/arm/KERNEL.ARMV7 b/kernel/arm/KERNEL.ARMV7 index 10bc4620a..7387be1a2 100644 --- a/kernel/arm/KERNEL.ARMV7 +++ b/kernel/arm/KERNEL.ARMV7 @@ -45,13 +45,13 @@ DAXPYKERNEL = ../arm/axpy.c CAXPYKERNEL = ../arm/zaxpy.c ZAXPYKERNEL = ../arm/zaxpy.c -SCOPYKERNEL = ../arm/copy.c -DCOPYKERNEL = ../arm/copy.c -CCOPYKERNEL = ../arm/zcopy.c -ZCOPYKERNEL = ../arm/zcopy.c +SCOPYKERNEL = scopy_vfpv3.S +DCOPYKERNEL = dcopy_vfpv3.S +CCOPYKERNEL = ccopy_vfpv3.S +ZCOPYKERNEL = zcopy_vfpv3.S -SDOTKERNEL = ../arm/dot.c -DDOTKERNEL = ../arm/dot.c +SDOTKERNEL = sdot_vfpv3.S +DDOTKERNEL = ddot_vfpv3.S CDOTKERNEL = ../arm/zdot.c ZDOTKERNEL = ../arm/zdot.c @@ -108,15 +108,15 @@ DGEMMOTCOPYOBJ = dgemm_otcopy.o CGEMMKERNEL = cgemm_kernel_2x2_vfpv3.S CGEMMONCOPY = cgemm_ncopy_2_vfpv3.S -CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c -CGEMMONCOPYOBJ = cgemm_oncopy.o -CGEMMOTCOPYOBJ = cgemm_otcopy.o +CGEMMOTCOPY = cgemm_tcopy_2_vfpv3.S +CGEMMONCOPYOBJ = cgemm_oncopy.o +CGEMMOTCOPYOBJ = cgemm_otcopy.o ZGEMMKERNEL = zgemm_kernel_2x2_vfpv3.S ZGEMMONCOPY = zgemm_ncopy_2_vfpv3.S -ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c -ZGEMMONCOPYOBJ = zgemm_oncopy.o -ZGEMMOTCOPYOBJ = zgemm_otcopy.o +ZGEMMOTCOPY = zgemm_tcopy_2_vfpv3.S +ZGEMMONCOPYOBJ = zgemm_oncopy.o +ZGEMMOTCOPYOBJ = zgemm_otcopy.o STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c @@ -138,9 +138,4 @@ ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c -CGEMM3MKERNEL = zgemm3m_kernel_8x4_core2.S -ZGEMM3MKERNEL = zgemm3m_kernel_4x4_core2.S - - - diff --git a/kernel/arm/ddot_vfpv3.S b/kernel/arm/ddot_vfpv3.S new file mode 100644 index 000000000..12d9e218b --- /dev/null +++ b/kernel/arm/ddot_vfpv3.S @@ -0,0 +1,238 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/07 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define N r0 +#define X r1 +#define INC_X r2 +#define OLD_Y r3 + + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define OLD_INC_Y [fp, #4 ] + +#define I r5 +#define Y r6 +#define INC_Y r7 + +#define X_PRE 256 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro KERNEL_F4 + + pld [ X, #X_PRE ] + pld [ Y, #X_PRE ] + fldmiad X!, { d8 - d9 } + fldmiad Y!, { d4 - d5} + fmacd d0 , d4, d8 + fldmiad X!, { d10 - d11 } + fmacd d1 , d5, d9 + fldmiad Y!, { d6 - d7 } + fmacd d0 , d6, d10 + fmacd d1 , d7, d11 + +.endm + +.macro KERNEL_F1 + + fldmiad X!, { d4 } + fldmiad Y!, { d8 } + fmacd d0 , d4, d8 + +.endm + + +/*************************************************************************************************************************/ + +.macro KERNEL_S4 + + nop + fldmiad X, { d4 } + fldmiad Y, { d8 } + add X, X, INC_X + add Y, Y, INC_Y + fmacd d0 , d4, d8 + + fldmiad X, { d5 } + fldmiad Y, { d9 } + add X, X, INC_X + add Y, Y, INC_Y + fmacd d1 , d5, d9 + + fldmiad X, { d6 } + fldmiad Y, { d10 } + add X, X, INC_X + add Y, Y, INC_Y + fmacd d0 , d6, d10 + + fldmiad X, { d7 } + fldmiad Y, { d11 } + add X, X, INC_X + add Y, Y, INC_Y + fmacd d1 , d7, d11 + +.endm + + +.macro KERNEL_S1 + + fldmiad X, { d4 } + fldmiad Y, { d8 } + add X, X, INC_X + fmacd d0 , d4, d8 + add Y, Y, INC_Y + +.endm + + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + sub r4, fp, #128 + vstm r4, { d8 - d15} // store floating point registers + + mov Y, OLD_Y + ldr INC_Y, OLD_INC_Y + + vsub.f64 d0 , d0 , d0 + vsub.f64 d1 , d1 , d1 + + cmp N, #0 + ble ddot_kernel_L999 + + cmp INC_X, #0 + beq ddot_kernel_L999 + + cmp INC_Y, #0 + beq ddot_kernel_L999 + + cmp INC_X, #1 + bne ddot_kernel_S_BEGIN + + cmp INC_Y, #1 + bne ddot_kernel_S_BEGIN + +ddot_kernel_F_BEGIN: + + asrs I, N, #2 // I = N / 4 + ble ddot_kernel_F1 + +ddot_kernel_F4: + + KERNEL_F4 + + subs I, I, #1 + bne ddot_kernel_F4 + +ddot_kernel_F1: + + ands I, N, #3 + ble ddot_kernel_L999 + +ddot_kernel_F10: + + KERNEL_F1 + + subs I, I, #1 + bne ddot_kernel_F10 + + b ddot_kernel_L999 + +ddot_kernel_S_BEGIN: + + lsl INC_X, INC_X, #3 // INC_X * SIZE + lsl INC_Y, INC_Y, #3 // INC_Y * SIZE + + asrs I, N, #2 // I = N / 4 + ble ddot_kernel_S1 + +ddot_kernel_S4: + + KERNEL_S4 + + subs I, I, #1 + bne ddot_kernel_S4 + +ddot_kernel_S1: + + ands I, N, #3 + ble ddot_kernel_L999 + +ddot_kernel_S10: + + KERNEL_S1 + + subs I, I, #1 + bne ddot_kernel_S10 + + + + + + +ddot_kernel_L999: + + sub r3, fp, #128 + vldm r3, { d8 - d15} // restore floating point registers + + vadd.f64 d0 , d0, d1 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/sdot_vfpv3.S b/kernel/arm/sdot_vfpv3.S new file mode 100644 index 000000000..164387482 --- /dev/null +++ b/kernel/arm/sdot_vfpv3.S @@ -0,0 +1,347 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/07 Saar +* BLASTEST : OK +* CTEST : OK (no test for dsdot) +* TEST : OK (no test for dsdot) +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define N r0 +#define X r1 +#define INC_X r2 +#define OLD_Y r3 + + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define OLD_INC_Y [fp, #4 ] + +#define I r5 +#define Y r6 +#define INC_Y r7 + +#define X_PRE 256 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +#if defined(DSDOT) + +.macro KERNEL_F4 + + fldmias X!, { s14 } + fldmias Y!, { s15 } + vmul.f32 s15, s14, s15 + vcvt.f64.f32 d16, s15 + vadd.f64 d0 , d0, d16 + + fldmias X!, { s14 } + fldmias Y!, { s15 } + vmul.f32 s15, s14, s15 + vcvt.f64.f32 d16, s15 + vadd.f64 d0 , d0, d16 + + fldmias X!, { s14 } + fldmias Y!, { s15 } + vmul.f32 s15, s14, s15 + vcvt.f64.f32 d16, s15 + vadd.f64 d0 , d0, d16 + + fldmias X!, { s14 } + fldmias Y!, { s15 } + vmul.f32 s15, s14, s15 + vcvt.f64.f32 d16, s15 + vadd.f64 d0 , d0, d16 + +.endm + +.macro KERNEL_F1 + + fldmias X!, { s14 } + fldmias Y!, { s15 } + vmul.f32 s15, s14, s15 + vcvt.f64.f32 d16, s15 + vadd.f64 d0 , d0, d16 + +.endm + + +.macro KERNEL_S4 + + nop + + fldmias X, { s14 } + fldmias Y, { s15 } + vmul.f32 s15, s14, s15 + vcvt.f64.f32 d16, s15 + vadd.f64 d0 , d0, d16 + add X, X, INC_X + add Y, Y, INC_Y + + fldmias X, { s14 } + fldmias Y, { s15 } + vmul.f32 s15, s14, s15 + vcvt.f64.f32 d16, s15 + vadd.f64 d0 , d0, d16 + add X, X, INC_X + add Y, Y, INC_Y + + fldmias X, { s14 } + fldmias Y, { s15 } + vmul.f32 s15, s14, s15 + vcvt.f64.f32 d16, s15 + vadd.f64 d0 , d0, d16 + add X, X, INC_X + add Y, Y, INC_Y + + fldmias X, { s14 } + fldmias Y, { s15 } + vmul.f32 s15, s14, s15 + vcvt.f64.f32 d16, s15 + vadd.f64 d0 , d0, d16 + add X, X, INC_X + add Y, Y, INC_Y + +.endm + + +.macro KERNEL_S1 + + fldmias X, { s14 } + fldmias Y, { s15 } + vmul.f32 s15, s14, s15 + vcvt.f64.f32 d16, s15 + vadd.f64 d0 , d0, d16 + add X, X, INC_X + add Y, Y, INC_Y + +.endm + + + +#else + +.macro KERNEL_F4 + + fldmias X!, { s8 - s9 } + fldmias Y!, { s4 - s5} + fmacs s0 , s4, s8 + fldmias X!, { s10 - s11 } + fmacs s1 , s5, s9 + fldmias Y!, { s6 - s7 } + fmacs s0 , s6, s10 + fmacs s1 , s7, s11 + +.endm + +.macro KERNEL_F1 + + fldmias X!, { s4 } + fldmias Y!, { s8 } + fmacs s0 , s4, s8 + +.endm + + +.macro KERNEL_S4 + + nop + fldmias X, { s4 } + fldmias Y, { s8 } + add X, X, INC_X + add Y, Y, INC_Y + fmacs s0 , s4, s8 + + fldmias X, { s5 } + fldmias Y, { s9 } + add X, X, INC_X + add Y, Y, INC_Y + fmacs s1 , s5, s9 + + fldmias X, { s6 } + fldmias Y, { s10 } + add X, X, INC_X + add Y, Y, INC_Y + fmacs s0 , s6, s10 + + fldmias X, { s7 } + fldmias Y, { s11 } + add X, X, INC_X + add Y, Y, INC_Y + fmacs s1 , s7, s11 + +.endm + + +.macro KERNEL_S1 + + fldmias X, { s4 } + fldmias Y, { s8 } + add X, X, INC_X + fmacs s0 , s4, s8 + add Y, Y, INC_Y + +.endm + +#endif + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + sub r4, fp, #128 + vstm r4, { s8 - s15 } // store floating point registers + + mov Y, OLD_Y + ldr INC_Y, OLD_INC_Y + +#if defined(DSDOT) + + vsub.f64 d0 , d0 , d0 + vsub.f64 d1 , d1 , d1 + +#else + + vsub.f32 s0 , s0 , s0 + vsub.f32 s1 , s1 , s1 + +#endif + + cmp N, #0 + ble sdot_kernel_L999 + + cmp INC_X, #0 + beq sdot_kernel_L999 + + cmp INC_Y, #0 + beq sdot_kernel_L999 + + cmp INC_X, #1 + bne sdot_kernel_S_BEGIN + + cmp INC_Y, #1 + bne sdot_kernel_S_BEGIN + +sdot_kernel_F_BEGIN: + + asrs I, N, #2 // I = N / 4 + ble sdot_kernel_F1 + +sdot_kernel_F4: + + KERNEL_F4 + + subs I, I, #1 + bne sdot_kernel_F4 + +sdot_kernel_F1: + + ands I, N, #3 + ble sdot_kernel_L999 + +sdot_kernel_F10: + + KERNEL_F1 + + subs I, I, #1 + bne sdot_kernel_F10 + + b sdot_kernel_L999 + +sdot_kernel_S_BEGIN: + + lsl INC_X, INC_X, #2 // INC_X * SIZE + lsl INC_Y, INC_Y, #2 // INC_Y * SIZE + + asrs I, N, #2 // I = N / 4 + ble sdot_kernel_S1 + +sdot_kernel_S4: + + KERNEL_S4 + + subs I, I, #1 + bne sdot_kernel_S4 + +sdot_kernel_S1: + + ands I, N, #3 + ble sdot_kernel_L999 + +sdot_kernel_S10: + + KERNEL_S1 + + subs I, I, #1 + bne sdot_kernel_S10 + + + + + + +sdot_kernel_L999: + + sub r3, fp, #128 + vldm r3, { s8 - s15} // restore floating point registers + +#if defined(DSDOT) + + vadd.f64 d0 , d0, d1 // set return value + +#else + + vadd.f32 s0 , s0, s1 // set return value + +#endif + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + From 5b36cc0f4725e4cff6f818dd7de4690a9a28c15a Mon Sep 17 00:00:00 2001 From: wernsaar Date: Fri, 8 Nov 2013 09:08:11 +0100 Subject: [PATCH 30/81] added blas level1 dot kernels for complex and double complex --- kernel/arm/KERNEL.ARMV7 | 4 +- kernel/arm/cdot_vfpv3.S | 284 +++++++++++++++++++++++++++++++++++++++ kernel/arm/zdot_vfpv3.S | 286 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 572 insertions(+), 2 deletions(-) create mode 100644 kernel/arm/cdot_vfpv3.S create mode 100644 kernel/arm/zdot_vfpv3.S diff --git a/kernel/arm/KERNEL.ARMV7 b/kernel/arm/KERNEL.ARMV7 index 7387be1a2..a6b1d67d2 100644 --- a/kernel/arm/KERNEL.ARMV7 +++ b/kernel/arm/KERNEL.ARMV7 @@ -52,8 +52,8 @@ ZCOPYKERNEL = zcopy_vfpv3.S SDOTKERNEL = sdot_vfpv3.S DDOTKERNEL = ddot_vfpv3.S -CDOTKERNEL = ../arm/zdot.c -ZDOTKERNEL = ../arm/zdot.c +CDOTKERNEL = cdot_vfpv3.S +ZDOTKERNEL = zdot_vfpv3.S SNRM2KERNEL = ../arm/nrm2.c DNRM2KERNEL = ../arm/nrm2.c diff --git a/kernel/arm/cdot_vfpv3.S b/kernel/arm/cdot_vfpv3.S new file mode 100644 index 000000000..261808916 --- /dev/null +++ b/kernel/arm/cdot_vfpv3.S @@ -0,0 +1,284 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/08 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define N r0 +#define X r1 +#define INC_X r2 +#define OLD_Y r3 + + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define OLD_INC_Y [fp, #4 ] + +#define I r5 +#define Y r6 +#define INC_Y r7 + +#define X_PRE 256 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro KERNEL_F4 + + pld [ X, #X_PRE ] + pld [ Y, #X_PRE ] + + fldmias X!, { s4 - s5 } + fldmias Y!, { s8 - s9 } + fmacs s0 , s4, s8 + fmacs s1 , s4, s9 + fldmias X!, { s6 - s7 } + fmacs s2 , s5, s9 + fmacs s3 , s5, s8 + + fldmias Y!, { s10 - s11 } + fmacs s0 , s6, s10 + fmacs s1 , s6, s11 + fmacs s2 , s7, s11 + fmacs s3 , s7, s10 + + + fldmias X!, { s4 - s5 } + fldmias Y!, { s8 - s9 } + fmacs s0 , s4, s8 + fmacs s1 , s4, s9 + fldmias X!, { s6 - s7 } + fmacs s2 , s5, s9 + fmacs s3 , s5, s8 + + fldmias Y!, { s10 - s11 } + fmacs s0 , s6, s10 + fmacs s1 , s6, s11 + fmacs s2 , s7, s11 + fmacs s3 , s7, s10 + +.endm + +.macro KERNEL_F1 + + fldmias X!, { s4 - s5 } + fldmias Y!, { s8 - s9 } + fmacs s0 , s4, s8 + fmacs s1 , s4, s9 + fmacs s2 , s5, s9 + fmacs s3 , s5, s8 + +.endm + + +/*************************************************************************************************************************/ + +.macro KERNEL_S4 + + nop + + fldmias X, { s4 - s5 } + fldmias Y, { s8 - s9 } + fmacs s0 , s4, s8 + fmacs s1 , s4, s9 + fmacs s2 , s5, s9 + fmacs s3 , s5, s8 + add X, X, INC_X + add Y, Y, INC_Y + + fldmias X, { s4 - s5 } + fldmias Y, { s8 - s9 } + fmacs s0 , s4, s8 + fmacs s1 , s4, s9 + fmacs s2 , s5, s9 + fmacs s3 , s5, s8 + add X, X, INC_X + add Y, Y, INC_Y + + fldmias X, { s4 - s5 } + fldmias Y, { s8 - s9 } + fmacs s0 , s4, s8 + fmacs s1 , s4, s9 + fmacs s2 , s5, s9 + fmacs s3 , s5, s8 + add X, X, INC_X + add Y, Y, INC_Y + + fldmias X, { s4 - s5 } + fldmias Y, { s8 - s9 } + fmacs s0 , s4, s8 + fmacs s1 , s4, s9 + fmacs s2 , s5, s9 + fmacs s3 , s5, s8 + add X, X, INC_X + add Y, Y, INC_Y + +.endm + + +.macro KERNEL_S1 + + fldmias X, { s4 - s5 } + fldmias Y, { s8 - s9 } + fmacs s0 , s4, s8 + fmacs s1 , s4, s9 + fmacs s2 , s5, s9 + fmacs s3 , s5, s8 + add X, X, INC_X + add Y, Y, INC_Y + +.endm + + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + sub r4, fp, #128 + vstm r4, { s8 - s15} // store floating point registers + + mov Y, OLD_Y + ldr INC_Y, OLD_INC_Y + + vsub.f32 s0 , s0 , s0 + vsub.f32 s1 , s1 , s1 + vsub.f32 s2 , s2 , s2 + vsub.f32 s3 , s3 , s3 + + cmp N, #0 + ble cdot_kernel_L999 + + cmp INC_X, #0 + beq cdot_kernel_L999 + + cmp INC_Y, #0 + beq cdot_kernel_L999 + + cmp INC_X, #1 + bne cdot_kernel_S_BEGIN + + cmp INC_Y, #1 + bne cdot_kernel_S_BEGIN + +cdot_kernel_F_BEGIN: + + asrs I, N, #2 // I = N / 4 + ble cdot_kernel_F1 + +cdot_kernel_F4: + + KERNEL_F4 + + subs I, I, #1 + bne cdot_kernel_F4 + +cdot_kernel_F1: + + ands I, N, #3 + ble cdot_kernel_L999 + +cdot_kernel_F10: + + KERNEL_F1 + + subs I, I, #1 + bne cdot_kernel_F10 + + b cdot_kernel_L999 + +cdot_kernel_S_BEGIN: + + lsl INC_X, INC_X, #3 // INC_X * SIZE * 2 + lsl INC_Y, INC_Y, #3 // INC_Y * SIZE * 2 + + asrs I, N, #2 // I = N / 4 + ble cdot_kernel_S1 + +cdot_kernel_S4: + + KERNEL_S4 + + subs I, I, #1 + bne cdot_kernel_S4 + +cdot_kernel_S1: + + ands I, N, #3 + ble cdot_kernel_L999 + +cdot_kernel_S10: + + KERNEL_S1 + + subs I, I, #1 + bne cdot_kernel_S10 + + + +cdot_kernel_L999: + + sub r3, fp, #128 + vldm r3, { s8 - s15} // restore floating point registers + +#if !defined(CONJ) + vsub.f32 s0 , s0, s2 + vadd.f32 s1 , s1, s3 +#else + vadd.f32 s0 , s0, s2 + vsub.f32 s1 , s1, s3 +#endif + + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/zdot_vfpv3.S b/kernel/arm/zdot_vfpv3.S new file mode 100644 index 000000000..2aa9171b8 --- /dev/null +++ b/kernel/arm/zdot_vfpv3.S @@ -0,0 +1,286 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/08 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define N r0 +#define X r1 +#define INC_X r2 +#define OLD_Y r3 + + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define OLD_INC_Y [fp, #4 ] + +#define I r5 +#define Y r6 +#define INC_Y r7 + +#define X_PRE 256 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro KERNEL_F4 + + pld [ X, #X_PRE ] + pld [ Y, #X_PRE ] + + fldmiad X!, { d4 - d5 } + fldmiad Y!, { d8 - d9 } + fmacd d0 , d4, d8 + fmacd d1 , d4, d9 + fldmiad X!, { d6 - d7 } + fmacd d2 , d5, d9 + fmacd d3 , d5, d8 + + fldmiad Y!, { d10 - d11 } + fmacd d0 , d6, d10 + fmacd d1 , d6, d11 + pld [ X, #X_PRE ] + fmacd d2 , d7, d11 + fmacd d3 , d7, d10 + + pld [ Y, #X_PRE ] + + fldmiad X!, { d4 - d5 } + fldmiad Y!, { d8 - d9 } + fmacd d0 , d4, d8 + fmacd d1 , d4, d9 + fldmiad X!, { d6 - d7 } + fmacd d2 , d5, d9 + fmacd d3 , d5, d8 + + fldmiad Y!, { d10 - d11 } + fmacd d0 , d6, d10 + fmacd d1 , d6, d11 + fmacd d2 , d7, d11 + fmacd d3 , d7, d10 + +.endm + +.macro KERNEL_F1 + + fldmiad X!, { d4 - d5 } + fldmiad Y!, { d8 - d9 } + fmacd d0 , d4, d8 + fmacd d1 , d4, d9 + fmacd d2 , d5, d9 + fmacd d3 , d5, d8 + +.endm + + +/*************************************************************************************************************************/ + +.macro KERNEL_S4 + + nop + + fldmiad X, { d4 - d5 } + fldmiad Y, { d8 - d9 } + fmacd d0 , d4, d8 + fmacd d1 , d4, d9 + fmacd d2 , d5, d9 + fmacd d3 , d5, d8 + add X, X, INC_X + add Y, Y, INC_Y + + fldmiad X, { d4 - d5 } + fldmiad Y, { d8 - d9 } + fmacd d0 , d4, d8 + fmacd d1 , d4, d9 + fmacd d2 , d5, d9 + fmacd d3 , d5, d8 + add X, X, INC_X + add Y, Y, INC_Y + + fldmiad X, { d4 - d5 } + fldmiad Y, { d8 - d9 } + fmacd d0 , d4, d8 + fmacd d1 , d4, d9 + fmacd d2 , d5, d9 + fmacd d3 , d5, d8 + add X, X, INC_X + add Y, Y, INC_Y + + fldmiad X, { d4 - d5 } + fldmiad Y, { d8 - d9 } + fmacd d0 , d4, d8 + fmacd d1 , d4, d9 + fmacd d2 , d5, d9 + fmacd d3 , d5, d8 + add X, X, INC_X + add Y, Y, INC_Y + +.endm + + +.macro KERNEL_S1 + + fldmiad X, { d4 - d5 } + fldmiad Y, { d8 - d9 } + fmacd d0 , d4, d8 + fmacd d1 , d4, d9 + fmacd d2 , d5, d9 + fmacd d3 , d5, d8 + add X, X, INC_X + add Y, Y, INC_Y + +.endm + + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + sub r4, fp, #128 + vstm r4, { d8 - d15} // store floating point registers + + mov Y, OLD_Y + ldr INC_Y, OLD_INC_Y + + vsub.f64 d0 , d0 , d0 + vsub.f64 d1 , d1 , d1 + vsub.f64 d2 , d2 , d2 + vsub.f64 d3 , d3 , d3 + + cmp N, #0 + ble zdot_kernel_L999 + + cmp INC_X, #0 + beq zdot_kernel_L999 + + cmp INC_Y, #0 + beq zdot_kernel_L999 + + cmp INC_X, #1 + bne zdot_kernel_S_BEGIN + + cmp INC_Y, #1 + bne zdot_kernel_S_BEGIN + +zdot_kernel_F_BEGIN: + + asrs I, N, #2 // I = N / 4 + ble zdot_kernel_F1 + +zdot_kernel_F4: + + KERNEL_F4 + + subs I, I, #1 + bne zdot_kernel_F4 + +zdot_kernel_F1: + + ands I, N, #3 + ble zdot_kernel_L999 + +zdot_kernel_F10: + + KERNEL_F1 + + subs I, I, #1 + bne zdot_kernel_F10 + + b zdot_kernel_L999 + +zdot_kernel_S_BEGIN: + + lsl INC_X, INC_X, #4 // INC_X * SIZE * 2 + lsl INC_Y, INC_Y, #4 // INC_Y * SIZE * 2 + + asrs I, N, #2 // I = N / 4 + ble zdot_kernel_S1 + +zdot_kernel_S4: + + KERNEL_S4 + + subs I, I, #1 + bne zdot_kernel_S4 + +zdot_kernel_S1: + + ands I, N, #3 + ble zdot_kernel_L999 + +zdot_kernel_S10: + + KERNEL_S1 + + subs I, I, #1 + bne zdot_kernel_S10 + + + +zdot_kernel_L999: + + sub r3, fp, #128 + vldm r3, { d8 - d15} // restore floating point registers + +#if !defined(CONJ) + vsub.f64 d0 , d0, d2 + vadd.f64 d1 , d1, d3 +#else + vadd.f64 d0 , d0, d2 + vsub.f64 d1 , d1, d3 +#endif + + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + From 00f33c0134da349d9809ff8d9ad334d43381b91e Mon Sep 17 00:00:00 2001 From: wernsaar Date: Mon, 11 Nov 2013 14:20:59 +0100 Subject: [PATCH 31/81] added asum_kernel for all precisions and complex --- kernel/arm/KERNEL.ARMV7 | 8 +- kernel/arm/asum_vfpv3.S | 481 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 485 insertions(+), 4 deletions(-) create mode 100644 kernel/arm/asum_vfpv3.S diff --git a/kernel/arm/KERNEL.ARMV7 b/kernel/arm/KERNEL.ARMV7 index a6b1d67d2..10db665b2 100644 --- a/kernel/arm/KERNEL.ARMV7 +++ b/kernel/arm/KERNEL.ARMV7 @@ -35,10 +35,10 @@ DSWAPKERNEL = ../arm/swap.c CSWAPKERNEL = ../arm/zswap.c ZSWAPKERNEL = ../arm/zswap.c -SASUMKERNEL = ../arm/asum.c -DASUMKERNEL = ../arm/asum.c -CASUMKERNEL = ../arm/zasum.c -ZASUMKERNEL = ../arm/zasum.c +SASUMKERNEL = asum_vfpv3.S +DASUMKERNEL = asum_vfpv3.S +CASUMKERNEL = asum_vfpv3.S +ZASUMKERNEL = asum_vfpv3.S SAXPYKERNEL = ../arm/axpy.c DAXPYKERNEL = ../arm/axpy.c diff --git a/kernel/arm/asum_vfpv3.S b/kernel/arm/asum_vfpv3.S new file mode 100644 index 000000000..2b6ceb191 --- /dev/null +++ b/kernel/arm/asum_vfpv3.S @@ -0,0 +1,481 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/11 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define N r0 +#define X r1 +#define INC_X r2 + + +#define I r12 + +#define X_PRE 512 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +#if !defined(COMPLEX) + +#if defined(DOUBLE) + +.macro KERNEL_F4 + + pld [ X, #X_PRE ] + fldmiad X!, { d4 - d5 } + vabs.f64 d4, d4 + vadd.f64 d0 , d0, d4 + vabs.f64 d5, d5 + fldmiad X!, { d6 - d7 } + vabs.f64 d6, d6 + vadd.f64 d1 , d1, d5 + vabs.f64 d7, d7 + vadd.f64 d0 , d0, d6 + vadd.f64 d1 , d1, d7 + +.endm + +.macro KERNEL_F1 + + fldmiad X!, { d4 } + vabs.f64 d4, d4 + vadd.f64 d0 , d0, d4 + +.endm + + +.macro KERNEL_S4 + + fldmiad X, { d4 } + vabs.f64 d4, d4 + vadd.f64 d0 , d0, d4 + add X, X, INC_X + + fldmiad X, { d4 } + vabs.f64 d4, d4 + vadd.f64 d0 , d0, d4 + add X, X, INC_X + + fldmiad X, { d4 } + vabs.f64 d4, d4 + vadd.f64 d0 , d0, d4 + add X, X, INC_X + + fldmiad X, { d4 } + vabs.f64 d4, d4 + vadd.f64 d0 , d0, d4 + add X, X, INC_X + +.endm + + +.macro KERNEL_S1 + + fldmiad X, { d4 } + vabs.f64 d4, d4 + vadd.f64 d0 , d0, d4 + add X, X, INC_X + +.endm + +#else + +.macro KERNEL_F4 + + fldmias X!, { s4 - s5 } + vabs.f32 s4, s4 + vadd.f32 s0 , s0, s4 + vabs.f32 s5, s5 + fldmias X!, { s6 - s7 } + vabs.f32 s6, s6 + vadd.f32 s1 , s1, s5 + vabs.f32 s7, s7 + vadd.f32 s0 , s0, s6 + vadd.f32 s1 , s1, s7 + +.endm + +.macro KERNEL_F1 + + fldmias X!, { s4 } + vabs.f32 s4, s4 + vadd.f32 s0 , s0, s4 + +.endm + + +.macro KERNEL_S4 + + fldmias X, { s4 } + vabs.f32 s4, s4 + vadd.f32 s0 , s0, s4 + add X, X, INC_X + + fldmias X, { s4 } + vabs.f32 s4, s4 + vadd.f32 s0 , s0, s4 + add X, X, INC_X + + fldmias X, { s4 } + vabs.f32 s4, s4 + vadd.f32 s0 , s0, s4 + add X, X, INC_X + + fldmias X, { s4 } + vabs.f32 s4, s4 + vadd.f32 s0 , s0, s4 + add X, X, INC_X + +.endm + + +.macro KERNEL_S1 + + fldmias X, { s4 } + vabs.f32 s4, s4 + vadd.f32 s0 , s0, s4 + add X, X, INC_X + +.endm + + +#endif + +#else + +#if defined(DOUBLE) + +.macro KERNEL_F4 + + pld [ X, #X_PRE ] + fldmiad X!, { d4 - d5 } + vabs.f64 d4, d4 + vadd.f64 d0 , d0, d4 + vabs.f64 d5, d5 + fldmiad X!, { d6 - d7 } + vabs.f64 d6, d6 + vadd.f64 d1 , d1, d5 + vabs.f64 d7, d7 + vadd.f64 d0 , d0, d6 + vadd.f64 d1 , d1, d7 + + pld [ X, #X_PRE ] + fldmiad X!, { d4 - d5 } + vabs.f64 d4, d4 + vadd.f64 d0 , d0, d4 + vabs.f64 d5, d5 + fldmiad X!, { d6 - d7 } + vabs.f64 d6, d6 + vadd.f64 d1 , d1, d5 + vabs.f64 d7, d7 + vadd.f64 d0 , d0, d6 + vadd.f64 d1 , d1, d7 + + +.endm + +.macro KERNEL_F1 + + fldmiad X!, { d4 } + vabs.f64 d4, d4 + vadd.f64 d0 , d0, d4 + + fldmiad X!, { d4 } + vabs.f64 d4, d4 + vadd.f64 d0 , d0, d4 + + +.endm + + +.macro KERNEL_S4 + + fldmiad X, { d4 -d5 } + vabs.f64 d4, d4 + vadd.f64 d0 , d0, d4 + vabs.f64 d5, d5 + vadd.f64 d0 , d0, d5 + add X, X, INC_X + + fldmiad X, { d4 -d5 } + vabs.f64 d4, d4 + vadd.f64 d0 , d0, d4 + vabs.f64 d5, d5 + vadd.f64 d0 , d0, d5 + add X, X, INC_X + + fldmiad X, { d4 -d5 } + vabs.f64 d4, d4 + vadd.f64 d0 , d0, d4 + vabs.f64 d5, d5 + vadd.f64 d0 , d0, d5 + add X, X, INC_X + + fldmiad X, { d4 -d5 } + vabs.f64 d4, d4 + vadd.f64 d0 , d0, d4 + vabs.f64 d5, d5 + vadd.f64 d0 , d0, d5 + add X, X, INC_X + +.endm + + +.macro KERNEL_S1 + + fldmiad X, { d4 -d5 } + vabs.f64 d4, d4 + vadd.f64 d0 , d0, d4 + vabs.f64 d5, d5 + vadd.f64 d0 , d0, d5 + add X, X, INC_X + +.endm + +#else + +.macro KERNEL_F4 + + pld [ X, #X_PRE ] + fldmias X!, { s4 - s5 } + vabs.f32 s4, s4 + vadd.f32 s0 , s0, s4 + vabs.f32 s5, s5 + fldmias X!, { s6 - s7 } + vabs.f32 s6, s6 + vadd.f32 s1 , s1, s5 + vabs.f32 s7, s7 + vadd.f32 s0 , s0, s6 + vadd.f32 s1 , s1, s7 + + fldmias X!, { s4 - s5 } + vabs.f32 s4, s4 + vadd.f32 s0 , s0, s4 + vabs.f32 s5, s5 + fldmias X!, { s6 - s7 } + vabs.f32 s6, s6 + vadd.f32 s1 , s1, s5 + vabs.f32 s7, s7 + vadd.f32 s0 , s0, s6 + vadd.f32 s1 , s1, s7 + + +.endm + +.macro KERNEL_F1 + + fldmias X!, { s4 } + vabs.f32 s4, s4 + vadd.f32 s0 , s0, s4 + + fldmias X!, { s4 } + vabs.f32 s4, s4 + vadd.f32 s0 , s0, s4 + +.endm + + +.macro KERNEL_S4 + + fldmias X, { s4 -s5 } + vabs.f32 s4, s4 + vadd.f32 s0 , s0, s4 + vabs.f32 s5, s5 + vadd.f32 s0 , s0, s5 + add X, X, INC_X + + fldmias X, { s4 -s5 } + vabs.f32 s4, s4 + vadd.f32 s0 , s0, s4 + vabs.f32 s5, s5 + vadd.f32 s0 , s0, s5 + add X, X, INC_X + + fldmias X, { s4 -s5 } + vabs.f32 s4, s4 + vadd.f32 s0 , s0, s4 + vabs.f32 s5, s5 + vadd.f32 s0 , s0, s5 + add X, X, INC_X + + fldmias X, { s4 -s5 } + vabs.f32 s4, s4 + vadd.f32 s0 , s0, s4 + vabs.f32 s5, s5 + vadd.f32 s0 , s0, s5 + add X, X, INC_X + +.endm + + +.macro KERNEL_S1 + + fldmias X, { s4 -s5 } + vabs.f32 s4, s4 + vadd.f32 s0 , s0, s4 + vabs.f32 s5, s5 + vadd.f32 s0 , s0, s5 + add X, X, INC_X + +.endm + +#endif + +#endif + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + +#if defined(DOUBLE) + vsub.f64 d0 , d0 , d0 + vsub.f64 d1 , d1 , d1 +#else + vsub.f32 s0 , s0 , s0 + vsub.f32 s1 , s1 , s1 +#endif + + cmp N, #0 + ble asum_kernel_L999 + + cmp INC_X, #0 + beq asum_kernel_L999 + + cmp INC_X, #1 + bne asum_kernel_S_BEGIN + + +asum_kernel_F_BEGIN: + + asrs I, N, #2 // I = N / 4 + ble asum_kernel_F1 + + .align 5 + +asum_kernel_F4: + +#if !defined(DOUBLE) && !defined(COMPLEX) + pld [ X, #X_PRE ] +#endif + KERNEL_F4 + + subs I, I, #1 + ble asum_kernel_F1 + + KERNEL_F4 + + subs I, I, #1 + bne asum_kernel_F4 + +asum_kernel_F1: + + ands I, N, #3 + ble asum_kernel_L999 + +asum_kernel_F10: + + KERNEL_F1 + + subs I, I, #1 + bne asum_kernel_F10 + + b asum_kernel_L999 + +asum_kernel_S_BEGIN: + +#if defined(COMPLEX) + +#if defined(DOUBLE) + lsl INC_X, INC_X, #4 // INC_X * SIZE * 2 +#else + lsl INC_X, INC_X, #3 // INC_X * SIZE * 2 +#endif + +#else + +#if defined(DOUBLE) + lsl INC_X, INC_X, #3 // INC_X * SIZE +#else + lsl INC_X, INC_X, #2 // INC_X * SIZE +#endif + +#endif + + asrs I, N, #2 // I = N / 4 + ble asum_kernel_S1 + + .align 5 + +asum_kernel_S4: + + KERNEL_S4 + + subs I, I, #1 + bne asum_kernel_S4 + +asum_kernel_S1: + + ands I, N, #3 + ble asum_kernel_L999 + +asum_kernel_S10: + + KERNEL_S1 + + subs I, I, #1 + bne asum_kernel_S10 + + +asum_kernel_L999: + + +#if defined(DOUBLE) + vadd.f64 d0 , d0, d1 // set return value +#else + vadd.f32 s0 , s0, s1 // set return value +#endif + + bx lr + + EPILOGUE + From f750103336dff5ddad6cb70177f69277b24a29a6 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Mon, 11 Nov 2013 15:47:56 +0100 Subject: [PATCH 32/81] small optimizations on dot-kernels --- kernel/arm/cdot_vfpv3.S | 4 ++-- kernel/arm/ddot_vfpv3.S | 22 ++++++++++++++++------ kernel/arm/sdot_vfpv3.S | 4 ++-- kernel/arm/zdot_vfpv3.S | 4 ++-- 4 files changed, 22 insertions(+), 12 deletions(-) diff --git a/kernel/arm/cdot_vfpv3.S b/kernel/arm/cdot_vfpv3.S index 261808916..b653888df 100644 --- a/kernel/arm/cdot_vfpv3.S +++ b/kernel/arm/cdot_vfpv3.S @@ -26,7 +26,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** -* 2013/11/08 Saar +* 2013/11/11 Saar * BLASTEST : OK * CTEST : OK * TEST : OK @@ -56,7 +56,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define Y r6 #define INC_Y r7 -#define X_PRE 256 +#define X_PRE 512 /************************************************************************************** * Macro definitions diff --git a/kernel/arm/ddot_vfpv3.S b/kernel/arm/ddot_vfpv3.S index 12d9e218b..ab819ec98 100644 --- a/kernel/arm/ddot_vfpv3.S +++ b/kernel/arm/ddot_vfpv3.S @@ -26,7 +26,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** -* 2013/11/07 Saar +* 2013/11/11 Saar * BLASTEST : OK * CTEST : OK * TEST : OK @@ -56,7 +56,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define Y r6 #define INC_Y r7 -#define X_PRE 256 +#define X_PRE 512 /************************************************************************************** * Macro definitions @@ -65,14 +65,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F4 pld [ X, #X_PRE ] + fldmiad X!, { d8 } pld [ Y, #X_PRE ] - fldmiad X!, { d8 - d9 } - fldmiad Y!, { d4 - d5} + fldmiad Y!, { d4 } + fldmiad Y!, { d5 } fmacd d0 , d4, d8 - fldmiad X!, { d10 - d11 } + fldmiad X!, { d9 } + fldmiad Y!, { d6 } fmacd d1 , d5, d9 - fldmiad Y!, { d6 - d7 } + fldmiad X!, { d10 } + fldmiad X!, { d11 } fmacd d0 , d6, d10 + fldmiad Y!, { d7 } fmacd d1 , d7, d11 .endm @@ -173,6 +177,12 @@ ddot_kernel_F_BEGIN: ddot_kernel_F4: + KERNEL_F4 + + subs I, I, #1 + ble ddot_kernel_F1 + + KERNEL_F4 subs I, I, #1 diff --git a/kernel/arm/sdot_vfpv3.S b/kernel/arm/sdot_vfpv3.S index 164387482..794e07317 100644 --- a/kernel/arm/sdot_vfpv3.S +++ b/kernel/arm/sdot_vfpv3.S @@ -26,7 +26,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** -* 2013/11/07 Saar +* 2013/11/11 Saar * BLASTEST : OK * CTEST : OK (no test for dsdot) * TEST : OK (no test for dsdot) @@ -56,7 +56,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define Y r6 #define INC_Y r7 -#define X_PRE 256 +#define X_PRE 512 /************************************************************************************** * Macro definitions diff --git a/kernel/arm/zdot_vfpv3.S b/kernel/arm/zdot_vfpv3.S index 2aa9171b8..1a78b5aec 100644 --- a/kernel/arm/zdot_vfpv3.S +++ b/kernel/arm/zdot_vfpv3.S @@ -26,7 +26,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** -* 2013/11/08 Saar +* 2013/11/11 Saar * BLASTEST : OK * CTEST : OK * TEST : OK @@ -56,7 +56,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define Y r6 #define INC_Y r7 -#define X_PRE 256 +#define X_PRE 512 /************************************************************************************** * Macro definitions From 6f4a0ebe38c1772e946d874e00a1b3ce3f83cb6f Mon Sep 17 00:00:00 2001 From: wernsaar Date: Thu, 14 Nov 2013 13:52:47 +0100 Subject: [PATCH 33/81] added max- und min-kernels for all precisions --- kernel/arm/KERNEL.ARMV7 | 48 ++-- kernel/arm/iamax_vfpv3.S | 478 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 502 insertions(+), 24 deletions(-) create mode 100644 kernel/arm/iamax_vfpv3.S diff --git a/kernel/arm/KERNEL.ARMV7 b/kernel/arm/KERNEL.ARMV7 index 10db665b2..1a8c6a3e5 100644 --- a/kernel/arm/KERNEL.ARMV7 +++ b/kernel/arm/KERNEL.ARMV7 @@ -1,34 +1,34 @@ -SAMAXKERNEL = ../arm/amax.c -DAMAXKERNEL = ../arm/amax.c -CAMAXKERNEL = ../arm/zamax.c -ZAMAXKERNEL = ../arm/zamax.c +SAMAXKERNEL = iamax_vfpv3.S +DAMAXKERNEL = iamax_vfpv3.S +CAMAXKERNEL = iamax_vfpv3.S +ZAMAXKERNEL = iamax_vfpv3.S -SAMINKERNEL = ../arm/amin.c -DAMINKERNEL = ../arm/amin.c -CAMINKERNEL = ../arm/zamin.c -ZAMINKERNEL = ../arm/zamin.c +SAMINKERNEL = iamax_vfpv3.S +DAMINKERNEL = iamax_vfpv3.S +CAMINKERNEL = iamax_vfpv3.S +ZAMINKERNEL = iamax_vfpv3.S -SMAXKERNEL = ../arm/max.c -DMAXKERNEL = ../arm/max.c +SMAXKERNEL = iamax_vfpv3.S +DMAXKERNEL = iamax_vfpv3.S -SMINKERNEL = ../arm/min.c -DMINKERNEL = ../arm/min.c +SMINKERNEL = iamax_vfpv3.S +DMINKERNEL = iamax_vfpv3.S -ISAMAXKERNEL = ../arm/iamax.c -IDAMAXKERNEL = ../arm/iamax.c -ICAMAXKERNEL = ../arm/izamax.c -IZAMAXKERNEL = ../arm/izamax.c +ISAMAXKERNEL = iamax_vfpv3.S +IDAMAXKERNEL = iamax_vfpv3.S +ICAMAXKERNEL = iamax_vfpv3.S +IZAMAXKERNEL = iamax_vfpv3.S -ISAMINKERNEL = ../arm/iamin.c -IDAMINKERNEL = ../arm/iamin.c -ICAMINKERNEL = ../arm/izamin.c -IZAMINKERNEL = ../arm/izamin.c +ISAMINKERNEL = iamax_vfpv3.S +IDAMINKERNEL = iamax_vfpv3.S +ICAMINKERNEL = iamax_vfpv3.S +IZAMINKERNEL = iamax_vfpv3.S -ISMAXKERNEL = ../arm/imax.c -IDMAXKERNEL = ../arm/imax.c +ISMAXKERNEL = iamax_vfpv3.S +IDMAXKERNEL = iamax_vfpv3.S -ISMINKERNEL = ../arm/imin.c -IDMINKERNEL = ../arm/imin.c +ISMINKERNEL = iamax_vfpv3.S +IDMINKERNEL = iamax_vfpv3.S SSWAPKERNEL = ../arm/swap.c DSWAPKERNEL = ../arm/swap.c diff --git a/kernel/arm/iamax_vfpv3.S b/kernel/arm/iamax_vfpv3.S new file mode 100644 index 000000000..1d7344898 --- /dev/null +++ b/kernel/arm/iamax_vfpv3.S @@ -0,0 +1,478 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/14 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define N r0 +#define X r1 +#define INC_X r2 +#define INDEX r3 +#define Z r4 + +#define I r12 + +#define X_PRE 512 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +#if defined(USE_ABS) + +#if defined(DOUBLE) + +#define VABS(x0,x1) vabs.f64 x0, x1 + +#else + +#define VABS(x0,x1) vabs.f32 x0, x1 + +#endif + +#else + +#define VABS(x0,x1) nop + +#endif + +/*****************************************************************************************/ + +#if defined(USE_MIN) + +#define MOVCOND movlt + +#if defined(DOUBLE) + +#define VMOVCOND vmovlt.f64 + +#else + +#define VMOVCOND vmovlt.f32 + +#endif + +#else + +#define MOVCOND movgt + +#if defined(DOUBLE) + +#define VMOVCOND vmovgt.f64 + +#else + +#define VMOVCOND vmovgt.f32 + +#endif + + +#endif + + +/*****************************************************************************************/ + + + +#if !defined(COMPLEX) + +#if defined(DOUBLE) + +.macro INIT_F + + fldmiad X!, { d0 } + VABS( d0, d0 ) + mov Z, #1 + mov INDEX, Z + +.endm + +.macro KERNEL_F1 + + fldmiad X!, { d4 } + add Z, Z, #1 + VABS( d4, d4 ) + vcmpe.f64 d4, d0 + vmrs APSR_nzcv, fpscr + VMOVCOND d0, d4 + MOVCOND INDEX, Z + +.endm + +.macro INIT_S + + fldmiad X, { d0 } + VABS( d0, d0 ) + mov Z, #1 + mov INDEX, Z + add X, X, INC_X + +.endm + + +.macro KERNEL_S1 + + fldmiad X, { d4 } + add Z, Z, #1 + VABS( d4, d4 ) + vcmpe.f64 d4, d0 + vmrs APSR_nzcv, fpscr + VMOVCOND d0, d4 + MOVCOND INDEX, Z + add X, X, INC_X + +.endm + +#else + +.macro INIT_F + + fldmias X!, { s0 } + VABS( s0, s0 ) + mov Z, #1 + mov INDEX, Z + +.endm + +.macro KERNEL_F1 + + fldmias X!, { s4 } + add Z, Z, #1 + VABS( s4, s4 ) + vcmpe.f32 s4, s0 + vmrs APSR_nzcv, fpscr + VMOVCOND s0, s4 + MOVCOND INDEX, Z + +.endm + +.macro INIT_S + + fldmias X, { s0 } + VABS( s0, s0 ) + mov Z, #1 + mov INDEX, Z + add X, X, INC_X + +.endm + + +.macro KERNEL_S1 + + fldmias X, { s4 } + add Z, Z, #1 + VABS( s4, s4 ) + vcmpe.f32 s4, s0 + vmrs APSR_nzcv, fpscr + VMOVCOND s0, s4 + MOVCOND INDEX, Z + add X, X, INC_X + +.endm + + + + +#endif + +#else + +#if defined(DOUBLE) + +.macro INIT_F + + fldmiad X!, { d0 -d1 } + vabs.f64 d0, d0 + vabs.f64 d1, d1 + vadd.f64 d0 , d0, d1 + mov Z, #1 + mov INDEX, Z + +.endm + + +.macro KERNEL_F1 + + fldmiad X!, { d4 - d5 } + add Z, Z, #1 + vabs.f64 d4, d4 + vabs.f64 d5, d5 + vadd.f64 d4 , d4, d5 + vcmpe.f64 d4, d0 + vmrs APSR_nzcv, fpscr + VMOVCOND d0, d4 + MOVCOND INDEX, Z + +.endm + +.macro INIT_S + + fldmiad X, { d0 -d1 } + vabs.f64 d0, d0 + vabs.f64 d1, d1 + vadd.f64 d0 , d0, d1 + mov Z, #1 + mov INDEX, Z + add X, X, INC_X + +.endm + + + +.macro KERNEL_S1 + + fldmiad X, { d4 - d5 } + add Z, Z, #1 + vabs.f64 d4, d4 + vabs.f64 d5, d5 + vadd.f64 d4 , d4, d5 + vcmpe.f64 d4, d0 + vmrs APSR_nzcv, fpscr + VMOVCOND d0, d4 + MOVCOND INDEX, Z + add X, X, INC_X + +.endm + +#else + +.macro INIT_F + + fldmias X!, { s0 -s1 } + vabs.f32 s0, s0 + vabs.f32 s1, s1 + vadd.f32 s0 , s0, s1 + mov Z, #1 + mov INDEX, Z + +.endm + + +.macro KERNEL_F1 + + fldmias X!, { s4 - s5 } + add Z, Z, #1 + vabs.f32 s4, s4 + vabs.f32 s5, s5 + vadd.f32 s4 , s4, s5 + vcmpe.f32 s4, s0 + vmrs APSR_nzcv, fpscr + VMOVCOND s0, s4 + MOVCOND INDEX, Z + +.endm + +.macro INIT_S + + fldmias X, { s0 -s1 } + vabs.f32 s0, s0 + vabs.f32 s1, s1 + vadd.f32 s0 , s0, s1 + mov Z, #1 + mov INDEX, Z + add X, X, INC_X + +.endm + + + +.macro KERNEL_S1 + + fldmias X, { s4 - s5 } + add Z, Z, #1 + vabs.f32 s4, s4 + vabs.f32 s5, s5 + vadd.f32 s4 , s4, s5 + vcmpe.f32 s4, s0 + vmrs APSR_nzcv, fpscr + VMOVCOND s0, s4 + MOVCOND INDEX, Z + add X, X, INC_X + +.endm + + + + +#endif + +#endif + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + push {r4} + +#if defined(DOUBLE) + vsub.f64 d0 , d0 , d0 +#else + vsub.f32 s0 , s0 , s0 +#endif + mov INDEX, #0 + + cmp N, #0 + ble iamax_kernel_L999 + + cmp INC_X, #0 + beq iamax_kernel_L999 + + + cmp INC_X, #1 + bne iamax_kernel_S_BEGIN + + +iamax_kernel_F_BEGIN: + + INIT_F + + subs N, N , #1 + ble iamax_kernel_L999 + + asrs I, N, #2 // I = N / 4 + ble iamax_kernel_F1 + + .align 5 + +iamax_kernel_F4: + + pld [ X, #X_PRE ] + KERNEL_F1 + KERNEL_F1 +#if defined(COMPLEX) && defined(DOUBLE) + pld [ X, #X_PRE ] +#endif + KERNEL_F1 + KERNEL_F1 + + subs I, I, #1 + ble iamax_kernel_F1 + + +#if defined(COMPLEX) || defined(DOUBLE) + pld [ X, #X_PRE ] +#endif + KERNEL_F1 + KERNEL_F1 +#if defined(COMPLEX) && defined(DOUBLE) + pld [ X, #X_PRE ] +#endif + KERNEL_F1 + KERNEL_F1 + + subs I, I, #1 + bne iamax_kernel_F4 + +iamax_kernel_F1: + + ands I, N, #3 + ble iamax_kernel_L999 + +iamax_kernel_F10: + + KERNEL_F1 + + subs I, I, #1 + bne iamax_kernel_F10 + + b iamax_kernel_L999 + +iamax_kernel_S_BEGIN: + +#if defined(COMPLEX) + +#if defined(DOUBLE) + lsl INC_X, INC_X, #4 // INC_X * SIZE * 2 +#else + lsl INC_X, INC_X, #3 // INC_X * SIZE * 2 +#endif + +#else + +#if defined(DOUBLE) + lsl INC_X, INC_X, #3 // INC_X * SIZE +#else + lsl INC_X, INC_X, #2 // INC_X * SIZE +#endif + +#endif + + INIT_S + + subs N, N , #1 + ble iamax_kernel_L999 + + asrs I, N, #2 // I = N / 4 + ble iamax_kernel_S1 + + .align 5 + +iamax_kernel_S4: + + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + + subs I, I, #1 + bne iamax_kernel_S4 + +iamax_kernel_S1: + + ands I, N, #3 + ble iamax_kernel_L999 + +iamax_kernel_S10: + + KERNEL_S1 + + subs I, I, #1 + bne iamax_kernel_S10 + + +iamax_kernel_L999: + + mov r0, INDEX // set return value + + pop {r4} + bx lr + + EPILOGUE + From 3dabd7e6e60161635e25985ad62d595e57d4ce00 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Thu, 14 Nov 2013 19:06:19 +0100 Subject: [PATCH 34/81] added swap-kernel for all precisions --- kernel/arm/KERNEL.ARMV7 | 8 +- kernel/arm/swap_vfpv3.S | 354 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 358 insertions(+), 4 deletions(-) create mode 100644 kernel/arm/swap_vfpv3.S diff --git a/kernel/arm/KERNEL.ARMV7 b/kernel/arm/KERNEL.ARMV7 index 1a8c6a3e5..8118c5330 100644 --- a/kernel/arm/KERNEL.ARMV7 +++ b/kernel/arm/KERNEL.ARMV7 @@ -30,10 +30,10 @@ IDMAXKERNEL = iamax_vfpv3.S ISMINKERNEL = iamax_vfpv3.S IDMINKERNEL = iamax_vfpv3.S -SSWAPKERNEL = ../arm/swap.c -DSWAPKERNEL = ../arm/swap.c -CSWAPKERNEL = ../arm/zswap.c -ZSWAPKERNEL = ../arm/zswap.c +SSWAPKERNEL = swap_vfpv3.S +DSWAPKERNEL = swap_vfpv3.S +CSWAPKERNEL = swap_vfpv3.S +ZSWAPKERNEL = swap_vfpv3.S SASUMKERNEL = asum_vfpv3.S DASUMKERNEL = asum_vfpv3.S diff --git a/kernel/arm/swap_vfpv3.S b/kernel/arm/swap_vfpv3.S new file mode 100644 index 000000000..352875188 --- /dev/null +++ b/kernel/arm/swap_vfpv3.S @@ -0,0 +1,354 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/14 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_INC_X [fp, #0 ] +#define OLD_Y [fp, #4 ] +#define OLD_INC_Y [fp, #8 ] + + +#define N r0 +#define Y r1 +#define INC_X r2 +#define X r3 +#define INC_Y r4 + +#define I r12 + +#define X_PRE 512 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +/*****************************************************************************************/ + + + +#if !defined(COMPLEX) + +#if defined(DOUBLE) + +.macro KERNEL_F4 + + pld [ X, #X_PRE ] + pld [ Y, #X_PRE ] + fldmiad X, { d0 - d3 } + fldmiad Y, { d4 - d7 } + fstmiad Y!, { d0 - d3 } + fstmiad X!, { d4 - d7} + +.endm + + +.macro KERNEL_F1 + + fldmiad X, { d0 } + fldmiad Y, { d4 } + fstmiad Y!, { d0 } + fstmiad X!, { d4 } + +.endm + +.macro KERNEL_S1 + + fldmiad X, { d0 } + fldmiad Y, { d4 } + fstmiad Y, { d0 } + fstmiad X, { d4 } + add X, X, INC_X + add Y, Y, INC_Y + +.endm + +#else + +.macro KERNEL_F4 + + fldmias X, { s0 - s3 } + fldmias Y, { s4 - s7 } + fstmias Y!, { s0 - s3 } + fstmias X!, { s4 - s7} + +.endm + + +.macro KERNEL_F1 + + fldmias X, { s0 } + fldmias Y, { s4 } + fstmias Y!, { s0 } + fstmias X!, { s4 } + +.endm + +.macro KERNEL_S1 + + fldmias X, { s0 } + fldmias Y, { s4 } + fstmias Y, { s0 } + fstmias X, { s4 } + add X, X, INC_X + add Y, Y, INC_Y + +.endm + + +#endif + +#else + +#if defined(DOUBLE) + +.macro KERNEL_F4 + + pld [ X, #X_PRE ] + pld [ Y, #X_PRE ] + fldmiad X, { d0 - d3 } + fldmiad Y, { d4 - d7 } + fstmiad Y!, { d0 - d3 } + fstmiad X!, { d4 - d7} + + pld [ X, #X_PRE ] + pld [ Y, #X_PRE ] + fldmiad X, { d0 - d3 } + fldmiad Y, { d4 - d7 } + fstmiad Y!, { d0 - d3 } + fstmiad X!, { d4 - d7} + +.endm + +.macro KERNEL_F1 + + fldmiad X, { d0 - d1 } + fldmiad Y, { d4 - d5 } + fstmiad Y!, { d0 - d1 } + fstmiad X!, { d4 - d5 } + +.endm + +.macro KERNEL_S1 + + fldmiad X, { d0 - d1 } + fldmiad Y, { d4 - d5 } + fstmiad Y, { d0 - d1 } + fstmiad X, { d4 - d5 } + add X, X, INC_X + add Y, Y, INC_Y + +.endm + + +#else + +.macro KERNEL_F4 + + pld [ X, #X_PRE ] + pld [ Y, #X_PRE ] + fldmias X, { s0 - s3 } + fldmias Y, { s4 - s7 } + fstmias Y!, { s0 - s3 } + fstmias X!, { s4 - s7} + + fldmias X, { s0 - s3 } + fldmias Y, { s4 - s7 } + fstmias Y!, { s0 - s3 } + fstmias X!, { s4 - s7} + +.endm + +.macro KERNEL_F1 + + fldmias X, { s0 - s1 } + fldmias Y, { s4 - s5 } + fstmias Y!, { s0 - s1 } + fstmias X!, { s4 - s5 } + +.endm + +.macro KERNEL_S1 + + fldmias X, { s0 - s1 } + fldmias Y, { s4 - s5 } + fstmias Y, { s0 - s1 } + fstmias X, { s4 - s5 } + add X, X, INC_X + add Y, Y, INC_Y + +.endm + + + +#endif + +#endif + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + push {r4 , fp} + add fp, sp, #8 + + ldr INC_X , OLD_INC_X + ldr Y, OLD_Y + ldr INC_Y , OLD_INC_Y + + + cmp N, #0 + ble swap_kernel_L999 + + cmp INC_X, #0 + beq swap_kernel_L999 + + cmp INC_Y, #0 + beq swap_kernel_L999 + + cmp INC_X, #1 + bne swap_kernel_S_BEGIN + + cmp INC_Y, #1 + bne swap_kernel_S_BEGIN + + +swap_kernel_F_BEGIN: + + + asrs I, N, #2 // I = N / 4 + ble swap_kernel_F1 + + .align 5 + +swap_kernel_F4: + +#if !defined(COMPLEX) && !defined(DOUBLE) + pld [ X, #X_PRE ] + pld [ Y, #X_PRE ] +#endif + + KERNEL_F4 + + subs I, I, #1 + ble swap_kernel_F1 + + KERNEL_F4 + + subs I, I, #1 + bne swap_kernel_F4 + +swap_kernel_F1: + + ands I, N, #3 + ble swap_kernel_L999 + +swap_kernel_F10: + + KERNEL_F1 + + subs I, I, #1 + bne swap_kernel_F10 + + b swap_kernel_L999 + +swap_kernel_S_BEGIN: + +#if defined(COMPLEX) + +#if defined(DOUBLE) + lsl INC_X, INC_X, #4 // INC_X * SIZE * 2 + lsl INC_Y, INC_Y, #4 // INC_Y * SIZE * 2 +#else + lsl INC_X, INC_X, #3 // INC_X * SIZE * 2 + lsl INC_Y, INC_Y, #3 // INC_Y * SIZE * 2 +#endif + +#else + +#if defined(DOUBLE) + lsl INC_X, INC_X, #3 // INC_X * SIZE + lsl INC_Y, INC_Y, #3 // INC_Y * SIZE +#else + lsl INC_X, INC_X, #2 // INC_X * SIZE + lsl INC_Y, INC_Y, #2 // INC_Y * SIZE +#endif + +#endif + + + asrs I, N, #2 // I = N / 4 + ble swap_kernel_S1 + + .align 5 + +swap_kernel_S4: + + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + + subs I, I, #1 + bne swap_kernel_S4 + +swap_kernel_S1: + + ands I, N, #3 + ble swap_kernel_L999 + +swap_kernel_S10: + + KERNEL_S1 + + subs I, I, #1 + bne swap_kernel_S10 + + +swap_kernel_L999: + + mov r0, #0 // set return value + + sub sp, fp, #8 + pop {r4,fp} + bx lr + + EPILOGUE + From f1b452e160c9954d40b50e00962db311bccace3c Mon Sep 17 00:00:00 2001 From: wernsaar Date: Fri, 15 Nov 2013 11:56:43 +0100 Subject: [PATCH 35/81] added scal kernel for all precisions --- kernel/arm/KERNEL.ARMV7 | 8 +- kernel/arm/scal_vfpv3.S | 376 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 380 insertions(+), 4 deletions(-) create mode 100644 kernel/arm/scal_vfpv3.S diff --git a/kernel/arm/KERNEL.ARMV7 b/kernel/arm/KERNEL.ARMV7 index 8118c5330..4e1939574 100644 --- a/kernel/arm/KERNEL.ARMV7 +++ b/kernel/arm/KERNEL.ARMV7 @@ -65,10 +65,10 @@ DROTKERNEL = ../arm/rot.c CROTKERNEL = ../arm/zrot.c ZROTKERNEL = ../arm/zrot.c -SSCALKERNEL = ../arm/scal.c -DSCALKERNEL = ../arm/scal.c -CSCALKERNEL = ../arm/zscal.c -ZSCALKERNEL = ../arm/zscal.c +SSCALKERNEL = scal_vfpv3.S +DSCALKERNEL = scal_vfpv3.S +CSCALKERNEL = scal_vfpv3.S +ZSCALKERNEL = scal_vfpv3.S SGEMVNKERNEL = gemv_n.c DGEMVNKERNEL = gemv_n.c diff --git a/kernel/arm/scal_vfpv3.S b/kernel/arm/scal_vfpv3.S new file mode 100644 index 000000000..a04b7241e --- /dev/null +++ b/kernel/arm/scal_vfpv3.S @@ -0,0 +1,376 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/15 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_INC_X [sp, #0 ] + + +#define N r0 +#define INC_X r1 +#define X r3 + +#define I r12 + +#define X_PRE 512 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +/*****************************************************************************************/ + + + +#if !defined(COMPLEX) + +#if defined(DOUBLE) + +.macro KERNEL_F4 + + pld [ X, #X_PRE ] + fldmiad X, { d4 - d7 } + vmul.f64 d4, d4, d0 + vmul.f64 d5, d5, d0 + vmul.f64 d6, d6, d0 + fstmiad X!, { d4 - d5 } + vmul.f64 d7, d7, d0 + fstmiad X!, { d6 - d7 } + +.endm + + +.macro KERNEL_F1 + + fldmiad X, { d4 } + vmul.f64 d4, d4, d0 + fstmiad X!, { d4 } + +.endm + +.macro KERNEL_S1 + + fldmiad X, { d4 } + vmul.f64 d4, d4, d0 + fstmiad X, { d4 } + add X, X, INC_X + +.endm + +#else + +.macro KERNEL_F4 + + fldmias X, { s4 - s7 } + vmul.f32 s4, s4, s0 + vmul.f32 s5, s5, s0 + vmul.f32 s6, s6, s0 + fstmias X!, { s4 - s5 } + vmul.f32 s7, s7, s0 + fstmias X!, { s6 - s7 } + +.endm + + +.macro KERNEL_F1 + + fldmias X, { s4 } + vmul.f32 s4, s4, s0 + fstmias X!, { s4 } + +.endm + +.macro KERNEL_S1 + + fldmias X, { s4 } + vmul.f32 s4, s4, s0 + fstmias X, { s4 } + add X, X, INC_X + +.endm + + + +#endif + +#else + +#if defined(DOUBLE) + +.macro KERNEL_F4 + + pld [ X, #X_PRE ] + + fldmiad X, { d4 - d5 } + vmul.f64 d2, d0, d4 + fnmacd d2, d1, d5 + vmul.f64 d3, d0, d5 + fmacd d3, d1, d4 + fstmiad X!, { d2 - d3 } + + fldmiad X, { d4 - d5 } + vmul.f64 d2, d0, d4 + fnmacd d2, d1, d5 + vmul.f64 d3, d0, d5 + fmacd d3, d1, d4 + fstmiad X!, { d2 - d3 } + + pld [ X, #X_PRE ] + + fldmiad X, { d4 - d5 } + vmul.f64 d2, d0, d4 + fnmacd d2, d1, d5 + vmul.f64 d3, d0, d5 + fmacd d3, d1, d4 + fstmiad X!, { d2 - d3 } + + fldmiad X, { d4 - d5 } + vmul.f64 d2, d0, d4 + fnmacd d2, d1, d5 + vmul.f64 d3, d0, d5 + fmacd d3, d1, d4 + fstmiad X!, { d2 - d3 } + +.endm + + +.macro KERNEL_F1 + + fldmiad X, { d4 - d5 } + vmul.f64 d2, d0, d4 + fnmacd d2, d1, d5 + vmul.f64 d3, d0, d5 + fmacd d3, d1, d4 + fstmiad X!, { d2 - d3 } + +.endm + +.macro KERNEL_S1 + + fldmiad X, { d4 - d5 } + vmul.f64 d2, d0, d4 + fnmacd d2, d1, d5 + vmul.f64 d3, d0, d5 + fmacd d3, d1, d4 + fstmiad X, { d2 - d3 } + add X, X, INC_X + +.endm + + +#else + +.macro KERNEL_F4 + + pld [ X, #X_PRE ] + + fldmias X, { s4 - s5 } + vmul.f32 s2, s0, s4 + fnmacs s2, s1, s5 + vmul.f32 s3, s0, s5 + fmacs s3, s1, s4 + fstmias X!, { s2 - s3 } + + fldmias X, { s4 - s5 } + vmul.f32 s2, s0, s4 + fnmacs s2, s1, s5 + vmul.f32 s3, s0, s5 + fmacs s3, s1, s4 + fstmias X!, { s2 - s3 } + + fldmias X, { s4 - s5 } + vmul.f32 s2, s0, s4 + fnmacs s2, s1, s5 + vmul.f32 s3, s0, s5 + fmacs s3, s1, s4 + fstmias X!, { s2 - s3 } + + fldmias X, { s4 - s5 } + vmul.f32 s2, s0, s4 + fnmacs s2, s1, s5 + vmul.f32 s3, s0, s5 + fmacs s3, s1, s4 + fstmias X!, { s2 - s3 } + +.endm + + +.macro KERNEL_F1 + + fldmias X, { s4 - s5 } + vmul.f32 s2, s0, s4 + fnmacs s2, s1, s5 + vmul.f32 s3, s0, s5 + fmacs s3, s1, s4 + fstmias X!, { s2 - s3 } + +.endm + +.macro KERNEL_S1 + + fldmias X, { s4 - s5 } + vmul.f32 s2, s0, s4 + fnmacs s2, s1, s5 + vmul.f32 s3, s0, s5 + fmacs s3, s1, s4 + fstmias X, { s2 - s3 } + add X, X, INC_X + +.endm + + + +#endif + +#endif + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + ldr INC_X , OLD_INC_X + + cmp N, #0 + ble scal_kernel_L999 + + cmp INC_X, #0 + ble scal_kernel_L999 + + cmp INC_X, #1 + bne scal_kernel_S_BEGIN + + +scal_kernel_F_BEGIN: + + + asrs I, N, #2 // I = N / 4 + ble scal_kernel_F1 + + .align 5 + +scal_kernel_F4: + +#if !defined(COMPLEX) && !defined(DOUBLE) + pld [ X, #X_PRE ] +#endif + + KERNEL_F4 + + subs I, I, #1 + ble scal_kernel_F1 + + KERNEL_F4 + + subs I, I, #1 + bne scal_kernel_F4 + +scal_kernel_F1: + + ands I, N, #3 + ble scal_kernel_L999 + +scal_kernel_F10: + + KERNEL_F1 + + subs I, I, #1 + bne scal_kernel_F10 + + b scal_kernel_L999 + +scal_kernel_S_BEGIN: + +#if defined(COMPLEX) + +#if defined(DOUBLE) + lsl INC_X, INC_X, #4 // INC_X * SIZE * 2 +#else + lsl INC_X, INC_X, #3 // INC_X * SIZE * 2 +#endif + +#else + +#if defined(DOUBLE) + lsl INC_X, INC_X, #3 // INC_X * SIZE +#else + lsl INC_X, INC_X, #2 // INC_X * SIZE +#endif + +#endif + + + asrs I, N, #2 // I = N / 4 + ble scal_kernel_S1 + + .align 5 + +scal_kernel_S4: + + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + + subs I, I, #1 + bne scal_kernel_S4 + +scal_kernel_S1: + + ands I, N, #3 + ble scal_kernel_L999 + +scal_kernel_S10: + + KERNEL_S1 + + subs I, I, #1 + bne scal_kernel_S10 + + +scal_kernel_L999: + + mov r0, #0 // set return value + + bx lr + + EPILOGUE + From 23dd474cd080f5e1beda6925f184eda9ed133764 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Fri, 15 Nov 2013 14:08:57 +0100 Subject: [PATCH 36/81] added rot kernel for all precisions --- kernel/arm/KERNEL.ARMV7 | 8 +- kernel/arm/rot_vfpv3.S | 584 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 588 insertions(+), 4 deletions(-) create mode 100644 kernel/arm/rot_vfpv3.S diff --git a/kernel/arm/KERNEL.ARMV7 b/kernel/arm/KERNEL.ARMV7 index 4e1939574..d349e2aa5 100644 --- a/kernel/arm/KERNEL.ARMV7 +++ b/kernel/arm/KERNEL.ARMV7 @@ -60,10 +60,10 @@ DNRM2KERNEL = ../arm/nrm2.c CNRM2KERNEL = ../arm/znrm2.c ZNRM2KERNEL = ../arm/znrm2.c -SROTKERNEL = ../arm/rot.c -DROTKERNEL = ../arm/rot.c -CROTKERNEL = ../arm/zrot.c -ZROTKERNEL = ../arm/zrot.c +SROTKERNEL = rot_vfpv3.S +DROTKERNEL = rot_vfpv3.S +CROTKERNEL = rot_vfpv3.S +ZROTKERNEL = rot_vfpv3.S SSCALKERNEL = scal_vfpv3.S DSCALKERNEL = scal_vfpv3.S diff --git a/kernel/arm/rot_vfpv3.S b/kernel/arm/rot_vfpv3.S new file mode 100644 index 000000000..663ecdf81 --- /dev/null +++ b/kernel/arm/rot_vfpv3.S @@ -0,0 +1,584 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/15 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_INC_Y [fp, #0 ] + + +#define N r0 +#define X r1 +#define INC_X r2 +#define Y r3 +#define INC_Y r4 + +#define I r12 + +#define X_PRE 512 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +/*****************************************************************************************/ + + + +#if !defined(COMPLEX) + +#if defined(DOUBLE) + +.macro KERNEL_F4 + + pld [ X, #X_PRE ] + pld [ Y, #X_PRE ] + + fldmiad X, { d4 } + fldmiad Y, { d5 } + vmul.f64 d2 , d0, d4 + fmacd d2 , d1, d5 + vmul.f64 d3 , d0, d5 + fnmacd d3 , d1, d4 + fstmiad X!, { d2 } + fstmiad Y!, { d3 } + + fldmiad X, { d4 } + fldmiad Y, { d5 } + vmul.f64 d2 , d0, d4 + fmacd d2 , d1, d5 + vmul.f64 d3 , d0, d5 + fnmacd d3 , d1, d4 + fstmiad X!, { d2 } + fstmiad Y!, { d3 } + + fldmiad X, { d4 } + fldmiad Y, { d5 } + vmul.f64 d2 , d0, d4 + fmacd d2 , d1, d5 + vmul.f64 d3 , d0, d5 + fnmacd d3 , d1, d4 + fstmiad X!, { d2 } + fstmiad Y!, { d3 } + + fldmiad X, { d4 } + fldmiad Y, { d5 } + vmul.f64 d2 , d0, d4 + fmacd d2 , d1, d5 + vmul.f64 d3 , d0, d5 + fnmacd d3 , d1, d4 + fstmiad X!, { d2 } + fstmiad Y!, { d3 } + +.endm + + +.macro KERNEL_F1 + + fldmiad X, { d4 } + fldmiad Y, { d5 } + vmul.f64 d2 , d0, d4 + fmacd d2 , d1, d5 + vmul.f64 d3 , d0, d5 + fnmacd d3 , d1, d4 + fstmiad X!, { d2 } + fstmiad Y!, { d3 } + +.endm + +.macro KERNEL_S1 + + fldmiad X, { d4 } + fldmiad Y, { d5 } + vmul.f64 d2 , d0, d4 + fmacd d2 , d1, d5 + vmul.f64 d3 , d0, d5 + fnmacd d3 , d1, d4 + fstmiad X, { d2 } + fstmiad Y, { d3 } + + add X, X, INC_X + add Y, Y, INC_Y + +.endm + +#else + +.macro KERNEL_F4 + + fldmias X, { s4 } + fldmias Y, { s5 } + vmul.f32 s2 , s0, s4 + fmacs s2 , s1, s5 + vmul.f32 s3 , s0, s5 + fnmacs s3 , s1, s4 + fstmias X!, { s2 } + fstmias Y!, { s3 } + + fldmias X, { s4 } + fldmias Y, { s5 } + vmul.f32 s2 , s0, s4 + fmacs s2 , s1, s5 + vmul.f32 s3 , s0, s5 + fnmacs s3 , s1, s4 + fstmias X!, { s2 } + fstmias Y!, { s3 } + + fldmias X, { s4 } + fldmias Y, { s5 } + vmul.f32 s2 , s0, s4 + fmacs s2 , s1, s5 + vmul.f32 s3 , s0, s5 + fnmacs s3 , s1, s4 + fstmias X!, { s2 } + fstmias Y!, { s3 } + + fldmias X, { s4 } + fldmias Y, { s5 } + vmul.f32 s2 , s0, s4 + fmacs s2 , s1, s5 + vmul.f32 s3 , s0, s5 + fnmacs s3 , s1, s4 + fstmias X!, { s2 } + fstmias Y!, { s3 } + +.endm + + +.macro KERNEL_F1 + + fldmias X, { s4 } + fldmias Y, { s5 } + vmul.f32 s2 , s0, s4 + fmacs s2 , s1, s5 + vmul.f32 s3 , s0, s5 + fnmacs s3 , s1, s4 + fstmias X!, { s2 } + fstmias Y!, { s3 } + +.endm + +.macro KERNEL_S1 + + fldmias X, { s4 } + fldmias Y, { s5 } + vmul.f32 s2 , s0, s4 + fmacs s2 , s1, s5 + vmul.f32 s3 , s0, s5 + fnmacs s3 , s1, s4 + fstmias X, { s2 } + fstmias Y, { s3 } + + add X, X, INC_X + add Y, Y, INC_Y + +.endm + + + +#endif + +#else + +#if defined(DOUBLE) + +.macro KERNEL_F4 + + pld [ X, #X_PRE ] + pld [ Y, #X_PRE ] + + fldmiad X, { d4 - d5 } + fldmiad Y, { d6 - d7 } + vmul.f64 d2 , d0, d4 + fmacd d2 , d1, d6 + vmul.f64 d3 , d0, d6 + fnmacd d3 , d1, d4 + fstmiad X!, { d2 } + fstmiad Y!, { d3 } + vmul.f64 d2 , d0, d5 + fmacd d2 , d1, d7 + vmul.f64 d3 , d0, d7 + fnmacd d3 , d1, d5 + fstmiad X!, { d2 } + fstmiad Y!, { d3 } + + fldmiad X, { d4 - d5 } + fldmiad Y, { d6 - d7 } + vmul.f64 d2 , d0, d4 + fmacd d2 , d1, d6 + vmul.f64 d3 , d0, d6 + fnmacd d3 , d1, d4 + fstmiad X!, { d2 } + fstmiad Y!, { d3 } + vmul.f64 d2 , d0, d5 + fmacd d2 , d1, d7 + vmul.f64 d3 , d0, d7 + fnmacd d3 , d1, d5 + fstmiad X!, { d2 } + fstmiad Y!, { d3 } + + pld [ X, #X_PRE ] + pld [ Y, #X_PRE ] + + fldmiad X, { d4 - d5 } + fldmiad Y, { d6 - d7 } + vmul.f64 d2 , d0, d4 + fmacd d2 , d1, d6 + vmul.f64 d3 , d0, d6 + fnmacd d3 , d1, d4 + fstmiad X!, { d2 } + fstmiad Y!, { d3 } + vmul.f64 d2 , d0, d5 + fmacd d2 , d1, d7 + vmul.f64 d3 , d0, d7 + fnmacd d3 , d1, d5 + fstmiad X!, { d2 } + fstmiad Y!, { d3 } + + fldmiad X, { d4 - d5 } + fldmiad Y, { d6 - d7 } + vmul.f64 d2 , d0, d4 + fmacd d2 , d1, d6 + vmul.f64 d3 , d0, d6 + fnmacd d3 , d1, d4 + fstmiad X!, { d2 } + fstmiad Y!, { d3 } + vmul.f64 d2 , d0, d5 + fmacd d2 , d1, d7 + vmul.f64 d3 , d0, d7 + fnmacd d3 , d1, d5 + fstmiad X!, { d2 } + fstmiad Y!, { d3 } + +.endm + + +.macro KERNEL_F1 + + fldmiad X, { d4 - d5 } + fldmiad Y, { d6 - d7 } + vmul.f64 d2 , d0, d4 + fmacd d2 , d1, d6 + vmul.f64 d3 , d0, d6 + fnmacd d3 , d1, d4 + fstmiad X!, { d2 } + fstmiad Y!, { d3 } + vmul.f64 d2 , d0, d5 + fmacd d2 , d1, d7 + vmul.f64 d3 , d0, d7 + fnmacd d3 , d1, d5 + fstmiad X!, { d2 } + fstmiad Y!, { d3 } + + +.endm + +.macro KERNEL_S1 + + fldmiad X, { d4 - d5 } + fldmiad Y, { d6 - d7 } + vmul.f64 d2 , d0, d4 + fmacd d2 , d1, d6 + vmul.f64 d3 , d0, d6 + fnmacd d3 , d1, d4 + vstr d2 , [ X, #0 ] + vstr d3 , [ Y, #0 ] + vmul.f64 d2 , d0, d5 + fmacd d2 , d1, d7 + vmul.f64 d3 , d0, d7 + fnmacd d3 , d1, d5 + vstr d2 , [ X, #8 ] + vstr d3 , [ Y, #8 ] + + add X, X, INC_X + add Y, Y, INC_Y + +.endm + + + +#else + +.macro KERNEL_F4 + + pld [ X, #X_PRE ] + pld [ Y, #X_PRE ] + + fldmias X, { s4 - s5 } + fldmias Y, { s6 - s7 } + vmul.f32 s2 , s0, s4 + fmacs s2 , s1, s6 + vmul.f32 s3 , s0, s6 + fnmacs s3 , s1, s4 + fstmias X!, { s2 } + fstmias Y!, { s3 } + vmul.f32 s2 , s0, s5 + fmacs s2 , s1, s7 + vmul.f32 s3 , s0, s7 + fnmacs s3 , s1, s5 + fstmias X!, { s2 } + fstmias Y!, { s3 } + + fldmias X, { s4 - s5 } + fldmias Y, { s6 - s7 } + vmul.f32 s2 , s0, s4 + fmacs s2 , s1, s6 + vmul.f32 s3 , s0, s6 + fnmacs s3 , s1, s4 + fstmias X!, { s2 } + fstmias Y!, { s3 } + vmul.f32 s2 , s0, s5 + fmacs s2 , s1, s7 + vmul.f32 s3 , s0, s7 + fnmacs s3 , s1, s5 + fstmias X!, { s2 } + fstmias Y!, { s3 } + + pld [ X, #X_PRE ] + pld [ Y, #X_PRE ] + + fldmias X, { s4 - s5 } + fldmias Y, { s6 - s7 } + vmul.f32 s2 , s0, s4 + fmacs s2 , s1, s6 + vmul.f32 s3 , s0, s6 + fnmacs s3 , s1, s4 + fstmias X!, { s2 } + fstmias Y!, { s3 } + vmul.f32 s2 , s0, s5 + fmacs s2 , s1, s7 + vmul.f32 s3 , s0, s7 + fnmacs s3 , s1, s5 + fstmias X!, { s2 } + fstmias Y!, { s3 } + + fldmias X, { s4 - s5 } + fldmias Y, { s6 - s7 } + vmul.f32 s2 , s0, s4 + fmacs s2 , s1, s6 + vmul.f32 s3 , s0, s6 + fnmacs s3 , s1, s4 + fstmias X!, { s2 } + fstmias Y!, { s3 } + vmul.f32 s2 , s0, s5 + fmacs s2 , s1, s7 + vmul.f32 s3 , s0, s7 + fnmacs s3 , s1, s5 + fstmias X!, { s2 } + fstmias Y!, { s3 } + +.endm + + +.macro KERNEL_F1 + + fldmias X, { s4 - s5 } + fldmias Y, { s6 - s7 } + vmul.f32 s2 , s0, s4 + fmacs s2 , s1, s6 + vmul.f32 s3 , s0, s6 + fnmacs s3 , s1, s4 + fstmias X!, { s2 } + fstmias Y!, { s3 } + vmul.f32 s2 , s0, s5 + fmacs s2 , s1, s7 + vmul.f32 s3 , s0, s7 + fnmacs s3 , s1, s5 + fstmias X!, { s2 } + fstmias Y!, { s3 } + + +.endm + +.macro KERNEL_S1 + + fldmias X, { s4 - s5 } + fldmias Y, { s6 - s7 } + vmul.f32 s2 , s0, s4 + fmacs s2 , s1, s6 + vmul.f32 s3 , s0, s6 + fnmacs s3 , s1, s4 + vstr s2 , [ X, #0 ] + vstr s3 , [ Y, #0 ] + vmul.f32 s2 , s0, s5 + fmacs s2 , s1, s7 + vmul.f32 s3 , s0, s7 + fnmacs s3 , s1, s5 + vstr s2 , [ X, #4 ] + vstr s3 , [ Y, #4 ] + + add X, X, INC_X + add Y, Y, INC_Y + +.endm + + +#endif + +#endif + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + push {r4 , fp} + add fp, sp, #8 + + ldr INC_Y , OLD_INC_Y + + + cmp N, #0 + ble rot_kernel_L999 + + cmp INC_X, #0 + beq rot_kernel_L999 + + cmp INC_Y, #0 + beq rot_kernel_L999 + + cmp INC_X, #1 + bne rot_kernel_S_BEGIN + + cmp INC_Y, #1 + bne rot_kernel_S_BEGIN + + +rot_kernel_F_BEGIN: + + + asrs I, N, #2 // I = N / 4 + ble rot_kernel_F1 + + .align 5 + +rot_kernel_F4: + +#if !defined(COMPLEX) && !defined(DOUBLE) + pld [ X, #X_PRE ] + pld [ Y, #X_PRE ] +#endif + + KERNEL_F4 + + subs I, I, #1 + ble rot_kernel_F1 + + KERNEL_F4 + + subs I, I, #1 + bne rot_kernel_F4 + +rot_kernel_F1: + + ands I, N, #3 + ble rot_kernel_L999 + +rot_kernel_F10: + + KERNEL_F1 + + subs I, I, #1 + bne rot_kernel_F10 + + b rot_kernel_L999 + +rot_kernel_S_BEGIN: + +#if defined(COMPLEX) + +#if defined(DOUBLE) + lsl INC_X, INC_X, #4 // INC_X * SIZE * 2 + lsl INC_Y, INC_Y, #4 // INC_Y * SIZE * 2 +#else + lsl INC_X, INC_X, #3 // INC_X * SIZE * 2 + lsl INC_Y, INC_Y, #3 // INC_Y * SIZE * 2 +#endif + +#else + +#if defined(DOUBLE) + lsl INC_X, INC_X, #3 // INC_X * SIZE + lsl INC_Y, INC_Y, #3 // INC_Y * SIZE +#else + lsl INC_X, INC_X, #2 // INC_X * SIZE + lsl INC_Y, INC_Y, #2 // INC_Y * SIZE +#endif + +#endif + + + asrs I, N, #2 // I = N / 4 + ble rot_kernel_S1 + + .align 5 + +rot_kernel_S4: + + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + + subs I, I, #1 + bne rot_kernel_S4 + +rot_kernel_S1: + + ands I, N, #3 + ble rot_kernel_L999 + +rot_kernel_S10: + + KERNEL_S1 + + subs I, I, #1 + bne rot_kernel_S10 + + +rot_kernel_L999: + + mov r0, #0 // set return value + + sub sp, fp, #8 + pop {r4,fp} + bx lr + + EPILOGUE + From f27cabfd082d2f4a98d2a555ea27863b33ae8999 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sat, 16 Nov 2013 16:17:17 +0100 Subject: [PATCH 37/81] added nrm2 kernel for all precisions --- kernel/arm/KERNEL.ARMV7 | 8 +- kernel/arm/nrm2_vfpv3.S | 508 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 512 insertions(+), 4 deletions(-) create mode 100644 kernel/arm/nrm2_vfpv3.S diff --git a/kernel/arm/KERNEL.ARMV7 b/kernel/arm/KERNEL.ARMV7 index d349e2aa5..c5aeef560 100644 --- a/kernel/arm/KERNEL.ARMV7 +++ b/kernel/arm/KERNEL.ARMV7 @@ -55,10 +55,10 @@ DDOTKERNEL = ddot_vfpv3.S CDOTKERNEL = cdot_vfpv3.S ZDOTKERNEL = zdot_vfpv3.S -SNRM2KERNEL = ../arm/nrm2.c -DNRM2KERNEL = ../arm/nrm2.c -CNRM2KERNEL = ../arm/znrm2.c -ZNRM2KERNEL = ../arm/znrm2.c +SNRM2KERNEL = nrm2_vfpv3.S +DNRM2KERNEL = nrm2_vfpv3.S +CNRM2KERNEL = nrm2_vfpv3.S +ZNRM2KERNEL = nrm2_vfpv3.S SROTKERNEL = rot_vfpv3.S DROTKERNEL = rot_vfpv3.S diff --git a/kernel/arm/nrm2_vfpv3.S b/kernel/arm/nrm2_vfpv3.S new file mode 100644 index 000000000..b56f8b038 --- /dev/null +++ b/kernel/arm/nrm2_vfpv3.S @@ -0,0 +1,508 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/16 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define N r0 +#define X r1 +#define INC_X r2 + +#define I r12 + +#define X_PRE 512 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + + +#if !defined(COMPLEX) + +#if defined(DOUBLE) + + +.macro KERNEL_F1 + + fldmiad X!, { d4 } + vcmpe.f64 d4, d6 // compare with 0.0 + vmrs APSR_nzcv, fpscr + beq KERNEL_F1_NEXT_\@ + vabs.f64 d4, d4 + vcmpe.f64 d0, d4 // compare with scale + vmrs APSR_nzcv, fpscr + vdivge.f64 d2 , d4, d0 // scale >= x ? x / scale + vmlage.f64 d1 , d2 , d2 // ssq += ( x/scale ) * ( x/scale ) + bge KERNEL_F1_NEXT_\@ + vdiv.f64 d2 , d0, d4 // scale / x + vmul.f64 d2 , d2, d2 // ( scale / x ) * ( scale / x ) + vmul.f64 d3 , d1, d2 // ssq * ( scale / x ) * ( scale / x ) + vadd.f64 d1 , d3, d7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) + vmov.f64 d0 , d4 // scale = x + +KERNEL_F1_NEXT_\@: + +.endm + +.macro KERNEL_F8 + + pld [ X, #X_PRE ] + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 + pld [ X, #X_PRE ] + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 + +.endm + +.macro KERNEL_S1 + + fldmiad X, { d4 } + vcmpe.f64 d4, d6 // compare with 0.0 + vmrs APSR_nzcv, fpscr + beq KERNEL_S1_NEXT + vabs.f64 d4, d4 + vcmpe.f64 d0, d4 // compare with scale + vmrs APSR_nzcv, fpscr + vdivge.f64 d2 , d4, d0 // scale >= x ? x / scale + vmlage.f64 d1 , d2 , d2 // ssq += ( x/scale ) * ( x/scale ) + bge KERNEL_S1_NEXT + vdiv.f64 d2 , d0, d4 // scale / x + vmul.f64 d2 , d2, d2 // ( scale / x ) * ( scale / x ) + vmul.f64 d3 , d1, d2 // ssq * ( scale / x ) * ( scale / x ) + vadd.f64 d1 , d3, d7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) + vmov.f64 d0 , d4 // scale = x + +KERNEL_S1_NEXT: + + add X, X, INC_X + +.endm + +#else + +.macro KERNEL_F1 + + fldmias X!, { s4 } + vcmpe.f32 s4, s6 // compare with 0.0 + vmrs APSR_nzcv, fpscr + beq KERNEL_F1_NEXT_\@ + vabs.f32 s4, s4 + vcmpe.f32 s0, s4 // compare with scale + vmrs APSR_nzcv, fpscr + vdivge.f32 s2 , s4, s0 // scale >= x ? x / scale + vmlage.f32 s1 , s2 , s2 // ssq += ( x/scale ) * ( x/scale ) + bge KERNEL_F1_NEXT_\@ + vdiv.f32 s2 , s0, s4 // scale / x + vmul.f32 s2 , s2, s2 // ( scale / x ) * ( scale / x ) + vmul.f32 s3 , s1, s2 // ssq * ( scale / x ) * ( scale / x ) + vadd.f32 s1 , s3, s7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) + vmov.f32 s0 , s4 // scale = x + +KERNEL_F1_NEXT_\@: + +.endm + +.macro KERNEL_F8 + + pld [ X, #X_PRE ] + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 + + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 + +.endm + +.macro KERNEL_S1 + + fldmias X, { s4 } + vcmpe.f32 s4, s6 // compare with 0.0 + vmrs APSR_nzcv, fpscr + beq KERNEL_S1_NEXT + vabs.f32 s4, s4 + vcmpe.f32 s0, s4 // compare with scale + vmrs APSR_nzcv, fpscr + vdivge.f32 s2 , s4, s0 // scale >= x ? x / scale + vmlage.f32 s1 , s2 , s2 // ssq += ( x/scale ) * ( x/scale ) + bge KERNEL_S1_NEXT + vdiv.f32 s2 , s0, s4 // scale / x + vmul.f32 s2 , s2, s2 // ( scale / x ) * ( scale / x ) + vmul.f32 s3 , s1, s2 // ssq * ( scale / x ) * ( scale / x ) + vadd.f32 s1 , s3, s7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) + vmov.f32 s0 , s4 // scale = x + +KERNEL_S1_NEXT: + + add X, X, INC_X + +.endm + + + + +#endif + +#else + +#if defined(DOUBLE) + +.macro KERNEL_F1 + + fldmiad X!, { d4 - d5 } + + vcmpe.f64 d4, d6 // compare with 0.0 + vmrs APSR_nzcv, fpscr + beq KERNEL_F1_NEXT_\@ + vabs.f64 d4, d4 + vcmpe.f64 d0, d4 // compare with scale + vmrs APSR_nzcv, fpscr + vdivge.f64 d2 , d4, d0 // scale >= x ? x / scale + vmlage.f64 d1 , d2 , d2 // ssq += ( x/scale ) * ( x/scale ) + bge KERNEL_F1_NEXT_\@ + vdiv.f64 d2 , d0, d4 // scale / x + vmul.f64 d2 , d2, d2 // ( scale / x ) * ( scale / x ) + vmul.f64 d3 , d1, d2 // ssq * ( scale / x ) * ( scale / x ) + vadd.f64 d1 , d3, d7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) + vmov.f64 d0 , d4 // scale = x + +KERNEL_F1_NEXT_\@: + + vcmpe.f64 d5, d6 // compare with 0.0 + vmrs APSR_nzcv, fpscr + beq KERNEL_F1_END_\@ + vabs.f64 d5, d5 + vcmpe.f64 d0, d5 // compare with scale + vmrs APSR_nzcv, fpscr + vdivge.f64 d2 , d5, d0 // scale >= x ? x / scale + vmlage.f64 d1 , d2 , d2 // ssq += ( x/scale ) * ( x/scale ) + bge KERNEL_F1_END_\@ + vdiv.f64 d2 , d0, d5 // scale / x + vmul.f64 d2 , d2, d2 // ( scale / x ) * ( scale / x ) + vmul.f64 d3 , d1, d2 // ssq * ( scale / x ) * ( scale / x ) + vadd.f64 d1 , d3, d7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) + vmov.f64 d0 , d5 // scale = x + +KERNEL_F1_END_\@: + + +.endm + +.macro KERNEL_F8 + + pld [ X, #X_PRE ] + KERNEL_F1 + KERNEL_F1 + pld [ X, #X_PRE ] + KERNEL_F1 + KERNEL_F1 + pld [ X, #X_PRE ] + KERNEL_F1 + KERNEL_F1 + pld [ X, #X_PRE ] + KERNEL_F1 + KERNEL_F1 + +.endm + +.macro KERNEL_S1 + + fldmiad X, { d4 - d5 } + + vcmpe.f64 d4, d6 // compare with 0.0 + vmrs APSR_nzcv, fpscr + beq KERNEL_S1_NEXT_\@ + vabs.f64 d4, d4 + vcmpe.f64 d0, d4 // compare with scale + vmrs APSR_nzcv, fpscr + vdivge.f64 d2 , d4, d0 // scale >= x ? x / scale + vmlage.f64 d1 , d2 , d2 // ssq += ( x/scale ) * ( x/scale ) + bge KERNEL_S1_NEXT_\@ + vdiv.f64 d2 , d0, d4 // scale / x + vmul.f64 d2 , d2, d2 // ( scale / x ) * ( scale / x ) + vmul.f64 d3 , d1, d2 // ssq * ( scale / x ) * ( scale / x ) + vadd.f64 d1 , d3, d7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) + vmov.f64 d0 , d4 // scale = x + +KERNEL_S1_NEXT_\@: + + vcmpe.f64 d5, d6 // compare with 0.0 + vmrs APSR_nzcv, fpscr + beq KERNEL_S1_END_\@ + vabs.f64 d5, d5 + vcmpe.f64 d0, d5 // compare with scale + vmrs APSR_nzcv, fpscr + vdivge.f64 d2 , d5, d0 // scale >= x ? x / scale + vmlage.f64 d1 , d2 , d2 // ssq += ( x/scale ) * ( x/scale ) + bge KERNEL_S1_END_\@ + vdiv.f64 d2 , d0, d5 // scale / x + vmul.f64 d2 , d2, d2 // ( scale / x ) * ( scale / x ) + vmul.f64 d3 , d1, d2 // ssq * ( scale / x ) * ( scale / x ) + vadd.f64 d1 , d3, d7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) + vmov.f64 d0 , d5 // scale = x + +KERNEL_S1_END_\@: + + add X, X, INC_X + +.endm + + +#else + +.macro KERNEL_F1 + + fldmias X!, { s4 - s5 } + + vcmpe.f32 s4, s6 // compare with 0.0 + vmrs APSR_nzcv, fpscr + beq KERNEL_F1_NEXT_\@ + vabs.f32 s4, s4 + vcmpe.f32 s0, s4 // compare with scale + vmrs APSR_nzcv, fpscr + vdivge.f32 s2 , s4, s0 // scale >= x ? x / scale + vmlage.f32 s1 , s2 , s2 // ssq += ( x/scale ) * ( x/scale ) + bge KERNEL_F1_NEXT_\@ + vdiv.f32 s2 , s0, s4 // scale / x + vmul.f32 s2 , s2, s2 // ( scale / x ) * ( scale / x ) + vmul.f32 s3 , s1, s2 // ssq * ( scale / x ) * ( scale / x ) + vadd.f32 s1 , s3, s7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) + vmov.f32 s0 , s4 // scale = x + +KERNEL_F1_NEXT_\@: + + vcmpe.f32 s5, s6 // compare with 0.0 + vmrs APSR_nzcv, fpscr + beq KERNEL_F1_END_\@ + vabs.f32 s5, s5 + vcmpe.f32 s0, s5 // compare with scale + vmrs APSR_nzcv, fpscr + vdivge.f32 s2 , s5, s0 // scale >= x ? x / scale + vmlage.f32 s1 , s2 , s2 // ssq += ( x/scale ) * ( x/scale ) + bge KERNEL_F1_END_\@ + vdiv.f32 s2 , s0, s5 // scale / x + vmul.f32 s2 , s2, s2 // ( scale / x ) * ( scale / x ) + vmul.f32 s3 , s1, s2 // ssq * ( scale / x ) * ( scale / x ) + vadd.f32 s1 , s3, s7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) + vmov.f32 s0 , s5 // scale = x + +KERNEL_F1_END_\@: + + +.endm + +.macro KERNEL_F8 + + pld [ X, #X_PRE ] + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 + pld [ X, #X_PRE ] + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 + +.endm + +.macro KERNEL_S1 + + fldmias X, { s4 - s5 } + + vcmpe.f32 s4, s6 // compare with 0.0 + vmrs APSR_nzcv, fpscr + beq KERNEL_S1_NEXT_\@ + vabs.f32 s4, s4 + vcmpe.f32 s0, s4 // compare with scale + vmrs APSR_nzcv, fpscr + vdivge.f32 s2 , s4, s0 // scale >= x ? x / scale + vmlage.f32 s1 , s2 , s2 // ssq += ( x/scale ) * ( x/scale ) + bge KERNEL_S1_NEXT_\@ + vdiv.f32 s2 , s0, s4 // scale / x + vmul.f32 s2 , s2, s2 // ( scale / x ) * ( scale / x ) + vmul.f32 s3 , s1, s2 // ssq * ( scale / x ) * ( scale / x ) + vadd.f32 s1 , s3, s7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) + vmov.f32 s0 , s4 // scale = x + +KERNEL_S1_NEXT_\@: + + vcmpe.f32 s5, s6 // compare with 0.0 + vmrs APSR_nzcv, fpscr + beq KERNEL_S1_END_\@ + vabs.f32 s5, s5 + vcmpe.f32 s0, s5 // compare with scale + vmrs APSR_nzcv, fpscr + vdivge.f32 s2 , s5, s0 // scale >= x ? x / scale + vmlage.f32 s1 , s2 , s2 // ssq += ( x/scale ) * ( x/scale ) + bge KERNEL_S1_END_\@ + vdiv.f32 s2 , s0, s5 // scale / x + vmul.f32 s2 , s2, s2 // ( scale / x ) * ( scale / x ) + vmul.f32 s3 , s1, s2 // ssq * ( scale / x ) * ( scale / x ) + vadd.f32 s1 , s3, s7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) + vmov.f32 s0 , s5 // scale = x + +KERNEL_S1_END_\@: + + add X, X, INC_X + +.endm + + + +#endif + +#endif + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + +#if defined(DOUBLE) + vsub.f64 d0 , d0 , d0 // scale=0.0 + vmov.f64 d1 , #1.0 // ssq=1.0 + vmov.f64 d7 , d1 // value 1.0 + vmov.f64 d6 , d0 // value 0.0 +#else + vsub.f32 s0 , s0 , s0 // scale=0.0 + vmov.f32 s1 , #1.0 // ssq=1.0 + vmov.f32 s7 , s1 // value 1.0 + vmov.f32 s6 , s0 // value 0.0 +#endif + + + + cmp N, #0 + ble nrm2_kernel_L999 + + cmp INC_X, #0 + beq nrm2_kernel_L999 + + + cmp INC_X, #1 + bne nrm2_kernel_S_BEGIN + + +nrm2_kernel_F_BEGIN: + + asrs I, N, #3 // I = N / 8 + ble nrm2_kernel_F1 + +nrm2_kernel_F8: + + KERNEL_F8 + + subs I, I, #1 + bne nrm2_kernel_F8 + +nrm2_kernel_F1: + + ands I, N, #7 + ble nrm2_kernel_L999 + + +nrm2_kernel_F10: + + KERNEL_F1 + + subs I, I, #1 + bne nrm2_kernel_F10 + + b nrm2_kernel_L999 + +nrm2_kernel_S_BEGIN: + +#if defined(COMPLEX) + +#if defined(DOUBLE) + lsl INC_X, INC_X, #4 // INC_X * SIZE * 2 +#else + lsl INC_X, INC_X, #3 // INC_X * SIZE * 2 +#endif + +#else + +#if defined(DOUBLE) + lsl INC_X, INC_X, #3 // INC_X * SIZE +#else + lsl INC_X, INC_X, #2 // INC_X * SIZE +#endif + +#endif + + + +nrm2_kernel_S1: + + mov I, N + + .align 5 + +nrm2_kernel_S10: + + KERNEL_S1 + + subs I, I, #1 + bne nrm2_kernel_S10 + + +nrm2_kernel_L999: + +#if defined(DOUBLE) + vsqrt.f64 d1, d1 + vmul.f64 d0, d0, d1 +#else + vsqrt.f32 s1, s1 + vmul.f32 s0, s0, s1 +#endif + + bx lr + + EPILOGUE + From 86283c0be1e5292cbe656826e453223bf751be15 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Tue, 19 Nov 2013 09:55:54 +0100 Subject: [PATCH 38/81] added gemv_t kernel for single and double precision --- kernel/arm/KERNEL.ARMV7 | 4 +- kernel/arm/gemv_t_vfpv3.S | 732 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 734 insertions(+), 2 deletions(-) create mode 100644 kernel/arm/gemv_t_vfpv3.S diff --git a/kernel/arm/KERNEL.ARMV7 b/kernel/arm/KERNEL.ARMV7 index c5aeef560..d3ddfac10 100644 --- a/kernel/arm/KERNEL.ARMV7 +++ b/kernel/arm/KERNEL.ARMV7 @@ -75,8 +75,8 @@ DGEMVNKERNEL = gemv_n.c CGEMVNKERNEL = zgemv_n.c ZGEMVNKERNEL = zgemv_n.c -SGEMVTKERNEL = gemv_t.c -DGEMVTKERNEL = gemv_t.c +SGEMVTKERNEL = gemv_t_vfpv3.S +DGEMVTKERNEL = gemv_t_vfpv3.S CGEMVTKERNEL = zgemv_t.c ZGEMVTKERNEL = zgemv_t.c diff --git a/kernel/arm/gemv_t_vfpv3.S b/kernel/arm/gemv_t_vfpv3.S new file mode 100644 index 000000000..7ae5799bc --- /dev/null +++ b/kernel/arm/gemv_t_vfpv3.S @@ -0,0 +1,732 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/18 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_LDA [fp, #0 ] +#define X [fp, #4 ] +#define OLD_INC_X [fp, #8 ] +#define Y [fp, #12 ] +#define OLD_INC_Y [fp, #16 ] +#define OLD_A r3 +#define OLD_N r1 + +#define M r0 +#define AO1 r1 +#define J r2 + +#define AO2 r4 +#define XO r5 +#define YO r6 +#define LDA r7 +#define INC_X r8 +#define INC_Y r9 + +#define I r12 + +#define N [fp, #-252 ] +#define A [fp, #-256 ] + + +#define X_PRE 512 +#define A_PRE 512 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + + +#if defined(DOUBLE) + +.macro INIT_F2 + + vsub.f64 d4 , d4 , d4 + vsub.f64 d5 , d5 , d5 + +.endm + +.macro KERNEL_F2X4 + + pld [ XO , #X_PRE ] + fldmiad XO! , { d28 - d31 } + pld [ AO1 , #A_PRE ] + fldmiad AO1!, { d8 - d9 } + pld [ AO2 , #A_PRE ] + fldmiad AO2!, { d16 - d17 } + vmla.f64 d4 , d28 , d8 + vmla.f64 d5 , d28 , d16 + fldmiad AO1!, { d10 - d11 } + vmla.f64 d4 , d29 , d9 + vmla.f64 d5 , d29 , d17 + fldmiad AO2!, { d18 - d19 } + vmla.f64 d4 , d30, d10 + vmla.f64 d5 , d30, d18 + vmla.f64 d4 , d31, d11 + vmla.f64 d5 , d31, d19 + +.endm + + +.macro KERNEL_F2X1 + + fldmiad XO! , { d2 } + fldmiad AO1!, { d8 } + fldmiad AO2!, { d16 } + vmla.f64 d4 , d2 , d8 + vmla.f64 d5 , d2 , d16 + +.endm + +.macro SAVE_F2 + + fldmiad YO, { d24 - d25 } + vmla.f64 d24, d0, d4 + vmla.f64 d25, d0, d5 + fstmiad YO!, { d24 - d25 } + +.endm + +.macro INIT_S2 + + vsub.f64 d4 , d4 , d4 + vsub.f64 d5 , d5 , d5 + +.endm + +.macro KERNEL_S2X4 + + pld [ AO1 , #A_PRE ] + fldmiad XO , { d28 } + add XO, XO, INC_X + fldmiad AO1!, { d8 - d9 } + pld [ AO2 , #A_PRE ] + fldmiad AO2!, { d16 - d17 } + vmla.f64 d4 , d28 , d8 + fldmiad XO , { d29 } + add XO, XO, INC_X + vmla.f64 d5 , d28 , d16 + fldmiad AO1!, { d10 - d11 } + vmla.f64 d4 , d29 , d9 + fldmiad XO , { d30 } + add XO, XO, INC_X + vmla.f64 d5 , d29 , d17 + fldmiad AO2!, { d18 - d19 } + vmla.f64 d4 , d30, d10 + fldmiad XO , { d31 } + add XO, XO, INC_X + vmla.f64 d5 , d30, d18 + vmla.f64 d4 , d31, d11 + vmla.f64 d5 , d31, d19 + +.endm + + +.macro KERNEL_S2X1 + + fldmiad XO , { d2 } + fldmiad AO1!, { d8 } + add XO, XO, INC_X + fldmiad AO2!, { d16 } + vmla.f64 d4 , d2 , d8 + vmla.f64 d5 , d2 , d16 + +.endm + +.macro SAVE_S2 + + fldmiad YO, { d24 } + vmla.f64 d24, d0, d4 + fstmiad YO, { d24 } + add YO, YO, INC_Y + + fldmiad YO, { d24 } + vmla.f64 d24, d0, d5 + fstmiad YO, { d24 } + add YO, YO, INC_Y + +.endm + +.macro INIT_F1 + + vsub.f64 d4 , d4 , d4 + +.endm + +.macro KERNEL_F1X4 + + pld [ XO , #X_PRE ] + fldmiad XO! , { d28 - d31 } + pld [ AO1 , #A_PRE ] + fldmiad AO1!, { d8 - d9 } + vmla.f64 d4 , d28 , d8 + fldmiad AO1!, { d10 - d11 } + vmla.f64 d4 , d29 , d9 + vmla.f64 d4 , d30, d10 + vmla.f64 d4 , d31, d11 + +.endm + + +.macro KERNEL_F1X1 + + fldmiad XO! , { d2 } + fldmiad AO1!, { d8 } + vmla.f64 d4 , d2 , d8 + +.endm + +.macro SAVE_F1 + + fldmiad YO, { d24 } + vmla.f64 d24, d0, d4 + fstmiad YO!, { d24 } + +.endm + +.macro INIT_S1 + + vsub.f64 d4 , d4 , d4 + +.endm + +.macro KERNEL_S1X4 + + pld [ AO1 , #A_PRE ] + fldmiad XO , { d28 } + add XO, XO, INC_X + fldmiad AO1!, { d8 - d9 } + vmla.f64 d4 , d28 , d8 + fldmiad XO , { d29 } + add XO, XO, INC_X + fldmiad AO1!, { d10 - d11 } + vmla.f64 d4 , d29 , d9 + fldmiad XO , { d30 } + add XO, XO, INC_X + vmla.f64 d4 , d30, d10 + fldmiad XO , { d31 } + add XO, XO, INC_X + vmla.f64 d4 , d31, d11 + +.endm + + +.macro KERNEL_S1X1 + + fldmiad XO , { d2 } + fldmiad AO1!, { d8 } + add XO, XO, INC_X + vmla.f64 d4 , d2 , d8 + +.endm + +.macro SAVE_S1 + + fldmiad YO, { d24 } + vmla.f64 d24, d0, d4 + fstmiad YO, { d24 } + add YO, YO, INC_Y + +.endm + + +#else /************************* SINGLE PRECISION *****************************************/ + +.macro INIT_F2 + + vsub.f32 s4 , s4 , s4 + vsub.f32 s5 , s5 , s5 + +.endm + +.macro KERNEL_F2X4 + + fldmias XO! , { s28 - s31 } + fldmias AO1!, { s8 - s9 } + fldmias AO2!, { s16 - s17 } + vmla.f32 s4 , s28 , s8 + vmla.f32 s5 , s28 , s16 + fldmias AO1!, { s10 - s11 } + vmla.f32 s4 , s29 , s9 + vmla.f32 s5 , s29 , s17 + fldmias AO2!, { s18 - s19 } + vmla.f32 s4 , s30, s10 + vmla.f32 s5 , s30, s18 + vmla.f32 s4 , s31, s11 + vmla.f32 s5 , s31, s19 + +.endm + + +.macro KERNEL_F2X1 + + fldmias XO! , { s2 } + fldmias AO1!, { s8 } + fldmias AO2!, { s16 } + vmla.f32 s4 , s2 , s8 + vmla.f32 s5 , s2 , s16 + +.endm + +.macro SAVE_F2 + + fldmias YO, { s24 - s25 } + vmla.f32 s24, s0, s4 + vmla.f32 s25, s0, s5 + fstmias YO!, { s24 - s25 } + +.endm + +.macro INIT_S2 + + vsub.f32 s4 , s4 , s4 + vsub.f32 s5 , s5 , s5 + +.endm + +.macro KERNEL_S2X4 + + fldmias XO , { s28 } + add XO, XO, INC_X + fldmias AO1!, { s8 - s9 } + fldmias AO2!, { s16 - s17 } + vmla.f32 s4 , s28 , s8 + fldmias XO , { s29 } + add XO, XO, INC_X + vmla.f32 s5 , s28 , s16 + fldmias AO1!, { s10 - s11 } + vmla.f32 s4 , s29 , s9 + fldmias XO , { s30 } + add XO, XO, INC_X + vmla.f32 s5 , s29 , s17 + fldmias AO2!, { s18 - s19 } + vmla.f32 s4 , s30, s10 + fldmias XO , { s31 } + add XO, XO, INC_X + vmla.f32 s5 , s30, s18 + vmla.f32 s4 , s31, s11 + vmla.f32 s5 , s31, s19 + +.endm + + +.macro KERNEL_S2X1 + + fldmias XO , { s2 } + fldmias AO1!, { s8 } + add XO, XO, INC_X + fldmias AO2!, { s16 } + vmla.f32 s4 , s2 , s8 + vmla.f32 s5 , s2 , s16 + +.endm + +.macro SAVE_S2 + + fldmias YO, { s24 } + vmla.f32 s24, s0, s4 + fstmias YO, { s24 } + add YO, YO, INC_Y + + fldmias YO, { s24 } + vmla.f32 s24, s0, s5 + fstmias YO, { s24 } + add YO, YO, INC_Y + +.endm + +.macro INIT_F1 + + vsub.f32 s4 , s4 , s4 + +.endm + +.macro KERNEL_F1X4 + + fldmias XO! , { s28 - s31 } + fldmias AO1!, { s8 - s9 } + vmla.f32 s4 , s28 , s8 + fldmias AO1!, { s10 - s11 } + vmla.f32 s4 , s29 , s9 + vmla.f32 s4 , s30, s10 + vmla.f32 s4 , s31, s11 + +.endm + + +.macro KERNEL_F1X1 + + fldmias XO! , { s2 } + fldmias AO1!, { s8 } + vmla.f32 s4 , s2 , s8 + +.endm + +.macro SAVE_F1 + + fldmias YO, { s24 } + vmla.f32 s24, s0, s4 + fstmias YO!, { s24 } + +.endm + +.macro INIT_S1 + + vsub.f32 s4 , s4 , s4 + +.endm + +.macro KERNEL_S1X4 + + fldmias XO , { s28 } + add XO, XO, INC_X + fldmias AO1!, { s8 - s9 } + vmla.f32 s4 , s28 , s8 + fldmias XO , { s29 } + add XO, XO, INC_X + fldmias AO1!, { s10 - s11 } + vmla.f32 s4 , s29 , s9 + fldmias XO , { s30 } + add XO, XO, INC_X + vmla.f32 s4 , s30, s10 + fldmias XO , { s31 } + add XO, XO, INC_X + vmla.f32 s4 , s31, s11 + +.endm + + +.macro KERNEL_S1X1 + + fldmias XO , { s2 } + fldmias AO1!, { s8 } + add XO, XO, INC_X + vmla.f32 s4 , s2 , s8 + +.endm + +.macro SAVE_S1 + + fldmias YO, { s24 } + vmla.f32 s24, s0, s4 + fstmias YO, { s24 } + add YO, YO, INC_Y + +.endm + + +#endif + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + push {r4 - r9 , fp} + add fp, sp, #28 + sub sp, sp, #STACKSIZE // reserve stack + + sub r12, fp, #192 + +#if defined(DOUBLE) + vstm r12, { d8 - d15 } // store floating point registers +#else + vstm r12, { s8 - s31 } // store floating point registers +#endif + + cmp M, #0 + ble gemvt_kernel_L999 + + cmp OLD_N, #0 + ble gemvt_kernel_L999 + + str OLD_A, A + str OLD_N, N + + ldr INC_X , OLD_INC_X + ldr INC_Y , OLD_INC_Y + + cmp INC_X, #0 + beq gemvt_kernel_L999 + + cmp INC_Y, #0 + beq gemvt_kernel_L999 + + ldr LDA, OLD_LDA + + +#if defined(DOUBLE) + lsl LDA, LDA, #3 // LDA * SIZE +#else + lsl LDA, LDA, #2 // LDA * SIZE +#endif + + cmp INC_X, #1 + bne gemvt_kernel_S2_BEGIN + + cmp INC_Y, #1 + bne gemvt_kernel_S2_BEGIN + + +gemvt_kernel_F2_BEGIN: + + ldr YO , Y + + ldr J, N + asrs J, J, #1 // J = N / 2 + ble gemvt_kernel_F1_BEGIN + +gemvt_kernel_F2X4: + + ldr AO1, A + add AO2, AO1, LDA + add r3 , AO2, LDA + str r3 , A + + ldr XO , X + + INIT_F2 + + asrs I, M, #2 // I = M / 4 + ble gemvt_kernel_F2X1 + + +gemvt_kernel_F2X4_10: + + KERNEL_F2X4 + + subs I, I, #1 + bne gemvt_kernel_F2X4_10 + + +gemvt_kernel_F2X1: + + ands I, M , #3 + ble gemvt_kernel_F2_END + +gemvt_kernel_F2X1_10: + + KERNEL_F2X1 + + subs I, I, #1 + bne gemvt_kernel_F2X1_10 + + +gemvt_kernel_F2_END: + + SAVE_F2 + + subs J , J , #1 + bne gemvt_kernel_F2X4 + + +gemvt_kernel_F1_BEGIN: + + ldr J, N + ands J, J, #1 + ble gemvt_kernel_L999 + +gemvt_kernel_F1X4: + + ldr AO1, A + + ldr XO , X + + INIT_F1 + + asrs I, M, #2 // I = M / 4 + ble gemvt_kernel_F1X1 + + +gemvt_kernel_F1X4_10: + + KERNEL_F1X4 + + subs I, I, #1 + bne gemvt_kernel_F1X4_10 + + +gemvt_kernel_F1X1: + + ands I, M , #3 + ble gemvt_kernel_F1_END + +gemvt_kernel_F1X1_10: + + KERNEL_F1X1 + + subs I, I, #1 + bne gemvt_kernel_F1X1_10 + + +gemvt_kernel_F1_END: + + SAVE_F1 + + b gemvt_kernel_L999 + + + +/*************************************************************************************************************/ + +gemvt_kernel_S2_BEGIN: + +#if defined(DOUBLE) + lsl INC_X, INC_X, #3 // INC_X * SIZE + lsl INC_Y, INC_Y, #3 // INC_Y * SIZE +#else + lsl INC_X, INC_X, #2 // INC_X * SIZE + lsl INC_Y, INC_Y, #2 // INC_Y * SIZE +#endif + + ldr YO , Y + + ldr J, N + asrs J, J, #1 // J = N / 2 + ble gemvt_kernel_S1_BEGIN + +gemvt_kernel_S2X4: + + ldr AO1, A + add AO2, AO1, LDA + add r3 , AO2, LDA + str r3 , A + + ldr XO , X + + INIT_S2 + + asrs I, M, #2 // I = M / 4 + ble gemvt_kernel_S2X1 + + +gemvt_kernel_S2X4_10: + + KERNEL_S2X4 + + subs I, I, #1 + bne gemvt_kernel_S2X4_10 + + +gemvt_kernel_S2X1: + + ands I, M , #3 + ble gemvt_kernel_S2_END + +gemvt_kernel_S2X1_10: + + KERNEL_S2X1 + + subs I, I, #1 + bne gemvt_kernel_S2X1_10 + + +gemvt_kernel_S2_END: + + SAVE_S2 + + subs J , J , #1 + bne gemvt_kernel_S2X4 + + +gemvt_kernel_S1_BEGIN: + + ldr J, N + ands J, J, #1 + ble gemvt_kernel_L999 + +gemvt_kernel_S1X4: + + ldr AO1, A + + ldr XO , X + + INIT_S1 + + asrs I, M, #2 // I = M / 4 + ble gemvt_kernel_S1X1 + + +gemvt_kernel_S1X4_10: + + KERNEL_S1X4 + + subs I, I, #1 + bne gemvt_kernel_S1X4_10 + + +gemvt_kernel_S1X1: + + ands I, M , #3 + ble gemvt_kernel_S1_END + +gemvt_kernel_S1X1_10: + + KERNEL_S1X1 + + subs I, I, #1 + bne gemvt_kernel_S1X1_10 + + +gemvt_kernel_S1_END: + + SAVE_S1 + + + +/*************************************************************************************************************/ + +gemvt_kernel_L999: + + sub r3, fp, #192 + +#if defined(DOUBLE) + vldm r3, { d8 - d15 } // restore floating point registers +#else + vldm r3, { s8 - s31 } // restore floating point registers +#endif + + mov r0, #0 // set return value + + sub sp, fp, #28 + pop {r4 -r9 ,fp} + bx lr + + EPILOGUE + From bf04544902fdd6588fa6e43a772f03c24e96069e Mon Sep 17 00:00:00 2001 From: wernsaar Date: Tue, 19 Nov 2013 15:07:20 +0100 Subject: [PATCH 39/81] added gemv_n kernel for single and double precision --- kernel/arm/KERNEL.ARMV7 | 4 +- kernel/arm/gemv_n_vfpv3.S | 781 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 783 insertions(+), 2 deletions(-) create mode 100644 kernel/arm/gemv_n_vfpv3.S diff --git a/kernel/arm/KERNEL.ARMV7 b/kernel/arm/KERNEL.ARMV7 index d3ddfac10..8d6acbe98 100644 --- a/kernel/arm/KERNEL.ARMV7 +++ b/kernel/arm/KERNEL.ARMV7 @@ -70,8 +70,8 @@ DSCALKERNEL = scal_vfpv3.S CSCALKERNEL = scal_vfpv3.S ZSCALKERNEL = scal_vfpv3.S -SGEMVNKERNEL = gemv_n.c -DGEMVNKERNEL = gemv_n.c +SGEMVNKERNEL = gemv_n_vfpv3.S +DGEMVNKERNEL = gemv_n_vfpv3.S CGEMVNKERNEL = zgemv_n.c ZGEMVNKERNEL = zgemv_n.c diff --git a/kernel/arm/gemv_n_vfpv3.S b/kernel/arm/gemv_n_vfpv3.S new file mode 100644 index 000000000..e031c331e --- /dev/null +++ b/kernel/arm/gemv_n_vfpv3.S @@ -0,0 +1,781 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/19 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_LDA [fp, #0 ] +#define X [fp, #4 ] +#define OLD_INC_X [fp, #8 ] +#define Y [fp, #12 ] +#define OLD_INC_Y [fp, #16 ] +#define OLD_A r3 +#define OLD_M r0 + +#define AO1 r0 +#define N r1 +#define J r2 + +#define AO2 r4 +#define XO r5 +#define YO r6 +#define LDA r7 +#define INC_X r8 +#define INC_Y r9 + +#define I r12 + +#define M [fp, #-252 ] +#define A [fp, #-256 ] + + +#define X_PRE 64 +#define Y_PRE 0 +#define A_PRE 0 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + + +#if defined(DOUBLE) + +.macro INIT_F8 + + pld [ YO , #Y_PRE ] + pld [ YO , #Y_PRE+32 ] + + vsub.f64 d24 , d24 , d24 + vmov.f64 d25 , d24 + vmov.f64 d26 , d24 + vmov.f64 d27 , d24 + vmov.f64 d28 , d24 + vmov.f64 d29 , d24 + vmov.f64 d30 , d24 + vmov.f64 d31 , d24 + +.endm + +.macro KERNEL_F8X8 + + pld [ XO , #X_PRE ] + KERNEL_F8X1 + KERNEL_F8X1 + KERNEL_F8X1 + KERNEL_F8X1 + + pld [ XO , #X_PRE ] + KERNEL_F8X1 + KERNEL_F8X1 + KERNEL_F8X1 + KERNEL_F8X1 + +.endm + + +.macro KERNEL_F8X1 + + fldmiad XO! , { d4 } + fldmiad AO1 , { d8 - d15 } + + vmla.f64 d24 , d4 , d8 + pld [ AO2 , #A_PRE ] + vmla.f64 d25 , d4 , d9 + pld [ AO2 , #A_PRE+32 ] + vmla.f64 d26 , d4 , d10 + vmla.f64 d27 , d4 , d11 + vmla.f64 d28 , d4 , d12 + vmla.f64 d29 , d4 , d13 + add AO1, AO1, LDA + vmla.f64 d30 , d4 , d14 + add AO2, AO2, LDA + vmla.f64 d31 , d4 , d15 + +.endm + +.macro SAVE_F8 + + fldmiad YO, { d16 - d23 } + + vmla.f64 d16, d0, d24 + vmla.f64 d17, d0, d25 + vmla.f64 d18, d0, d26 + vmla.f64 d19, d0, d27 + vmla.f64 d20, d0, d28 + vmla.f64 d21, d0, d29 + vmla.f64 d22, d0, d30 + vmla.f64 d23, d0, d31 + + fstmiad YO!, { d16 - d23 } + +.endm + + +.macro INIT_F1 + + vsub.f64 d24 , d24 , d24 + +.endm + + + +.macro KERNEL_F1X1 + + fldmiad XO! , { d4 } + fldmiad AO1 , { d8 } + vmla.f64 d24 , d4 , d8 + add AO1, AO1, LDA + +.endm + +.macro SAVE_F1 + + fldmiad YO, { d16 } + vmla.f64 d16, d0, d24 + fstmiad YO!, { d16 } + +.endm + +/*********************************************************************************************/ + + +.macro INIT_S8 + + vsub.f64 d24 , d24 , d24 + vmov.f64 d25 , d24 + vmov.f64 d26 , d24 + vmov.f64 d27 , d24 + vmov.f64 d28 , d24 + vmov.f64 d29 , d24 + vmov.f64 d30 , d24 + vmov.f64 d31 , d24 + +.endm + +.macro KERNEL_S8X8 + + KERNEL_S8X1 + KERNEL_S8X1 + KERNEL_S8X1 + KERNEL_S8X1 + + KERNEL_S8X1 + KERNEL_S8X1 + KERNEL_S8X1 + KERNEL_S8X1 + +.endm + + +.macro KERNEL_S8X1 + + pld [ AO2 , #A_PRE ] + pld [ AO2 , #A_PRE+32 ] + fldmiad XO , { d4 } + fldmiad AO1 , { d8 - d15 } + + vmla.f64 d24 , d4 , d8 + vmla.f64 d25 , d4 , d9 + vmla.f64 d26 , d4 , d10 + vmla.f64 d27 , d4 , d11 + vmla.f64 d28 , d4 , d12 + vmla.f64 d29 , d4 , d13 + vmla.f64 d30 , d4 , d14 + vmla.f64 d31 , d4 , d15 + add AO1, AO1, LDA + add AO2, AO2, LDA + add XO, XO, INC_X + +.endm + +.macro SAVE_S8 + + fldmiad YO, { d16 } + vmla.f64 d16, d0, d24 + fstmiad YO, { d16 } + add YO, YO, INC_Y + + fldmiad YO, { d17 } + vmla.f64 d17, d0, d25 + fstmiad YO, { d17 } + add YO, YO, INC_Y + + fldmiad YO, { d18 } + vmla.f64 d18, d0, d26 + fstmiad YO, { d18 } + add YO, YO, INC_Y + + fldmiad YO, { d19 } + vmla.f64 d19, d0, d27 + fstmiad YO, { d19 } + add YO, YO, INC_Y + + fldmiad YO, { d20 } + vmla.f64 d20, d0, d28 + fstmiad YO, { d20 } + add YO, YO, INC_Y + + fldmiad YO, { d21 } + vmla.f64 d21, d0, d29 + fstmiad YO, { d21 } + add YO, YO, INC_Y + + fldmiad YO, { d22 } + vmla.f64 d22, d0, d30 + fstmiad YO, { d22 } + add YO, YO, INC_Y + + fldmiad YO, { d23 } + vmla.f64 d23, d0, d31 + fstmiad YO, { d23 } + add YO, YO, INC_Y + +.endm + + +.macro INIT_S1 + + vsub.f64 d24 , d24 , d24 + +.endm + + + +.macro KERNEL_S1X1 + + fldmiad XO , { d4 } + fldmiad AO1 , { d8 } + vmla.f64 d24 , d4 , d8 + add AO1, AO1, LDA + add XO, XO, INC_X + +.endm + +.macro SAVE_S1 + + fldmiad YO, { d16 } + vmla.f64 d16, d0, d24 + fstmiad YO, { d16 } + add YO, YO, INC_Y + +.endm + + + +#else /************************* SINGLE PRECISION *****************************************/ + +.macro INIT_F8 + + pld [ YO , #Y_PRE ] + + vsub.f32 s24 , s24 , s24 + vmov.f32 s25 , s24 + vmov.f32 s26 , s24 + vmov.f32 s27 , s24 + vmov.f32 s28 , s24 + vmov.f32 s29 , s24 + vmov.f32 s30 , s24 + vmov.f32 s31 , s24 + +.endm + +.macro KERNEL_F8X8 + + pld [ XO , #X_PRE ] + KERNEL_F8X1 + KERNEL_F8X1 + KERNEL_F8X1 + KERNEL_F8X1 + + KERNEL_F8X1 + KERNEL_F8X1 + KERNEL_F8X1 + KERNEL_F8X1 + +.endm + + +.macro KERNEL_F8X1 + + pld [ AO2 , #A_PRE ] + fldmias XO! , { s4 } + fldmias AO1 , { s8 - s15 } + + vmla.f32 s24 , s4 , s8 + vmla.f32 s25 , s4 , s9 + vmla.f32 s26 , s4 , s10 + vmla.f32 s27 , s4 , s11 + vmla.f32 s28 , s4 , s12 + vmla.f32 s29 , s4 , s13 + vmla.f32 s30 , s4 , s14 + vmla.f32 s31 , s4 , s15 + add AO1, AO1, LDA + add AO2, AO2, LDA + +.endm + +.macro SAVE_F8 + + fldmias YO, { s16 - s23 } + + vmla.f32 s16, s0, s24 + vmla.f32 s17, s0, s25 + vmla.f32 s18, s0, s26 + vmla.f32 s19, s0, s27 + vmla.f32 s20, s0, s28 + vmla.f32 s21, s0, s29 + vmla.f32 s22, s0, s30 + vmla.f32 s23, s0, s31 + + fstmias YO!, { s16 - s23 } + +.endm + + +.macro INIT_F1 + + vsub.f32 s24 , s24 , s24 + +.endm + + + +.macro KERNEL_F1X1 + + fldmias XO! , { s4 } + fldmias AO1 , { s8 } + vmla.f32 s24 , s4 , s8 + add AO1, AO1, LDA + +.endm + +.macro SAVE_F1 + + fldmias YO, { s16 } + vmla.f32 s16, s0, s24 + fstmias YO!, { s16 } + +.endm + +/*********************************************************************************************/ + + +.macro INIT_S8 + + vsub.f32 s24 , s24 , s24 + vmov.f32 s25 , s24 + vmov.f32 s26 , s24 + vmov.f32 s27 , s24 + vmov.f32 s28 , s24 + vmov.f32 s29 , s24 + vmov.f32 s30 , s24 + vmov.f32 s31 , s24 + +.endm + +.macro KERNEL_S8X8 + + KERNEL_S8X1 + KERNEL_S8X1 + KERNEL_S8X1 + KERNEL_S8X1 + + KERNEL_S8X1 + KERNEL_S8X1 + KERNEL_S8X1 + KERNEL_S8X1 + +.endm + + +.macro KERNEL_S8X1 + + pld [ AO2 , #A_PRE ] + fldmias XO , { s4 } + fldmias AO1 , { s8 - s15 } + + vmla.f32 s24 , s4 , s8 + vmla.f32 s25 , s4 , s9 + vmla.f32 s26 , s4 , s10 + vmla.f32 s27 , s4 , s11 + vmla.f32 s28 , s4 , s12 + vmla.f32 s29 , s4 , s13 + vmla.f32 s30 , s4 , s14 + vmla.f32 s31 , s4 , s15 + add AO1, AO1, LDA + add AO2, AO2, LDA + add XO, XO, INC_X + +.endm + +.macro SAVE_S8 + + fldmias YO, { s16 } + vmla.f32 s16, s0, s24 + fstmias YO, { s16 } + add YO, YO, INC_Y + + fldmias YO, { s17 } + vmla.f32 s17, s0, s25 + fstmias YO, { s17 } + add YO, YO, INC_Y + + fldmias YO, { s18 } + vmla.f32 s18, s0, s26 + fstmias YO, { s18 } + add YO, YO, INC_Y + + fldmias YO, { s19 } + vmla.f32 s19, s0, s27 + fstmias YO, { s19 } + add YO, YO, INC_Y + + fldmias YO, { s20 } + vmla.f32 s20, s0, s28 + fstmias YO, { s20 } + add YO, YO, INC_Y + + fldmias YO, { s21 } + vmla.f32 s21, s0, s29 + fstmias YO, { s21 } + add YO, YO, INC_Y + + fldmias YO, { s22 } + vmla.f32 s22, s0, s30 + fstmias YO, { s22 } + add YO, YO, INC_Y + + fldmias YO, { s23 } + vmla.f32 s23, s0, s31 + fstmias YO, { s23 } + add YO, YO, INC_Y + +.endm + + +.macro INIT_S1 + + vsub.f32 s24 , s24 , s24 + +.endm + + + +.macro KERNEL_S1X1 + + fldmias XO , { s4 } + fldmias AO1 , { s8 } + vmla.f32 s24 , s4 , s8 + add AO1, AO1, LDA + add XO, XO, INC_X + +.endm + +.macro SAVE_S1 + + fldmias YO, { s16 } + vmla.f32 s16, s0, s24 + fstmias YO, { s16 } + add YO, YO, INC_Y + +.endm + + + + +#endif + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + push {r4 - r9 , fp} + add fp, sp, #28 + sub sp, sp, #STACKSIZE // reserve stack + + sub r12, fp, #192 + +#if defined(DOUBLE) + vstm r12, { d8 - d15 } // store floating point registers +#else + vstm r12, { s8 - s31 } // store floating point registers +#endif + + cmp OLD_M, #0 + ble gemvn_kernel_L999 + + cmp N, #0 + ble gemvn_kernel_L999 + + str OLD_A, A + str OLD_M, M + + ldr INC_X , OLD_INC_X + ldr INC_Y , OLD_INC_Y + + cmp INC_X, #0 + beq gemvn_kernel_L999 + + cmp INC_Y, #0 + beq gemvn_kernel_L999 + + ldr LDA, OLD_LDA + + +#if defined(DOUBLE) + lsl LDA, LDA, #3 // LDA * SIZE +#else + lsl LDA, LDA, #2 // LDA * SIZE +#endif + + cmp INC_X, #1 + bne gemvn_kernel_S8_BEGIN + + cmp INC_Y, #1 + bne gemvn_kernel_S8_BEGIN + + +gemvn_kernel_F8_BEGIN: + + ldr YO , Y + + ldr I, M + asrs I, I, #3 // I = M / 8 + ble gemvn_kernel_F1_BEGIN + +gemvn_kernel_F8X8: + + ldr AO1, A + add AO2, AO1, LDA + add r3 , AO1, #8*SIZE + str r3 , A + + ldr XO , X + + INIT_F8 + + asrs J, N, #3 // J = N / 8 + ble gemvn_kernel_F8X1 + + +gemvn_kernel_F8X8_10: + + KERNEL_F8X8 + + subs J, J, #1 + bne gemvn_kernel_F8X8_10 + + +gemvn_kernel_F8X1: + + ands J, N , #7 + ble gemvn_kernel_F8_END + +gemvn_kernel_F8X1_10: + + KERNEL_F8X1 + + subs J, J, #1 + bne gemvn_kernel_F8X1_10 + + +gemvn_kernel_F8_END: + + SAVE_F8 + + subs I , I , #1 + bne gemvn_kernel_F8X8 + + +gemvn_kernel_F1_BEGIN: + + ldr I, M + ands I, I , #7 + ble gemvn_kernel_L999 + +gemvn_kernel_F1X1: + + ldr AO1, A + add r3, AO1, #SIZE + str r3, A + + ldr XO , X + + INIT_F1 + + mov J, N + + +gemvn_kernel_F1X1_10: + + KERNEL_F1X1 + + subs J, J, #1 + bne gemvn_kernel_F1X1_10 + + +gemvn_kernel_F1_END: + + SAVE_F1 + + subs I , I , #1 + bne gemvn_kernel_F1X1 + + b gemvn_kernel_L999 + + + +/*************************************************************************************************************/ + +gemvn_kernel_S8_BEGIN: + +#if defined(DOUBLE) + lsl INC_X, INC_X, #3 // INC_X * SIZE + lsl INC_Y, INC_Y, #3 // INC_Y * SIZE +#else + lsl INC_X, INC_X, #2 // INC_X * SIZE + lsl INC_Y, INC_Y, #2 // INC_Y * SIZE +#endif + + ldr YO , Y + + ldr I, M + asrs I, I, #3 // I = M / 8 + ble gemvn_kernel_S1_BEGIN + +gemvn_kernel_S8X8: + + ldr AO1, A + add AO2, AO1, LDA + add r3 , AO1, #8*SIZE + str r3 , A + + ldr XO , X + + INIT_S8 + + asrs J, N, #3 // J = N / 8 + ble gemvn_kernel_S8X1 + + +gemvn_kernel_S8X8_10: + + KERNEL_S8X8 + + subs J, J, #1 + bne gemvn_kernel_S8X8_10 + + +gemvn_kernel_S8X1: + + ands J, N , #7 + ble gemvn_kernel_S8_END + +gemvn_kernel_S8X1_10: + + KERNEL_S8X1 + + subs J, J, #1 + bne gemvn_kernel_S8X1_10 + + +gemvn_kernel_S8_END: + + SAVE_S8 + + subs I , I , #1 + bne gemvn_kernel_S8X8 + + +gemvn_kernel_S1_BEGIN: + + ldr I, M + ands I, I , #7 + ble gemvn_kernel_L999 + +gemvn_kernel_S1X1: + + ldr AO1, A + add r3, AO1, #SIZE + str r3, A + + ldr XO , X + + INIT_S1 + + mov J, N + + +gemvn_kernel_S1X1_10: + + KERNEL_S1X1 + + subs J, J, #1 + bne gemvn_kernel_S1X1_10 + + +gemvn_kernel_S1_END: + + SAVE_S1 + + subs I , I , #1 + bne gemvn_kernel_S1X1 + + +/*************************************************************************************************************/ + +gemvn_kernel_L999: + + sub r3, fp, #192 + +#if defined(DOUBLE) + vldm r3, { d8 - d15 } // restore floating point registers +#else + vldm r3, { s8 - s31 } // restore floating point registers +#endif + + mov r0, #0 // set return value + + sub sp, fp, #28 + pop {r4 -r9 ,fp} + bx lr + + EPILOGUE + From 410afda9b44e1920bf3f7b6bdcd852da0f4c4da9 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Thu, 21 Nov 2013 20:18:51 +0100 Subject: [PATCH 40/81] added cpu detection and target ARMV6, used in raspberry pi --- Makefile.arm | 9 +- cpuid_arm.c | 262 ++++++++++++++++++++++++++++++++++++++++ getarch.c | 39 +++++- kernel/arm/KERNEL.ARMV6 | 134 ++++++++++++++++++++ param.h | 40 ++++++ 5 files changed, 477 insertions(+), 7 deletions(-) create mode 100644 cpuid_arm.c create mode 100644 kernel/arm/KERNEL.ARMV6 diff --git a/Makefile.arm b/Makefile.arm index 6cdeb2f75..8502d5286 100644 --- a/Makefile.arm +++ b/Makefile.arm @@ -1,7 +1,12 @@ ifeq ($(CORE), ARMV7) -CCOMMON_OPT += -marm -mfpu=vfpv3 -mfloat-abi=hard -FCOMMON_OPT += -marm -mfpu=vfpv3 -mfloat-abi=hard +CCOMMON_OPT += -marm -mfpu=vfpv3 -mfloat-abi=hard -march=armv7-a +FCOMMON_OPT += -marm -mfpu=vfpv3 -mfloat-abi=hard -march=armv7-a +endif + +ifeq ($(CORE), ARMV6) +CCOMMON_OPT += -marm -mfpu=vfp -mfloat-abi=hard -march=armv6 +FCOMMON_OPT += -marm -mfpu=vfp -mfloat-abi=hard -march=armv6 endif diff --git a/cpuid_arm.c b/cpuid_arm.c new file mode 100644 index 000000000..efd1369b4 --- /dev/null +++ b/cpuid_arm.c @@ -0,0 +1,262 @@ +/************************************************************************** + Copyright (c) 2013, The OpenBLAS Project + All rights reserved. + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE + USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ + +#include + +#define CPU_UNKNOWN 0 +#define CPU_ARMV6 1 +#define CPU_ARMV7 2 +#define CPU_CORTEXA15 3 + +static char *cpuname[] = { + "UNKOWN", + "ARMV6", + "ARMV7", + "CORTEXA15" +}; + + +int get_feature(char *search) +{ + +#ifdef linux + FILE *infile; + char buffer[2048], *p,*t; + p = (char *) NULL ; + + infile = fopen("/proc/cpuinfo", "r"); + + while (fgets(buffer, sizeof(buffer), infile)) + { + + if (!strncmp("Features", buffer, 8)) + { + p = strchr(buffer, ':') + 2; + break; + } + } + + fclose(infile); + + + if( p == NULL ) return; + + t = strtok(p," "); + while( t = strtok(NULL," ")) + { + if (!strcmp(t, search)) { return(1); } + } + +#endif + return(0); +} + + +int detect(void) +{ + +#ifdef linux + + FILE *infile; + char buffer[512], *p; + p = (char *) NULL ; + + infile = fopen("/proc/cpuinfo", "r"); + + while (fgets(buffer, sizeof(buffer), infile)) + { + + if (!strncmp("model name", buffer, 10)) + { + p = strchr(buffer, ':') + 2; + break; + } + } + + fclose(infile); + + if(p != NULL) + { + + if (strstr(p, "ARMv7")) + { + if ( get_feature("vfpv4")) + return CPU_ARMV7; + + if ( get_feature("vfpv3")) + return CPU_ARMV7; + + if ( get_feature("vfp")) + return CPU_ARMV6; + + + } + + if (strstr(p, "ARMv6")) + { + if ( get_feature("vfp")) + return CPU_ARMV6; + } + + + } +#endif + + return CPU_UNKNOWN; +} + +char *get_corename(void) +{ + return cpuname[detect()]; +} + +void get_architecture(void) +{ + printf("ARM"); +} + +void get_subarchitecture(void) +{ + int d = detect(); + switch (d) + { + + case CPU_ARMV7: + printf("ARMV7"); + break; + + case CPU_ARMV6: + printf("ARMV6"); + break; + + default: + printf("UNKNOWN"); + break; + } +} + +void get_subdirname(void) +{ + printf("arm"); +} + +void get_cpuconfig(void) +{ + + int d = detect(); + switch (d) + { + + case CPU_ARMV7: + printf("#define ARMV7\n"); + printf("#define HAVE_VFP\n"); + printf("#define HAVE_VFPV3\n"); + if ( get_feature("neon")) printf("#define HAVE_NEON\n"); + if ( get_feature("vfpv4")) printf("#define HAVE_VFPV4\n"); + printf("#define L1_DATA_SIZE 65536\n"); + printf("#define L1_DATA_LINESIZE 32\n"); + printf("#define L2_SIZE 512488\n"); + printf("#define L2_LINESIZE 32\n"); + printf("#define DTB_DEFAULT_ENTRIES 64\n"); + printf("#define DTB_SIZE 4096\n"); + printf("#define L2_ASSOCIATIVE 4\n"); + break; + + case CPU_ARMV6: + printf("#define ARMV6\n"); + printf("#define HAVE_VFP\n"); + printf("#define L1_DATA_SIZE 65536\n"); + printf("#define L1_DATA_LINESIZE 32\n"); + printf("#define L2_SIZE 512488\n"); + printf("#define L2_LINESIZE 32\n"); + printf("#define DTB_DEFAULT_ENTRIES 64\n"); + printf("#define DTB_SIZE 4096\n"); + printf("#define L2_ASSOCIATIVE 4\n"); + break; + + } +} + + +void get_libname(void) +{ + + int d = detect(); + switch (d) + { + + case CPU_ARMV7: + printf("armv7\n"); + break; + + case CPU_ARMV6: + printf("armv6\n"); + break; + + } +} + + +void get_features(void) +{ + +#ifdef linux + FILE *infile; + char buffer[2048], *p,*t; + p = (char *) NULL ; + + infile = fopen("/proc/cpuinfo", "r"); + + while (fgets(buffer, sizeof(buffer), infile)) + { + + if (!strncmp("Features", buffer, 8)) + { + p = strchr(buffer, ':') + 2; + break; + } + } + + fclose(infile); + + + if( p == NULL ) return; + + t = strtok(p," "); + while( t = strtok(NULL," ")) + { + if (!strcmp(t, "vfp")) { printf("HAVE_VFP=1\n"); continue; } + if (!strcmp(t, "vfpv3")) { printf("HAVE_VFPV3=1\n"); continue; } + if (!strcmp(t, "vfpv4")) { printf("HAVE_VFPV4=1\n"); continue; } + if (!strcmp(t, "neon")) { printf("HAVE_NEON=1\n"); continue; } + } + +#endif + return; +} + + diff --git a/getarch.c b/getarch.c index 3264a76f6..4407e3d9b 100644 --- a/getarch.c +++ b/getarch.c @@ -687,23 +687,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ARCHCONFIG "-DARMV7 " \ "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \ "-DL2_SIZE=512488 -DL2_LINESIZE=32 " \ - "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " \ + "-DHAVE_VFPV3 -DHAVE_VFP" #define LIBNAME "armv7" #define CORENAME "ARMV7" #else #endif +#ifdef FORCE_ARMV6 +#define FORCE +#define ARCHITECTURE "ARM" +#define SUBARCHITECTURE "ARMV6" +#define SUBDIRNAME "arm" +#define ARCHCONFIG "-DARMV6 " \ + "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \ + "-DL2_SIZE=512488 -DL2_LINESIZE=32 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " \ + "-DHAVE_VFP" +#define LIBNAME "armv6" +#define CORENAME "ARMV6" +#else +#endif + + #ifndef FORCE #if defined(__powerpc__) || defined(__powerpc) || defined(powerpc) || \ - defined(__PPC__) || defined(PPC) || defined(_POWER) || defined(__POWERPC__) + defined(__PPC__) || defined(PPC) || defined(_POWER) || defined(__POWERPC__) + #ifndef POWER #define POWER #endif #define OPENBLAS_SUPPORTED #endif + #if defined(__i386__) || (__x86_64__) #include "cpuid_x86.c" #define OPENBLAS_SUPPORTED @@ -734,12 +753,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define OPENBLAS_SUPPORTED #endif +#ifdef __arm__ +#include "cpuid_arm.c" +#define OPENBLAS_SUPPORTED +#endif + + #ifndef OPENBLAS_SUPPORTED #error "This arch/CPU is not supported by OpenBLAS." #endif -#else - #endif static int get_num_cores(void) { @@ -788,7 +811,7 @@ int main(int argc, char *argv[]){ #ifdef FORCE printf("CORE=%s\n", CORENAME); #else -#if defined(__i386__) || defined(__x86_64__) || defined(POWER) || defined(__mips__) +#if defined(__i386__) || defined(__x86_64__) || defined(POWER) || defined(__mips__) || defined(__arm__) printf("CORE=%s\n", get_corename()); #endif #endif @@ -803,6 +826,12 @@ int main(int argc, char *argv[]){ printf("NUM_CORES=%d\n", get_num_cores()); +#if defined(__arm__) && !defined(FORCE) + get_features(); +#endif + + + #if defined(__i386__) || defined(__x86_64__) #ifndef FORCE get_sse(); diff --git a/kernel/arm/KERNEL.ARMV6 b/kernel/arm/KERNEL.ARMV6 new file mode 100644 index 000000000..e379347c0 --- /dev/null +++ b/kernel/arm/KERNEL.ARMV6 @@ -0,0 +1,134 @@ +SAMAXKERNEL = amax.c +DAMAXKERNEL = amax.c +CAMAXKERNEL = zamax.c +ZAMAXKERNEL = zamax.c + +SAMINKERNEL = amin.c +DAMINKERNEL = amin.c +CAMINKERNEL = zamin.c +ZAMINKERNEL = zamin.c + +SMAXKERNEL = max.c +DMAXKERNEL = max.c + +SMINKERNEL = min.c +DMINKERNEL = min.c + +ISAMAXKERNEL = iamax.c +IDAMAXKERNEL = iamax.c +ICAMAXKERNEL = izamax.c +IZAMAXKERNEL = izamax.c + +ISAMINKERNEL = iamin.c +IDAMINKERNEL = iamin.c +ICAMINKERNEL = izamin.c +IZAMINKERNEL = izamin.c + +ISMAXKERNEL = imax.c +IDMAXKERNEL = imax.c + +ISMINKERNEL = imin.c +IDMINKERNEL = imin.c + +SASUMKERNEL = asum.c +DASUMKERNEL = asum.c +CASUMKERNEL = zasum.c +ZASUMKERNEL = zasum.c + +SAXPYKERNEL = axpy.c +DAXPYKERNEL = axpy.c +CAXPYKERNEL = zaxpy.c +ZAXPYKERNEL = zaxpy.c + +SCOPYKERNEL = copy.c +DCOPYKERNEL = copy.c +CCOPYKERNEL = zcopy.c +ZCOPYKERNEL = zcopy.c + +SDOTKERNEL = dot.c +DDOTKERNEL = dot.c +CDOTKERNEL = zdot.c +ZDOTKERNEL = zdot.c + +SNRM2KERNEL = nrm2.c +DNRM2KERNEL = nrm2.c +CNRM2KERNEL = znrm2.c +ZNRM2KERNEL = znrm2.c + +SROTKERNEL = rot.c +DROTKERNEL = rot.c +CROTKERNEL = zrot.c +ZROTKERNEL = zrot.c + +SSCALKERNEL = scal.c +DSCALKERNEL = scal.c +CSCALKERNEL = zscal.c +ZSCALKERNEL = zscal.c + +SSWAPKERNEL = swap.c +DSWAPKERNEL = swap.c +CSWAPKERNEL = zswap.c +ZSWAPKERNEL = zswap.c + +SGEMVNKERNEL = gemv_n.c +DGEMVNKERNEL = gemv_n.c +CGEMVNKERNEL = zgemv_n.c +ZGEMVNKERNEL = zgemv_n.c + +SGEMVTKERNEL = gemv_t.c +DGEMVTKERNEL = gemv_t.c +CGEMVTKERNEL = zgemv_t.c +ZGEMVTKERNEL = zgemv_t.c + +STRMMKERNEL = ../generic/trmmkernel_2x2.c +DTRMMKERNEL = ../generic/trmmkernel_2x2.c +CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c +ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c + +SGEMMKERNEL = ../generic/gemmkernel_2x2.c +SGEMMONCOPY = ../generic/gemm_ncopy_2.c +SGEMMOTCOPY = ../generic/gemm_tcopy_2.c +SGEMMONCOPYOBJ = sgemm_oncopy.o +SGEMMOTCOPYOBJ = sgemm_otcopy.o + +DGEMMKERNEL = ../generic/gemmkernel_2x2.c +DGEMMONCOPY = ../generic/gemm_ncopy_2.c +DGEMMOTCOPY = ../generic/gemm_tcopy_2.c +DGEMMONCOPYOBJ = dgemm_oncopy.o +DGEMMOTCOPYOBJ = dgemm_otcopy.o + +CGEMMKERNEL = ../generic/zgemmkernel_2x2.c +CGEMMONCOPY = ../generic/zgemm_ncopy_2.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +CGEMMONCOPYOBJ = cgemm_oncopy.o +CGEMMOTCOPYOBJ = cgemm_otcopy.o + +ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c +ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMONCOPYOBJ = zgemm_oncopy.o +ZGEMMOTCOPYOBJ = zgemm_otcopy.o + +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + + + + diff --git a/param.h b/param.h index ab0ed91b7..7bb27f3ab 100644 --- a/param.h +++ b/param.h @@ -1831,6 +1831,46 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +#define SYMV_P 16 +#endif + + +#if defined(ARMV6) +#define SNUMOPT 2 +#define DNUMOPT 2 + +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 +#define GEMM_DEFAULT_ALIGN 0x03fffUL + +#define SGEMM_DEFAULT_UNROLL_M 2 +#define SGEMM_DEFAULT_UNROLL_N 2 + +#define DGEMM_DEFAULT_UNROLL_M 2 +#define DGEMM_DEFAULT_UNROLL_N 2 + +#define CGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_N 2 + +#define ZGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 + +#define SGEMM_DEFAULT_P 128 +#define DGEMM_DEFAULT_P 128 +#define CGEMM_DEFAULT_P 96 +#define ZGEMM_DEFAULT_P 64 + +#define SGEMM_DEFAULT_Q 240 +#define DGEMM_DEFAULT_Q 120 +#define CGEMM_DEFAULT_Q 120 +#define ZGEMM_DEFAULT_Q 120 + +#define SGEMM_DEFAULT_R 12288 +#define DGEMM_DEFAULT_R 8192 +#define CGEMM_DEFAULT_R 4096 +#define ZGEMM_DEFAULT_R 4096 + + #define SYMV_P 16 #endif From f1be3a168a4d54d9e6cd40a19587aa0bca9913c1 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Thu, 21 Nov 2013 20:48:57 +0100 Subject: [PATCH 41/81] renamed some BLAS kernels, which are compatible to ARMV6 --- kernel/arm/KERNEL.ARMV6 | 8 ++++---- kernel/arm/KERNEL.ARMV7 | 8 ++++---- kernel/arm/{ccopy_vfpv3.S => ccopy_vfp.S} | 0 kernel/arm/{dcopy_vfpv3.S => dcopy_vfp.S} | 0 kernel/arm/{scopy_vfpv3.S => scopy_vfp.S} | 0 kernel/arm/{zcopy_vfpv3.S => zcopy_vfp.S} | 0 6 files changed, 8 insertions(+), 8 deletions(-) rename kernel/arm/{ccopy_vfpv3.S => ccopy_vfp.S} (100%) rename kernel/arm/{dcopy_vfpv3.S => dcopy_vfp.S} (100%) rename kernel/arm/{scopy_vfpv3.S => scopy_vfp.S} (100%) rename kernel/arm/{zcopy_vfpv3.S => zcopy_vfp.S} (100%) diff --git a/kernel/arm/KERNEL.ARMV6 b/kernel/arm/KERNEL.ARMV6 index e379347c0..f745724c1 100644 --- a/kernel/arm/KERNEL.ARMV6 +++ b/kernel/arm/KERNEL.ARMV6 @@ -40,10 +40,10 @@ DAXPYKERNEL = axpy.c CAXPYKERNEL = zaxpy.c ZAXPYKERNEL = zaxpy.c -SCOPYKERNEL = copy.c -DCOPYKERNEL = copy.c -CCOPYKERNEL = zcopy.c -ZCOPYKERNEL = zcopy.c +SCOPYKERNEL = scopy_vfp.S +DCOPYKERNEL = dcopy_vfp.S +CCOPYKERNEL = ccopy_vfp.S +ZCOPYKERNEL = zcopy_vfp.S SDOTKERNEL = dot.c DDOTKERNEL = dot.c diff --git a/kernel/arm/KERNEL.ARMV7 b/kernel/arm/KERNEL.ARMV7 index 8d6acbe98..48f0a72a9 100644 --- a/kernel/arm/KERNEL.ARMV7 +++ b/kernel/arm/KERNEL.ARMV7 @@ -45,10 +45,10 @@ DAXPYKERNEL = ../arm/axpy.c CAXPYKERNEL = ../arm/zaxpy.c ZAXPYKERNEL = ../arm/zaxpy.c -SCOPYKERNEL = scopy_vfpv3.S -DCOPYKERNEL = dcopy_vfpv3.S -CCOPYKERNEL = ccopy_vfpv3.S -ZCOPYKERNEL = zcopy_vfpv3.S +SCOPYKERNEL = scopy_vfp.S +DCOPYKERNEL = dcopy_vfp.S +CCOPYKERNEL = ccopy_vfp.S +ZCOPYKERNEL = zcopy_vfp.S SDOTKERNEL = sdot_vfpv3.S DDOTKERNEL = ddot_vfpv3.S diff --git a/kernel/arm/ccopy_vfpv3.S b/kernel/arm/ccopy_vfp.S similarity index 100% rename from kernel/arm/ccopy_vfpv3.S rename to kernel/arm/ccopy_vfp.S diff --git a/kernel/arm/dcopy_vfpv3.S b/kernel/arm/dcopy_vfp.S similarity index 100% rename from kernel/arm/dcopy_vfpv3.S rename to kernel/arm/dcopy_vfp.S diff --git a/kernel/arm/scopy_vfpv3.S b/kernel/arm/scopy_vfp.S similarity index 100% rename from kernel/arm/scopy_vfpv3.S rename to kernel/arm/scopy_vfp.S diff --git a/kernel/arm/zcopy_vfpv3.S b/kernel/arm/zcopy_vfp.S similarity index 100% rename from kernel/arm/zcopy_vfpv3.S rename to kernel/arm/zcopy_vfp.S From 29a005c63572779a7a7ef81d6909f6177f8da5b0 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Thu, 21 Nov 2013 21:12:33 +0100 Subject: [PATCH 42/81] renamed iamax assembler kernel --- kernel/arm/KERNEL.ARMV6 | 48 +++++++++++------------ kernel/arm/KERNEL.ARMV7 | 48 +++++++++++------------ kernel/arm/{iamax_vfpv3.S => iamax_vfp.S} | 0 3 files changed, 48 insertions(+), 48 deletions(-) rename kernel/arm/{iamax_vfpv3.S => iamax_vfp.S} (100%) diff --git a/kernel/arm/KERNEL.ARMV6 b/kernel/arm/KERNEL.ARMV6 index f745724c1..a7bddbd82 100644 --- a/kernel/arm/KERNEL.ARMV6 +++ b/kernel/arm/KERNEL.ARMV6 @@ -1,34 +1,34 @@ -SAMAXKERNEL = amax.c -DAMAXKERNEL = amax.c -CAMAXKERNEL = zamax.c -ZAMAXKERNEL = zamax.c +SAMAXKERNEL = iamax_vfp.S +DAMAXKERNEL = iamax_vfp.S +CAMAXKERNEL = iamax_vfp.S +ZAMAXKERNEL = iamax_vfp.S -SAMINKERNEL = amin.c -DAMINKERNEL = amin.c -CAMINKERNEL = zamin.c -ZAMINKERNEL = zamin.c +SAMINKERNEL = iamax_vfp.S +DAMINKERNEL = iamax_vfp.S +CAMINKERNEL = iamax_vfp.S +ZAMINKERNEL = iamax_vfp.S -SMAXKERNEL = max.c -DMAXKERNEL = max.c +SMAXKERNEL = iamax_vfp.S +DMAXKERNEL = iamax_vfp.S -SMINKERNEL = min.c -DMINKERNEL = min.c +SMINKERNEL = iamax_vfp.S +DMINKERNEL = iamax_vfp.S -ISAMAXKERNEL = iamax.c -IDAMAXKERNEL = iamax.c -ICAMAXKERNEL = izamax.c -IZAMAXKERNEL = izamax.c +ISAMAXKERNEL = iamax_vfp.S +IDAMAXKERNEL = iamax_vfp.S +ICAMAXKERNEL = iamax_vfp.S +IZAMAXKERNEL = iamax_vfp.S -ISAMINKERNEL = iamin.c -IDAMINKERNEL = iamin.c -ICAMINKERNEL = izamin.c -IZAMINKERNEL = izamin.c +ISAMINKERNEL = iamax_vfp.S +IDAMINKERNEL = iamax_vfp.S +ICAMINKERNEL = iamax_vfp.S +IZAMINKERNEL = iamax_vfp.S -ISMAXKERNEL = imax.c -IDMAXKERNEL = imax.c +ISMAXKERNEL = iamax_vfp.S +IDMAXKERNEL = iamax_vfp.S -ISMINKERNEL = imin.c -IDMINKERNEL = imin.c +ISMINKERNEL = iamax_vfp.S +IDMINKERNEL = iamax_vfp.S SASUMKERNEL = asum.c DASUMKERNEL = asum.c diff --git a/kernel/arm/KERNEL.ARMV7 b/kernel/arm/KERNEL.ARMV7 index 48f0a72a9..dafc4c32a 100644 --- a/kernel/arm/KERNEL.ARMV7 +++ b/kernel/arm/KERNEL.ARMV7 @@ -1,34 +1,34 @@ -SAMAXKERNEL = iamax_vfpv3.S -DAMAXKERNEL = iamax_vfpv3.S -CAMAXKERNEL = iamax_vfpv3.S -ZAMAXKERNEL = iamax_vfpv3.S +SAMAXKERNEL = iamax_vfp.S +DAMAXKERNEL = iamax_vfp.S +CAMAXKERNEL = iamax_vfp.S +ZAMAXKERNEL = iamax_vfp.S -SAMINKERNEL = iamax_vfpv3.S -DAMINKERNEL = iamax_vfpv3.S -CAMINKERNEL = iamax_vfpv3.S -ZAMINKERNEL = iamax_vfpv3.S +SAMINKERNEL = iamax_vfp.S +DAMINKERNEL = iamax_vfp.S +CAMINKERNEL = iamax_vfp.S +ZAMINKERNEL = iamax_vfp.S -SMAXKERNEL = iamax_vfpv3.S -DMAXKERNEL = iamax_vfpv3.S +SMAXKERNEL = iamax_vfp.S +DMAXKERNEL = iamax_vfp.S -SMINKERNEL = iamax_vfpv3.S -DMINKERNEL = iamax_vfpv3.S +SMINKERNEL = iamax_vfp.S +DMINKERNEL = iamax_vfp.S -ISAMAXKERNEL = iamax_vfpv3.S -IDAMAXKERNEL = iamax_vfpv3.S -ICAMAXKERNEL = iamax_vfpv3.S -IZAMAXKERNEL = iamax_vfpv3.S +ISAMAXKERNEL = iamax_vfp.S +IDAMAXKERNEL = iamax_vfp.S +ICAMAXKERNEL = iamax_vfp.S +IZAMAXKERNEL = iamax_vfp.S -ISAMINKERNEL = iamax_vfpv3.S -IDAMINKERNEL = iamax_vfpv3.S -ICAMINKERNEL = iamax_vfpv3.S -IZAMINKERNEL = iamax_vfpv3.S +ISAMINKERNEL = iamax_vfp.S +IDAMINKERNEL = iamax_vfp.S +ICAMINKERNEL = iamax_vfp.S +IZAMINKERNEL = iamax_vfp.S -ISMAXKERNEL = iamax_vfpv3.S -IDMAXKERNEL = iamax_vfpv3.S +ISMAXKERNEL = iamax_vfp.S +IDMAXKERNEL = iamax_vfp.S -ISMINKERNEL = iamax_vfpv3.S -IDMINKERNEL = iamax_vfpv3.S +ISMINKERNEL = iamax_vfp.S +IDMINKERNEL = iamax_vfp.S SSWAPKERNEL = swap_vfpv3.S DSWAPKERNEL = swap_vfpv3.S diff --git a/kernel/arm/iamax_vfpv3.S b/kernel/arm/iamax_vfp.S similarity index 100% rename from kernel/arm/iamax_vfpv3.S rename to kernel/arm/iamax_vfp.S From 5bf7cf8d670593487ea0915ca02be2815d35ba47 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Thu, 21 Nov 2013 22:03:36 +0100 Subject: [PATCH 43/81] renamed scal_vfpv3.S to scal_vfp.S --- kernel/arm/KERNEL.ARMV6 | 8 ++++---- kernel/arm/KERNEL.ARMV7 | 8 ++++---- kernel/arm/{scal_vfpv3.S => scal_vfp.S} | 0 3 files changed, 8 insertions(+), 8 deletions(-) rename kernel/arm/{scal_vfpv3.S => scal_vfp.S} (100%) diff --git a/kernel/arm/KERNEL.ARMV6 b/kernel/arm/KERNEL.ARMV6 index a7bddbd82..8f3533544 100644 --- a/kernel/arm/KERNEL.ARMV6 +++ b/kernel/arm/KERNEL.ARMV6 @@ -60,10 +60,10 @@ DROTKERNEL = rot.c CROTKERNEL = zrot.c ZROTKERNEL = zrot.c -SSCALKERNEL = scal.c -DSCALKERNEL = scal.c -CSCALKERNEL = zscal.c -ZSCALKERNEL = zscal.c +SSCALKERNEL = scal_vfp.S +DSCALKERNEL = scal_vfp.S +CSCALKERNEL = scal_vfp.S +ZSCALKERNEL = scal_vfp.S SSWAPKERNEL = swap.c DSWAPKERNEL = swap.c diff --git a/kernel/arm/KERNEL.ARMV7 b/kernel/arm/KERNEL.ARMV7 index dafc4c32a..a36cbc1ee 100644 --- a/kernel/arm/KERNEL.ARMV7 +++ b/kernel/arm/KERNEL.ARMV7 @@ -65,10 +65,10 @@ DROTKERNEL = rot_vfpv3.S CROTKERNEL = rot_vfpv3.S ZROTKERNEL = rot_vfpv3.S -SSCALKERNEL = scal_vfpv3.S -DSCALKERNEL = scal_vfpv3.S -CSCALKERNEL = scal_vfpv3.S -ZSCALKERNEL = scal_vfpv3.S +SSCALKERNEL = scal_vfp.S +DSCALKERNEL = scal_vfp.S +CSCALKERNEL = scal_vfp.S +ZSCALKERNEL = scal_vfp.S SGEMVNKERNEL = gemv_n_vfpv3.S DGEMVNKERNEL = gemv_n_vfpv3.S diff --git a/kernel/arm/scal_vfpv3.S b/kernel/arm/scal_vfp.S similarity index 100% rename from kernel/arm/scal_vfpv3.S rename to kernel/arm/scal_vfp.S From 8565afb3c2ac144307f3e67b91889d3d8aa5f1a6 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Thu, 21 Nov 2013 22:26:27 +0100 Subject: [PATCH 44/81] renamed asum_vfpv3.S to asum_vfp.S --- kernel/arm/KERNEL.ARMV6 | 8 ++++---- kernel/arm/KERNEL.ARMV7 | 8 ++++---- kernel/arm/{asum_vfpv3.S => asum_vfp.S} | 0 3 files changed, 8 insertions(+), 8 deletions(-) rename kernel/arm/{asum_vfpv3.S => asum_vfp.S} (100%) diff --git a/kernel/arm/KERNEL.ARMV6 b/kernel/arm/KERNEL.ARMV6 index 8f3533544..b35eeb28a 100644 --- a/kernel/arm/KERNEL.ARMV6 +++ b/kernel/arm/KERNEL.ARMV6 @@ -30,10 +30,10 @@ IDMAXKERNEL = iamax_vfp.S ISMINKERNEL = iamax_vfp.S IDMINKERNEL = iamax_vfp.S -SASUMKERNEL = asum.c -DASUMKERNEL = asum.c -CASUMKERNEL = zasum.c -ZASUMKERNEL = zasum.c +SASUMKERNEL = asum_vfp.S +DASUMKERNEL = asum_vfp.S +CASUMKERNEL = asum_vfp.S +ZASUMKERNEL = asum_vfp.S SAXPYKERNEL = axpy.c DAXPYKERNEL = axpy.c diff --git a/kernel/arm/KERNEL.ARMV7 b/kernel/arm/KERNEL.ARMV7 index a36cbc1ee..4a652d2db 100644 --- a/kernel/arm/KERNEL.ARMV7 +++ b/kernel/arm/KERNEL.ARMV7 @@ -35,10 +35,10 @@ DSWAPKERNEL = swap_vfpv3.S CSWAPKERNEL = swap_vfpv3.S ZSWAPKERNEL = swap_vfpv3.S -SASUMKERNEL = asum_vfpv3.S -DASUMKERNEL = asum_vfpv3.S -CASUMKERNEL = asum_vfpv3.S -ZASUMKERNEL = asum_vfpv3.S +SASUMKERNEL = asum_vfp.S +DASUMKERNEL = asum_vfp.S +CASUMKERNEL = asum_vfp.S +ZASUMKERNEL = asum_vfp.S SAXPYKERNEL = ../arm/axpy.c DAXPYKERNEL = ../arm/axpy.c diff --git a/kernel/arm/asum_vfpv3.S b/kernel/arm/asum_vfp.S similarity index 100% rename from kernel/arm/asum_vfpv3.S rename to kernel/arm/asum_vfp.S From cd93cae5a7a71edf3d5e0ac13e05bdf77a4e5a60 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Thu, 21 Nov 2013 22:49:28 +0100 Subject: [PATCH 45/81] renamed rot_vfpv3.S to rot_vfp.S --- kernel/arm/KERNEL.ARMV6 | 8 +- kernel/arm/KERNEL.ARMV7 | 8 +- kernel/arm/rot_vfp.S | 584 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 592 insertions(+), 8 deletions(-) create mode 100644 kernel/arm/rot_vfp.S diff --git a/kernel/arm/KERNEL.ARMV6 b/kernel/arm/KERNEL.ARMV6 index b35eeb28a..fd294a937 100644 --- a/kernel/arm/KERNEL.ARMV6 +++ b/kernel/arm/KERNEL.ARMV6 @@ -55,10 +55,10 @@ DNRM2KERNEL = nrm2.c CNRM2KERNEL = znrm2.c ZNRM2KERNEL = znrm2.c -SROTKERNEL = rot.c -DROTKERNEL = rot.c -CROTKERNEL = zrot.c -ZROTKERNEL = zrot.c +SROTKERNEL = rot_vfp.S +DROTKERNEL = rot_vfp.S +CROTKERNEL = rot_vfp.S +ZROTKERNEL = rot_vfp.S SSCALKERNEL = scal_vfp.S DSCALKERNEL = scal_vfp.S diff --git a/kernel/arm/KERNEL.ARMV7 b/kernel/arm/KERNEL.ARMV7 index 4a652d2db..b03c709e0 100644 --- a/kernel/arm/KERNEL.ARMV7 +++ b/kernel/arm/KERNEL.ARMV7 @@ -60,10 +60,10 @@ DNRM2KERNEL = nrm2_vfpv3.S CNRM2KERNEL = nrm2_vfpv3.S ZNRM2KERNEL = nrm2_vfpv3.S -SROTKERNEL = rot_vfpv3.S -DROTKERNEL = rot_vfpv3.S -CROTKERNEL = rot_vfpv3.S -ZROTKERNEL = rot_vfpv3.S +SROTKERNEL = rot_vfp.S +DROTKERNEL = rot_vfp.S +CROTKERNEL = rot_vfp.S +ZROTKERNEL = rot_vfp.S SSCALKERNEL = scal_vfp.S DSCALKERNEL = scal_vfp.S diff --git a/kernel/arm/rot_vfp.S b/kernel/arm/rot_vfp.S new file mode 100644 index 000000000..663ecdf81 --- /dev/null +++ b/kernel/arm/rot_vfp.S @@ -0,0 +1,584 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/15 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_INC_Y [fp, #0 ] + + +#define N r0 +#define X r1 +#define INC_X r2 +#define Y r3 +#define INC_Y r4 + +#define I r12 + +#define X_PRE 512 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +/*****************************************************************************************/ + + + +#if !defined(COMPLEX) + +#if defined(DOUBLE) + +.macro KERNEL_F4 + + pld [ X, #X_PRE ] + pld [ Y, #X_PRE ] + + fldmiad X, { d4 } + fldmiad Y, { d5 } + vmul.f64 d2 , d0, d4 + fmacd d2 , d1, d5 + vmul.f64 d3 , d0, d5 + fnmacd d3 , d1, d4 + fstmiad X!, { d2 } + fstmiad Y!, { d3 } + + fldmiad X, { d4 } + fldmiad Y, { d5 } + vmul.f64 d2 , d0, d4 + fmacd d2 , d1, d5 + vmul.f64 d3 , d0, d5 + fnmacd d3 , d1, d4 + fstmiad X!, { d2 } + fstmiad Y!, { d3 } + + fldmiad X, { d4 } + fldmiad Y, { d5 } + vmul.f64 d2 , d0, d4 + fmacd d2 , d1, d5 + vmul.f64 d3 , d0, d5 + fnmacd d3 , d1, d4 + fstmiad X!, { d2 } + fstmiad Y!, { d3 } + + fldmiad X, { d4 } + fldmiad Y, { d5 } + vmul.f64 d2 , d0, d4 + fmacd d2 , d1, d5 + vmul.f64 d3 , d0, d5 + fnmacd d3 , d1, d4 + fstmiad X!, { d2 } + fstmiad Y!, { d3 } + +.endm + + +.macro KERNEL_F1 + + fldmiad X, { d4 } + fldmiad Y, { d5 } + vmul.f64 d2 , d0, d4 + fmacd d2 , d1, d5 + vmul.f64 d3 , d0, d5 + fnmacd d3 , d1, d4 + fstmiad X!, { d2 } + fstmiad Y!, { d3 } + +.endm + +.macro KERNEL_S1 + + fldmiad X, { d4 } + fldmiad Y, { d5 } + vmul.f64 d2 , d0, d4 + fmacd d2 , d1, d5 + vmul.f64 d3 , d0, d5 + fnmacd d3 , d1, d4 + fstmiad X, { d2 } + fstmiad Y, { d3 } + + add X, X, INC_X + add Y, Y, INC_Y + +.endm + +#else + +.macro KERNEL_F4 + + fldmias X, { s4 } + fldmias Y, { s5 } + vmul.f32 s2 , s0, s4 + fmacs s2 , s1, s5 + vmul.f32 s3 , s0, s5 + fnmacs s3 , s1, s4 + fstmias X!, { s2 } + fstmias Y!, { s3 } + + fldmias X, { s4 } + fldmias Y, { s5 } + vmul.f32 s2 , s0, s4 + fmacs s2 , s1, s5 + vmul.f32 s3 , s0, s5 + fnmacs s3 , s1, s4 + fstmias X!, { s2 } + fstmias Y!, { s3 } + + fldmias X, { s4 } + fldmias Y, { s5 } + vmul.f32 s2 , s0, s4 + fmacs s2 , s1, s5 + vmul.f32 s3 , s0, s5 + fnmacs s3 , s1, s4 + fstmias X!, { s2 } + fstmias Y!, { s3 } + + fldmias X, { s4 } + fldmias Y, { s5 } + vmul.f32 s2 , s0, s4 + fmacs s2 , s1, s5 + vmul.f32 s3 , s0, s5 + fnmacs s3 , s1, s4 + fstmias X!, { s2 } + fstmias Y!, { s3 } + +.endm + + +.macro KERNEL_F1 + + fldmias X, { s4 } + fldmias Y, { s5 } + vmul.f32 s2 , s0, s4 + fmacs s2 , s1, s5 + vmul.f32 s3 , s0, s5 + fnmacs s3 , s1, s4 + fstmias X!, { s2 } + fstmias Y!, { s3 } + +.endm + +.macro KERNEL_S1 + + fldmias X, { s4 } + fldmias Y, { s5 } + vmul.f32 s2 , s0, s4 + fmacs s2 , s1, s5 + vmul.f32 s3 , s0, s5 + fnmacs s3 , s1, s4 + fstmias X, { s2 } + fstmias Y, { s3 } + + add X, X, INC_X + add Y, Y, INC_Y + +.endm + + + +#endif + +#else + +#if defined(DOUBLE) + +.macro KERNEL_F4 + + pld [ X, #X_PRE ] + pld [ Y, #X_PRE ] + + fldmiad X, { d4 - d5 } + fldmiad Y, { d6 - d7 } + vmul.f64 d2 , d0, d4 + fmacd d2 , d1, d6 + vmul.f64 d3 , d0, d6 + fnmacd d3 , d1, d4 + fstmiad X!, { d2 } + fstmiad Y!, { d3 } + vmul.f64 d2 , d0, d5 + fmacd d2 , d1, d7 + vmul.f64 d3 , d0, d7 + fnmacd d3 , d1, d5 + fstmiad X!, { d2 } + fstmiad Y!, { d3 } + + fldmiad X, { d4 - d5 } + fldmiad Y, { d6 - d7 } + vmul.f64 d2 , d0, d4 + fmacd d2 , d1, d6 + vmul.f64 d3 , d0, d6 + fnmacd d3 , d1, d4 + fstmiad X!, { d2 } + fstmiad Y!, { d3 } + vmul.f64 d2 , d0, d5 + fmacd d2 , d1, d7 + vmul.f64 d3 , d0, d7 + fnmacd d3 , d1, d5 + fstmiad X!, { d2 } + fstmiad Y!, { d3 } + + pld [ X, #X_PRE ] + pld [ Y, #X_PRE ] + + fldmiad X, { d4 - d5 } + fldmiad Y, { d6 - d7 } + vmul.f64 d2 , d0, d4 + fmacd d2 , d1, d6 + vmul.f64 d3 , d0, d6 + fnmacd d3 , d1, d4 + fstmiad X!, { d2 } + fstmiad Y!, { d3 } + vmul.f64 d2 , d0, d5 + fmacd d2 , d1, d7 + vmul.f64 d3 , d0, d7 + fnmacd d3 , d1, d5 + fstmiad X!, { d2 } + fstmiad Y!, { d3 } + + fldmiad X, { d4 - d5 } + fldmiad Y, { d6 - d7 } + vmul.f64 d2 , d0, d4 + fmacd d2 , d1, d6 + vmul.f64 d3 , d0, d6 + fnmacd d3 , d1, d4 + fstmiad X!, { d2 } + fstmiad Y!, { d3 } + vmul.f64 d2 , d0, d5 + fmacd d2 , d1, d7 + vmul.f64 d3 , d0, d7 + fnmacd d3 , d1, d5 + fstmiad X!, { d2 } + fstmiad Y!, { d3 } + +.endm + + +.macro KERNEL_F1 + + fldmiad X, { d4 - d5 } + fldmiad Y, { d6 - d7 } + vmul.f64 d2 , d0, d4 + fmacd d2 , d1, d6 + vmul.f64 d3 , d0, d6 + fnmacd d3 , d1, d4 + fstmiad X!, { d2 } + fstmiad Y!, { d3 } + vmul.f64 d2 , d0, d5 + fmacd d2 , d1, d7 + vmul.f64 d3 , d0, d7 + fnmacd d3 , d1, d5 + fstmiad X!, { d2 } + fstmiad Y!, { d3 } + + +.endm + +.macro KERNEL_S1 + + fldmiad X, { d4 - d5 } + fldmiad Y, { d6 - d7 } + vmul.f64 d2 , d0, d4 + fmacd d2 , d1, d6 + vmul.f64 d3 , d0, d6 + fnmacd d3 , d1, d4 + vstr d2 , [ X, #0 ] + vstr d3 , [ Y, #0 ] + vmul.f64 d2 , d0, d5 + fmacd d2 , d1, d7 + vmul.f64 d3 , d0, d7 + fnmacd d3 , d1, d5 + vstr d2 , [ X, #8 ] + vstr d3 , [ Y, #8 ] + + add X, X, INC_X + add Y, Y, INC_Y + +.endm + + + +#else + +.macro KERNEL_F4 + + pld [ X, #X_PRE ] + pld [ Y, #X_PRE ] + + fldmias X, { s4 - s5 } + fldmias Y, { s6 - s7 } + vmul.f32 s2 , s0, s4 + fmacs s2 , s1, s6 + vmul.f32 s3 , s0, s6 + fnmacs s3 , s1, s4 + fstmias X!, { s2 } + fstmias Y!, { s3 } + vmul.f32 s2 , s0, s5 + fmacs s2 , s1, s7 + vmul.f32 s3 , s0, s7 + fnmacs s3 , s1, s5 + fstmias X!, { s2 } + fstmias Y!, { s3 } + + fldmias X, { s4 - s5 } + fldmias Y, { s6 - s7 } + vmul.f32 s2 , s0, s4 + fmacs s2 , s1, s6 + vmul.f32 s3 , s0, s6 + fnmacs s3 , s1, s4 + fstmias X!, { s2 } + fstmias Y!, { s3 } + vmul.f32 s2 , s0, s5 + fmacs s2 , s1, s7 + vmul.f32 s3 , s0, s7 + fnmacs s3 , s1, s5 + fstmias X!, { s2 } + fstmias Y!, { s3 } + + pld [ X, #X_PRE ] + pld [ Y, #X_PRE ] + + fldmias X, { s4 - s5 } + fldmias Y, { s6 - s7 } + vmul.f32 s2 , s0, s4 + fmacs s2 , s1, s6 + vmul.f32 s3 , s0, s6 + fnmacs s3 , s1, s4 + fstmias X!, { s2 } + fstmias Y!, { s3 } + vmul.f32 s2 , s0, s5 + fmacs s2 , s1, s7 + vmul.f32 s3 , s0, s7 + fnmacs s3 , s1, s5 + fstmias X!, { s2 } + fstmias Y!, { s3 } + + fldmias X, { s4 - s5 } + fldmias Y, { s6 - s7 } + vmul.f32 s2 , s0, s4 + fmacs s2 , s1, s6 + vmul.f32 s3 , s0, s6 + fnmacs s3 , s1, s4 + fstmias X!, { s2 } + fstmias Y!, { s3 } + vmul.f32 s2 , s0, s5 + fmacs s2 , s1, s7 + vmul.f32 s3 , s0, s7 + fnmacs s3 , s1, s5 + fstmias X!, { s2 } + fstmias Y!, { s3 } + +.endm + + +.macro KERNEL_F1 + + fldmias X, { s4 - s5 } + fldmias Y, { s6 - s7 } + vmul.f32 s2 , s0, s4 + fmacs s2 , s1, s6 + vmul.f32 s3 , s0, s6 + fnmacs s3 , s1, s4 + fstmias X!, { s2 } + fstmias Y!, { s3 } + vmul.f32 s2 , s0, s5 + fmacs s2 , s1, s7 + vmul.f32 s3 , s0, s7 + fnmacs s3 , s1, s5 + fstmias X!, { s2 } + fstmias Y!, { s3 } + + +.endm + +.macro KERNEL_S1 + + fldmias X, { s4 - s5 } + fldmias Y, { s6 - s7 } + vmul.f32 s2 , s0, s4 + fmacs s2 , s1, s6 + vmul.f32 s3 , s0, s6 + fnmacs s3 , s1, s4 + vstr s2 , [ X, #0 ] + vstr s3 , [ Y, #0 ] + vmul.f32 s2 , s0, s5 + fmacs s2 , s1, s7 + vmul.f32 s3 , s0, s7 + fnmacs s3 , s1, s5 + vstr s2 , [ X, #4 ] + vstr s3 , [ Y, #4 ] + + add X, X, INC_X + add Y, Y, INC_Y + +.endm + + +#endif + +#endif + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + push {r4 , fp} + add fp, sp, #8 + + ldr INC_Y , OLD_INC_Y + + + cmp N, #0 + ble rot_kernel_L999 + + cmp INC_X, #0 + beq rot_kernel_L999 + + cmp INC_Y, #0 + beq rot_kernel_L999 + + cmp INC_X, #1 + bne rot_kernel_S_BEGIN + + cmp INC_Y, #1 + bne rot_kernel_S_BEGIN + + +rot_kernel_F_BEGIN: + + + asrs I, N, #2 // I = N / 4 + ble rot_kernel_F1 + + .align 5 + +rot_kernel_F4: + +#if !defined(COMPLEX) && !defined(DOUBLE) + pld [ X, #X_PRE ] + pld [ Y, #X_PRE ] +#endif + + KERNEL_F4 + + subs I, I, #1 + ble rot_kernel_F1 + + KERNEL_F4 + + subs I, I, #1 + bne rot_kernel_F4 + +rot_kernel_F1: + + ands I, N, #3 + ble rot_kernel_L999 + +rot_kernel_F10: + + KERNEL_F1 + + subs I, I, #1 + bne rot_kernel_F10 + + b rot_kernel_L999 + +rot_kernel_S_BEGIN: + +#if defined(COMPLEX) + +#if defined(DOUBLE) + lsl INC_X, INC_X, #4 // INC_X * SIZE * 2 + lsl INC_Y, INC_Y, #4 // INC_Y * SIZE * 2 +#else + lsl INC_X, INC_X, #3 // INC_X * SIZE * 2 + lsl INC_Y, INC_Y, #3 // INC_Y * SIZE * 2 +#endif + +#else + +#if defined(DOUBLE) + lsl INC_X, INC_X, #3 // INC_X * SIZE + lsl INC_Y, INC_Y, #3 // INC_Y * SIZE +#else + lsl INC_X, INC_X, #2 // INC_X * SIZE + lsl INC_Y, INC_Y, #2 // INC_Y * SIZE +#endif + +#endif + + + asrs I, N, #2 // I = N / 4 + ble rot_kernel_S1 + + .align 5 + +rot_kernel_S4: + + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + + subs I, I, #1 + bne rot_kernel_S4 + +rot_kernel_S1: + + ands I, N, #3 + ble rot_kernel_L999 + +rot_kernel_S10: + + KERNEL_S1 + + subs I, I, #1 + bne rot_kernel_S10 + + +rot_kernel_L999: + + mov r0, #0 // set return value + + sub sp, fp, #8 + pop {r4,fp} + bx lr + + EPILOGUE + From 440db4cddae9007eed93276d135e2d73fa959793 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Thu, 21 Nov 2013 22:52:24 +0100 Subject: [PATCH 46/81] delete rot_vfpv3.S --- kernel/arm/rot_vfpv3.S | 584 ----------------------------------------- 1 file changed, 584 deletions(-) delete mode 100644 kernel/arm/rot_vfpv3.S diff --git a/kernel/arm/rot_vfpv3.S b/kernel/arm/rot_vfpv3.S deleted file mode 100644 index 663ecdf81..000000000 --- a/kernel/arm/rot_vfpv3.S +++ /dev/null @@ -1,584 +0,0 @@ -/*************************************************************************** -Copyright (c) 2013, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -/************************************************************************************** -* 2013/11/15 Saar -* BLASTEST : OK -* CTEST : OK -* TEST : OK -* -**************************************************************************************/ - -#define ASSEMBLER -#include "common.h" - -#define STACKSIZE 256 - -#define OLD_INC_Y [fp, #0 ] - - -#define N r0 -#define X r1 -#define INC_X r2 -#define Y r3 -#define INC_Y r4 - -#define I r12 - -#define X_PRE 512 - -/************************************************************************************** -* Macro definitions -**************************************************************************************/ - -/*****************************************************************************************/ - - - -#if !defined(COMPLEX) - -#if defined(DOUBLE) - -.macro KERNEL_F4 - - pld [ X, #X_PRE ] - pld [ Y, #X_PRE ] - - fldmiad X, { d4 } - fldmiad Y, { d5 } - vmul.f64 d2 , d0, d4 - fmacd d2 , d1, d5 - vmul.f64 d3 , d0, d5 - fnmacd d3 , d1, d4 - fstmiad X!, { d2 } - fstmiad Y!, { d3 } - - fldmiad X, { d4 } - fldmiad Y, { d5 } - vmul.f64 d2 , d0, d4 - fmacd d2 , d1, d5 - vmul.f64 d3 , d0, d5 - fnmacd d3 , d1, d4 - fstmiad X!, { d2 } - fstmiad Y!, { d3 } - - fldmiad X, { d4 } - fldmiad Y, { d5 } - vmul.f64 d2 , d0, d4 - fmacd d2 , d1, d5 - vmul.f64 d3 , d0, d5 - fnmacd d3 , d1, d4 - fstmiad X!, { d2 } - fstmiad Y!, { d3 } - - fldmiad X, { d4 } - fldmiad Y, { d5 } - vmul.f64 d2 , d0, d4 - fmacd d2 , d1, d5 - vmul.f64 d3 , d0, d5 - fnmacd d3 , d1, d4 - fstmiad X!, { d2 } - fstmiad Y!, { d3 } - -.endm - - -.macro KERNEL_F1 - - fldmiad X, { d4 } - fldmiad Y, { d5 } - vmul.f64 d2 , d0, d4 - fmacd d2 , d1, d5 - vmul.f64 d3 , d0, d5 - fnmacd d3 , d1, d4 - fstmiad X!, { d2 } - fstmiad Y!, { d3 } - -.endm - -.macro KERNEL_S1 - - fldmiad X, { d4 } - fldmiad Y, { d5 } - vmul.f64 d2 , d0, d4 - fmacd d2 , d1, d5 - vmul.f64 d3 , d0, d5 - fnmacd d3 , d1, d4 - fstmiad X, { d2 } - fstmiad Y, { d3 } - - add X, X, INC_X - add Y, Y, INC_Y - -.endm - -#else - -.macro KERNEL_F4 - - fldmias X, { s4 } - fldmias Y, { s5 } - vmul.f32 s2 , s0, s4 - fmacs s2 , s1, s5 - vmul.f32 s3 , s0, s5 - fnmacs s3 , s1, s4 - fstmias X!, { s2 } - fstmias Y!, { s3 } - - fldmias X, { s4 } - fldmias Y, { s5 } - vmul.f32 s2 , s0, s4 - fmacs s2 , s1, s5 - vmul.f32 s3 , s0, s5 - fnmacs s3 , s1, s4 - fstmias X!, { s2 } - fstmias Y!, { s3 } - - fldmias X, { s4 } - fldmias Y, { s5 } - vmul.f32 s2 , s0, s4 - fmacs s2 , s1, s5 - vmul.f32 s3 , s0, s5 - fnmacs s3 , s1, s4 - fstmias X!, { s2 } - fstmias Y!, { s3 } - - fldmias X, { s4 } - fldmias Y, { s5 } - vmul.f32 s2 , s0, s4 - fmacs s2 , s1, s5 - vmul.f32 s3 , s0, s5 - fnmacs s3 , s1, s4 - fstmias X!, { s2 } - fstmias Y!, { s3 } - -.endm - - -.macro KERNEL_F1 - - fldmias X, { s4 } - fldmias Y, { s5 } - vmul.f32 s2 , s0, s4 - fmacs s2 , s1, s5 - vmul.f32 s3 , s0, s5 - fnmacs s3 , s1, s4 - fstmias X!, { s2 } - fstmias Y!, { s3 } - -.endm - -.macro KERNEL_S1 - - fldmias X, { s4 } - fldmias Y, { s5 } - vmul.f32 s2 , s0, s4 - fmacs s2 , s1, s5 - vmul.f32 s3 , s0, s5 - fnmacs s3 , s1, s4 - fstmias X, { s2 } - fstmias Y, { s3 } - - add X, X, INC_X - add Y, Y, INC_Y - -.endm - - - -#endif - -#else - -#if defined(DOUBLE) - -.macro KERNEL_F4 - - pld [ X, #X_PRE ] - pld [ Y, #X_PRE ] - - fldmiad X, { d4 - d5 } - fldmiad Y, { d6 - d7 } - vmul.f64 d2 , d0, d4 - fmacd d2 , d1, d6 - vmul.f64 d3 , d0, d6 - fnmacd d3 , d1, d4 - fstmiad X!, { d2 } - fstmiad Y!, { d3 } - vmul.f64 d2 , d0, d5 - fmacd d2 , d1, d7 - vmul.f64 d3 , d0, d7 - fnmacd d3 , d1, d5 - fstmiad X!, { d2 } - fstmiad Y!, { d3 } - - fldmiad X, { d4 - d5 } - fldmiad Y, { d6 - d7 } - vmul.f64 d2 , d0, d4 - fmacd d2 , d1, d6 - vmul.f64 d3 , d0, d6 - fnmacd d3 , d1, d4 - fstmiad X!, { d2 } - fstmiad Y!, { d3 } - vmul.f64 d2 , d0, d5 - fmacd d2 , d1, d7 - vmul.f64 d3 , d0, d7 - fnmacd d3 , d1, d5 - fstmiad X!, { d2 } - fstmiad Y!, { d3 } - - pld [ X, #X_PRE ] - pld [ Y, #X_PRE ] - - fldmiad X, { d4 - d5 } - fldmiad Y, { d6 - d7 } - vmul.f64 d2 , d0, d4 - fmacd d2 , d1, d6 - vmul.f64 d3 , d0, d6 - fnmacd d3 , d1, d4 - fstmiad X!, { d2 } - fstmiad Y!, { d3 } - vmul.f64 d2 , d0, d5 - fmacd d2 , d1, d7 - vmul.f64 d3 , d0, d7 - fnmacd d3 , d1, d5 - fstmiad X!, { d2 } - fstmiad Y!, { d3 } - - fldmiad X, { d4 - d5 } - fldmiad Y, { d6 - d7 } - vmul.f64 d2 , d0, d4 - fmacd d2 , d1, d6 - vmul.f64 d3 , d0, d6 - fnmacd d3 , d1, d4 - fstmiad X!, { d2 } - fstmiad Y!, { d3 } - vmul.f64 d2 , d0, d5 - fmacd d2 , d1, d7 - vmul.f64 d3 , d0, d7 - fnmacd d3 , d1, d5 - fstmiad X!, { d2 } - fstmiad Y!, { d3 } - -.endm - - -.macro KERNEL_F1 - - fldmiad X, { d4 - d5 } - fldmiad Y, { d6 - d7 } - vmul.f64 d2 , d0, d4 - fmacd d2 , d1, d6 - vmul.f64 d3 , d0, d6 - fnmacd d3 , d1, d4 - fstmiad X!, { d2 } - fstmiad Y!, { d3 } - vmul.f64 d2 , d0, d5 - fmacd d2 , d1, d7 - vmul.f64 d3 , d0, d7 - fnmacd d3 , d1, d5 - fstmiad X!, { d2 } - fstmiad Y!, { d3 } - - -.endm - -.macro KERNEL_S1 - - fldmiad X, { d4 - d5 } - fldmiad Y, { d6 - d7 } - vmul.f64 d2 , d0, d4 - fmacd d2 , d1, d6 - vmul.f64 d3 , d0, d6 - fnmacd d3 , d1, d4 - vstr d2 , [ X, #0 ] - vstr d3 , [ Y, #0 ] - vmul.f64 d2 , d0, d5 - fmacd d2 , d1, d7 - vmul.f64 d3 , d0, d7 - fnmacd d3 , d1, d5 - vstr d2 , [ X, #8 ] - vstr d3 , [ Y, #8 ] - - add X, X, INC_X - add Y, Y, INC_Y - -.endm - - - -#else - -.macro KERNEL_F4 - - pld [ X, #X_PRE ] - pld [ Y, #X_PRE ] - - fldmias X, { s4 - s5 } - fldmias Y, { s6 - s7 } - vmul.f32 s2 , s0, s4 - fmacs s2 , s1, s6 - vmul.f32 s3 , s0, s6 - fnmacs s3 , s1, s4 - fstmias X!, { s2 } - fstmias Y!, { s3 } - vmul.f32 s2 , s0, s5 - fmacs s2 , s1, s7 - vmul.f32 s3 , s0, s7 - fnmacs s3 , s1, s5 - fstmias X!, { s2 } - fstmias Y!, { s3 } - - fldmias X, { s4 - s5 } - fldmias Y, { s6 - s7 } - vmul.f32 s2 , s0, s4 - fmacs s2 , s1, s6 - vmul.f32 s3 , s0, s6 - fnmacs s3 , s1, s4 - fstmias X!, { s2 } - fstmias Y!, { s3 } - vmul.f32 s2 , s0, s5 - fmacs s2 , s1, s7 - vmul.f32 s3 , s0, s7 - fnmacs s3 , s1, s5 - fstmias X!, { s2 } - fstmias Y!, { s3 } - - pld [ X, #X_PRE ] - pld [ Y, #X_PRE ] - - fldmias X, { s4 - s5 } - fldmias Y, { s6 - s7 } - vmul.f32 s2 , s0, s4 - fmacs s2 , s1, s6 - vmul.f32 s3 , s0, s6 - fnmacs s3 , s1, s4 - fstmias X!, { s2 } - fstmias Y!, { s3 } - vmul.f32 s2 , s0, s5 - fmacs s2 , s1, s7 - vmul.f32 s3 , s0, s7 - fnmacs s3 , s1, s5 - fstmias X!, { s2 } - fstmias Y!, { s3 } - - fldmias X, { s4 - s5 } - fldmias Y, { s6 - s7 } - vmul.f32 s2 , s0, s4 - fmacs s2 , s1, s6 - vmul.f32 s3 , s0, s6 - fnmacs s3 , s1, s4 - fstmias X!, { s2 } - fstmias Y!, { s3 } - vmul.f32 s2 , s0, s5 - fmacs s2 , s1, s7 - vmul.f32 s3 , s0, s7 - fnmacs s3 , s1, s5 - fstmias X!, { s2 } - fstmias Y!, { s3 } - -.endm - - -.macro KERNEL_F1 - - fldmias X, { s4 - s5 } - fldmias Y, { s6 - s7 } - vmul.f32 s2 , s0, s4 - fmacs s2 , s1, s6 - vmul.f32 s3 , s0, s6 - fnmacs s3 , s1, s4 - fstmias X!, { s2 } - fstmias Y!, { s3 } - vmul.f32 s2 , s0, s5 - fmacs s2 , s1, s7 - vmul.f32 s3 , s0, s7 - fnmacs s3 , s1, s5 - fstmias X!, { s2 } - fstmias Y!, { s3 } - - -.endm - -.macro KERNEL_S1 - - fldmias X, { s4 - s5 } - fldmias Y, { s6 - s7 } - vmul.f32 s2 , s0, s4 - fmacs s2 , s1, s6 - vmul.f32 s3 , s0, s6 - fnmacs s3 , s1, s4 - vstr s2 , [ X, #0 ] - vstr s3 , [ Y, #0 ] - vmul.f32 s2 , s0, s5 - fmacs s2 , s1, s7 - vmul.f32 s3 , s0, s7 - fnmacs s3 , s1, s5 - vstr s2 , [ X, #4 ] - vstr s3 , [ Y, #4 ] - - add X, X, INC_X - add Y, Y, INC_Y - -.endm - - -#endif - -#endif - -/************************************************************************************** -* End of macro definitions -**************************************************************************************/ - - PROLOGUE - - .align 5 - push {r4 , fp} - add fp, sp, #8 - - ldr INC_Y , OLD_INC_Y - - - cmp N, #0 - ble rot_kernel_L999 - - cmp INC_X, #0 - beq rot_kernel_L999 - - cmp INC_Y, #0 - beq rot_kernel_L999 - - cmp INC_X, #1 - bne rot_kernel_S_BEGIN - - cmp INC_Y, #1 - bne rot_kernel_S_BEGIN - - -rot_kernel_F_BEGIN: - - - asrs I, N, #2 // I = N / 4 - ble rot_kernel_F1 - - .align 5 - -rot_kernel_F4: - -#if !defined(COMPLEX) && !defined(DOUBLE) - pld [ X, #X_PRE ] - pld [ Y, #X_PRE ] -#endif - - KERNEL_F4 - - subs I, I, #1 - ble rot_kernel_F1 - - KERNEL_F4 - - subs I, I, #1 - bne rot_kernel_F4 - -rot_kernel_F1: - - ands I, N, #3 - ble rot_kernel_L999 - -rot_kernel_F10: - - KERNEL_F1 - - subs I, I, #1 - bne rot_kernel_F10 - - b rot_kernel_L999 - -rot_kernel_S_BEGIN: - -#if defined(COMPLEX) - -#if defined(DOUBLE) - lsl INC_X, INC_X, #4 // INC_X * SIZE * 2 - lsl INC_Y, INC_Y, #4 // INC_Y * SIZE * 2 -#else - lsl INC_X, INC_X, #3 // INC_X * SIZE * 2 - lsl INC_Y, INC_Y, #3 // INC_Y * SIZE * 2 -#endif - -#else - -#if defined(DOUBLE) - lsl INC_X, INC_X, #3 // INC_X * SIZE - lsl INC_Y, INC_Y, #3 // INC_Y * SIZE -#else - lsl INC_X, INC_X, #2 // INC_X * SIZE - lsl INC_Y, INC_Y, #2 // INC_Y * SIZE -#endif - -#endif - - - asrs I, N, #2 // I = N / 4 - ble rot_kernel_S1 - - .align 5 - -rot_kernel_S4: - - KERNEL_S1 - KERNEL_S1 - KERNEL_S1 - KERNEL_S1 - - subs I, I, #1 - bne rot_kernel_S4 - -rot_kernel_S1: - - ands I, N, #3 - ble rot_kernel_L999 - -rot_kernel_S10: - - KERNEL_S1 - - subs I, I, #1 - bne rot_kernel_S10 - - -rot_kernel_L999: - - mov r0, #0 // set return value - - sub sp, fp, #8 - pop {r4,fp} - bx lr - - EPILOGUE - From 9adf87495ef165b6019d94f2381bf7e077aa5a26 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Thu, 21 Nov 2013 23:07:51 +0100 Subject: [PATCH 47/81] renamed some dot kernels --- kernel/arm/KERNEL.ARMV6 | 7 ++++--- kernel/arm/KERNEL.ARMV7 | 6 +++--- kernel/arm/{cdot_vfpv3.S => cdot_vfp.S} | 0 kernel/arm/{ddot_vfpv3.S => ddot_vfp.S} | 0 kernel/arm/{zdot_vfpv3.S => zdot_vfp.S} | 0 5 files changed, 7 insertions(+), 6 deletions(-) rename kernel/arm/{cdot_vfpv3.S => cdot_vfp.S} (100%) rename kernel/arm/{ddot_vfpv3.S => ddot_vfp.S} (100%) rename kernel/arm/{zdot_vfpv3.S => zdot_vfp.S} (100%) diff --git a/kernel/arm/KERNEL.ARMV6 b/kernel/arm/KERNEL.ARMV6 index fd294a937..9e3483e3c 100644 --- a/kernel/arm/KERNEL.ARMV6 +++ b/kernel/arm/KERNEL.ARMV6 @@ -46,9 +46,10 @@ CCOPYKERNEL = ccopy_vfp.S ZCOPYKERNEL = zcopy_vfp.S SDOTKERNEL = dot.c -DDOTKERNEL = dot.c -CDOTKERNEL = zdot.c -ZDOTKERNEL = zdot.c +DDOTKERNEL = ddot_vfp.S +CDOTKERNEL = cdot_vfp.S +ZDOTKERNEL = zdot_vfp.S + SNRM2KERNEL = nrm2.c DNRM2KERNEL = nrm2.c diff --git a/kernel/arm/KERNEL.ARMV7 b/kernel/arm/KERNEL.ARMV7 index b03c709e0..4a409e777 100644 --- a/kernel/arm/KERNEL.ARMV7 +++ b/kernel/arm/KERNEL.ARMV7 @@ -51,9 +51,9 @@ CCOPYKERNEL = ccopy_vfp.S ZCOPYKERNEL = zcopy_vfp.S SDOTKERNEL = sdot_vfpv3.S -DDOTKERNEL = ddot_vfpv3.S -CDOTKERNEL = cdot_vfpv3.S -ZDOTKERNEL = zdot_vfpv3.S +DDOTKERNEL = ddot_vfp.S +CDOTKERNEL = cdot_vfp.S +ZDOTKERNEL = zdot_vfp.S SNRM2KERNEL = nrm2_vfpv3.S DNRM2KERNEL = nrm2_vfpv3.S diff --git a/kernel/arm/cdot_vfpv3.S b/kernel/arm/cdot_vfp.S similarity index 100% rename from kernel/arm/cdot_vfpv3.S rename to kernel/arm/cdot_vfp.S diff --git a/kernel/arm/ddot_vfpv3.S b/kernel/arm/ddot_vfp.S similarity index 100% rename from kernel/arm/ddot_vfpv3.S rename to kernel/arm/ddot_vfp.S diff --git a/kernel/arm/zdot_vfpv3.S b/kernel/arm/zdot_vfp.S similarity index 100% rename from kernel/arm/zdot_vfpv3.S rename to kernel/arm/zdot_vfp.S From 19cd5c64a20da8e33c95d94d690f3afa1c254ee1 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Thu, 21 Nov 2013 23:19:32 +0100 Subject: [PATCH 48/81] renamed swap_vfpv3.S to swap_vfp.S --- kernel/arm/KERNEL.ARMV6 | 8 ++++---- kernel/arm/KERNEL.ARMV7 | 8 ++++---- kernel/arm/{swap_vfpv3.S => swap_vfp.S} | 0 3 files changed, 8 insertions(+), 8 deletions(-) rename kernel/arm/{swap_vfpv3.S => swap_vfp.S} (100%) diff --git a/kernel/arm/KERNEL.ARMV6 b/kernel/arm/KERNEL.ARMV6 index 9e3483e3c..59604031c 100644 --- a/kernel/arm/KERNEL.ARMV6 +++ b/kernel/arm/KERNEL.ARMV6 @@ -66,10 +66,10 @@ DSCALKERNEL = scal_vfp.S CSCALKERNEL = scal_vfp.S ZSCALKERNEL = scal_vfp.S -SSWAPKERNEL = swap.c -DSWAPKERNEL = swap.c -CSWAPKERNEL = zswap.c -ZSWAPKERNEL = zswap.c +SSWAPKERNEL = swap_vfp.S +DSWAPKERNEL = swap_vfp.S +CSWAPKERNEL = swap_vfp.S +ZSWAPKERNEL = swap_vfp.S SGEMVNKERNEL = gemv_n.c DGEMVNKERNEL = gemv_n.c diff --git a/kernel/arm/KERNEL.ARMV7 b/kernel/arm/KERNEL.ARMV7 index 4a409e777..bbd7e89dd 100644 --- a/kernel/arm/KERNEL.ARMV7 +++ b/kernel/arm/KERNEL.ARMV7 @@ -30,10 +30,10 @@ IDMAXKERNEL = iamax_vfp.S ISMINKERNEL = iamax_vfp.S IDMINKERNEL = iamax_vfp.S -SSWAPKERNEL = swap_vfpv3.S -DSWAPKERNEL = swap_vfpv3.S -CSWAPKERNEL = swap_vfpv3.S -ZSWAPKERNEL = swap_vfpv3.S +SSWAPKERNEL = swap_vfp.S +DSWAPKERNEL = swap_vfp.S +CSWAPKERNEL = swap_vfp.S +ZSWAPKERNEL = swap_vfp.S SASUMKERNEL = asum_vfp.S DASUMKERNEL = asum_vfp.S diff --git a/kernel/arm/swap_vfpv3.S b/kernel/arm/swap_vfp.S similarity index 100% rename from kernel/arm/swap_vfpv3.S rename to kernel/arm/swap_vfp.S From dbae93110baff205499853733e8fd47d1643c765 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Thu, 21 Nov 2013 23:34:51 +0100 Subject: [PATCH 49/81] added sdot_vfp.S --- kernel/arm/KERNEL.ARMV6 | 2 +- kernel/arm/sdot_vfp.S | 347 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 348 insertions(+), 1 deletion(-) create mode 100644 kernel/arm/sdot_vfp.S diff --git a/kernel/arm/KERNEL.ARMV6 b/kernel/arm/KERNEL.ARMV6 index 59604031c..75f7a9491 100644 --- a/kernel/arm/KERNEL.ARMV6 +++ b/kernel/arm/KERNEL.ARMV6 @@ -45,7 +45,7 @@ DCOPYKERNEL = dcopy_vfp.S CCOPYKERNEL = ccopy_vfp.S ZCOPYKERNEL = zcopy_vfp.S -SDOTKERNEL = dot.c +SDOTKERNEL = sdot_vfp.S DDOTKERNEL = ddot_vfp.S CDOTKERNEL = cdot_vfp.S ZDOTKERNEL = zdot_vfp.S diff --git a/kernel/arm/sdot_vfp.S b/kernel/arm/sdot_vfp.S new file mode 100644 index 000000000..2d1909201 --- /dev/null +++ b/kernel/arm/sdot_vfp.S @@ -0,0 +1,347 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/11 Saar +* BLASTEST : OK +* CTEST : OK (no test for dsdot) +* TEST : OK (no test for dsdot) +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define N r0 +#define X r1 +#define INC_X r2 +#define OLD_Y r3 + + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define OLD_INC_Y [fp, #4 ] + +#define I r5 +#define Y r6 +#define INC_Y r7 + +#define X_PRE 512 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +#if defined(DSDOT) + +.macro KERNEL_F4 + + fldmias X!, { s14 } + fldmias Y!, { s15 } + vmul.f32 s15, s14, s15 + vcvt.f64.f32 d4, s15 + vadd.f64 d0 , d0, d4 + + fldmias X!, { s14 } + fldmias Y!, { s15 } + vmul.f32 s15, s14, s15 + vcvt.f64.f32 d4, s15 + vadd.f64 d0 , d0, d4 + + fldmias X!, { s14 } + fldmias Y!, { s15 } + vmul.f32 s15, s14, s15 + vcvt.f64.f32 d4, s15 + vadd.f64 d0 , d0, d4 + + fldmias X!, { s14 } + fldmias Y!, { s15 } + vmul.f32 s15, s14, s15 + vcvt.f64.f32 d4, s15 + vadd.f64 d0 , d0, d4 + +.endm + +.macro KERNEL_F1 + + fldmias X!, { s14 } + fldmias Y!, { s15 } + vmul.f32 s15, s14, s15 + vcvt.f64.f32 d4, s15 + vadd.f64 d0 , d0, d4 + +.endm + + +.macro KERNEL_S4 + + nop + + fldmias X, { s14 } + fldmias Y, { s15 } + vmul.f32 s15, s14, s15 + vcvt.f64.f32 d4, s15 + vadd.f64 d0 , d0, d4 + add X, X, INC_X + add Y, Y, INC_Y + + fldmias X, { s14 } + fldmias Y, { s15 } + vmul.f32 s15, s14, s15 + vcvt.f64.f32 d4, s15 + vadd.f64 d0 , d0, d4 + add X, X, INC_X + add Y, Y, INC_Y + + fldmias X, { s14 } + fldmias Y, { s15 } + vmul.f32 s15, s14, s15 + vcvt.f64.f32 d4, s15 + vadd.f64 d0 , d0, d4 + add X, X, INC_X + add Y, Y, INC_Y + + fldmias X, { s14 } + fldmias Y, { s15 } + vmul.f32 s15, s14, s15 + vcvt.f64.f32 d4, s15 + vadd.f64 d0 , d0, d4 + add X, X, INC_X + add Y, Y, INC_Y + +.endm + + +.macro KERNEL_S1 + + fldmias X, { s14 } + fldmias Y, { s15 } + vmul.f32 s15, s14, s15 + vcvt.f64.f32 d4, s15 + vadd.f64 d0 , d0, d4 + add X, X, INC_X + add Y, Y, INC_Y + +.endm + + + +#else + +.macro KERNEL_F4 + + fldmias X!, { s8 - s9 } + fldmias Y!, { s4 - s5} + fmacs s0 , s4, s8 + fldmias X!, { s10 - s11 } + fmacs s1 , s5, s9 + fldmias Y!, { s6 - s7 } + fmacs s0 , s6, s10 + fmacs s1 , s7, s11 + +.endm + +.macro KERNEL_F1 + + fldmias X!, { s4 } + fldmias Y!, { s8 } + fmacs s0 , s4, s8 + +.endm + + +.macro KERNEL_S4 + + nop + fldmias X, { s4 } + fldmias Y, { s8 } + add X, X, INC_X + add Y, Y, INC_Y + fmacs s0 , s4, s8 + + fldmias X, { s5 } + fldmias Y, { s9 } + add X, X, INC_X + add Y, Y, INC_Y + fmacs s1 , s5, s9 + + fldmias X, { s6 } + fldmias Y, { s10 } + add X, X, INC_X + add Y, Y, INC_Y + fmacs s0 , s6, s10 + + fldmias X, { s7 } + fldmias Y, { s11 } + add X, X, INC_X + add Y, Y, INC_Y + fmacs s1 , s7, s11 + +.endm + + +.macro KERNEL_S1 + + fldmias X, { s4 } + fldmias Y, { s8 } + add X, X, INC_X + fmacs s0 , s4, s8 + add Y, Y, INC_Y + +.endm + +#endif + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + sub r4, fp, #128 + vstm r4, { s8 - s15 } // store floating point registers + + mov Y, OLD_Y + ldr INC_Y, OLD_INC_Y + +#if defined(DSDOT) + + vsub.f64 d0 , d0 , d0 + vsub.f64 d1 , d1 , d1 + +#else + + vsub.f32 s0 , s0 , s0 + vsub.f32 s1 , s1 , s1 + +#endif + + cmp N, #0 + ble sdot_kernel_L999 + + cmp INC_X, #0 + beq sdot_kernel_L999 + + cmp INC_Y, #0 + beq sdot_kernel_L999 + + cmp INC_X, #1 + bne sdot_kernel_S_BEGIN + + cmp INC_Y, #1 + bne sdot_kernel_S_BEGIN + +sdot_kernel_F_BEGIN: + + asrs I, N, #2 // I = N / 4 + ble sdot_kernel_F1 + +sdot_kernel_F4: + + KERNEL_F4 + + subs I, I, #1 + bne sdot_kernel_F4 + +sdot_kernel_F1: + + ands I, N, #3 + ble sdot_kernel_L999 + +sdot_kernel_F10: + + KERNEL_F1 + + subs I, I, #1 + bne sdot_kernel_F10 + + b sdot_kernel_L999 + +sdot_kernel_S_BEGIN: + + lsl INC_X, INC_X, #2 // INC_X * SIZE + lsl INC_Y, INC_Y, #2 // INC_Y * SIZE + + asrs I, N, #2 // I = N / 4 + ble sdot_kernel_S1 + +sdot_kernel_S4: + + KERNEL_S4 + + subs I, I, #1 + bne sdot_kernel_S4 + +sdot_kernel_S1: + + ands I, N, #3 + ble sdot_kernel_L999 + +sdot_kernel_S10: + + KERNEL_S1 + + subs I, I, #1 + bne sdot_kernel_S10 + + + + + + +sdot_kernel_L999: + + sub r3, fp, #128 + vldm r3, { s8 - s15} // restore floating point registers + +#if defined(DSDOT) + + vadd.f64 d0 , d0, d1 // set return value + +#else + + vadd.f32 s0 , s0, s1 // set return value + +#endif + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + From 9f0a3a35b3d1f4dce709bbc1aca348abe26880f0 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Thu, 21 Nov 2013 23:42:54 +0100 Subject: [PATCH 50/81] removed obsolete file sdot_vfpv3.S --- kernel/arm/sdot_vfpv3.S | 347 ---------------------------------------- 1 file changed, 347 deletions(-) delete mode 100644 kernel/arm/sdot_vfpv3.S diff --git a/kernel/arm/sdot_vfpv3.S b/kernel/arm/sdot_vfpv3.S deleted file mode 100644 index 794e07317..000000000 --- a/kernel/arm/sdot_vfpv3.S +++ /dev/null @@ -1,347 +0,0 @@ -/*************************************************************************** -Copyright (c) 2013, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -/************************************************************************************** -* 2013/11/11 Saar -* BLASTEST : OK -* CTEST : OK (no test for dsdot) -* TEST : OK (no test for dsdot) -* -**************************************************************************************/ - -#define ASSEMBLER -#include "common.h" - -#define STACKSIZE 256 - -#define N r0 -#define X r1 -#define INC_X r2 -#define OLD_Y r3 - - -/****************************************************** -* [fp, #-128] - [fp, #-64] is reserved -* for store and restore of floating point -* registers -*******************************************************/ - -#define OLD_INC_Y [fp, #4 ] - -#define I r5 -#define Y r6 -#define INC_Y r7 - -#define X_PRE 512 - -/************************************************************************************** -* Macro definitions -**************************************************************************************/ - -#if defined(DSDOT) - -.macro KERNEL_F4 - - fldmias X!, { s14 } - fldmias Y!, { s15 } - vmul.f32 s15, s14, s15 - vcvt.f64.f32 d16, s15 - vadd.f64 d0 , d0, d16 - - fldmias X!, { s14 } - fldmias Y!, { s15 } - vmul.f32 s15, s14, s15 - vcvt.f64.f32 d16, s15 - vadd.f64 d0 , d0, d16 - - fldmias X!, { s14 } - fldmias Y!, { s15 } - vmul.f32 s15, s14, s15 - vcvt.f64.f32 d16, s15 - vadd.f64 d0 , d0, d16 - - fldmias X!, { s14 } - fldmias Y!, { s15 } - vmul.f32 s15, s14, s15 - vcvt.f64.f32 d16, s15 - vadd.f64 d0 , d0, d16 - -.endm - -.macro KERNEL_F1 - - fldmias X!, { s14 } - fldmias Y!, { s15 } - vmul.f32 s15, s14, s15 - vcvt.f64.f32 d16, s15 - vadd.f64 d0 , d0, d16 - -.endm - - -.macro KERNEL_S4 - - nop - - fldmias X, { s14 } - fldmias Y, { s15 } - vmul.f32 s15, s14, s15 - vcvt.f64.f32 d16, s15 - vadd.f64 d0 , d0, d16 - add X, X, INC_X - add Y, Y, INC_Y - - fldmias X, { s14 } - fldmias Y, { s15 } - vmul.f32 s15, s14, s15 - vcvt.f64.f32 d16, s15 - vadd.f64 d0 , d0, d16 - add X, X, INC_X - add Y, Y, INC_Y - - fldmias X, { s14 } - fldmias Y, { s15 } - vmul.f32 s15, s14, s15 - vcvt.f64.f32 d16, s15 - vadd.f64 d0 , d0, d16 - add X, X, INC_X - add Y, Y, INC_Y - - fldmias X, { s14 } - fldmias Y, { s15 } - vmul.f32 s15, s14, s15 - vcvt.f64.f32 d16, s15 - vadd.f64 d0 , d0, d16 - add X, X, INC_X - add Y, Y, INC_Y - -.endm - - -.macro KERNEL_S1 - - fldmias X, { s14 } - fldmias Y, { s15 } - vmul.f32 s15, s14, s15 - vcvt.f64.f32 d16, s15 - vadd.f64 d0 , d0, d16 - add X, X, INC_X - add Y, Y, INC_Y - -.endm - - - -#else - -.macro KERNEL_F4 - - fldmias X!, { s8 - s9 } - fldmias Y!, { s4 - s5} - fmacs s0 , s4, s8 - fldmias X!, { s10 - s11 } - fmacs s1 , s5, s9 - fldmias Y!, { s6 - s7 } - fmacs s0 , s6, s10 - fmacs s1 , s7, s11 - -.endm - -.macro KERNEL_F1 - - fldmias X!, { s4 } - fldmias Y!, { s8 } - fmacs s0 , s4, s8 - -.endm - - -.macro KERNEL_S4 - - nop - fldmias X, { s4 } - fldmias Y, { s8 } - add X, X, INC_X - add Y, Y, INC_Y - fmacs s0 , s4, s8 - - fldmias X, { s5 } - fldmias Y, { s9 } - add X, X, INC_X - add Y, Y, INC_Y - fmacs s1 , s5, s9 - - fldmias X, { s6 } - fldmias Y, { s10 } - add X, X, INC_X - add Y, Y, INC_Y - fmacs s0 , s6, s10 - - fldmias X, { s7 } - fldmias Y, { s11 } - add X, X, INC_X - add Y, Y, INC_Y - fmacs s1 , s7, s11 - -.endm - - -.macro KERNEL_S1 - - fldmias X, { s4 } - fldmias Y, { s8 } - add X, X, INC_X - fmacs s0 , s4, s8 - add Y, Y, INC_Y - -.endm - -#endif - -/************************************************************************************** -* End of macro definitions -**************************************************************************************/ - - PROLOGUE - - .align 5 - - push {r4 - r9, fp} - add fp, sp, #24 - sub sp, sp, #STACKSIZE // reserve stack - - sub r4, fp, #128 - vstm r4, { s8 - s15 } // store floating point registers - - mov Y, OLD_Y - ldr INC_Y, OLD_INC_Y - -#if defined(DSDOT) - - vsub.f64 d0 , d0 , d0 - vsub.f64 d1 , d1 , d1 - -#else - - vsub.f32 s0 , s0 , s0 - vsub.f32 s1 , s1 , s1 - -#endif - - cmp N, #0 - ble sdot_kernel_L999 - - cmp INC_X, #0 - beq sdot_kernel_L999 - - cmp INC_Y, #0 - beq sdot_kernel_L999 - - cmp INC_X, #1 - bne sdot_kernel_S_BEGIN - - cmp INC_Y, #1 - bne sdot_kernel_S_BEGIN - -sdot_kernel_F_BEGIN: - - asrs I, N, #2 // I = N / 4 - ble sdot_kernel_F1 - -sdot_kernel_F4: - - KERNEL_F4 - - subs I, I, #1 - bne sdot_kernel_F4 - -sdot_kernel_F1: - - ands I, N, #3 - ble sdot_kernel_L999 - -sdot_kernel_F10: - - KERNEL_F1 - - subs I, I, #1 - bne sdot_kernel_F10 - - b sdot_kernel_L999 - -sdot_kernel_S_BEGIN: - - lsl INC_X, INC_X, #2 // INC_X * SIZE - lsl INC_Y, INC_Y, #2 // INC_Y * SIZE - - asrs I, N, #2 // I = N / 4 - ble sdot_kernel_S1 - -sdot_kernel_S4: - - KERNEL_S4 - - subs I, I, #1 - bne sdot_kernel_S4 - -sdot_kernel_S1: - - ands I, N, #3 - ble sdot_kernel_L999 - -sdot_kernel_S10: - - KERNEL_S1 - - subs I, I, #1 - bne sdot_kernel_S10 - - - - - - -sdot_kernel_L999: - - sub r3, fp, #128 - vldm r3, { s8 - s15} // restore floating point registers - -#if defined(DSDOT) - - vadd.f64 d0 , d0, d1 // set return value - -#else - - vadd.f32 s0 , s0, s1 // set return value - -#endif - sub sp, fp, #24 - pop {r4 - r9, fp} - bx lr - - EPILOGUE - From 7f210587f0e3cf3d25a346d94041b68fb39be40f Mon Sep 17 00:00:00 2001 From: wernsaar Date: Fri, 22 Nov 2013 00:20:25 +0100 Subject: [PATCH 51/81] renamed some ncopy and tcopy files --- kernel/arm/KERNEL.ARMV6 | 18 +++++++++--------- kernel/arm/KERNEL.ARMV7 | 18 +++++++++--------- ...emm_ncopy_2_vfpv3.S => cgemm_ncopy_2_vfp.S} | 0 ...emm_tcopy_2_vfpv3.S => cgemm_tcopy_2_vfp.S} | 0 ...emm_ncopy_4_vfpv3.S => dgemm_ncopy_4_vfp.S} | 0 ...emm_tcopy_4_vfpv3.S => dgemm_tcopy_4_vfp.S} | 0 ...emm_ncopy_4_vfpv3.S => sgemm_ncopy_4_vfp.S} | 0 ...emm_tcopy_4_vfpv3.S => sgemm_tcopy_4_vfp.S} | 0 ...emm_ncopy_2_vfpv3.S => zgemm_ncopy_2_vfp.S} | 0 ...emm_tcopy_2_vfpv3.S => zgemm_tcopy_2_vfp.S} | 0 10 files changed, 18 insertions(+), 18 deletions(-) rename kernel/arm/{cgemm_ncopy_2_vfpv3.S => cgemm_ncopy_2_vfp.S} (100%) rename kernel/arm/{cgemm_tcopy_2_vfpv3.S => cgemm_tcopy_2_vfp.S} (100%) rename kernel/arm/{dgemm_ncopy_4_vfpv3.S => dgemm_ncopy_4_vfp.S} (100%) rename kernel/arm/{dgemm_tcopy_4_vfpv3.S => dgemm_tcopy_4_vfp.S} (100%) rename kernel/arm/{sgemm_ncopy_4_vfpv3.S => sgemm_ncopy_4_vfp.S} (100%) rename kernel/arm/{sgemm_tcopy_4_vfpv3.S => sgemm_tcopy_4_vfp.S} (100%) rename kernel/arm/{zgemm_ncopy_2_vfpv3.S => zgemm_ncopy_2_vfp.S} (100%) rename kernel/arm/{zgemm_tcopy_2_vfpv3.S => zgemm_tcopy_2_vfp.S} (100%) diff --git a/kernel/arm/KERNEL.ARMV6 b/kernel/arm/KERNEL.ARMV6 index 75f7a9491..17a4f9257 100644 --- a/kernel/arm/KERNEL.ARMV6 +++ b/kernel/arm/KERNEL.ARMV6 @@ -92,23 +92,23 @@ SGEMMOTCOPY = ../generic/gemm_tcopy_2.c SGEMMONCOPYOBJ = sgemm_oncopy.o SGEMMOTCOPYOBJ = sgemm_otcopy.o -DGEMMKERNEL = ../generic/gemmkernel_2x2.c +DGEMMKERNEL = ../generic/gemmkernel_2x2.c DGEMMONCOPY = ../generic/gemm_ncopy_2.c DGEMMOTCOPY = ../generic/gemm_tcopy_2.c DGEMMONCOPYOBJ = dgemm_oncopy.o DGEMMOTCOPYOBJ = dgemm_otcopy.o CGEMMKERNEL = ../generic/zgemmkernel_2x2.c -CGEMMONCOPY = ../generic/zgemm_ncopy_2.c -CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c -CGEMMONCOPYOBJ = cgemm_oncopy.o -CGEMMOTCOPYOBJ = cgemm_otcopy.o +CGEMMONCOPY = cgemm_ncopy_2_vfp.S +CGEMMOTCOPY = cgemm_tcopy_2_vfp.S +CGEMMONCOPYOBJ = cgemm_oncopy.o +CGEMMOTCOPYOBJ = cgemm_otcopy.o ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c -ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c -ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c -ZGEMMONCOPYOBJ = zgemm_oncopy.o -ZGEMMOTCOPYOBJ = zgemm_otcopy.o +ZGEMMONCOPY = zgemm_ncopy_2_vfp.S +ZGEMMOTCOPY = zgemm_tcopy_2_vfp.S +ZGEMMONCOPYOBJ = zgemm_oncopy.o +ZGEMMOTCOPYOBJ = zgemm_otcopy.o STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c diff --git a/kernel/arm/KERNEL.ARMV7 b/kernel/arm/KERNEL.ARMV7 index bbd7e89dd..07d38d189 100644 --- a/kernel/arm/KERNEL.ARMV7 +++ b/kernel/arm/KERNEL.ARMV7 @@ -50,7 +50,7 @@ DCOPYKERNEL = dcopy_vfp.S CCOPYKERNEL = ccopy_vfp.S ZCOPYKERNEL = zcopy_vfp.S -SDOTKERNEL = sdot_vfpv3.S +SDOTKERNEL = sdot_vfp.S DDOTKERNEL = ddot_vfp.S CDOTKERNEL = cdot_vfp.S ZDOTKERNEL = zdot_vfp.S @@ -89,8 +89,8 @@ ZTRMMKERNEL = ztrmm_kernel_2x2_vfpv3.S SGEMMKERNEL = sgemm_kernel_4x4_vfpv3.S SGEMMINCOPY = SGEMMITCOPY = -SGEMMONCOPY = sgemm_ncopy_4_vfpv3.S -SGEMMOTCOPY = sgemm_tcopy_4_vfpv3.S +SGEMMONCOPY = sgemm_ncopy_4_vfp.S +SGEMMOTCOPY = sgemm_tcopy_4_vfp.S SGEMMINCOPYOBJ = SGEMMITCOPYOBJ = SGEMMONCOPYOBJ = sgemm_oncopy.o @@ -99,22 +99,22 @@ SGEMMOTCOPYOBJ = sgemm_otcopy.o DGEMMKERNEL = dgemm_kernel_4x4_vfpv3.S DGEMMINCOPY = DGEMMITCOPY = -DGEMMONCOPY = dgemm_ncopy_4_vfpv3.S -DGEMMOTCOPY = dgemm_tcopy_4_vfpv3.S +DGEMMONCOPY = dgemm_ncopy_4_vfp.S +DGEMMOTCOPY = dgemm_tcopy_4_vfp.S DGEMMINCOPYOBJ = DGEMMITCOPYOBJ = DGEMMONCOPYOBJ = dgemm_oncopy.o DGEMMOTCOPYOBJ = dgemm_otcopy.o CGEMMKERNEL = cgemm_kernel_2x2_vfpv3.S -CGEMMONCOPY = cgemm_ncopy_2_vfpv3.S -CGEMMOTCOPY = cgemm_tcopy_2_vfpv3.S +CGEMMONCOPY = cgemm_ncopy_2_vfp.S +CGEMMOTCOPY = cgemm_tcopy_2_vfp.S CGEMMONCOPYOBJ = cgemm_oncopy.o CGEMMOTCOPYOBJ = cgemm_otcopy.o ZGEMMKERNEL = zgemm_kernel_2x2_vfpv3.S -ZGEMMONCOPY = zgemm_ncopy_2_vfpv3.S -ZGEMMOTCOPY = zgemm_tcopy_2_vfpv3.S +ZGEMMONCOPY = zgemm_ncopy_2_vfp.S +ZGEMMOTCOPY = zgemm_tcopy_2_vfp.S ZGEMMONCOPYOBJ = zgemm_oncopy.o ZGEMMOTCOPYOBJ = zgemm_otcopy.o diff --git a/kernel/arm/cgemm_ncopy_2_vfpv3.S b/kernel/arm/cgemm_ncopy_2_vfp.S similarity index 100% rename from kernel/arm/cgemm_ncopy_2_vfpv3.S rename to kernel/arm/cgemm_ncopy_2_vfp.S diff --git a/kernel/arm/cgemm_tcopy_2_vfpv3.S b/kernel/arm/cgemm_tcopy_2_vfp.S similarity index 100% rename from kernel/arm/cgemm_tcopy_2_vfpv3.S rename to kernel/arm/cgemm_tcopy_2_vfp.S diff --git a/kernel/arm/dgemm_ncopy_4_vfpv3.S b/kernel/arm/dgemm_ncopy_4_vfp.S similarity index 100% rename from kernel/arm/dgemm_ncopy_4_vfpv3.S rename to kernel/arm/dgemm_ncopy_4_vfp.S diff --git a/kernel/arm/dgemm_tcopy_4_vfpv3.S b/kernel/arm/dgemm_tcopy_4_vfp.S similarity index 100% rename from kernel/arm/dgemm_tcopy_4_vfpv3.S rename to kernel/arm/dgemm_tcopy_4_vfp.S diff --git a/kernel/arm/sgemm_ncopy_4_vfpv3.S b/kernel/arm/sgemm_ncopy_4_vfp.S similarity index 100% rename from kernel/arm/sgemm_ncopy_4_vfpv3.S rename to kernel/arm/sgemm_ncopy_4_vfp.S diff --git a/kernel/arm/sgemm_tcopy_4_vfpv3.S b/kernel/arm/sgemm_tcopy_4_vfp.S similarity index 100% rename from kernel/arm/sgemm_tcopy_4_vfpv3.S rename to kernel/arm/sgemm_tcopy_4_vfp.S diff --git a/kernel/arm/zgemm_ncopy_2_vfpv3.S b/kernel/arm/zgemm_ncopy_2_vfp.S similarity index 100% rename from kernel/arm/zgemm_ncopy_2_vfpv3.S rename to kernel/arm/zgemm_ncopy_2_vfp.S diff --git a/kernel/arm/zgemm_tcopy_2_vfpv3.S b/kernel/arm/zgemm_tcopy_2_vfp.S similarity index 100% rename from kernel/arm/zgemm_tcopy_2_vfpv3.S rename to kernel/arm/zgemm_tcopy_2_vfp.S From 9a0f9789296e764c5fe92c614c7cc9d655061bb3 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Fri, 22 Nov 2013 17:21:10 +0100 Subject: [PATCH 52/81] added nrm2 kernel for ARMV6 --- kernel/arm/KERNEL.ARMV6 | 9 +- kernel/arm/nrm2_vfp.S | 565 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 569 insertions(+), 5 deletions(-) create mode 100644 kernel/arm/nrm2_vfp.S diff --git a/kernel/arm/KERNEL.ARMV6 b/kernel/arm/KERNEL.ARMV6 index 17a4f9257..03eff6ec5 100644 --- a/kernel/arm/KERNEL.ARMV6 +++ b/kernel/arm/KERNEL.ARMV6 @@ -50,11 +50,10 @@ DDOTKERNEL = ddot_vfp.S CDOTKERNEL = cdot_vfp.S ZDOTKERNEL = zdot_vfp.S - -SNRM2KERNEL = nrm2.c -DNRM2KERNEL = nrm2.c -CNRM2KERNEL = znrm2.c -ZNRM2KERNEL = znrm2.c +SNRM2KERNEL = nrm2_vfp.S +DNRM2KERNEL = nrm2_vfp.S +CNRM2KERNEL = nrm2_vfp.S +ZNRM2KERNEL = nrm2_vfp.S SROTKERNEL = rot_vfp.S DROTKERNEL = rot_vfp.S diff --git a/kernel/arm/nrm2_vfp.S b/kernel/arm/nrm2_vfp.S new file mode 100644 index 000000000..4c62917b9 --- /dev/null +++ b/kernel/arm/nrm2_vfp.S @@ -0,0 +1,565 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/22 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define N r0 +#define X r1 +#define INC_X r2 + +#define I r12 + +#define X_PRE 512 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + + +#if !defined(COMPLEX) + +#if defined(DOUBLE) + + +.macro KERNEL_F1 + + fldmiad X!, { d4 } + vcmpe.f64 d4, d6 // compare with 0.0 + vmrs APSR_nzcv, fpscr + beq KERNEL_F1_NEXT_\@ + vabs.f64 d4, d4 + vcmpe.f64 d0, d4 // compare with scale + vmrs APSR_nzcv, fpscr + vdivge.f64 d2 , d4, d0 // scale >= x ? x / scale + vmlage.f64 d1 , d2 , d2 // ssq += ( x/scale ) * ( x/scale ) + bge KERNEL_F1_NEXT_\@ + vdiv.f64 d2 , d0, d4 // scale / x + vmul.f64 d2 , d2, d2 // ( scale / x ) * ( scale / x ) + vmul.f64 d3 , d1, d2 // ssq * ( scale / x ) * ( scale / x ) + vadd.f64 d1 , d3, d7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) + vmov.f64 d0 , d4 // scale = x + +KERNEL_F1_NEXT_\@: + +.endm + +.macro KERNEL_F8 + + pld [ X, #X_PRE ] + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 + pld [ X, #X_PRE ] + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 + +.endm + +.macro KERNEL_S1 + + fldmiad X, { d4 } + vcmpe.f64 d4, d6 // compare with 0.0 + vmrs APSR_nzcv, fpscr + beq KERNEL_S1_NEXT + vabs.f64 d4, d4 + vcmpe.f64 d0, d4 // compare with scale + vmrs APSR_nzcv, fpscr + vdivge.f64 d2 , d4, d0 // scale >= x ? x / scale + vmlage.f64 d1 , d2 , d2 // ssq += ( x/scale ) * ( x/scale ) + bge KERNEL_S1_NEXT + vdiv.f64 d2 , d0, d4 // scale / x + vmul.f64 d2 , d2, d2 // ( scale / x ) * ( scale / x ) + vmul.f64 d3 , d1, d2 // ssq * ( scale / x ) * ( scale / x ) + vadd.f64 d1 , d3, d7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) + vmov.f64 d0 , d4 // scale = x + +KERNEL_S1_NEXT: + + add X, X, INC_X + +.endm + +#else + +.macro KERNEL_F1 + + fldmias X!, { s4 } + vcmpe.f32 s4, s6 // compare with 0.0 + vmrs APSR_nzcv, fpscr + beq KERNEL_F1_NEXT_\@ + vabs.f32 s4, s4 + vcmpe.f32 s0, s4 // compare with scale + vmrs APSR_nzcv, fpscr + vdivge.f32 s2 , s4, s0 // scale >= x ? x / scale + vmlage.f32 s1 , s2 , s2 // ssq += ( x/scale ) * ( x/scale ) + bge KERNEL_F1_NEXT_\@ + vdiv.f32 s2 , s0, s4 // scale / x + vmul.f32 s2 , s2, s2 // ( scale / x ) * ( scale / x ) + vmul.f32 s3 , s1, s2 // ssq * ( scale / x ) * ( scale / x ) + vadd.f32 s1 , s3, s7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) + vmov.f32 s0 , s4 // scale = x + +KERNEL_F1_NEXT_\@: + +.endm + +.macro KERNEL_F8 + + pld [ X, #X_PRE ] + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 + + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 + +.endm + +.macro KERNEL_S1 + + fldmias X, { s4 } + vcmpe.f32 s4, s6 // compare with 0.0 + vmrs APSR_nzcv, fpscr + beq KERNEL_S1_NEXT + vabs.f32 s4, s4 + vcmpe.f32 s0, s4 // compare with scale + vmrs APSR_nzcv, fpscr + vdivge.f32 s2 , s4, s0 // scale >= x ? x / scale + vmlage.f32 s1 , s2 , s2 // ssq += ( x/scale ) * ( x/scale ) + bge KERNEL_S1_NEXT + vdiv.f32 s2 , s0, s4 // scale / x + vmul.f32 s2 , s2, s2 // ( scale / x ) * ( scale / x ) + vmul.f32 s3 , s1, s2 // ssq * ( scale / x ) * ( scale / x ) + vadd.f32 s1 , s3, s7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) + vmov.f32 s0 , s4 // scale = x + +KERNEL_S1_NEXT: + + add X, X, INC_X + +.endm + + + + +#endif + +#else + +#if defined(DOUBLE) + +.macro KERNEL_F1 + + fldmiad X!, { d4 - d5 } + + vcmpe.f64 d4, d6 // compare with 0.0 + vmrs APSR_nzcv, fpscr + beq KERNEL_F1_NEXT_\@ + vabs.f64 d4, d4 + vcmpe.f64 d0, d4 // compare with scale + vmrs APSR_nzcv, fpscr + vdivge.f64 d2 , d4, d0 // scale >= x ? x / scale + vmlage.f64 d1 , d2 , d2 // ssq += ( x/scale ) * ( x/scale ) + bge KERNEL_F1_NEXT_\@ + vdiv.f64 d2 , d0, d4 // scale / x + vmul.f64 d2 , d2, d2 // ( scale / x ) * ( scale / x ) + vmul.f64 d3 , d1, d2 // ssq * ( scale / x ) * ( scale / x ) + vadd.f64 d1 , d3, d7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) + vmov.f64 d0 , d4 // scale = x + +KERNEL_F1_NEXT_\@: + + vcmpe.f64 d5, d6 // compare with 0.0 + vmrs APSR_nzcv, fpscr + beq KERNEL_F1_END_\@ + vabs.f64 d5, d5 + vcmpe.f64 d0, d5 // compare with scale + vmrs APSR_nzcv, fpscr + vdivge.f64 d2 , d5, d0 // scale >= x ? x / scale + vmlage.f64 d1 , d2 , d2 // ssq += ( x/scale ) * ( x/scale ) + bge KERNEL_F1_END_\@ + vdiv.f64 d2 , d0, d5 // scale / x + vmul.f64 d2 , d2, d2 // ( scale / x ) * ( scale / x ) + vmul.f64 d3 , d1, d2 // ssq * ( scale / x ) * ( scale / x ) + vadd.f64 d1 , d3, d7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) + vmov.f64 d0 , d5 // scale = x + +KERNEL_F1_END_\@: + + +.endm + +.macro KERNEL_F8 + + pld [ X, #X_PRE ] + KERNEL_F1 + KERNEL_F1 + pld [ X, #X_PRE ] + KERNEL_F1 + KERNEL_F1 + pld [ X, #X_PRE ] + KERNEL_F1 + KERNEL_F1 + pld [ X, #X_PRE ] + KERNEL_F1 + KERNEL_F1 + +.endm + +.macro KERNEL_S1 + + fldmiad X, { d4 - d5 } + + vcmpe.f64 d4, d6 // compare with 0.0 + vmrs APSR_nzcv, fpscr + beq KERNEL_S1_NEXT_\@ + vabs.f64 d4, d4 + vcmpe.f64 d0, d4 // compare with scale + vmrs APSR_nzcv, fpscr + vdivge.f64 d2 , d4, d0 // scale >= x ? x / scale + vmlage.f64 d1 , d2 , d2 // ssq += ( x/scale ) * ( x/scale ) + bge KERNEL_S1_NEXT_\@ + vdiv.f64 d2 , d0, d4 // scale / x + vmul.f64 d2 , d2, d2 // ( scale / x ) * ( scale / x ) + vmul.f64 d3 , d1, d2 // ssq * ( scale / x ) * ( scale / x ) + vadd.f64 d1 , d3, d7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) + vmov.f64 d0 , d4 // scale = x + +KERNEL_S1_NEXT_\@: + + vcmpe.f64 d5, d6 // compare with 0.0 + vmrs APSR_nzcv, fpscr + beq KERNEL_S1_END_\@ + vabs.f64 d5, d5 + vcmpe.f64 d0, d5 // compare with scale + vmrs APSR_nzcv, fpscr + vdivge.f64 d2 , d5, d0 // scale >= x ? x / scale + vmlage.f64 d1 , d2 , d2 // ssq += ( x/scale ) * ( x/scale ) + bge KERNEL_S1_END_\@ + vdiv.f64 d2 , d0, d5 // scale / x + vmul.f64 d2 , d2, d2 // ( scale / x ) * ( scale / x ) + vmul.f64 d3 , d1, d2 // ssq * ( scale / x ) * ( scale / x ) + vadd.f64 d1 , d3, d7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) + vmov.f64 d0 , d5 // scale = x + +KERNEL_S1_END_\@: + + add X, X, INC_X + +.endm + + +#else + +.macro KERNEL_F1 + + fldmias X!, { s4 - s5 } + + vcmpe.f32 s4, s6 // compare with 0.0 + vmrs APSR_nzcv, fpscr + beq KERNEL_F1_NEXT_\@ + vabs.f32 s4, s4 + vcmpe.f32 s0, s4 // compare with scale + vmrs APSR_nzcv, fpscr + vdivge.f32 s2 , s4, s0 // scale >= x ? x / scale + vmlage.f32 s1 , s2 , s2 // ssq += ( x/scale ) * ( x/scale ) + bge KERNEL_F1_NEXT_\@ + vdiv.f32 s2 , s0, s4 // scale / x + vmul.f32 s2 , s2, s2 // ( scale / x ) * ( scale / x ) + vmul.f32 s3 , s1, s2 // ssq * ( scale / x ) * ( scale / x ) + vadd.f32 s1 , s3, s7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) + vmov.f32 s0 , s4 // scale = x + +KERNEL_F1_NEXT_\@: + + vcmpe.f32 s5, s6 // compare with 0.0 + vmrs APSR_nzcv, fpscr + beq KERNEL_F1_END_\@ + vabs.f32 s5, s5 + vcmpe.f32 s0, s5 // compare with scale + vmrs APSR_nzcv, fpscr + vdivge.f32 s2 , s5, s0 // scale >= x ? x / scale + vmlage.f32 s1 , s2 , s2 // ssq += ( x/scale ) * ( x/scale ) + bge KERNEL_F1_END_\@ + vdiv.f32 s2 , s0, s5 // scale / x + vmul.f32 s2 , s2, s2 // ( scale / x ) * ( scale / x ) + vmul.f32 s3 , s1, s2 // ssq * ( scale / x ) * ( scale / x ) + vadd.f32 s1 , s3, s7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) + vmov.f32 s0 , s5 // scale = x + +KERNEL_F1_END_\@: + + +.endm + +.macro KERNEL_F8 + + pld [ X, #X_PRE ] + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 + pld [ X, #X_PRE ] + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 + +.endm + +.macro KERNEL_S1 + + fldmias X, { s4 - s5 } + + vcmpe.f32 s4, s6 // compare with 0.0 + vmrs APSR_nzcv, fpscr + beq KERNEL_S1_NEXT_\@ + vabs.f32 s4, s4 + vcmpe.f32 s0, s4 // compare with scale + vmrs APSR_nzcv, fpscr + vdivge.f32 s2 , s4, s0 // scale >= x ? x / scale + vmlage.f32 s1 , s2 , s2 // ssq += ( x/scale ) * ( x/scale ) + bge KERNEL_S1_NEXT_\@ + vdiv.f32 s2 , s0, s4 // scale / x + vmul.f32 s2 , s2, s2 // ( scale / x ) * ( scale / x ) + vmul.f32 s3 , s1, s2 // ssq * ( scale / x ) * ( scale / x ) + vadd.f32 s1 , s3, s7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) + vmov.f32 s0 , s4 // scale = x + +KERNEL_S1_NEXT_\@: + + vcmpe.f32 s5, s6 // compare with 0.0 + vmrs APSR_nzcv, fpscr + beq KERNEL_S1_END_\@ + vabs.f32 s5, s5 + vcmpe.f32 s0, s5 // compare with scale + vmrs APSR_nzcv, fpscr + vdivge.f32 s2 , s5, s0 // scale >= x ? x / scale + vmlage.f32 s1 , s2 , s2 // ssq += ( x/scale ) * ( x/scale ) + bge KERNEL_S1_END_\@ + vdiv.f32 s2 , s0, s5 // scale / x + vmul.f32 s2 , s2, s2 // ( scale / x ) * ( scale / x ) + vmul.f32 s3 , s1, s2 // ssq * ( scale / x ) * ( scale / x ) + vadd.f32 s1 , s3, s7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) + vmov.f32 s0 , s5 // scale = x + +KERNEL_S1_END_\@: + + add X, X, INC_X + +.endm + + + +#endif + +#endif + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + b nrm2_begin + + +#if defined(COMPLEX) + +#if defined(DOUBLE) + +znrm2_one: + .word 0x00000000 + .word 0x3ff00000 + +#else + +cnrm2_one: + .word 0x3f800000 + +#endif + +#else + +#if defined(DOUBLE) + +dnrm2_one: + .word 0x00000000 + .word 0x3ff00000 + +#else + +snrm2_one: + .word 0x3f800000 + +#endif + +#endif + + + .align 5 + + +nrm2_begin: + +#if defined(COMPLEX) + +#if defined(DOUBLE) + vsub.f64 d0 , d0 , d0 // scale=0.0 + vldr.64 d1 , znrm2_one // ssq=1.0 + vmov.f64 d7 , d1 // value 1.0 + vmov.f64 d6 , d0 // value 0.0 +#else + vsub.f32 s0 , s0 , s0 // scale=0.0 + vldr.32 s1 , cnrm2_one // ssq=1.0 + vmov.f32 s7 , s1 // value 1.0 + vmov.f32 s6 , s0 // value 0.0 +#endif + +#else + +#if defined(DOUBLE) + vsub.f64 d0 , d0 , d0 // scale=0.0 + vldr.64 d1 , dnrm2_one // ssq=1.0 + vmov.f64 d7 , d1 // value 1.0 + vmov.f64 d6 , d0 // value 0.0 +#else + vsub.f32 s0 , s0 , s0 // scale=0.0 + vldr.32 s1 , snrm2_one // ssq=1.0 + vmov.f32 s7 , s1 // value 1.0 + vmov.f32 s6 , s0 // value 0.0 +#endif + + +#endif + + + cmp N, #0 + ble nrm2_kernel_L999 + + cmp INC_X, #0 + beq nrm2_kernel_L999 + + + cmp INC_X, #1 + bne nrm2_kernel_S_BEGIN + + +nrm2_kernel_F_BEGIN: + + asrs I, N, #3 // I = N / 8 + ble nrm2_kernel_F1 + +nrm2_kernel_F8: + + KERNEL_F8 + + subs I, I, #1 + bne nrm2_kernel_F8 + +nrm2_kernel_F1: + + ands I, N, #7 + ble nrm2_kernel_L999 + + +nrm2_kernel_F10: + + KERNEL_F1 + + subs I, I, #1 + bne nrm2_kernel_F10 + + b nrm2_kernel_L999 + +nrm2_kernel_S_BEGIN: + +#if defined(COMPLEX) + +#if defined(DOUBLE) + lsl INC_X, INC_X, #4 // INC_X * SIZE * 2 +#else + lsl INC_X, INC_X, #3 // INC_X * SIZE * 2 +#endif + +#else + +#if defined(DOUBLE) + lsl INC_X, INC_X, #3 // INC_X * SIZE +#else + lsl INC_X, INC_X, #2 // INC_X * SIZE +#endif + +#endif + + + +nrm2_kernel_S1: + + mov I, N + + .align 5 + +nrm2_kernel_S10: + + KERNEL_S1 + + subs I, I, #1 + bne nrm2_kernel_S10 + + +nrm2_kernel_L999: + +#if defined(DOUBLE) + vsqrt.f64 d1, d1 + vmul.f64 d0, d0, d1 +#else + vsqrt.f32 s1, s1 + vmul.f32 s0, s0, s1 +#endif + + bx lr + + EPILOGUE + From 33d3ab6e098e618a92a315e09a7a6e22a213ef87 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sat, 23 Nov 2013 12:35:31 +0100 Subject: [PATCH 53/81] small optimizations for zgemv kernels --- kernel/arm/zgemv_n.c | 64 +++++++++++++++++++++-------- kernel/arm/zgemv_t.c | 97 ++++++++++++++++++++++++-------------------- 2 files changed, 101 insertions(+), 60 deletions(-) diff --git a/kernel/arm/zgemv_n.c b/kernel/arm/zgemv_n.c index 5f00c34f6..dc2ffa0d2 100644 --- a/kernel/arm/zgemv_n.c +++ b/kernel/arm/zgemv_n.c @@ -26,7 +26,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** - * * 2013/09/15 Saar + * * 2013/11/23 Saar * * BLASTEST float : OK * * BLASTEST double : OK * CTEST : OK @@ -48,20 +48,17 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, BLASLONG lda2; BLASLONG i2; - if( alpha_r == 0.0 && alpha_i == 0.0 ) return(0); - lda2 = 2*lda; - inc_x2 = 2 * inc_x; - inc_y2 = 2 * inc_y; - ix = 0; a_ptr = a; -#if !defined(CONJ) - for (j=0; j Date: Sat, 23 Nov 2013 14:35:19 +0100 Subject: [PATCH 54/81] fixed bug in SAVE macros, that are not found by any test routine --- kernel/arm/dgemm_kernel_4x4_vfpv3.S | 4 ++-- kernel/arm/dtrmm_kernel_4x4_vfpv3.S | 4 ++-- kernel/arm/sgemm_kernel_4x4_vfpv3.S | 4 ++-- kernel/arm/strmm_kernel_4x4_vfpv3.S | 4 ++-- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/kernel/arm/dgemm_kernel_4x4_vfpv3.S b/kernel/arm/dgemm_kernel_4x4_vfpv3.S index ed7f611f1..3b6af19a3 100644 --- a/kernel/arm/dgemm_kernel_4x4_vfpv3.S +++ b/kernel/arm/dgemm_kernel_4x4_vfpv3.S @@ -26,7 +26,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** -* 2013/10/11 Saar +* 2013/11/23 Saar * BLASTEST : OK * CTEST : OK * TEST : OK @@ -714,7 +714,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmacd d12, d0 , d20 fstd d12, [CO2] - add CO1, CO1, #16 + add CO1, CO1, #8 .endm diff --git a/kernel/arm/dtrmm_kernel_4x4_vfpv3.S b/kernel/arm/dtrmm_kernel_4x4_vfpv3.S index a80177f8b..0f8a9291a 100644 --- a/kernel/arm/dtrmm_kernel_4x4_vfpv3.S +++ b/kernel/arm/dtrmm_kernel_4x4_vfpv3.S @@ -26,7 +26,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** -* 2013/10/05 Saar +* 2013/11/23 Saar * BLASTEST : OK * CTEST : OK * TEST : OK @@ -690,7 +690,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmuld d12, d0 , d20 fstd d12, [CO2] - add CO1, CO1, #16 + add CO1, CO1, #8 .endm diff --git a/kernel/arm/sgemm_kernel_4x4_vfpv3.S b/kernel/arm/sgemm_kernel_4x4_vfpv3.S index 4031c28db..38dc4d3ea 100644 --- a/kernel/arm/sgemm_kernel_4x4_vfpv3.S +++ b/kernel/arm/sgemm_kernel_4x4_vfpv3.S @@ -26,7 +26,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** -* 2013/11/05 Saar +* 2013/11/23 Saar * BLASTEST : OK * CTEST : OK * TEST : OK @@ -687,7 +687,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmacs s12, s0 , s20 fsts s12, [CO2] - add CO1, CO1, #8 + add CO1, CO1, #4 .endm diff --git a/kernel/arm/strmm_kernel_4x4_vfpv3.S b/kernel/arm/strmm_kernel_4x4_vfpv3.S index 15c866856..3a0c8af87 100644 --- a/kernel/arm/strmm_kernel_4x4_vfpv3.S +++ b/kernel/arm/strmm_kernel_4x4_vfpv3.S @@ -26,7 +26,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** -* 2013/10/14 Saar +* 2013/11/23 Saar * BLASTEST : OK * CTEST : OK * TEST : OK @@ -627,7 +627,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmuls s12, s0 , s20 fsts s12, [CO2] - add CO1, CO1, #8 + add CO1, CO1, #4 .endm From 8776a73773a0cb3a0daf993120a5081cc5cac42c Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sat, 23 Nov 2013 16:24:52 +0100 Subject: [PATCH 55/81] added optimized dgemm and dtrmm kernel for ARMV6 --- Makefile.rule | 2 +- kernel/arm/KERNEL.ARMV6 | 8 +- kernel/arm/dgemm_kernel_4x2_vfp.S | 797 +++++++++++++++++++++ kernel/arm/dtrmm_kernel_4x2_vfp.S | 1083 +++++++++++++++++++++++++++++ param.h | 2 +- 5 files changed, 1888 insertions(+), 4 deletions(-) create mode 100644 kernel/arm/dgemm_kernel_4x2_vfp.S create mode 100644 kernel/arm/dtrmm_kernel_4x2_vfp.S diff --git a/Makefile.rule b/Makefile.rule index 534f4d1a2..c8433288b 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -12,7 +12,7 @@ VERSION = 0.2.8 # You can specify the target architecture, otherwise it's # automatically detected. -TARGET = ARMV7 +TARGET = ARMV6 # If you want to support multiple architecture in one binary # DYNAMIC_ARCH = 1 diff --git a/kernel/arm/KERNEL.ARMV6 b/kernel/arm/KERNEL.ARMV6 index 03eff6ec5..7a58fecd3 100644 --- a/kernel/arm/KERNEL.ARMV6 +++ b/kernel/arm/KERNEL.ARMV6 @@ -81,7 +81,7 @@ CGEMVTKERNEL = zgemv_t.c ZGEMVTKERNEL = zgemv_t.c STRMMKERNEL = ../generic/trmmkernel_2x2.c -DTRMMKERNEL = ../generic/trmmkernel_2x2.c +DTRMMKERNEL = dtrmm_kernel_4x2_vfp.S CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c @@ -91,7 +91,11 @@ SGEMMOTCOPY = ../generic/gemm_tcopy_2.c SGEMMONCOPYOBJ = sgemm_oncopy.o SGEMMOTCOPYOBJ = sgemm_otcopy.o -DGEMMKERNEL = ../generic/gemmkernel_2x2.c +DGEMMKERNEL = dgemm_kernel_4x2_vfp.S +DGEMMINCOPY = ../generic/gemm_ncopy_4.c +DGEMMITCOPY = ../generic/gemm_tcopy_4.c +DGEMMINCOPYOBJ = dgemm_incopy.o +DGEMMITCOPYOBJ = dgemm_itcopy.o DGEMMONCOPY = ../generic/gemm_ncopy_2.c DGEMMOTCOPY = ../generic/gemm_tcopy_2.c DGEMMONCOPYOBJ = dgemm_oncopy.o diff --git a/kernel/arm/dgemm_kernel_4x2_vfp.S b/kernel/arm/dgemm_kernel_4x2_vfp.S new file mode 100644 index 000000000..56fd81513 --- /dev/null +++ b/kernel/arm/dgemm_kernel_4x2_vfp.S @@ -0,0 +1,797 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/23 Saar +* BLASTEST : xOK +* CTEST : xOK +* TEST : xOK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_M r0 +#define OLD_N r1 +#define OLD_K r2 +#define OLD_A r3 +#define OLD_ALPHA d0 + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define LDC [fp, #-252 ] +#define M [fp, #-256 ] +#define N [fp, #-260 ] +#define K [fp, #-264 ] +#define A [fp, #-268 ] + +#define ALPHA [fp, #-280] + +#define B [fp, #4 ] +#define C [fp, #8 ] +#define OLD_LDC [fp, #12 ] + +#define I r0 +#define J r1 +#define L r2 + +#define AO r5 +#define BO r6 + +#define CO1 r8 +#define CO2 r9 + +#define K1 r7 +#define BC r12 + +#define A_PRE 96 +#define B_PRE 96 +#define C_PRE 64 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro INIT4x2 + + vsub.f64 d8 , d8 , d8 + vmov.f64 d9, d8 + vmov.f64 d10, d8 + vmov.f64 d11, d8 + vmov.f64 d12, d8 + vmov.f64 d13, d8 + vmov.f64 d14, d8 + vmov.f64 d15, d8 + +.endm + + + +.macro KERNEL4x2_SUB + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + + fmacd d8 , d0, d4 + fmacd d9 , d1, d4 + fmacd d10 , d2, d4 + fmacd d11 , d3, d4 + + fmacd d12 , d0, d5 + fmacd d13 , d1, d5 + fmacd d14 , d2, d5 + fmacd d15 , d3, d5 + + add AO , AO, #32 + add BO , BO, #16 + +.endm + +.macro SAVE4x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + fldd d0, ALPHA + + fldd d4 , [CO1] + fldd d5 , [CO1, #8 ] + fldd d6 , [CO1, #16 ] + fldd d7 , [CO1, #24 ] + + fmacd d4 , d0 , d8 + fmacd d5 , d0 , d9 + fmacd d6 , d0 , d10 + fmacd d7 , d0 , d11 + + fstd d4 , [CO1] + fstd d5 , [CO1, #8 ] + fstd d6 , [CO1, #16 ] + fstd d7 , [CO1, #24 ] + + fldd d4 , [CO2] + fldd d5 , [CO2, #8 ] + fldd d6 , [CO2, #16 ] + fldd d7 , [CO2, #24 ] + + fmacd d4 , d0 , d12 + fmacd d5 , d0 , d13 + fmacd d6 , d0 , d14 + fmacd d7 , d0 , d15 + + fstd d4 , [CO2] + fstd d5 , [CO2, #8 ] + fstd d6 , [CO2, #16 ] + fstd d7 , [CO2, #24 ] + + add CO1, CO1, #32 + +.endm + + +/******************************************************************************/ + +.macro INIT2x2 + + vsub.f64 d8 , d8 , d8 + vmov.f64 d9, d8 + vmov.f64 d12, d8 + vmov.f64 d13, d8 + +.endm + +.macro KERNEL2x2_SUB + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fmacd d8 , d0, d4 + fmacd d9 , d1, d4 + + fmacd d12 , d0, d5 + fmacd d13 , d1, d5 + + add AO , AO, #16 + add BO , BO, #16 + +.endm + +.macro SAVE2x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + fldd d0, ALPHA + + fldd d4 , [CO1] + fldd d5 , [CO1, #8 ] + + fmacd d4 , d0 , d8 + fmacd d5 , d0 , d9 + + fstd d4 , [CO1] + fstd d5 , [CO1, #8 ] + + fldd d4 , [CO2] + fldd d5 , [CO2, #8 ] + + fmacd d4 , d0 , d12 + fmacd d5 , d0 , d13 + + fstd d4 , [CO2] + fstd d5 , [CO2, #8 ] + + add CO1, CO1, #16 + +.endm + + +/******************************************************************************/ + +.macro INIT1x2 + + vsub.f64 d8 , d8 , d8 + vmov.f64 d12, d8 + +.endm + +.macro KERNEL1x2_SUB + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fldd d0 , [ AO ] + + fmacd d8 , d0, d4 + + fmacd d12 , d0, d5 + + add AO , AO, #8 + add BO , BO, #16 + +.endm + +.macro SAVE1x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + fldd d0, ALPHA + + fldd d4 , [CO1] + + fmacd d4 , d0 , d8 + + fstd d4 , [CO1] + + fldd d4 , [CO2] + + fmacd d4 , d0 , d12 + + fstd d4 , [CO2] + + add CO1, CO1, #8 + +.endm + + + +/******************************************************************************/ + +.macro INIT4x1 + + vsub.f64 d8 , d8 , d8 + vmov.f64 d9, d8 + vmov.f64 d10, d8 + vmov.f64 d11, d8 + +.endm + + + +.macro KERNEL4x1_SUB + + fldd d4 , [ BO ] + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + + fmacd d8 , d0, d4 + fmacd d9 , d1, d4 + fmacd d10 , d2, d4 + fmacd d11 , d3, d4 + + add AO , AO, #32 + add BO , BO, #8 + +.endm + +.macro SAVE4x1 + + fldd d0, ALPHA + + fldd d4 , [CO1] + fldd d5 , [CO1, #8 ] + fldd d6 , [CO1, #16 ] + fldd d7 , [CO1, #24 ] + + fmacd d4 , d0 , d8 + fmacd d5 , d0 , d9 + fmacd d6 , d0 , d10 + fmacd d7 , d0 , d11 + + fstd d4 , [CO1] + fstd d5 , [CO1, #8 ] + fstd d6 , [CO1, #16 ] + fstd d7 , [CO1, #24 ] + + add CO1, CO1, #32 + +.endm + +/******************************************************************************/ + +.macro INIT2x1 + + vsub.f64 d8 , d8 , d8 + vmov.f64 d9 , d8 + +.endm + +.macro KERNEL2x1_SUB + + fldd d4 , [ BO ] + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fmacd d8 , d0, d4 + fmacd d9 , d1, d4 + + add AO , AO, #16 + add BO , BO, #8 + +.endm + +.macro SAVE2x1 + + fldd d0, ALPHA + + fldd d4 , [CO1] + fldd d5 , [CO1, #8 ] + + fmacd d4 , d0 , d8 + fmacd d5 , d0 , d9 + + fstd d4 , [CO1] + fstd d5 , [CO1, #8 ] + + add CO1, CO1, #16 + +.endm + + +/******************************************************************************/ + +.macro INIT1x1 + + vsub.f64 d8 , d8 , d8 + +.endm + +.macro KERNEL1x1_SUB + + fldd d4 , [ BO ] + + fldd d0 , [ AO ] + + fmacd d8 , d0, d4 + + add AO , AO, #8 + add BO , BO, #8 + +.endm + +.macro SAVE1x1 + + fldd d0, ALPHA + + fldd d4 , [CO1] + + fmacd d4 , d0 , d8 + + fstd d4 , [CO1] + + add CO1, CO1, #8 + +.endm + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + str OLD_M, M + str OLD_N, N + str OLD_K, K + str OLD_A, A + vstr OLD_ALPHA, ALPHA + + sub r3, fp, #128 + vstm r3, { d8 - d15} // store floating point registers + + ldr r3, OLD_LDC + lsl r3, r3, #3 // ldc = ldc * 8 + str r3, LDC + + ldr K1, K + ldr BC, B + + ldr J, N + asrs J, J, #1 // J = J / 2 + ble dgemm_kernel_L1_BEGIN + + +/*********************************************************************************************/ + +dgemm_kernel_L2_BEGIN: + + ldr CO1, C // CO1 = C + ldr r4 , LDC + lsl r4 , r4 , #1 // LDC * 2 + add r3 , r4, CO1 + str r3 , C // store C + + ldr AO, A // AO = A + +dgemm_kernel_L2_M4_BEGIN: + + ldr I, M + asrs I, I, #2 // I = I / 4 + ble dgemm_kernel_L2_M2_BEGIN + +dgemm_kernel_L2_M4_20: + + INIT4x2 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble dgemm_kernel_L2_M4_40 + .align 5 + +dgemm_kernel_L2_M4_22: + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + subs L, L, #1 + bgt dgemm_kernel_L2_M4_22 + + +dgemm_kernel_L2_M4_40: + + ands L , K1, #7 // L = L % 8 + ble dgemm_kernel_L2_M4_100 + +dgemm_kernel_L2_M4_42: + + KERNEL4x2_SUB + + subs L, L, #1 + bgt dgemm_kernel_L2_M4_42 + +dgemm_kernel_L2_M4_100: + + SAVE4x2 + +dgemm_kernel_L2_M4_END: + + subs I, I, #1 + bgt dgemm_kernel_L2_M4_20 + + +dgemm_kernel_L2_M2_BEGIN: + + ldr I, M + tst I , #3 + ble dgemm_kernel_L2_END + + tst I, #2 // I = I / 2 + ble dgemm_kernel_L2_M1_BEGIN + +dgemm_kernel_L2_M2_20: + + INIT2x2 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble dgemm_kernel_L2_M2_40 + +dgemm_kernel_L2_M2_22: + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + subs L, L, #1 + bgt dgemm_kernel_L2_M2_22 + + +dgemm_kernel_L2_M2_40: + + ands L , K1, #7 // L = L % 8 + ble dgemm_kernel_L2_M2_100 + +dgemm_kernel_L2_M2_42: + + KERNEL2x2_SUB + + subs L, L, #1 + bgt dgemm_kernel_L2_M2_42 + +dgemm_kernel_L2_M2_100: + + SAVE2x2 + +dgemm_kernel_L2_M2_END: + + +dgemm_kernel_L2_M1_BEGIN: + + tst I, #1 // I = I % 2 + ble dgemm_kernel_L2_END + +dgemm_kernel_L2_M1_20: + + INIT1x2 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble dgemm_kernel_L2_M1_40 + +dgemm_kernel_L2_M1_22: + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + subs L, L, #1 + bgt dgemm_kernel_L2_M1_22 + + +dgemm_kernel_L2_M1_40: + + ands L , K1, #7 // L = L % 8 + ble dgemm_kernel_L2_M1_100 + +dgemm_kernel_L2_M1_42: + + KERNEL1x2_SUB + + subs L, L, #1 + bgt dgemm_kernel_L2_M1_42 + +dgemm_kernel_L2_M1_100: + + SAVE1x2 + + +dgemm_kernel_L2_END: + + mov r3, BC + mov r4, K1 + lsl r4, r4, #4 // k * 2 * 8 + add r3, r3, r4 // B = B + K * 2 * 8 + mov BC, r3 + + subs J , #1 // j-- + bgt dgemm_kernel_L2_BEGIN + +/*********************************************************************************************/ + +dgemm_kernel_L1_BEGIN: + + ldr J , N + tst J , #1 + ble dgemm_kernel_L999 + + + ldr CO1, C // CO1 = C + ldr r4 , LDC + add r3 , r4, CO1 + str r3 , C // store C + + ldr AO, A // AO = A + + + +dgemm_kernel_L1_M4_BEGIN: + + ldr I, M + asrs I, I, #2 // I = I / 4 + ble dgemm_kernel_L1_M2_BEGIN + +dgemm_kernel_L1_M4_20: + + INIT4x1 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble dgemm_kernel_L1_M4_40 + .align 5 + +dgemm_kernel_L1_M4_22: + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + subs L, L, #1 + bgt dgemm_kernel_L1_M4_22 + + +dgemm_kernel_L1_M4_40: + + ands L , K1, #7 // L = L % 8 + ble dgemm_kernel_L1_M4_100 + +dgemm_kernel_L1_M4_42: + + KERNEL4x1_SUB + + subs L, L, #1 + bgt dgemm_kernel_L1_M4_42 + +dgemm_kernel_L1_M4_100: + + SAVE4x1 + +dgemm_kernel_L1_M4_END: + + subs I, I, #1 + bgt dgemm_kernel_L1_M4_20 + + +dgemm_kernel_L1_M2_BEGIN: + + ldr I, M + tst I , #3 + ble dgemm_kernel_L1_END + + tst I, #2 // I = I / 2 + ble dgemm_kernel_L1_M1_BEGIN + +dgemm_kernel_L1_M2_20: + + INIT2x1 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble dgemm_kernel_L1_M2_40 + +dgemm_kernel_L1_M2_22: + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + subs L, L, #1 + bgt dgemm_kernel_L1_M2_22 + + +dgemm_kernel_L1_M2_40: + + ands L , K1, #7 // L = L % 8 + ble dgemm_kernel_L1_M2_100 + +dgemm_kernel_L1_M2_42: + + KERNEL2x1_SUB + + subs L, L, #1 + bgt dgemm_kernel_L1_M2_42 + +dgemm_kernel_L1_M2_100: + + SAVE2x1 + +dgemm_kernel_L1_M2_END: + + +dgemm_kernel_L1_M1_BEGIN: + + tst I, #1 // I = I % 2 + ble dgemm_kernel_L1_END + +dgemm_kernel_L1_M1_20: + + INIT1x1 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble dgemm_kernel_L1_M1_40 + +dgemm_kernel_L1_M1_22: + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + subs L, L, #1 + bgt dgemm_kernel_L1_M1_22 + + +dgemm_kernel_L1_M1_40: + + ands L , K1, #7 // L = L % 8 + ble dgemm_kernel_L1_M1_100 + +dgemm_kernel_L1_M1_42: + + KERNEL1x1_SUB + + subs L, L, #1 + bgt dgemm_kernel_L1_M1_42 + +dgemm_kernel_L1_M1_100: + + SAVE1x1 + + +dgemm_kernel_L1_END: + + +dgemm_kernel_L999: + + sub r3, fp, #128 + vldm r3, { d8 - d15} // restore floating point registers + + movs r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/dtrmm_kernel_4x2_vfp.S b/kernel/arm/dtrmm_kernel_4x2_vfp.S new file mode 100644 index 000000000..55a017a97 --- /dev/null +++ b/kernel/arm/dtrmm_kernel_4x2_vfp.S @@ -0,0 +1,1083 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/23 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 252 + +#define OLD_M r0 +#define OLD_N r1 +#define OLD_K r2 +#define OLD_A r3 +#define OLD_ALPHA d0 + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define KK [fp, #-240 ] +#define KKK [fp, #-244] +#define C [fp, #-248 ] +#define LDC [fp, #-252 ] +#define M [fp, #-256 ] +#define N [fp, #-260 ] +#define K [fp, #-264 ] +#define A [fp, #-268 ] + +#define ALPHA [fp, #-276 ] + +#define B [fp, #4 ] +#define OLD_C [fp, #8 ] +#define OLD_LDC [fp, #12 ] +#define OFFSET [fp, #16 ] + +#define I r0 +#define J r1 +#define L r2 + +#define AO r5 +#define BO r6 + +#define CO1 r8 +#define CO2 r9 + +#define K1 r7 +#define BC r12 + +#define A_PRE 64 +#define B_PRE 64 +#define C_PRE 64 + + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro INIT4x2 + + vsub.f64 d8 , d8 , d8 + vmov.f64 d9, d8 + vmov.f64 d10, d8 + vmov.f64 d11, d8 + vmov.f64 d12, d8 + vmov.f64 d13, d8 + vmov.f64 d14, d8 + vmov.f64 d15, d8 + +.endm + + + +.macro KERNEL4x2_SUB + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + + fmacd d8 , d0, d4 + fmacd d9 , d1, d4 + fmacd d10 , d2, d4 + fmacd d11 , d3, d4 + + fmacd d12 , d0, d5 + fmacd d13 , d1, d5 + fmacd d14 , d2, d5 + fmacd d15 , d3, d5 + + add AO , AO, #32 + add BO , BO, #16 + +.endm + +.macro SAVE4x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + fldd d0, ALPHA + + fmuld d4 , d0 , d8 + fmuld d5 , d0 , d9 + fmuld d6 , d0 , d10 + fmuld d7 , d0 , d11 + + fstd d4 , [CO1] + fstd d5 , [CO1, #8 ] + fstd d6 , [CO1, #16 ] + fstd d7 , [CO1, #24 ] + + fmuld d4 , d0 , d12 + fmuld d5 , d0 , d13 + fmuld d6 , d0 , d14 + fmuld d7 , d0 , d15 + + fstd d4 , [CO2] + fstd d5 , [CO2, #8 ] + fstd d6 , [CO2, #16 ] + fstd d7 , [CO2, #24 ] + + add CO1, CO1, #32 + +.endm + + +/******************************************************************************/ + +.macro INIT2x2 + + vsub.f64 d8 , d8 , d8 + vmov.f64 d9, d8 + vmov.f64 d12, d8 + vmov.f64 d13, d8 + +.endm + +.macro KERNEL2x2_SUB + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fmacd d8 , d0, d4 + fmacd d9 , d1, d4 + + fmacd d12 , d0, d5 + fmacd d13 , d1, d5 + + add AO , AO, #16 + add BO , BO, #16 + +.endm + +.macro SAVE2x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + fldd d0, ALPHA + + + fmuld d4 , d0 , d8 + fmuld d5 , d0 , d9 + + fstd d4 , [CO1] + fstd d5 , [CO1, #8 ] + + fmuld d4 , d0 , d12 + fmuld d5 , d0 , d13 + + fstd d4 , [CO2] + fstd d5 , [CO2, #8 ] + + add CO1, CO1, #16 + +.endm + + +/******************************************************************************/ + +.macro INIT1x2 + + vsub.f64 d8 , d8 , d8 + vmov.f64 d12, d8 + +.endm + +.macro KERNEL1x2_SUB + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fldd d0 , [ AO ] + + fmacd d8 , d0, d4 + + fmacd d12 , d0, d5 + + add AO , AO, #8 + add BO , BO, #16 + +.endm + +.macro SAVE1x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + fldd d0, ALPHA + + + fmuld d4 , d0 , d8 + + fstd d4 , [CO1] + + + fmuld d4 , d0 , d12 + + fstd d4 , [CO2] + + add CO1, CO1, #8 + +.endm + + + +/******************************************************************************/ + +.macro INIT4x1 + + vsub.f64 d8 , d8 , d8 + vmov.f64 d9, d8 + vmov.f64 d10, d8 + vmov.f64 d11, d8 + +.endm + + + +.macro KERNEL4x1_SUB + + fldd d4 , [ BO ] + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + + fmacd d8 , d0, d4 + fmacd d9 , d1, d4 + fmacd d10 , d2, d4 + fmacd d11 , d3, d4 + + add AO , AO, #32 + add BO , BO, #8 + +.endm + +.macro SAVE4x1 + + fldd d0, ALPHA + + fmuld d4 , d0 , d8 + fmuld d5 , d0 , d9 + fmuld d6 , d0 , d10 + fmuld d7 , d0 , d11 + + fstd d4 , [CO1] + fstd d5 , [CO1, #8 ] + fstd d6 , [CO1, #16 ] + fstd d7 , [CO1, #24 ] + + add CO1, CO1, #32 + +.endm + +/******************************************************************************/ + +.macro INIT2x1 + + vsub.f64 d8 , d8 , d8 + vmov.f64 d9 , d8 + +.endm + +.macro KERNEL2x1_SUB + + fldd d4 , [ BO ] + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fmacd d8 , d0, d4 + fmacd d9 , d1, d4 + + add AO , AO, #16 + add BO , BO, #8 + +.endm + +.macro SAVE2x1 + + fldd d0, ALPHA + + fmuld d4 , d0 , d8 + fmuld d5 , d0 , d9 + + fstd d4 , [CO1] + fstd d5 , [CO1, #8 ] + + add CO1, CO1, #16 + +.endm + + +/******************************************************************************/ + +.macro INIT1x1 + + vsub.f64 d8 , d8 , d8 + +.endm + +.macro KERNEL1x1_SUB + + fldd d4 , [ BO ] + + fldd d0 , [ AO ] + + fmacd d8 , d0, d4 + + add AO , AO, #8 + add BO , BO, #8 + +.endm + +.macro SAVE1x1 + + fldd d0, ALPHA + + fmuld d4 , d0 , d8 + + fstd d4 , [CO1] + + add CO1, CO1, #8 + +.endm + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + str OLD_M, M + str OLD_N, N + str OLD_K, K + str OLD_A, A + vstr OLD_ALPHA, ALPHA + + sub r3, fp, #128 + vstm r3, { d8 - d15} // store floating point registers + + ldr r3, OLD_LDC + lsl r3, r3, #3 // ldc = ldc * 8 + str r3, LDC + + ldr r3, OLD_C + str r3, C + + ldr BC, B + + ldr r3, OFFSET +#ifndef LEFT + neg r3 , r3 +#endif + str r3 , KK + + ldr J, N + asrs J, J, #1 // J = J / 2 + ble _L1_BEGIN + +_L2_BEGIN: + + ldr CO1, C // CO1 = C + ldr r4 , LDC + lsl r4 , r4 , #1 // LDC * 2 + add r3 , r4, CO1 + str r3 , C // store C + +#if defined(LEFT) + ldr r3 , OFFSET + str r3 , KK +#endif + + ldr AO, A // AO = A + + +_L2_M4_BEGIN: + + ldr I, M + asrs I, I, #2 // I = I / 4 + ble _L2_M2_BEGIN + +_L2_M4_20: + + INIT4x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #4 // 2 double values + add BO , BO , r4 + lsls r4 , r3 , #5 // 4 double values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr L , K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr L , K + ldr r3, KK + sub L , L, r3 + str L , KKK +#else + ldr L , KK +#ifdef LEFT + add L , L , #4 // number of values in AO +#else + add L , L , #2 // number of values in BO +#endif + str L , KKK +#endif + + mov K1, L + asrs L , K1, #3 // L = L / 8 + ble _L2_M4_40 + .align 5 + +_L2_M4_22: + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + subs L, L, #1 + bgt _L2_M4_22 + + +_L2_M4_40: + + ands L , K1, #7 // L = L % 8 + ble _L2_M4_100 + +_L2_M4_42: + + KERNEL4x2_SUB + + subs L, L, #1 + bgt _L2_M4_42 + +_L2_M4_100: + + SAVE4x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #4 // 2 double values + add BO , BO , r4 + lsls r4 , r3 , #5 // 4 double values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #4 // number of values in AO + str r3 , KK +#endif + + + +_L2_M4_END: + + subs I, I, #1 + bgt _L2_M4_20 + + +_L2_M2_BEGIN: + + ldr I, M + tst I , #3 + ble _L2_END + + tst I, #2 // I = I / 2 + ble _L2_M1_BEGIN + +_L2_M2_20: + + INIT2x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #4 // 2 double values + add BO , BO , r4 + lsls r4 , r3 , #4 // 2 double values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr L , K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr L , K + ldr r3, KK + sub L , L, r3 + str L , KKK +#else + ldr L , KK +#ifdef LEFT + add L , L , #2 // number of values in AO +#else + add L , L , #2 // number of values in BO +#endif + str L , KKK +#endif + + mov K1, L + asrs L , K1, #3 // L = L / 8 + ble _L2_M2_40 + +_L2_M2_22: + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + subs L, L, #1 + bgt _L2_M2_22 + + +_L2_M2_40: + + ands L , K1, #7 // L = L % 8 + ble _L2_M2_100 + +_L2_M2_42: + + KERNEL2x2_SUB + + subs L, L, #1 + bgt _L2_M2_42 + +_L2_M2_100: + + SAVE2x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #4 // 2 double values + add BO , BO , r4 + lsls r4 , r3 , #4 // 2 double values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #2 // number of values in AO + str r3 , KK +#endif + + + +_L2_M2_END: + + +_L2_M1_BEGIN: + + tst I, #1 // I = I % 2 + ble _L2_END + +_L2_M1_20: + + INIT1x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #4 // 2 double values + add BO , BO , r4 + lsls r4 , r3 , #3 // 1 double value + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr L , K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr L , K + ldr r3, KK + sub L , L, r3 + str L , KKK +#else + ldr L , KK +#ifdef LEFT + add L , L , #1 // number of values in AO +#else + add L , L , #2 // number of values in BO +#endif + str L , KKK +#endif + + mov K1, L + asrs L , K1, #3 // L = L / 8 + ble _L2_M1_40 + +_L2_M1_22: + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + subs L, L, #1 + bgt _L2_M1_22 + + +_L2_M1_40: + + ands L , K1, #7 // L = L % 8 + ble _L2_M1_100 + +_L2_M1_42: + + KERNEL1x2_SUB + + subs L, L, #1 + bgt _L2_M1_42 + +_L2_M1_100: + + SAVE1x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #4 // 2 double values + add BO , BO , r4 + lsls r4 , r3 , #3 // 1 double value + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #1 // number of values in AO + str r3 , KK +#endif + + + + +_L2_END: + + mov r3, BC + ldr r4, K + lsl r4, r4, #4 // k * 2 * 8 + add r3, r3, r4 // B = B + K * 2 * 8 + mov BC, r3 + +#if !defined(LEFT) + ldr r3 , KK + add r3 , r3 , #2 // number of values in BO + str r3 , KK +#endif + + + subs J , #1 // j-- + bgt _L2_BEGIN + +/*********************************************************************************************/ + +_L1_BEGIN: + + ldr J , N + tst J , #1 + ble _L999 + + + ldr CO1, C // CO1 = C + ldr r4 , LDC + add r3 , r4, CO1 + str r3 , C // store C + +#if defined(LEFT) + ldr r3 , OFFSET + str r3 , KK +#endif + + ldr AO, A // AO = A + //pld [AO , #A_PRE-96] + //pld [AO , #A_PRE-64] + //pld [AO , #A_PRE-32] + + + +_L1_M4_BEGIN: + + ldr I, M + asrs I, I, #2 // I = I / 4 + ble _L1_M2_BEGIN + +_L1_M4_20: + + INIT4x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #3 // 1 double value + add BO , BO , r4 + lsls r4 , r3 , #5 // 4 double values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr L , K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr L , K + ldr r3, KK + sub L , L, r3 + str L , KKK +#else + ldr L , KK +#ifdef LEFT + add L , L , #4 // number of values in AO +#else + add L , L , #1 // number of values in BO +#endif + str L , KKK +#endif + + mov K1, L + asrs L , K1, #3 // L = L / 8 + ble _L1_M4_40 + .align 5 + +_L1_M4_22: + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + subs L, L, #1 + bgt _L1_M4_22 + + +_L1_M4_40: + + ands L , K1, #7 // L = L % 8 + ble _L1_M4_100 + +_L1_M4_42: + + KERNEL4x1_SUB + + subs L, L, #1 + bgt _L1_M4_42 + +_L1_M4_100: + + SAVE4x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #3 // 1 double value + add BO , BO , r4 + lsls r4 , r3 , #5 // 4 double values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #4 // number of values in AO + str r3 , KK +#endif + + + +_L1_M4_END: + + subs I, I, #1 + bgt _L1_M4_20 + + +_L1_M2_BEGIN: + + ldr I, M + tst I , #3 + ble _L1_END + + tst I, #2 // I = I / 2 + ble _L1_M1_BEGIN + +_L1_M2_20: + + INIT2x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #3 // 1 double value + add BO , BO , r4 + lsls r4 , r3 , #4 // 2 double values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr L , K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr L , K + ldr r3, KK + sub L , L, r3 + str L , KKK +#else + ldr L , KK +#ifdef LEFT + add L , L , #2 // number of values in AO +#else + add L , L , #1 // number of values in BO +#endif + str L , KKK +#endif + + mov K1, L + asrs L , K1, #3 // L = L / 8 + ble _L1_M2_40 + +_L1_M2_22: + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + subs L, L, #1 + bgt _L1_M2_22 + + +_L1_M2_40: + + ands L , K1, #7 // L = L % 8 + ble _L1_M2_100 + +_L1_M2_42: + + KERNEL2x1_SUB + + subs L, L, #1 + bgt _L1_M2_42 + +_L1_M2_100: + + SAVE2x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #3 // 1 double value + add BO , BO , r4 + lsls r4 , r3 , #4 // 2 double values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #2 // number of values in AO + str r3 , KK +#endif + + + +_L1_M2_END: + + +_L1_M1_BEGIN: + + tst I, #1 // I = I % 2 + ble _L1_END + +_L1_M1_20: + + INIT1x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #3 // 1 double value + add BO , BO , r4 + lsls r4 , r3 , #3 // 1 double value + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr L , K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr L , K + ldr r3, KK + sub L , L, r3 + str L , KKK +#else + ldr L , KK +#ifdef LEFT + add L , L , #1 // number of values in AO +#else + add L , L , #1 // number of values in BO +#endif + str L , KKK +#endif + + mov K1, L + asrs L , K1, #3 // L = L / 8 + ble _L1_M1_40 + +_L1_M1_22: + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + subs L, L, #1 + bgt _L1_M1_22 + + +_L1_M1_40: + + ands L , K1, #7 // L = L % 8 + ble _L1_M1_100 + +_L1_M1_42: + + KERNEL1x1_SUB + + subs L, L, #1 + bgt _L1_M1_42 + +_L1_M1_100: + + SAVE1x1 + + +_L1_END: + + +_L999: + + sub r3, fp, #128 + vldm r3, { d8 - d15} // restore floating point registers + + movs r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + diff --git a/param.h b/param.h index 7bb27f3ab..568605bb8 100644 --- a/param.h +++ b/param.h @@ -1846,7 +1846,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SGEMM_DEFAULT_UNROLL_M 2 #define SGEMM_DEFAULT_UNROLL_N 2 -#define DGEMM_DEFAULT_UNROLL_M 2 +#define DGEMM_DEFAULT_UNROLL_M 4 #define DGEMM_DEFAULT_UNROLL_N 2 #define CGEMM_DEFAULT_UNROLL_M 2 From 29a3196f56a6ad465d097580eb6c0c5556adbbcc Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sat, 23 Nov 2013 18:09:41 +0100 Subject: [PATCH 56/81] added optimized sgemm and strmm kernel for ARMV6 --- kernel/arm/KERNEL.ARMV6 | 8 +- kernel/arm/sgemm_kernel_4x2_vfp.S | 798 +++++++++++++++++++++ kernel/arm/strmm_kernel_4x2_vfp.S | 1083 +++++++++++++++++++++++++++++ param.h | 2 +- 4 files changed, 1888 insertions(+), 3 deletions(-) create mode 100644 kernel/arm/sgemm_kernel_4x2_vfp.S create mode 100644 kernel/arm/strmm_kernel_4x2_vfp.S diff --git a/kernel/arm/KERNEL.ARMV6 b/kernel/arm/KERNEL.ARMV6 index 7a58fecd3..b192b20da 100644 --- a/kernel/arm/KERNEL.ARMV6 +++ b/kernel/arm/KERNEL.ARMV6 @@ -80,12 +80,16 @@ DGEMVTKERNEL = gemv_t.c CGEMVTKERNEL = zgemv_t.c ZGEMVTKERNEL = zgemv_t.c -STRMMKERNEL = ../generic/trmmkernel_2x2.c +STRMMKERNEL = strmm_kernel_4x2_vfp.S DTRMMKERNEL = dtrmm_kernel_4x2_vfp.S CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c -SGEMMKERNEL = ../generic/gemmkernel_2x2.c +SGEMMKERNEL = sgemm_kernel_4x2_vfp.S +SGEMMINCOPY = ../generic/gemm_ncopy_4.c +SGEMMITCOPY = ../generic/gemm_tcopy_4.c +SGEMMINCOPYOBJ = sgemm_incopy.o +SGEMMITCOPYOBJ = sgemm_itcopy.o SGEMMONCOPY = ../generic/gemm_ncopy_2.c SGEMMOTCOPY = ../generic/gemm_tcopy_2.c SGEMMONCOPYOBJ = sgemm_oncopy.o diff --git a/kernel/arm/sgemm_kernel_4x2_vfp.S b/kernel/arm/sgemm_kernel_4x2_vfp.S new file mode 100644 index 000000000..3e20f86f0 --- /dev/null +++ b/kernel/arm/sgemm_kernel_4x2_vfp.S @@ -0,0 +1,798 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/23 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_M r0 +#define OLD_N r1 +#define OLD_K r2 +#define OLD_A r3 +#define OLD_ALPHA s0 + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define LDC [fp, #-252 ] +#define M [fp, #-256 ] +#define N [fp, #-260 ] +#define K [fp, #-264 ] +#define A [fp, #-268 ] + +#define ALPHA [fp, #-280] + +#define B [fp, #4 ] +#define C [fp, #8 ] +#define OLD_LDC [fp, #12 ] + +#define I r0 +#define J r1 +#define L r2 + +#define AO r5 +#define BO r6 + +#define CO1 r8 +#define CO2 r9 + +#define K1 r7 +#define BC r12 + +#define A_PRE 96 +#define B_PRE 96 +#define C_PRE 64 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro INIT4x2 + + vsub.f32 s8 , s8 , s8 + vmov.f32 s9, s8 + vmov.f32 s10, s8 + vmov.f32 s11, s8 + vmov.f32 s12, s8 + vmov.f32 s13, s8 + vmov.f32 s14, s8 + vmov.f32 s15, s8 + +.endm + + + +.macro KERNEL4x2_SUB + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s2 , [ AO, #8 ] + flds s3 , [ AO, #12 ] + + fmacs s8 , s0, s4 + fmacs s9 , s1, s4 + fmacs s10 , s2, s4 + fmacs s11 , s3, s4 + + fmacs s12 , s0, s5 + fmacs s13 , s1, s5 + fmacs s14 , s2, s5 + fmacs s15 , s3, s5 + + add AO , AO, #16 + add BO , BO, #8 + +.endm + +.macro SAVE4x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + flds s0, ALPHA + + flds s4 , [CO1] + flds s5 , [CO1, #4 ] + flds s6 , [CO1, #8 ] + flds s7 , [CO1, #12 ] + + fmacs s4 , s0 , s8 + fmacs s5 , s0 , s9 + fmacs s6 , s0 , s10 + fmacs s7 , s0 , s11 + + fsts s4 , [CO1] + fsts s5 , [CO1, #4 ] + fsts s6 , [CO1, #8 ] + fsts s7 , [CO1, #12 ] + + flds s4 , [CO2] + flds s5 , [CO2, #4 ] + flds s6 , [CO2, #8 ] + flds s7 , [CO2, #12 ] + + fmacs s4 , s0 , s12 + fmacs s5 , s0 , s13 + fmacs s6 , s0 , s14 + fmacs s7 , s0 , s15 + + fsts s4 , [CO2] + fsts s5 , [CO2, #4 ] + fsts s6 , [CO2, #8 ] + fsts s7 , [CO2, #12 ] + + add CO1, CO1, #16 + +.endm + + +/******************************************************************************/ + +.macro INIT2x2 + + vsub.f32 s8 , s8 , s8 + vmov.f32 s9, s8 + vmov.f32 s12, s8 + vmov.f32 s13, s8 + +.endm + +.macro KERNEL2x2_SUB + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + fmacs s8 , s0, s4 + fmacs s9 , s1, s4 + + fmacs s12 , s0, s5 + fmacs s13 , s1, s5 + + add AO , AO, #8 + add BO , BO, #8 + +.endm + +.macro SAVE2x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + flds s0, ALPHA + + flds s4 , [CO1] + flds s5 , [CO1, #4 ] + + fmacs s4 , s0 , s8 + fmacs s5 , s0 , s9 + + fsts s4 , [CO1] + fsts s5 , [CO1, #4 ] + + flds s4 , [CO2] + flds s5 , [CO2, #4 ] + + fmacs s4 , s0 , s12 + fmacs s5 , s0 , s13 + + fsts s4 , [CO2] + fsts s5 , [CO2, #4 ] + + add CO1, CO1, #8 + +.endm + + +/******************************************************************************/ + +.macro INIT1x2 + + vsub.f32 s8 , s8 , s8 + vmov.f32 s12, s8 + +.endm + +.macro KERNEL1x2_SUB + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + + flds s0 , [ AO ] + + fmacs s8 , s0, s4 + + fmacs s12 , s0, s5 + + add AO , AO, #4 + add BO , BO, #8 + +.endm + +.macro SAVE1x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + flds s0, ALPHA + + flds s4 , [CO1] + + fmacs s4 , s0 , s8 + + fsts s4 , [CO1] + + flds s4 , [CO2] + + fmacs s4 , s0 , s12 + + fsts s4 , [CO2] + + add CO1, CO1, #4 + +.endm + + + +/******************************************************************************/ + +.macro INIT4x1 + + vsub.f32 s8 , s8 , s8 + vmov.f32 s9, s8 + vmov.f32 s10, s8 + vmov.f32 s11, s8 + +.endm + + + +.macro KERNEL4x1_SUB + + flds s4 , [ BO ] + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s2 , [ AO, #8 ] + flds s3 , [ AO, #12 ] + + fmacs s8 , s0, s4 + fmacs s9 , s1, s4 + fmacs s10 , s2, s4 + fmacs s11 , s3, s4 + + add AO , AO, #16 + add BO , BO, #4 + +.endm + +.macro SAVE4x1 + + flds s0, ALPHA + + flds s4 , [CO1] + flds s5 , [CO1, #4 ] + flds s6 , [CO1, #8 ] + flds s7 , [CO1, #12 ] + + fmacs s4 , s0 , s8 + fmacs s5 , s0 , s9 + fmacs s6 , s0 , s10 + fmacs s7 , s0 , s11 + + fsts s4 , [CO1] + fsts s5 , [CO1, #4 ] + fsts s6 , [CO1, #8 ] + fsts s7 , [CO1, #12 ] + + add CO1, CO1, #16 + +.endm + +/******************************************************************************/ + +.macro INIT2x1 + + vsub.f32 s8 , s8 , s8 + vmov.f32 s9 , s8 + +.endm + +.macro KERNEL2x1_SUB + + flds s4 , [ BO ] + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + fmacs s8 , s0, s4 + fmacs s9 , s1, s4 + + add AO , AO, #8 + add BO , BO, #4 + +.endm + +.macro SAVE2x1 + + flds s0, ALPHA + + flds s4 , [CO1] + flds s5 , [CO1, #4 ] + + fmacs s4 , s0 , s8 + fmacs s5 , s0 , s9 + + fsts s4 , [CO1] + fsts s5 , [CO1, #4 ] + + add CO1, CO1, #8 + +.endm + + +/******************************************************************************/ + +.macro INIT1x1 + + vsub.f32 s8 , s8 , s8 + +.endm + +.macro KERNEL1x1_SUB + + flds s4 , [ BO ] + + flds s0 , [ AO ] + + fmacs s8 , s0, s4 + + add AO , AO, #4 + add BO , BO, #4 + +.endm + +.macro SAVE1x1 + + flds s0, ALPHA + + flds s4 , [CO1] + + fmacs s4 , s0 , s8 + + fsts s4 , [CO1] + + add CO1, CO1, #4 + +.endm + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + str OLD_M, M + str OLD_N, N + str OLD_K, K + str OLD_A, A + vstr OLD_ALPHA, ALPHA + + sub r3, fp, #128 + vstm r3, { s8 - s15} // store floating point registers + + ldr r3, OLD_LDC + lsl r3, r3, #2 // ldc = ldc * 4 + str r3, LDC + + ldr K1, K + ldr BC, B + + ldr J, N + asrs J, J, #1 // J = J / 2 + ble sgemm_kernel_L1_BEGIN + + +/*********************************************************************************************/ + +sgemm_kernel_L2_BEGIN: + + ldr CO1, C // CO1 = C + ldr r4 , LDC + lsl r4 , r4 , #1 // LDC * 2 + add r3 , r4, CO1 + str r3 , C // store C + + ldr AO, A // AO = A + +sgemm_kernel_L2_M4_BEGIN: + + ldr I, M + asrs I, I, #2 // I = I / 4 + ble sgemm_kernel_L2_M2_BEGIN + +sgemm_kernel_L2_M4_20: + + INIT4x2 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble sgemm_kernel_L2_M4_40 + .align 5 + +sgemm_kernel_L2_M4_22: + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + subs L, L, #1 + bgt sgemm_kernel_L2_M4_22 + + +sgemm_kernel_L2_M4_40: + + ands L , K1, #7 // L = L % 8 + ble sgemm_kernel_L2_M4_100 + +sgemm_kernel_L2_M4_42: + + KERNEL4x2_SUB + + subs L, L, #1 + bgt sgemm_kernel_L2_M4_42 + +sgemm_kernel_L2_M4_100: + + SAVE4x2 + +sgemm_kernel_L2_M4_END: + + subs I, I, #1 + bgt sgemm_kernel_L2_M4_20 + + +sgemm_kernel_L2_M2_BEGIN: + + ldr I, M + tst I , #3 + ble sgemm_kernel_L2_END + + tst I, #2 // I = I / 2 + ble sgemm_kernel_L2_M1_BEGIN + +sgemm_kernel_L2_M2_20: + + INIT2x2 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble sgemm_kernel_L2_M2_40 + +sgemm_kernel_L2_M2_22: + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + subs L, L, #1 + bgt sgemm_kernel_L2_M2_22 + + +sgemm_kernel_L2_M2_40: + + ands L , K1, #7 // L = L % 8 + ble sgemm_kernel_L2_M2_100 + +sgemm_kernel_L2_M2_42: + + KERNEL2x2_SUB + + subs L, L, #1 + bgt sgemm_kernel_L2_M2_42 + +sgemm_kernel_L2_M2_100: + + SAVE2x2 + +sgemm_kernel_L2_M2_END: + + +sgemm_kernel_L2_M1_BEGIN: + + tst I, #1 // I = I % 2 + ble sgemm_kernel_L2_END + +sgemm_kernel_L2_M1_20: + + INIT1x2 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble sgemm_kernel_L2_M1_40 + +sgemm_kernel_L2_M1_22: + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + subs L, L, #1 + bgt sgemm_kernel_L2_M1_22 + + +sgemm_kernel_L2_M1_40: + + ands L , K1, #7 // L = L % 8 + ble sgemm_kernel_L2_M1_100 + +sgemm_kernel_L2_M1_42: + + KERNEL1x2_SUB + + subs L, L, #1 + bgt sgemm_kernel_L2_M1_42 + +sgemm_kernel_L2_M1_100: + + SAVE1x2 + + +sgemm_kernel_L2_END: + + mov r3, BC + mov r4, K1 + lsl r4, r4, #3 // k * 2 * 4 + add r3, r3, r4 // B = B + K * 2 * 4 + mov BC, r3 + + subs J , #1 // j-- + bgt sgemm_kernel_L2_BEGIN + +/*********************************************************************************************/ + +sgemm_kernel_L1_BEGIN: + + ldr J , N + tst J , #1 + ble sgemm_kernel_L999 + + + ldr CO1, C // CO1 = C + ldr r4 , LDC + add r3 , r4, CO1 + str r3 , C // store C + + ldr AO, A // AO = A + + + +sgemm_kernel_L1_M4_BEGIN: + + ldr I, M + asrs I, I, #2 // I = I / 4 + ble sgemm_kernel_L1_M2_BEGIN + +sgemm_kernel_L1_M4_20: + + INIT4x1 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble sgemm_kernel_L1_M4_40 + .align 5 + +sgemm_kernel_L1_M4_22: + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + subs L, L, #1 + bgt sgemm_kernel_L1_M4_22 + + +sgemm_kernel_L1_M4_40: + + ands L , K1, #7 // L = L % 8 + ble sgemm_kernel_L1_M4_100 + +sgemm_kernel_L1_M4_42: + + KERNEL4x1_SUB + + subs L, L, #1 + bgt sgemm_kernel_L1_M4_42 + +sgemm_kernel_L1_M4_100: + + SAVE4x1 + +sgemm_kernel_L1_M4_END: + + subs I, I, #1 + bgt sgemm_kernel_L1_M4_20 + + +sgemm_kernel_L1_M2_BEGIN: + + ldr I, M + tst I , #3 + ble sgemm_kernel_L1_END + + tst I, #2 // I = I / 2 + ble sgemm_kernel_L1_M1_BEGIN + +sgemm_kernel_L1_M2_20: + + INIT2x1 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble sgemm_kernel_L1_M2_40 + +sgemm_kernel_L1_M2_22: + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + subs L, L, #1 + bgt sgemm_kernel_L1_M2_22 + + +sgemm_kernel_L1_M2_40: + + ands L , K1, #7 // L = L % 8 + ble sgemm_kernel_L1_M2_100 + +sgemm_kernel_L1_M2_42: + + KERNEL2x1_SUB + + subs L, L, #1 + bgt sgemm_kernel_L1_M2_42 + +sgemm_kernel_L1_M2_100: + + SAVE2x1 + +sgemm_kernel_L1_M2_END: + + +sgemm_kernel_L1_M1_BEGIN: + + tst I, #1 // I = I % 2 + ble sgemm_kernel_L1_END + +sgemm_kernel_L1_M1_20: + + INIT1x1 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble sgemm_kernel_L1_M1_40 + +sgemm_kernel_L1_M1_22: + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + subs L, L, #1 + bgt sgemm_kernel_L1_M1_22 + + +sgemm_kernel_L1_M1_40: + + ands L , K1, #7 // L = L % 8 + ble sgemm_kernel_L1_M1_100 + +sgemm_kernel_L1_M1_42: + + KERNEL1x1_SUB + + subs L, L, #1 + bgt sgemm_kernel_L1_M1_42 + +sgemm_kernel_L1_M1_100: + + SAVE1x1 + + +sgemm_kernel_L1_END: + + +sgemm_kernel_L999: + + sub r3, fp, #128 + vldm r3, { s8 - s15} // restore floating point registers + + movs r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/strmm_kernel_4x2_vfp.S b/kernel/arm/strmm_kernel_4x2_vfp.S new file mode 100644 index 000000000..5394a6444 --- /dev/null +++ b/kernel/arm/strmm_kernel_4x2_vfp.S @@ -0,0 +1,1083 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/23 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 252 + +#define OLD_M r0 +#define OLD_N r1 +#define OLD_K r2 +#define OLD_A r3 +#define OLD_ALPHA s0 + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define KK [fp, #-240 ] +#define KKK [fp, #-244] +#define C [fp, #-248 ] +#define LDC [fp, #-252 ] +#define M [fp, #-256 ] +#define N [fp, #-260 ] +#define K [fp, #-264 ] +#define A [fp, #-268 ] + +#define ALPHA [fp, #-276 ] + +#define B [fp, #4 ] +#define OLD_C [fp, #8 ] +#define OLD_LDC [fp, #12 ] +#define OFFSET [fp, #16 ] + +#define I r0 +#define J r1 +#define L r2 + +#define AO r5 +#define BO r6 + +#define CO1 r8 +#define CO2 r9 + +#define K1 r7 +#define BC r12 + +#define A_PRE 64 +#define B_PRE 64 +#define C_PRE 64 + + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro INIT4x2 + + vsub.f32 s8 , s8 , s8 + vmov.f32 s9, s8 + vmov.f32 s10, s8 + vmov.f32 s11, s8 + vmov.f32 s12, s8 + vmov.f32 s13, s8 + vmov.f32 s14, s8 + vmov.f32 s15, s8 + +.endm + + + +.macro KERNEL4x2_SUB + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s2 , [ AO, #8 ] + flds s3 , [ AO, #12 ] + + fmacs s8 , s0, s4 + fmacs s9 , s1, s4 + fmacs s10 , s2, s4 + fmacs s11 , s3, s4 + + fmacs s12 , s0, s5 + fmacs s13 , s1, s5 + fmacs s14 , s2, s5 + fmacs s15 , s3, s5 + + add AO , AO, #16 + add BO , BO, #8 + +.endm + +.macro SAVE4x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + flds s0, ALPHA + + fmuls s4 , s0 , s8 + fmuls s5 , s0 , s9 + fmuls s6 , s0 , s10 + fmuls s7 , s0 , s11 + + fsts s4 , [CO1] + fsts s5 , [CO1, #4 ] + fsts s6 , [CO1, #8 ] + fsts s7 , [CO1, #12 ] + + fmuls s4 , s0 , s12 + fmuls s5 , s0 , s13 + fmuls s6 , s0 , s14 + fmuls s7 , s0 , s15 + + fsts s4 , [CO2] + fsts s5 , [CO2, #4 ] + fsts s6 , [CO2, #8 ] + fsts s7 , [CO2, #12 ] + + add CO1, CO1, #16 + +.endm + + +/******************************************************************************/ + +.macro INIT2x2 + + vsub.f32 s8 , s8 , s8 + vmov.f32 s9, s8 + vmov.f32 s12, s8 + vmov.f32 s13, s8 + +.endm + +.macro KERNEL2x2_SUB + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + fmacs s8 , s0, s4 + fmacs s9 , s1, s4 + + fmacs s12 , s0, s5 + fmacs s13 , s1, s5 + + add AO , AO, #8 + add BO , BO, #8 + +.endm + +.macro SAVE2x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + flds s0, ALPHA + + + fmuls s4 , s0 , s8 + fmuls s5 , s0 , s9 + + fsts s4 , [CO1] + fsts s5 , [CO1, #4 ] + + fmuls s4 , s0 , s12 + fmuls s5 , s0 , s13 + + fsts s4 , [CO2] + fsts s5 , [CO2, #4 ] + + add CO1, CO1, #8 + +.endm + + +/******************************************************************************/ + +.macro INIT1x2 + + vsub.f32 s8 , s8 , s8 + vmov.f32 s12, s8 + +.endm + +.macro KERNEL1x2_SUB + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + + flds s0 , [ AO ] + + fmacs s8 , s0, s4 + + fmacs s12 , s0, s5 + + add AO , AO, #4 + add BO , BO, #8 + +.endm + +.macro SAVE1x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + flds s0, ALPHA + + + fmuls s4 , s0 , s8 + + fsts s4 , [CO1] + + + fmuls s4 , s0 , s12 + + fsts s4 , [CO2] + + add CO1, CO1, #4 + +.endm + + + +/******************************************************************************/ + +.macro INIT4x1 + + vsub.f32 s8 , s8 , s8 + vmov.f32 s9, s8 + vmov.f32 s10, s8 + vmov.f32 s11, s8 + +.endm + + + +.macro KERNEL4x1_SUB + + flds s4 , [ BO ] + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s2 , [ AO, #8 ] + flds s3 , [ AO, #12 ] + + fmacs s8 , s0, s4 + fmacs s9 , s1, s4 + fmacs s10 , s2, s4 + fmacs s11 , s3, s4 + + add AO , AO, #16 + add BO , BO, #4 + +.endm + +.macro SAVE4x1 + + flds s0, ALPHA + + fmuls s4 , s0 , s8 + fmuls s5 , s0 , s9 + fmuls s6 , s0 , s10 + fmuls s7 , s0 , s11 + + fsts s4 , [CO1] + fsts s5 , [CO1, #4 ] + fsts s6 , [CO1, #8 ] + fsts s7 , [CO1, #12 ] + + add CO1, CO1, #16 + +.endm + +/******************************************************************************/ + +.macro INIT2x1 + + vsub.f32 s8 , s8 , s8 + vmov.f32 s9 , s8 + +.endm + +.macro KERNEL2x1_SUB + + flds s4 , [ BO ] + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + fmacs s8 , s0, s4 + fmacs s9 , s1, s4 + + add AO , AO, #8 + add BO , BO, #4 + +.endm + +.macro SAVE2x1 + + flds s0, ALPHA + + fmuls s4 , s0 , s8 + fmuls s5 , s0 , s9 + + fsts s4 , [CO1] + fsts s5 , [CO1, #4 ] + + add CO1, CO1, #8 + +.endm + + +/******************************************************************************/ + +.macro INIT1x1 + + vsub.f32 s8 , s8 , s8 + +.endm + +.macro KERNEL1x1_SUB + + flds s4 , [ BO ] + + flds s0 , [ AO ] + + fmacs s8 , s0, s4 + + add AO , AO, #4 + add BO , BO, #4 + +.endm + +.macro SAVE1x1 + + flds s0, ALPHA + + fmuls s4 , s0 , s8 + + fsts s4 , [CO1] + + add CO1, CO1, #4 + +.endm + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + str OLD_M, M + str OLD_N, N + str OLD_K, K + str OLD_A, A + vstr OLD_ALPHA, ALPHA + + sub r3, fp, #128 + vstm r3, { s8 - s15} // store floating point registers + + ldr r3, OLD_LDC + lsl r3, r3, #2 // ldc = ldc * 4 + str r3, LDC + + ldr r3, OLD_C + str r3, C + + ldr BC, B + + ldr r3, OFFSET +#ifndef LEFT + neg r3 , r3 +#endif + str r3 , KK + + ldr J, N + asrs J, J, #1 // J = J / 2 + ble _L1_BEGIN + +_L2_BEGIN: + + ldr CO1, C // CO1 = C + ldr r4 , LDC + lsl r4 , r4 , #1 // LDC * 2 + add r3 , r4, CO1 + str r3 , C // store C + +#if defined(LEFT) + ldr r3 , OFFSET + str r3 , KK +#endif + + ldr AO, A // AO = A + + +_L2_M4_BEGIN: + + ldr I, M + asrs I, I, #2 // I = I / 4 + ble _L2_M2_BEGIN + +_L2_M4_20: + + INIT4x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #3 // 2 float values + add BO , BO , r4 + lsls r4 , r3 , #4 // 4 float values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr L , K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr L , K + ldr r3, KK + sub L , L, r3 + str L , KKK +#else + ldr L , KK +#ifdef LEFT + add L , L , #4 // number of values in AO +#else + add L , L , #2 // number of values in BO +#endif + str L , KKK +#endif + + mov K1, L + asrs L , K1, #3 // L = L / 8 + ble _L2_M4_40 + .align 5 + +_L2_M4_22: + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + subs L, L, #1 + bgt _L2_M4_22 + + +_L2_M4_40: + + ands L , K1, #7 // L = L % 8 + ble _L2_M4_100 + +_L2_M4_42: + + KERNEL4x2_SUB + + subs L, L, #1 + bgt _L2_M4_42 + +_L2_M4_100: + + SAVE4x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #3 // 2 float values + add BO , BO , r4 + lsls r4 , r3 , #4 // 4 float values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #4 // number of values in AO + str r3 , KK +#endif + + + +_L2_M4_END: + + subs I, I, #1 + bgt _L2_M4_20 + + +_L2_M2_BEGIN: + + ldr I, M + tst I , #3 + ble _L2_END + + tst I, #2 // I = I / 2 + ble _L2_M1_BEGIN + +_L2_M2_20: + + INIT2x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #3 // 2 float values + add BO , BO , r4 + lsls r4 , r3 , #3 // 2 float values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr L , K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr L , K + ldr r3, KK + sub L , L, r3 + str L , KKK +#else + ldr L , KK +#ifdef LEFT + add L , L , #2 // number of values in AO +#else + add L , L , #2 // number of values in BO +#endif + str L , KKK +#endif + + mov K1, L + asrs L , K1, #3 // L = L / 8 + ble _L2_M2_40 + +_L2_M2_22: + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + subs L, L, #1 + bgt _L2_M2_22 + + +_L2_M2_40: + + ands L , K1, #7 // L = L % 8 + ble _L2_M2_100 + +_L2_M2_42: + + KERNEL2x2_SUB + + subs L, L, #1 + bgt _L2_M2_42 + +_L2_M2_100: + + SAVE2x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #3 // 2 float values + add BO , BO , r4 + lsls r4 , r3 , #3 // 2 float values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #2 // number of values in AO + str r3 , KK +#endif + + + +_L2_M2_END: + + +_L2_M1_BEGIN: + + tst I, #1 // I = I % 2 + ble _L2_END + +_L2_M1_20: + + INIT1x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #3 // 2 float values + add BO , BO , r4 + lsls r4 , r3 , #2 // 1 float value + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr L , K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr L , K + ldr r3, KK + sub L , L, r3 + str L , KKK +#else + ldr L , KK +#ifdef LEFT + add L , L , #1 // number of values in AO +#else + add L , L , #2 // number of values in BO +#endif + str L , KKK +#endif + + mov K1, L + asrs L , K1, #3 // L = L / 8 + ble _L2_M1_40 + +_L2_M1_22: + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + subs L, L, #1 + bgt _L2_M1_22 + + +_L2_M1_40: + + ands L , K1, #7 // L = L % 8 + ble _L2_M1_100 + +_L2_M1_42: + + KERNEL1x2_SUB + + subs L, L, #1 + bgt _L2_M1_42 + +_L2_M1_100: + + SAVE1x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #3 // 2 float values + add BO , BO , r4 + lsls r4 , r3 , #2 // 1 float value + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #1 // number of values in AO + str r3 , KK +#endif + + + + +_L2_END: + + mov r3, BC + ldr r4, K + lsl r4, r4, #3 // k * 2 * 4 + add r3, r3, r4 // B = B + K * 2 * 4 + mov BC, r3 + +#if !defined(LEFT) + ldr r3 , KK + add r3 , r3 , #2 // number of values in BO + str r3 , KK +#endif + + + subs J , #1 // j-- + bgt _L2_BEGIN + +/*********************************************************************************************/ + +_L1_BEGIN: + + ldr J , N + tst J , #1 + ble _L999 + + + ldr CO1, C // CO1 = C + ldr r4 , LDC + add r3 , r4, CO1 + str r3 , C // store C + +#if defined(LEFT) + ldr r3 , OFFSET + str r3 , KK +#endif + + ldr AO, A // AO = A + //pld [AO , #A_PRE-96] + //pld [AO , #A_PRE-64] + //pld [AO , #A_PRE-32] + + + +_L1_M4_BEGIN: + + ldr I, M + asrs I, I, #2 // I = I / 4 + ble _L1_M2_BEGIN + +_L1_M4_20: + + INIT4x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #2 // 1 float value + add BO , BO , r4 + lsls r4 , r3 , #4 // 4 float values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr L , K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr L , K + ldr r3, KK + sub L , L, r3 + str L , KKK +#else + ldr L , KK +#ifdef LEFT + add L , L , #4 // number of values in AO +#else + add L , L , #1 // number of values in BO +#endif + str L , KKK +#endif + + mov K1, L + asrs L , K1, #3 // L = L / 8 + ble _L1_M4_40 + .align 5 + +_L1_M4_22: + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + subs L, L, #1 + bgt _L1_M4_22 + + +_L1_M4_40: + + ands L , K1, #7 // L = L % 8 + ble _L1_M4_100 + +_L1_M4_42: + + KERNEL4x1_SUB + + subs L, L, #1 + bgt _L1_M4_42 + +_L1_M4_100: + + SAVE4x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #2 // 1 float value + add BO , BO , r4 + lsls r4 , r3 , #4 // 4 float values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #4 // number of values in AO + str r3 , KK +#endif + + + +_L1_M4_END: + + subs I, I, #1 + bgt _L1_M4_20 + + +_L1_M2_BEGIN: + + ldr I, M + tst I , #3 + ble _L1_END + + tst I, #2 // I = I / 2 + ble _L1_M1_BEGIN + +_L1_M2_20: + + INIT2x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #2 // 1 float value + add BO , BO , r4 + lsls r4 , r3 , #3 // 2 float values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr L , K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr L , K + ldr r3, KK + sub L , L, r3 + str L , KKK +#else + ldr L , KK +#ifdef LEFT + add L , L , #2 // number of values in AO +#else + add L , L , #1 // number of values in BO +#endif + str L , KKK +#endif + + mov K1, L + asrs L , K1, #3 // L = L / 8 + ble _L1_M2_40 + +_L1_M2_22: + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + subs L, L, #1 + bgt _L1_M2_22 + + +_L1_M2_40: + + ands L , K1, #7 // L = L % 8 + ble _L1_M2_100 + +_L1_M2_42: + + KERNEL2x1_SUB + + subs L, L, #1 + bgt _L1_M2_42 + +_L1_M2_100: + + SAVE2x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #2 // 1 float value + add BO , BO , r4 + lsls r4 , r3 , #3 // 2 float values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #2 // number of values in AO + str r3 , KK +#endif + + + +_L1_M2_END: + + +_L1_M1_BEGIN: + + tst I, #1 // I = I % 2 + ble _L1_END + +_L1_M1_20: + + INIT1x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #2 // 1 float value + add BO , BO , r4 + lsls r4 , r3 , #2 // 1 float value + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr L , K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr L , K + ldr r3, KK + sub L , L, r3 + str L , KKK +#else + ldr L , KK +#ifdef LEFT + add L , L , #1 // number of values in AO +#else + add L , L , #1 // number of values in BO +#endif + str L , KKK +#endif + + mov K1, L + asrs L , K1, #3 // L = L / 8 + ble _L1_M1_40 + +_L1_M1_22: + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + subs L, L, #1 + bgt _L1_M1_22 + + +_L1_M1_40: + + ands L , K1, #7 // L = L % 8 + ble _L1_M1_100 + +_L1_M1_42: + + KERNEL1x1_SUB + + subs L, L, #1 + bgt _L1_M1_42 + +_L1_M1_100: + + SAVE1x1 + + +_L1_END: + + +_L999: + + sub r3, fp, #128 + vldm r3, { s8 - s15} // restore floating point registers + + movs r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + diff --git a/param.h b/param.h index 568605bb8..ec1767d20 100644 --- a/param.h +++ b/param.h @@ -1843,7 +1843,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_B 0 #define GEMM_DEFAULT_ALIGN 0x03fffUL -#define SGEMM_DEFAULT_UNROLL_M 2 +#define SGEMM_DEFAULT_UNROLL_M 4 #define SGEMM_DEFAULT_UNROLL_N 2 #define DGEMM_DEFAULT_UNROLL_M 4 From 12e02a00e06b61e1825ac48d9071fce262042998 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sun, 24 Nov 2013 08:46:47 +0100 Subject: [PATCH 57/81] added ncopy kernels for ARMV6 --- kernel/arm/KERNEL.ARMV6 | 14 +- kernel/arm/dgemm_ncopy_2_vfp.S | 225 +++++++++++++++++++++++++++++++++ kernel/arm/sgemm_ncopy_2_vfp.S | 225 +++++++++++++++++++++++++++++++++ 3 files changed, 457 insertions(+), 7 deletions(-) create mode 100644 kernel/arm/dgemm_ncopy_2_vfp.S create mode 100644 kernel/arm/sgemm_ncopy_2_vfp.S diff --git a/kernel/arm/KERNEL.ARMV6 b/kernel/arm/KERNEL.ARMV6 index b192b20da..1f2510bc3 100644 --- a/kernel/arm/KERNEL.ARMV6 +++ b/kernel/arm/KERNEL.ARMV6 @@ -85,22 +85,22 @@ DTRMMKERNEL = dtrmm_kernel_4x2_vfp.S CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c -SGEMMKERNEL = sgemm_kernel_4x2_vfp.S -SGEMMINCOPY = ../generic/gemm_ncopy_4.c -SGEMMITCOPY = ../generic/gemm_tcopy_4.c +SGEMMKERNEL = sgemm_kernel_4x2_vfp.S +SGEMMINCOPY = sgemm_ncopy_4_vfp.S +SGEMMITCOPY = sgemm_tcopy_4_vfp.S SGEMMINCOPYOBJ = sgemm_incopy.o SGEMMITCOPYOBJ = sgemm_itcopy.o -SGEMMONCOPY = ../generic/gemm_ncopy_2.c +SGEMMONCOPY = sgemm_ncopy_2_vfp.S SGEMMOTCOPY = ../generic/gemm_tcopy_2.c SGEMMONCOPYOBJ = sgemm_oncopy.o SGEMMOTCOPYOBJ = sgemm_otcopy.o DGEMMKERNEL = dgemm_kernel_4x2_vfp.S -DGEMMINCOPY = ../generic/gemm_ncopy_4.c -DGEMMITCOPY = ../generic/gemm_tcopy_4.c +DGEMMINCOPY = dgemm_ncopy_4_vfp.S +DGEMMITCOPY = dgemm_tcopy_4_vfp.S DGEMMINCOPYOBJ = dgemm_incopy.o DGEMMITCOPYOBJ = dgemm_itcopy.o -DGEMMONCOPY = ../generic/gemm_ncopy_2.c +DGEMMONCOPY = dgemm_ncopy_2_vfp.S DGEMMOTCOPY = ../generic/gemm_tcopy_2.c DGEMMONCOPYOBJ = dgemm_oncopy.o DGEMMOTCOPYOBJ = dgemm_otcopy.o diff --git a/kernel/arm/dgemm_ncopy_2_vfp.S b/kernel/arm/dgemm_ncopy_2_vfp.S new file mode 100644 index 000000000..763c032e1 --- /dev/null +++ b/kernel/arm/dgemm_ncopy_2_vfp.S @@ -0,0 +1,225 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/24 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_M r0 +#define OLD_N r1 +#define OLD_A r2 +#define OLD_LDA r3 + +#define B [fp, #4 ] + +#define M r0 +#define N r1 +#define A r2 + +#define BO r5 + +#define AO1 r6 +#define AO2 r7 +#define LDA r8 + +#define I r3 +#define J r12 + +#define A_PRE 256 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro COPY2x2 + + fldd d0 , [ AO1, #0 ] + fldd d2 , [ AO1, #8 ] + + fldd d1 , [ AO2, #0 ] + fldd d3 , [ AO2, #8 ] + + add AO1, AO1, #16 + fstmiad BO!, { d0 - d3 } + add AO2, AO2, #16 + +.endm + + +.macro COPY1x2 + + fldd d0 , [ AO1, #0 ] + fldd d1 , [ AO2, #0 ] + add AO1, AO1, #8 + + fstmiad BO!, { d0 - d1 } + add AO2, AO2, #8 + +.endm + +.macro COPY2x1 + + fldd d0 , [ AO1, #0 ] + fldd d1 , [ AO1, #8 ] + + fstmiad BO!, { d0 - d1 } + add AO1, AO1, #16 + +.endm + + +.macro COPY1x1 + + fldd d0 , [ AO1, #0 ] + + fstmiad BO!, { d0 } + add AO1, AO1, #8 + +.endm + + + + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + + lsl LDA, OLD_LDA, #3 // lda = lda * 8 + + ldr BO, B + + +/*********************************************************************************************/ + +dgemm_ncopy_L2_BEGIN: + + asrs J, N, #1 // J = N / 2 + ble dgemm_ncopy_L1_BEGIN + +dgemm_ncopy_L2_M2_BEGIN: + + mov AO1, A // AO1 = A + add AO2, AO1, LDA + add A , AO2, LDA // A = A + 2 * LDA + + asrs I, M, #1 // I = M / 2 + ble dgemm_ncopy_L2_M2_40 + +dgemm_ncopy_L2_M2_20: + + COPY2x2 + + subs I , I , #1 + bne dgemm_ncopy_L2_M2_20 + + +dgemm_ncopy_L2_M2_40: + + ands I, M , #1 + ble dgemm_ncopy_L2_M2_END + +dgemm_ncopy_L2_M2_60: + + COPY1x2 + + subs I , I , #1 + bne dgemm_ncopy_L2_M2_60 + + +dgemm_ncopy_L2_M2_END: + + subs J , J, #1 // j-- + bne dgemm_ncopy_L2_M2_BEGIN + +/*********************************************************************************************/ + +dgemm_ncopy_L1_BEGIN: + + tst N, #1 + ble dgemm_ncopy_L999 + + +dgemm_ncopy_L1_M2_BEGIN: + + mov AO1, A // AO1 = A + add A , AO1, LDA // A = A + 1 * LDA + + asrs I, M, #1 // I = M / 2 + ble dgemm_ncopy_L1_M2_40 + +dgemm_ncopy_L1_M2_20: + + COPY2x1 + + subs I , I , #1 + bne dgemm_ncopy_L1_M2_20 + + +dgemm_ncopy_L1_M2_40: + + ands I, M , #1 + ble dgemm_ncopy_L1_M2_END + +dgemm_ncopy_L1_M2_60: + + COPY1x1 + + subs I , I , #1 + bne dgemm_ncopy_L1_M2_60 + + +dgemm_ncopy_L1_M2_END: + + + +dgemm_ncopy_L999: + + + movs r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/sgemm_ncopy_2_vfp.S b/kernel/arm/sgemm_ncopy_2_vfp.S new file mode 100644 index 000000000..0546f1d69 --- /dev/null +++ b/kernel/arm/sgemm_ncopy_2_vfp.S @@ -0,0 +1,225 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/24 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_M r0 +#define OLD_N r1 +#define OLD_A r2 +#define OLD_LDA r3 + +#define B [fp, #4 ] + +#define M r0 +#define N r1 +#define A r2 + +#define BO r5 + +#define AO1 r6 +#define AO2 r7 +#define LDA r8 + +#define I r3 +#define J r12 + +#define A_PRE 256 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro COPY2x2 + + flds s0 , [ AO1, #0 ] + flds s2 , [ AO1, #4 ] + + flds s1 , [ AO2, #0 ] + flds s3 , [ AO2, #4 ] + + add AO1, AO1, #8 + fstmias BO!, { s0 - s3 } + add AO2, AO2, #8 + +.endm + + +.macro COPY1x2 + + flds s0 , [ AO1, #0 ] + flds s1 , [ AO2, #0 ] + add AO1, AO1, #4 + + fstmias BO!, { s0 - s1 } + add AO2, AO2, #4 + +.endm + +.macro COPY2x1 + + flds s0 , [ AO1, #0 ] + flds s1 , [ AO1, #4 ] + + fstmias BO!, { s0 - s1 } + add AO1, AO1, #8 + +.endm + + +.macro COPY1x1 + + flds s0 , [ AO1, #0 ] + + fstmias BO!, { s0 } + add AO1, AO1, #4 + +.endm + + + + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + + lsl LDA, OLD_LDA, #2 // lda = lda * 4 + + ldr BO, B + + +/*********************************************************************************************/ + +sgemm_ncopy_L2_BEGIN: + + asrs J, N, #1 // J = N / 2 + ble sgemm_ncopy_L1_BEGIN + +sgemm_ncopy_L2_M2_BEGIN: + + mov AO1, A // AO1 = A + add AO2, AO1, LDA + add A , AO2, LDA // A = A + 2 * LDA + + asrs I, M, #1 // I = M / 2 + ble sgemm_ncopy_L2_M2_40 + +sgemm_ncopy_L2_M2_20: + + COPY2x2 + + subs I , I , #1 + bne sgemm_ncopy_L2_M2_20 + + +sgemm_ncopy_L2_M2_40: + + ands I, M , #1 + ble sgemm_ncopy_L2_M2_END + +sgemm_ncopy_L2_M2_60: + + COPY1x2 + + subs I , I , #1 + bne sgemm_ncopy_L2_M2_60 + + +sgemm_ncopy_L2_M2_END: + + subs J , J, #1 // j-- + bne sgemm_ncopy_L2_M2_BEGIN + +/*********************************************************************************************/ + +sgemm_ncopy_L1_BEGIN: + + tst N, #1 + ble sgemm_ncopy_L999 + + +sgemm_ncopy_L1_M2_BEGIN: + + mov AO1, A // AO1 = A + add A , AO1, LDA // A = A + 1 * LDA + + asrs I, M, #1 // I = M / 2 + ble sgemm_ncopy_L1_M2_40 + +sgemm_ncopy_L1_M2_20: + + COPY2x1 + + subs I , I , #1 + bne sgemm_ncopy_L1_M2_20 + + +sgemm_ncopy_L1_M2_40: + + ands I, M , #1 + ble sgemm_ncopy_L1_M2_END + +sgemm_ncopy_L1_M2_60: + + COPY1x1 + + subs I , I , #1 + bne sgemm_ncopy_L1_M2_60 + + +sgemm_ncopy_L1_M2_END: + + + +sgemm_ncopy_L999: + + + movs r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + From 25c605059328127d0e2612e94c0bbcb52a4bff3a Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sun, 24 Nov 2013 12:03:28 +0100 Subject: [PATCH 58/81] add single and double precision gemv_n kernel for ARMV6 --- kernel/arm/KERNEL.ARMV6 | 4 +- kernel/arm/gemv_n_vfp.S | 675 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 677 insertions(+), 2 deletions(-) create mode 100644 kernel/arm/gemv_n_vfp.S diff --git a/kernel/arm/KERNEL.ARMV6 b/kernel/arm/KERNEL.ARMV6 index 1f2510bc3..9dad061d9 100644 --- a/kernel/arm/KERNEL.ARMV6 +++ b/kernel/arm/KERNEL.ARMV6 @@ -70,8 +70,8 @@ DSWAPKERNEL = swap_vfp.S CSWAPKERNEL = swap_vfp.S ZSWAPKERNEL = swap_vfp.S -SGEMVNKERNEL = gemv_n.c -DGEMVNKERNEL = gemv_n.c +SGEMVNKERNEL = gemv_n_vfp.S +DGEMVNKERNEL = gemv_n_vfp.S CGEMVNKERNEL = zgemv_n.c ZGEMVNKERNEL = zgemv_n.c diff --git a/kernel/arm/gemv_n_vfp.S b/kernel/arm/gemv_n_vfp.S new file mode 100644 index 000000000..47265994c --- /dev/null +++ b/kernel/arm/gemv_n_vfp.S @@ -0,0 +1,675 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/24 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_LDA [fp, #0 ] +#define X [fp, #4 ] +#define OLD_INC_X [fp, #8 ] +#define Y [fp, #12 ] +#define OLD_INC_Y [fp, #16 ] +#define OLD_A r3 +#define OLD_M r0 + +#define AO1 r0 +#define N r1 +#define J r2 + +#define AO2 r4 +#define XO r5 +#define YO r6 +#define LDA r7 +#define INC_X r8 +#define INC_Y r9 + +#define I r12 + +#define M [fp, #-252 ] +#define A [fp, #-256 ] + + +#define X_PRE 64 +#define Y_PRE 0 +#define A_PRE 0 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + + +#if defined(DOUBLE) + +.macro INIT_F4 + + pld [ YO , #Y_PRE ] + + vsub.f64 d12 , d12 , d12 + vmov.f64 d13 , d12 + vmov.f64 d14 , d12 + vmov.f64 d15 , d12 + +.endm + +.macro KERNEL_F4X4 + + pld [ XO , #X_PRE ] + KERNEL_F4X1 + KERNEL_F4X1 + KERNEL_F4X1 + KERNEL_F4X1 + +.endm + + +.macro KERNEL_F4X1 + + fldmiad XO! , { d2 } + fldmiad AO1 , { d8 - d11 } + + vmla.f64 d12 , d2 , d8 + pld [ AO2 , #A_PRE ] + vmla.f64 d13 , d2 , d9 + add AO1, AO1, LDA + vmla.f64 d14 , d2 , d10 + vmla.f64 d15 , d2 , d11 + add AO2, AO2, LDA + +.endm + +.macro SAVE_F4 + + fldmiad YO, { d4 - d7 } + + vmla.f64 d4 , d0, d12 + vmla.f64 d5 , d0, d13 + vmla.f64 d6 , d0, d14 + vmla.f64 d7 , d0, d15 + + fstmiad YO!, { d4 - d7 } + +.endm + + +.macro INIT_F1 + + vsub.f64 d12 , d12 , d12 + +.endm + + + +.macro KERNEL_F1X1 + + fldmiad XO! , { d2 } + fldmiad AO1 , { d8 } + vmla.f64 d12 , d2 , d8 + add AO1, AO1, LDA + +.endm + +.macro SAVE_F1 + + fldmiad YO, { d4 } + vmla.f64 d4, d0, d12 + fstmiad YO!, { d4 } + +.endm + +/*********************************************************************************************/ + +.macro INIT_S4 + + vsub.f64 d12 , d12 , d12 + vmov.f64 d13 , d12 + vmov.f64 d14 , d12 + vmov.f64 d15 , d12 + +.endm + +.macro KERNEL_S4X4 + + KERNEL_S4X1 + KERNEL_S4X1 + KERNEL_S4X1 + KERNEL_S4X1 + +.endm + + +.macro KERNEL_S4X1 + + pld [ AO2 , #A_PRE ] + fldmiad XO , { d2 } + fldmiad AO1 , { d8 - d11 } + + vmla.f64 d12 , d2 , d8 + add AO1, AO1, LDA + vmla.f64 d13 , d2 , d9 + add AO2, AO2, LDA + vmla.f64 d14 , d2 , d10 + vmla.f64 d15 , d2 , d11 + add XO, XO , INC_X + +.endm + +.macro SAVE_S4 + + fldmiad YO, { d4 } + vmla.f64 d4 , d0, d12 + fstmiad YO, { d4 } + add YO, YO, INC_Y + + fldmiad YO, { d5 } + vmla.f64 d5 , d0, d13 + fstmiad YO, { d5 } + add YO, YO, INC_Y + + fldmiad YO, { d4 } + vmla.f64 d4 , d0, d14 + fstmiad YO, { d4 } + add YO, YO, INC_Y + + fldmiad YO, { d5 } + vmla.f64 d5 , d0, d15 + fstmiad YO, { d5 } + add YO, YO, INC_Y + +.endm + + +.macro INIT_S1 + + vsub.f64 d12 , d12 , d12 + +.endm + + + +.macro KERNEL_S1X1 + + fldmiad XO , { d2 } + fldmiad AO1 , { d8 } + vmla.f64 d12 , d2 , d8 + add AO1, AO1, LDA + add XO, XO , INC_X + +.endm + +.macro SAVE_S1 + + fldmiad YO, { d4 } + vmla.f64 d4, d0, d12 + fstmiad YO , { d4 } + add YO, YO, INC_Y + +.endm + + + + +#else /************************* SINGLE PRECISION *****************************************/ + +.macro INIT_F4 + + pld [ YO , #Y_PRE ] + + vsub.f32 s12 , s12 , s12 + vmov.f32 s13 , s12 + vmov.f32 s14 , s12 + vmov.f32 s15 , s12 + +.endm + +.macro KERNEL_F4X4 + + pld [ XO , #X_PRE ] + KERNEL_F4X1 + KERNEL_F4X1 + KERNEL_F4X1 + KERNEL_F4X1 + +.endm + + +.macro KERNEL_F4X1 + + fldmias XO! , { s2 } + fldmias AO1 , { s8 - s11 } + + vmla.f32 s12 , s2 , s8 + vmla.f32 s13 , s2 , s9 + vmla.f32 s14 , s2 , s10 + vmla.f32 s15 , s2 , s11 + add AO1, AO1, LDA + add AO2, AO2, LDA + +.endm + +.macro SAVE_F4 + + fldmias YO, { s4 - s7 } + + vmla.f32 s4 , s0, s12 + vmla.f32 s5 , s0, s13 + vmla.f32 s6 , s0, s14 + vmla.f32 s7 , s0, s15 + + fstmias YO!, { s4 - s7 } + +.endm + + +.macro INIT_F1 + + vsub.f32 s12 , s12 , s12 + +.endm + + + +.macro KERNEL_F1X1 + + fldmias XO! , { s2 } + fldmias AO1 , { s8 } + vmla.f32 s12 , s2 , s8 + add AO1, AO1, LDA + +.endm + +.macro SAVE_F1 + + fldmias YO, { s4 } + vmla.f32 s4, s0, s12 + fstmias YO!, { s4 } + +.endm + +/*********************************************************************************************/ + +.macro INIT_S4 + + vsub.f32 s12 , s12 , s12 + vmov.f32 s13 , s12 + vmov.f32 s14 , s12 + vmov.f32 s15 , s12 + +.endm + +.macro KERNEL_S4X4 + + KERNEL_S4X1 + KERNEL_S4X1 + KERNEL_S4X1 + KERNEL_S4X1 + +.endm + + +.macro KERNEL_S4X1 + + pld [ AO2 , #A_PRE ] + fldmias XO , { s2 } + fldmias AO1 , { s8 - s11 } + + vmla.f32 s12 , s2 , s8 + vmla.f32 s13 , s2 , s9 + vmla.f32 s14 , s2 , s10 + vmla.f32 s15 , s2 , s11 + add AO1, AO1, LDA + add AO2, AO2, LDA + add XO, XO , INC_X + +.endm + +.macro SAVE_S4 + + fldmias YO, { s4 } + vmla.f32 s4 , s0, s12 + fstmias YO, { s4 } + add YO, YO, INC_Y + + fldmias YO, { s5 } + vmla.f32 s5 , s0, s13 + fstmias YO, { s5 } + add YO, YO, INC_Y + + fldmias YO, { s4 } + vmla.f32 s4 , s0, s14 + fstmias YO, { s4 } + add YO, YO, INC_Y + + fldmias YO, { s5 } + vmla.f32 s5 , s0, s15 + fstmias YO, { s5 } + add YO, YO, INC_Y + +.endm + + +.macro INIT_S1 + + vsub.f32 s12 , s12 , s12 + +.endm + + + +.macro KERNEL_S1X1 + + fldmias XO , { s2 } + fldmias AO1 , { s8 } + vmla.f32 s12 , s2 , s8 + add AO1, AO1, LDA + add XO, XO , INC_X + +.endm + +.macro SAVE_S1 + + fldmias YO, { s4 } + vmla.f32 s4, s0, s12 + fstmias YO , { s4 } + add YO, YO, INC_Y + +.endm + + + + +#endif + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + push {r4 - r9 , fp} + add fp, sp, #28 + sub sp, sp, #STACKSIZE // reserve stack + + sub r12, fp, #192 + +#if defined(DOUBLE) + vstm r12, { d8 - d15 } // store floating point registers +#else + vstm r12, { s8 - s15 } // store floating point registers +#endif + + cmp OLD_M, #0 + ble gemvn_kernel_L999 + + cmp N, #0 + ble gemvn_kernel_L999 + + str OLD_A, A + str OLD_M, M + + ldr INC_X , OLD_INC_X + ldr INC_Y , OLD_INC_Y + + cmp INC_X, #0 + beq gemvn_kernel_L999 + + cmp INC_Y, #0 + beq gemvn_kernel_L999 + + ldr LDA, OLD_LDA + + +#if defined(DOUBLE) + lsl LDA, LDA, #3 // LDA * SIZE +#else + lsl LDA, LDA, #2 // LDA * SIZE +#endif + + cmp INC_X, #1 + bne gemvn_kernel_S4_BEGIN + + cmp INC_Y, #1 + bne gemvn_kernel_S4_BEGIN + + +gemvn_kernel_F4_BEGIN: + + ldr YO , Y + + ldr I, M + asrs I, I, #2 // I = M / 4 + ble gemvn_kernel_F1_BEGIN + +gemvn_kernel_F4X4: + + ldr AO1, A + add AO2, AO1, LDA + add r3 , AO1, #4*SIZE + str r3 , A + + ldr XO , X + + INIT_F4 + + asrs J, N, #2 // J = N / 4 + ble gemvn_kernel_F4X1 + + +gemvn_kernel_F4X4_10: + + KERNEL_F4X4 + + subs J, J, #1 + bne gemvn_kernel_F4X4_10 + + +gemvn_kernel_F4X1: + + ands J, N , #3 + ble gemvn_kernel_F4_END + +gemvn_kernel_F4X1_10: + + KERNEL_F4X1 + + subs J, J, #1 + bne gemvn_kernel_F4X1_10 + + +gemvn_kernel_F4_END: + + SAVE_F4 + + subs I , I , #1 + bne gemvn_kernel_F4X4 + + +gemvn_kernel_F1_BEGIN: + + ldr I, M + ands I, I , #3 + ble gemvn_kernel_L999 + +gemvn_kernel_F1X1: + + ldr AO1, A + add r3, AO1, #SIZE + str r3, A + + ldr XO , X + + INIT_F1 + + mov J, N + + +gemvn_kernel_F1X1_10: + + KERNEL_F1X1 + + subs J, J, #1 + bne gemvn_kernel_F1X1_10 + + +gemvn_kernel_F1_END: + + SAVE_F1 + + subs I , I , #1 + bne gemvn_kernel_F1X1 + + b gemvn_kernel_L999 + + + +/*************************************************************************************************************/ + +gemvn_kernel_S4_BEGIN: + +#if defined(DOUBLE) + lsl INC_X, INC_X, #3 // INC_X * SIZE + lsl INC_Y, INC_Y, #3 // INC_Y * SIZE +#else + lsl INC_X, INC_X, #2 // INC_X * SIZE + lsl INC_Y, INC_Y, #2 // INC_Y * SIZE +#endif + + ldr YO , Y + + ldr I, M + asrs I, I, #2 // I = M / 4 + ble gemvn_kernel_S1_BEGIN + +gemvn_kernel_S4X4: + + ldr AO1, A + add AO2, AO1, LDA + add r3 , AO1, #4*SIZE + str r3 , A + + ldr XO , X + + INIT_S4 + + asrs J, N, #2 // J = N / 4 + ble gemvn_kernel_S4X1 + + +gemvn_kernel_S4X4_10: + + KERNEL_S4X4 + + subs J, J, #1 + bne gemvn_kernel_S4X4_10 + + +gemvn_kernel_S4X1: + + ands J, N , #3 + ble gemvn_kernel_S4_END + +gemvn_kernel_S4X1_10: + + KERNEL_S4X1 + + subs J, J, #1 + bne gemvn_kernel_S4X1_10 + + +gemvn_kernel_S4_END: + + SAVE_S4 + + subs I , I , #1 + bne gemvn_kernel_S4X4 + + +gemvn_kernel_S1_BEGIN: + + ldr I, M + ands I, I , #3 + ble gemvn_kernel_L999 + +gemvn_kernel_S1X1: + + ldr AO1, A + add r3, AO1, #SIZE + str r3, A + + ldr XO , X + + INIT_S1 + + mov J, N + + +gemvn_kernel_S1X1_10: + + KERNEL_S1X1 + + subs J, J, #1 + bne gemvn_kernel_S1X1_10 + + +gemvn_kernel_S1_END: + + SAVE_S1 + + subs I , I , #1 + bne gemvn_kernel_S1X1 + + +/*************************************************************************************************************/ + +gemvn_kernel_L999: + + sub r3, fp, #192 + +#if defined(DOUBLE) + vldm r3, { d8 - d15 } // restore floating point registers +#else + vldm r3, { s8 - s15 } // restore floating point registers +#endif + + mov r0, #0 // set return value + + sub sp, fp, #28 + pop {r4 -r9 ,fp} + bx lr + + EPILOGUE + From e25de3d182097290b456d0a8707e9cc66949f24d Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sun, 24 Nov 2013 13:22:49 +0100 Subject: [PATCH 59/81] changed default optimization flag for ARM from -O2 to -O3 --- Makefile.system | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/Makefile.system b/Makefile.system index e5358f65b..0f5e9c6d5 100644 --- a/Makefile.system +++ b/Makefile.system @@ -830,8 +830,12 @@ COMMON_OPT += -g endif ifndef COMMON_OPT +ifeq ($(ARCH), arm) +COMMON_OPT = -O3 +else COMMON_OPT = -O2 endif +endif override CFLAGS += $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR) @@ -929,6 +933,10 @@ export HAVE_SSE4_2 export HAVE_SSE4A export HAVE_SSE5 export HAVE_AVX +export HAVE_VFP +export HAVE_VFPV3 +export HAVE_VFPV4 +export HAVE_NEON export KERNELDIR export FUNCTION_PROFILE export TARGET_CORE From fe5f46c3304946ac1c536d5edd593ebaf21f71d9 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sun, 24 Nov 2013 15:47:00 +0100 Subject: [PATCH 60/81] added experimental support for ARMV8 --- Makefile.arm64 | 7 ++ Makefile.system | 8 ++ c_check | 2 + common.h | 6 +- common_arm64.h | 169 ++++++++++++++++++++++++++++++++++++ ctest.c | 3 + getarch.c | 16 ++++ kernel/Makefile.L3 | 4 + kernel/arm64/KERNEL | 46 ++++++++++ kernel/arm64/KERNEL.ARMV8 | 134 ++++++++++++++++++++++++++++ kernel/arm64/Makefile | 2 + lapack/laswp/arm64/Makefile | 33 +++++++ param.h | 40 +++++++++ 13 files changed, 469 insertions(+), 1 deletion(-) create mode 100644 Makefile.arm64 create mode 100644 common_arm64.h create mode 100644 kernel/arm64/KERNEL create mode 100644 kernel/arm64/KERNEL.ARMV8 create mode 100644 kernel/arm64/Makefile create mode 100644 lapack/laswp/arm64/Makefile diff --git a/Makefile.arm64 b/Makefile.arm64 new file mode 100644 index 000000000..a4f8bab6b --- /dev/null +++ b/Makefile.arm64 @@ -0,0 +1,7 @@ + +ifeq ($(CORE), ARMV8) +CCOMMON_OPT += -march=armv8-a +FCOMMON_OPT += -march=armv8-a +endif + + diff --git a/Makefile.system b/Makefile.system index 0f5e9c6d5..aceadf2b6 100644 --- a/Makefile.system +++ b/Makefile.system @@ -367,6 +367,14 @@ ifeq ($(ARCH), arm) NO_BINARY_MODE = 1 BINARY_DEFINED = 1 endif + +ifeq ($(ARCH), arm64) +NO_BINARY_MODE = 1 +BINARY_DEFINED = 1 +endif + + + # # C Compiler dependent settings # diff --git a/c_check b/c_check index c1cdd59c4..0828a5bba 100644 --- a/c_check +++ b/c_check @@ -64,6 +64,7 @@ $architecture = alpha if ($data =~ /ARCH_ALPHA/); $architecture = sparc if ($data =~ /ARCH_SPARC/); $architecture = ia64 if ($data =~ /ARCH_IA64/); $architecture = arm if ($data =~ /ARCH_ARM/); +$architecture = arm64 if ($data =~ /ARCH_ARM64/); $defined = 0; @@ -151,6 +152,7 @@ $architecture = alpha if ($data =~ /ARCH_ALPHA/); $architecture = sparc if ($data =~ /ARCH_SPARC/); $architecture = ia64 if ($data =~ /ARCH_IA64/); $architecture = arm if ($data =~ /ARCH_ARM/); +$architecture = arm64 if ($data =~ /ARCH_ARM64/); $binformat = bin32; $binformat = bin64 if ($data =~ /BINARY_64/); diff --git a/common.h b/common.h index a2775520f..310fcad93 100644 --- a/common.h +++ b/common.h @@ -311,7 +311,7 @@ typedef int blasint; #endif -#ifdef ARMV7 +#if defined(ARMV7) || defined(ARMV6) || defined(ARMV8) #define YIELDING asm volatile ("nop;nop;nop;nop;nop;nop;nop;nop; \n"); #endif @@ -375,6 +375,10 @@ please https://github.com/xianyi/OpenBLAS/issues/246 #include "common_arm.h" #endif +#ifdef ARCH_ARM64 +#include "common_arm64.h" +#endif + #ifdef OS_LINUX #include "common_linux.h" #endif diff --git a/common_arm64.h b/common_arm64.h new file mode 100644 index 000000000..2da0d894c --- /dev/null +++ b/common_arm64.h @@ -0,0 +1,169 @@ +/***************************************************************************** +Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the ISCAS nor the names of its contributors may + be used to endorse or promote products derived from this software + without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#ifndef COMMON_ARM64 +#define COMMON_ARM64 + +#define MB +#define WMB + +#define INLINE inline + +#define RETURN_BY_COMPLEX + +#ifndef ASSEMBLER + +static void __inline blas_lock(volatile BLASULONG *address){ +/* + int register ret; + + do { + while (*address) {YIELDING;}; + + __asm__ __volatile__( + "ldrex r2, [%1] \n\t" + "mov r2, #0 \n\t" + "strex r3, r2, [%1] \n\t" + "mov %0 , r3 \n\t" + : "=r"(ret), "=r"(address) + : "1"(address) + : "memory", "r2" , "r3" + + + ); + + } while (ret); +*/ +} + + +static inline unsigned long long rpcc(void){ + unsigned long long ret=0; + double v; + struct timeval tv; + gettimeofday(&tv,NULL); + v=(double) tv.tv_sec + (double) tv.tv_usec * 1e-6; + ret = (unsigned long long) ( v * 1000.0d ); + return ret; +} + +static inline int blas_quickdivide(blasint x, blasint y){ + return x / y; +} + +#if defined(DOUBLE) +#define GET_IMAGE(res) __asm__ __volatile__("vstr.f64 d1, %0" : "=m"(res) : : "memory") +#else +#define GET_IMAGE(res) __asm__ __volatile__("vstr.f32 s1, %0" : "=m"(res) : : "memory") +#endif + +#define GET_IMAGE_CANCEL + +#endif + + +#ifndef F_INTERFACE +#define REALNAME ASMNAME +#else +#define REALNAME ASMFNAME +#endif + +#if defined(ASSEMBLER) && !defined(NEEDPARAM) + +#define PROLOGUE \ + .arm ;\ + .global REALNAME ;\ + .func REALNAME ;\ +REALNAME: + +#define EPILOGUE + +#define PROFCODE + +#endif + + +#define SEEK_ADDRESS + +#ifndef PAGESIZE +#define PAGESIZE ( 4 << 10) +#endif +#define HUGE_PAGESIZE ( 4 << 20) + +#define BUFFER_SIZE (16 << 20) + + +#define BASE_ADDRESS (START_ADDRESS - BUFFER_SIZE * MAX_CPU_NUMBER) + +#ifndef MAP_ANONYMOUS +#define MAP_ANONYMOUS MAP_ANON +#endif + +#endif diff --git a/ctest.c b/ctest.c index 184416339..86dc226d4 100644 --- a/ctest.c +++ b/ctest.c @@ -129,4 +129,7 @@ BINARY_64 ARCH_ARM #endif +#if defined(__aarch64__) +ARCH_ARM64 +#endif diff --git a/getarch.c b/getarch.c index 4407e3d9b..7975c9468 100644 --- a/getarch.c +++ b/getarch.c @@ -709,6 +709,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else #endif +#ifdef FORCE_ARMV8 +#define FORCE +#define ARCHITECTURE "ARM64" +#define SUBARCHITECTURE "ARMV8" +#define SUBDIRNAME "arm64" +#define ARCHCONFIG "-DARMV8 " \ + "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \ + "-DL2_SIZE=512488 -DL2_LINESIZE=32 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " \ + "-DHAVE_VFP -DHAVE_VFPV3 -DHAVE_VFPV4" +#define LIBNAME "armv8" +#define CORENAME "ARMV8" +#else +#endif + + #ifndef FORCE diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index f543cd08d..b9b4bef1e 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -18,6 +18,10 @@ ifeq ($(ARCH), arm) USE_TRMM = 1 endif +ifeq ($(ARCH), arm64) +USE_TRMM = 1 +endif + ifeq ($(TARGET), LOONGSON3B) USE_TRMM = 1 endif diff --git a/kernel/arm64/KERNEL b/kernel/arm64/KERNEL new file mode 100644 index 000000000..aeccfbf4c --- /dev/null +++ b/kernel/arm64/KERNEL @@ -0,0 +1,46 @@ +ifndef SNRM2KERNEL +SNRM2KERNEL = nrm2.c +endif + +ifndef DNRM2KERNEL +DNRM2KERNEL = nrm2.c +endif + +ifndef CNRM2KERNEL +CNRM2KERNEL = znrm2.c +endif + +ifndef ZNRM2KERNEL +ZNRM2KERNEL = znrm2.c +endif + +ifndef SCABS_KERNEL +SCABS_KERNEL = ../generic/cabs.c +endif + +ifndef DCABS_KERNEL +DCABS_KERNEL = ../generic/cabs.c +endif + +ifndef QCABS_KERNEL +QCABS_KERNEL = ../generic/cabs.c +endif + +ifndef LSAME_KERNEL +LSAME_KERNEL = ../generic/lsame.c +endif + +ifndef SGEMM_BETA +SGEMM_BETA = ../generic/gemm_beta.c +endif +ifndef DGEMM_BETA +DGEMM_BETA = ../generic/gemm_beta.c +endif +ifndef CGEMM_BETA +CGEMM_BETA = ../generic/zgemm_beta.c +endif +ifndef ZGEMM_BETA +ZGEMM_BETA = ../generic/zgemm_beta.c +endif + + diff --git a/kernel/arm64/KERNEL.ARMV8 b/kernel/arm64/KERNEL.ARMV8 new file mode 100644 index 000000000..ecf278cf9 --- /dev/null +++ b/kernel/arm64/KERNEL.ARMV8 @@ -0,0 +1,134 @@ +SAMAXKERNEL = ../arm/amax.c +DAMAXKERNEL = ../arm/amax.c +CAMAXKERNEL = ../arm/zamax.c +ZAMAXKERNEL = ../arm/zamax.c + +SAMINKERNEL = ../arm/amin.c +DAMINKERNEL = ../arm/amin.c +CAMINKERNEL = ../arm/zamin.c +ZAMINKERNEL = ../arm/zamin.c + +SMAXKERNEL = ../arm/max.c +DMAXKERNEL = ../arm/max.c + +SMINKERNEL = ../arm/min.c +DMINKERNEL = ../arm/min.c + +ISAMAXKERNEL = ../arm/iamax.c +IDAMAXKERNEL = ../arm/iamax.c +ICAMAXKERNEL = ../arm/izamax.c +IZAMAXKERNEL = ../arm/izamax.c + +ISAMINKERNEL = ../arm/iamin.c +IDAMINKERNEL = ../arm/iamin.c +ICAMINKERNEL = ../arm/izamin.c +IZAMINKERNEL = ../arm/izamin.c + +ISMAXKERNEL = ../arm/imax.c +IDMAXKERNEL = ../arm/imax.c + +ISMINKERNEL = ../arm/imin.c +IDMINKERNEL = ../arm/imin.c + +SASUMKERNEL = ../arm/asum.c +DASUMKERNEL = ../arm/asum.c +CASUMKERNEL = ../arm/zasum.c +ZASUMKERNEL = ../arm/zasum.c + +SAXPYKERNEL = ../arm/axpy.c +DAXPYKERNEL = ../arm/axpy.c +CAXPYKERNEL = ../arm/zaxpy.c +ZAXPYKERNEL = ../arm/zaxpy.c + +SCOPYKERNEL = ../arm/copy.c +DCOPYKERNEL = ../arm/copy.c +CCOPYKERNEL = ../arm/zcopy.c +ZCOPYKERNEL = ../arm/zcopy.c + +SDOTKERNEL = ../arm/dot.c +DDOTKERNEL = ../arm/dot.c +CDOTKERNEL = ../arm/zdot.c +ZDOTKERNEL = ../arm/zdot.c + +SNRM2KERNEL = ../arm/nrm2.c +DNRM2KERNEL = ../arm/nrm2.c +CNRM2KERNEL = ../arm/znrm2.c +ZNRM2KERNEL = ../arm/znrm2.c + +SROTKERNEL = ../arm/rot.c +DROTKERNEL = ../arm/rot.c +CROTKERNEL = ../arm/zrot.c +ZROTKERNEL = ../arm/zrot.c + +SSCALKERNEL = ../arm/scal.c +DSCALKERNEL = ../arm/scal.c +CSCALKERNEL = ../arm/zscal.c +ZSCALKERNEL = ../arm/zscal.c + +SSWAPKERNEL = ../arm/swap.c +DSWAPKERNEL = ../arm/swap.c +CSWAPKERNEL = ../arm/zswap.c +ZSWAPKERNEL = ../arm/zswap.c + +SGEMVNKERNEL = ../arm/gemv_n.c +DGEMVNKERNEL = ../arm/gemv_n.c +CGEMVNKERNEL = ../arm/zgemv_n.c +ZGEMVNKERNEL = ../arm/zgemv_n.c + +SGEMVTKERNEL = ../arm/gemv_t.c +DGEMVTKERNEL = ../arm/gemv_t.c +CGEMVTKERNEL = ../arm/zgemv_t.c +ZGEMVTKERNEL = ../arm/zgemv_t.c + +STRMMKERNEL = ../generic/trmmkernel_2x2.c +DTRMMKERNEL = ../generic/trmmkernel_2x2.c +CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c +ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c + +SGEMMKERNEL = ../generic/gemmkernel_2x2.c +SGEMMONCOPY = ../generic/gemm_ncopy_2.c +SGEMMOTCOPY = ../generic/gemm_tcopy_2.c +SGEMMONCOPYOBJ = sgemm_oncopy.o +SGEMMOTCOPYOBJ = sgemm_otcopy.o + +DGEMMKERNEL = ../generic/gemmkernel_2x2.c +DGEMMONCOPY = ../generic/gemm_ncopy_2.c +DGEMMOTCOPY = ../generic/gemm_tcopy_2.c +DGEMMONCOPYOBJ = dgemm_oncopy.o +DGEMMOTCOPYOBJ = dgemm_otcopy.o + +CGEMMKERNEL = ../generic/zgemmkernel_2x2.c +CGEMMONCOPY = ../generic/zgemm_ncopy_2.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +CGEMMONCOPYOBJ = cgemm_oncopy.o +CGEMMOTCOPYOBJ = cgemm_otcopy.o + +ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c +ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMONCOPYOBJ = zgemm_oncopy.o +ZGEMMOTCOPYOBJ = zgemm_otcopy.o + +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + + + + diff --git a/kernel/arm64/Makefile b/kernel/arm64/Makefile new file mode 100644 index 000000000..efae70d7b --- /dev/null +++ b/kernel/arm64/Makefile @@ -0,0 +1,2 @@ +clean :: + diff --git a/lapack/laswp/arm64/Makefile b/lapack/laswp/arm64/Makefile new file mode 100644 index 000000000..434c82a84 --- /dev/null +++ b/lapack/laswp/arm64/Makefile @@ -0,0 +1,33 @@ +TOPDIR = ../../.. +include ../../../Makefile.system + +ifeq ($(CORE), CORE2) +LASWP = ../generic/laswp_k_2.c +ZLASWP = ../generic/zlaswp_k_2.c +endif + +ifeq ($(CORE), OPTERON) +LASWP = ../generic/laswp_k_1.c +ZLASWP = ../generic/zlaswp_k_1.c +endif + +ifeq ($(CORE), PRESCOTT) +LASWP = ../generic/laswp_k_1.c +ZLASWP = ../generic/zlaswp_k_1.c +endif + +ifeq ($(DYNAMIC_ARCH), 1) +LASWP = ../generic/laswp_k_4.c +ZLASWP = ../generic/zlaswp_k_4.c +endif + +ifndef LASWP +LASWP = ../generic/laswp_k.c +endif + +ifndef ZLASWP +ZLASWP = ../generic/zlaswp_k.c +endif + +include ../generic/Makefile + diff --git a/param.h b/param.h index ec1767d20..0628a1972 100644 --- a/param.h +++ b/param.h @@ -1874,6 +1874,46 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SYMV_P 16 #endif +#if defined(ARMV8) +#define SNUMOPT 2 +#define DNUMOPT 2 + +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 +#define GEMM_DEFAULT_ALIGN 0x03fffUL + +#define SGEMM_DEFAULT_UNROLL_M 2 +#define SGEMM_DEFAULT_UNROLL_N 2 + +#define DGEMM_DEFAULT_UNROLL_M 2 +#define DGEMM_DEFAULT_UNROLL_N 2 + +#define CGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_N 2 + +#define ZGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 + +#define SGEMM_DEFAULT_P 128 +#define DGEMM_DEFAULT_P 128 +#define CGEMM_DEFAULT_P 96 +#define ZGEMM_DEFAULT_P 64 + +#define SGEMM_DEFAULT_Q 240 +#define DGEMM_DEFAULT_Q 120 +#define CGEMM_DEFAULT_Q 120 +#define ZGEMM_DEFAULT_Q 120 + +#define SGEMM_DEFAULT_R 12288 +#define DGEMM_DEFAULT_R 8192 +#define CGEMM_DEFAULT_R 4096 +#define ZGEMM_DEFAULT_R 4096 + + +#define SYMV_P 16 +#endif + + #ifdef GENERIC From fd1d9fdb22955385c77a5743e5103543f5b43f96 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sun, 24 Nov 2013 16:19:01 +0100 Subject: [PATCH 61/81] changed default optimization from -O2 to -O3 for ARMV8 --- Makefile.system | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/Makefile.system b/Makefile.system index aceadf2b6..5545de1b1 100644 --- a/Makefile.system +++ b/Makefile.system @@ -840,11 +840,19 @@ endif ifndef COMMON_OPT ifeq ($(ARCH), arm) COMMON_OPT = -O3 -else -COMMON_OPT = -O2 endif endif +ifndef COMMON_OPT +ifeq ($(ARCH), arm64) +COMMON_OPT = -O3 +endif +endif + +ifndef COMMON_OPT +COMMON_OPT = -O2 +endif + override CFLAGS += $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR) override PFLAGS += $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR) -DPROFILE $(COMMON_PROF) From d2b20c5c51b1f174d40dd8879b8a7387ce5a5416 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Mon, 25 Nov 2013 12:25:58 +0100 Subject: [PATCH 62/81] add optimized axpy kernel --- kernel/arm/KERNEL.ARMV6 | 8 +- kernel/arm/KERNEL.ARMV7 | 8 +- kernel/arm/axpy_vfp.S | 503 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 511 insertions(+), 8 deletions(-) create mode 100644 kernel/arm/axpy_vfp.S diff --git a/kernel/arm/KERNEL.ARMV6 b/kernel/arm/KERNEL.ARMV6 index 9dad061d9..79fb5f9c9 100644 --- a/kernel/arm/KERNEL.ARMV6 +++ b/kernel/arm/KERNEL.ARMV6 @@ -35,10 +35,10 @@ DASUMKERNEL = asum_vfp.S CASUMKERNEL = asum_vfp.S ZASUMKERNEL = asum_vfp.S -SAXPYKERNEL = axpy.c -DAXPYKERNEL = axpy.c -CAXPYKERNEL = zaxpy.c -ZAXPYKERNEL = zaxpy.c +SAXPYKERNEL = axpy_vfp.S +DAXPYKERNEL = axpy_vfp.S +CAXPYKERNEL = axpy_vfp.S +ZAXPYKERNEL = axpy_vfp.S SCOPYKERNEL = scopy_vfp.S DCOPYKERNEL = dcopy_vfp.S diff --git a/kernel/arm/KERNEL.ARMV7 b/kernel/arm/KERNEL.ARMV7 index 07d38d189..143327074 100644 --- a/kernel/arm/KERNEL.ARMV7 +++ b/kernel/arm/KERNEL.ARMV7 @@ -40,10 +40,10 @@ DASUMKERNEL = asum_vfp.S CASUMKERNEL = asum_vfp.S ZASUMKERNEL = asum_vfp.S -SAXPYKERNEL = ../arm/axpy.c -DAXPYKERNEL = ../arm/axpy.c -CAXPYKERNEL = ../arm/zaxpy.c -ZAXPYKERNEL = ../arm/zaxpy.c +SAXPYKERNEL = axpy_vfp.S +DAXPYKERNEL = axpy_vfp.S +CAXPYKERNEL = axpy_vfp.S +ZAXPYKERNEL = axpy_vfp.S SCOPYKERNEL = scopy_vfp.S DCOPYKERNEL = dcopy_vfp.S diff --git a/kernel/arm/axpy_vfp.S b/kernel/arm/axpy_vfp.S new file mode 100644 index 000000000..acc575707 --- /dev/null +++ b/kernel/arm/axpy_vfp.S @@ -0,0 +1,503 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/14 Saar +* BLASTEST : xOK +* CTEST : xOK +* TEST : xOK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_INC_X [fp, #0 ] +#define OLD_Y [fp, #4 ] +#define OLD_INC_Y [fp, #8 ] + + +#define N r0 +#define Y r1 +#define INC_X r2 +#define X r3 +#define INC_Y r4 + +#define I r12 + +#define X_PRE 512 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +/*****************************************************************************************/ + +#if !defined(CONJ) + +#if defined(DOUBLE) + +#define FMAC_R1 fmacd +#define FMAC_R2 fnmacd +#define FMAC_I1 fmacd +#define FMAC_I2 fmacd + +#else + +#define FMAC_R1 fmacs +#define FMAC_R2 fnmacs +#define FMAC_I1 fmacs +#define FMAC_I2 fmacs + +#endif + +#else // CONJ + +#if defined(DOUBLE) + +#define FMAC_R1 fmacd +#define FMAC_R2 fmacd +#define FMAC_I1 fnmacd +#define FMAC_I2 fmacd + +#else + +#define FMAC_R1 fmacs +#define FMAC_R2 fmacs +#define FMAC_I1 fnmacs +#define FMAC_I2 fmacs + +#endif + +#endif + + +#if !defined(COMPLEX) + +#if defined(DOUBLE) + +.macro KERNEL_F4 + + pld [ X, #X_PRE ] + fldmiad X!, { d4 - d7 } + pld [ Y, #X_PRE ] + fldmiad Y , { d8 - d11 } + fmacd d8 , d0, d4 + fstmiad Y!, { d8 } + fmacd d9 , d0, d5 + fstmiad Y!, { d9 } + fmacd d10, d0, d6 + fstmiad Y!, { d10 } + fmacd d11, d0, d7 + fstmiad Y!, { d11 } + + +.endm + + +.macro KERNEL_F1 + + fldmiad X!, { d4 } + fldmiad Y , { d8 } + fmacd d8 , d0, d4 + fstmiad Y!, { d8 } + +.endm + +.macro KERNEL_S1 + + fldmiad X , { d4 } + fldmiad Y , { d8 } + fmacd d8 , d0, d4 + fstmiad Y , { d8 } + add X, X, INC_X + add Y, Y, INC_Y + +.endm + +#else + +.macro KERNEL_F4 + + fldmias X!, { s4 - s7 } + fldmias Y , { s8 - s11 } + fmacs s8 , s0, s4 + fstmias Y!, { s8 } + fmacs s9 , s0, s5 + fstmias Y!, { s9 } + fmacs s10, s0, s6 + fstmias Y!, { s10 } + fmacs s11, s0, s7 + fstmias Y!, { s11 } + + +.endm + + +.macro KERNEL_F1 + + fldmias X!, { s4 } + fldmias Y , { s8 } + fmacs s8 , s0, s4 + fstmias Y!, { s8 } + +.endm + +.macro KERNEL_S1 + + fldmias X , { s4 } + fldmias Y , { s8 } + fmacs s8 , s0, s4 + fstmias Y , { s8 } + add X, X, INC_X + add Y, Y, INC_Y + +.endm + + +#endif + +#else + +#if defined(DOUBLE) + +.macro KERNEL_F4 + + pld [ X, #X_PRE ] + fldmiad X!, { d4 - d7 } + pld [ Y, #X_PRE ] + fldmiad Y , { d8 - d11 } + + FMAC_R1 d8 , d0, d4 + FMAC_R2 d8 , d1, d5 + FMAC_I1 d9 , d0, d5 + FMAC_I2 d9 , d1, d4 + fstmiad Y!, { d8 } + fstmiad Y!, { d9 } + + FMAC_R1 d10, d0, d6 + FMAC_R2 d10, d1, d7 + FMAC_I1 d11, d0, d7 + FMAC_I2 d11, d1, d6 + fstmiad Y!, { d10 } + fstmiad Y!, { d11 } + + pld [ X, #X_PRE ] + fldmiad X!, { d4 - d7 } + pld [ Y, #X_PRE ] + fldmiad Y , { d8 - d11 } + + FMAC_R1 d8 , d0, d4 + FMAC_R2 d8 , d1, d5 + FMAC_I1 d9 , d0, d5 + FMAC_I2 d9 , d1, d4 + fstmiad Y!, { d8 } + fstmiad Y!, { d9 } + + FMAC_R1 d10, d0, d6 + FMAC_R2 d10, d1, d7 + FMAC_I1 d11, d0, d7 + FMAC_I2 d11, d1, d6 + fstmiad Y!, { d10 } + fstmiad Y!, { d11 } + + + + + +.endm + + +.macro KERNEL_F1 + + fldmiad X!, { d4 - d5 } + fldmiad Y , { d8 - d9 } + + FMAC_R1 d8 , d0, d4 + FMAC_R2 d8 , d1, d5 + FMAC_I1 d9 , d0, d5 + FMAC_I2 d9 , d1, d4 + fstmiad Y!, { d8 } + fstmiad Y!, { d9 } + + + +.endm + +.macro KERNEL_S1 + + fldmiad X , { d4 - d5 } + fldmiad Y , { d8 - d9 } + + FMAC_R1 d8 , d0, d4 + FMAC_R2 d8 , d1, d5 + FMAC_I1 d9 , d0, d5 + FMAC_I2 d9 , d1, d4 + fstmiad Y , { d8 - d9 } + + add X, X, INC_X + add Y, Y, INC_Y + +.endm + + + +#else + +.macro KERNEL_F4 + + pld [ X, #X_PRE ] + fldmias X!, { s4 - s7 } + pld [ Y, #X_PRE ] + fldmias Y , { s8 - s11 } + + FMAC_R1 s8 , s0, s4 + FMAC_R2 s8 , s1, s5 + FMAC_I1 s9 , s0, s5 + FMAC_I2 s9 , s1, s4 + fstmias Y!, { s8 } + fstmias Y!, { s9 } + + FMAC_R1 s10, s0, s6 + FMAC_R2 s10, s1, s7 + FMAC_I1 s11, s0, s7 + FMAC_I2 s11, s1, s6 + fstmias Y!, { s10 } + fstmias Y!, { s11 } + + fldmias X!, { s4 - s7 } + fldmias Y , { s8 - s11 } + + FMAC_R1 s8 , s0, s4 + FMAC_R2 s8 , s1, s5 + FMAC_I1 s9 , s0, s5 + FMAC_I2 s9 , s1, s4 + fstmias Y!, { s8 } + fstmias Y!, { s9 } + + FMAC_R1 s10, s0, s6 + FMAC_R2 s10, s1, s7 + FMAC_I1 s11, s0, s7 + FMAC_I2 s11, s1, s6 + fstmias Y!, { s10 } + fstmias Y!, { s11 } + + + + + +.endm + + +.macro KERNEL_F1 + + fldmias X!, { s4 - s5 } + fldmias Y , { s8 - s9 } + + FMAC_R1 s8 , s0, s4 + FMAC_R2 s8 , s1, s5 + FMAC_I1 s9 , s0, s5 + FMAC_I2 s9 , s1, s4 + fstmias Y!, { s8 } + fstmias Y!, { s9 } + + + +.endm + +.macro KERNEL_S1 + + fldmias X , { s4 - s5 } + fldmias Y , { s8 - s9 } + + FMAC_R1 s8 , s0, s4 + FMAC_R2 s8 , s1, s5 + FMAC_I1 s9 , s0, s5 + FMAC_I2 s9 , s1, s4 + fstmias Y , { s8 - s9 } + + add X, X, INC_X + add Y, Y, INC_Y + +.endm + + +#endif + +#endif + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + push {r4 , fp} + add fp, sp, #8 + sub sp, sp, #STACKSIZE // reserve stack + + ldr INC_X , OLD_INC_X + ldr Y, OLD_Y + ldr INC_Y , OLD_INC_Y + + sub r12, fp, #128 + +#if defined(DOUBLE) + vstm r12, { d8 - d15} // store floating point registers +#else + vstm r12, { s8 - s15} // store floating point registers +#endif + + cmp N, #0 + ble axpy_kernel_L999 + + cmp INC_X, #0 + beq axpy_kernel_L999 + + cmp INC_Y, #0 + beq axpy_kernel_L999 + + cmp INC_X, #1 + bne axpy_kernel_S_BEGIN + + cmp INC_Y, #1 + bne axpy_kernel_S_BEGIN + + +axpy_kernel_F_BEGIN: + + + asrs I, N, #2 // I = N / 4 + ble axpy_kernel_F1 + + .align 5 + +axpy_kernel_F4: + +#if !defined(COMPLEX) && !defined(DOUBLE) + pld [ X, #X_PRE ] + pld [ Y, #X_PRE ] +#endif + + KERNEL_F4 + + subs I, I, #1 + ble axpy_kernel_F1 + + KERNEL_F4 + + subs I, I, #1 + bne axpy_kernel_F4 + +axpy_kernel_F1: + + ands I, N, #3 + ble axpy_kernel_L999 + +axpy_kernel_F10: + + KERNEL_F1 + + subs I, I, #1 + bne axpy_kernel_F10 + + b axpy_kernel_L999 + +axpy_kernel_S_BEGIN: + +#if defined(COMPLEX) + +#if defined(DOUBLE) + lsl INC_X, INC_X, #4 // INC_X * SIZE * 2 + lsl INC_Y, INC_Y, #4 // INC_Y * SIZE * 2 +#else + lsl INC_X, INC_X, #3 // INC_X * SIZE * 2 + lsl INC_Y, INC_Y, #3 // INC_Y * SIZE * 2 +#endif + +#else + +#if defined(DOUBLE) + lsl INC_X, INC_X, #3 // INC_X * SIZE + lsl INC_Y, INC_Y, #3 // INC_Y * SIZE +#else + lsl INC_X, INC_X, #2 // INC_X * SIZE + lsl INC_Y, INC_Y, #2 // INC_Y * SIZE +#endif + +#endif + + + asrs I, N, #2 // I = N / 4 + ble axpy_kernel_S1 + + .align 5 + +axpy_kernel_S4: + + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + + subs I, I, #1 + bne axpy_kernel_S4 + +axpy_kernel_S1: + + ands I, N, #3 + ble axpy_kernel_L999 + +axpy_kernel_S10: + + KERNEL_S1 + + subs I, I, #1 + bne axpy_kernel_S10 + + +axpy_kernel_L999: + + sub r3, fp, #128 + +#if defined(DOUBLE) + vldm r3, { d8 - d15 } // restore floating point registers +#else + vldm r3, { s8 - s15 } // restore floating point registers +#endif + + mov r0, #0 // set return value + + sub sp, fp, #8 + pop {r4,fp} + bx lr + + EPILOGUE + From 36b0f7fe1dfc89e9d08a8afc508229833a1f11e9 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Mon, 25 Nov 2013 19:31:27 +0100 Subject: [PATCH 63/81] added optimized gemv_t kernel for ARMV6 --- kernel/arm/KERNEL.ARMV6 | 4 +- kernel/arm/gemv_t_vfp.S | 750 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 752 insertions(+), 2 deletions(-) create mode 100644 kernel/arm/gemv_t_vfp.S diff --git a/kernel/arm/KERNEL.ARMV6 b/kernel/arm/KERNEL.ARMV6 index 79fb5f9c9..1ab2683cf 100644 --- a/kernel/arm/KERNEL.ARMV6 +++ b/kernel/arm/KERNEL.ARMV6 @@ -75,8 +75,8 @@ DGEMVNKERNEL = gemv_n_vfp.S CGEMVNKERNEL = zgemv_n.c ZGEMVNKERNEL = zgemv_n.c -SGEMVTKERNEL = gemv_t.c -DGEMVTKERNEL = gemv_t.c +SGEMVTKERNEL = gemv_t_vfp.S +DGEMVTKERNEL = gemv_t_vfp.S CGEMVTKERNEL = zgemv_t.c ZGEMVTKERNEL = zgemv_t.c diff --git a/kernel/arm/gemv_t_vfp.S b/kernel/arm/gemv_t_vfp.S new file mode 100644 index 000000000..6a56ae9d1 --- /dev/null +++ b/kernel/arm/gemv_t_vfp.S @@ -0,0 +1,750 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/25 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_LDA [fp, #0 ] +#define X [fp, #4 ] +#define OLD_INC_X [fp, #8 ] +#define Y [fp, #12 ] +#define OLD_INC_Y [fp, #16 ] +#define OLD_A r3 +#define OLD_N r1 + +#define M r0 +#define AO1 r1 +#define J r2 + +#define AO2 r4 +#define XO r5 +#define YO r6 +#define LDA r7 +#define INC_X r8 +#define INC_Y r9 + +#define I r12 + +#define N [fp, #-252 ] +#define A [fp, #-256 ] + + +#define X_PRE 512 +#define A_PRE 512 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + + +#if defined(DOUBLE) + +.macro INIT_F2 + + vsub.f64 d2 , d2 , d2 + vsub.f64 d3 , d3 , d3 + +.endm + +.macro KERNEL_F2X4 + + pld [ XO , #X_PRE ] + fldmiad XO! , { d12 - d15 } + pld [ AO1 , #A_PRE ] + fldmiad AO1!, { d8 - d9 } + pld [ AO2 , #A_PRE ] + fldmiad AO2!, { d4 - d5 } + fldmiad AO1!, { d10 - d11 } + fldmiad AO2!, { d6 - d7 } + + vmla.f64 d2 , d12 , d8 + vmla.f64 d3 , d12 , d4 + vmla.f64 d2 , d13 , d9 + vmla.f64 d3 , d13 , d5 + vmla.f64 d2 , d14, d10 + vmla.f64 d3 , d14, d6 + vmla.f64 d2 , d15, d11 + vmla.f64 d3 , d15, d7 + +.endm + +.macro KERNEL_F2X1 + + fldmiad XO! , { d1 } + fldmiad AO1!, { d8 } + fldmiad AO2!, { d4 } + vmla.f64 d2 , d1 , d8 + vmla.f64 d3 , d1 , d4 + +.endm + +.macro SAVE_F2 + + fldmiad YO, { d4 - d5 } + vmla.f64 d4, d0, d2 + vmla.f64 d5, d0, d3 + fstmiad YO!, { d4 - d5 } + +.endm + +.macro INIT_F1 + + vsub.f64 d2 , d2 , d2 + +.endm + +.macro KERNEL_F1X4 + + pld [ XO , #X_PRE ] + fldmiad XO! , { d12 - d15 } + pld [ AO1 , #A_PRE ] + fldmiad AO1!, { d8 - d9 } + fldmiad AO1!, { d10 - d11 } + vmla.f64 d2 , d12 , d8 + vmla.f64 d2 , d13 , d9 + vmla.f64 d2 , d14, d10 + vmla.f64 d2 , d15, d11 + +.endm + +.macro KERNEL_F1X1 + + fldmiad XO! , { d1 } + fldmiad AO1!, { d8 } + vmla.f64 d2 , d1 , d8 + +.endm + +.macro SAVE_F1 + + fldmiad YO, { d4 } + vmla.f64 d4, d0, d2 + fstmiad YO!, { d4 } + +.endm + + +.macro INIT_S2 + + vsub.f64 d2 , d2 , d2 + vsub.f64 d3 , d3 , d3 + +.endm + +.macro KERNEL_S2X4 + + fldmiad XO , { d12 } + add XO, XO, INC_X + + pld [ AO1 , #A_PRE ] + fldmiad AO1!, { d8 - d9 } + pld [ AO2 , #A_PRE ] + fldmiad AO2!, { d4 - d5 } + + fldmiad XO , { d13 } + add XO, XO, INC_X + fldmiad AO1!, { d10 - d11 } + fldmiad AO2!, { d6 - d7 } + + fldmiad XO , { d14 } + add XO, XO, INC_X + + fldmiad XO , { d15 } + add XO, XO, INC_X + + vmla.f64 d2 , d12 , d8 + vmla.f64 d3 , d12 , d4 + vmla.f64 d2 , d13 , d9 + vmla.f64 d3 , d13 , d5 + vmla.f64 d2 , d14, d10 + vmla.f64 d3 , d14, d6 + vmla.f64 d2 , d15, d11 + vmla.f64 d3 , d15, d7 + +.endm + +.macro KERNEL_S2X1 + + fldmiad XO , { d1 } + fldmiad AO1!, { d8 } + fldmiad AO2!, { d4 } + vmla.f64 d2 , d1 , d8 + add XO, XO, INC_X + vmla.f64 d3 , d1 , d4 + +.endm + +.macro SAVE_S2 + + fldmiad YO, { d4 } + vmla.f64 d4, d0, d2 + fstmiad YO, { d4 } + add YO, YO, INC_Y + + fldmiad YO, { d5 } + vmla.f64 d5, d0, d3 + fstmiad YO, { d5 } + add YO, YO, INC_Y + +.endm + +.macro INIT_S1 + + vsub.f64 d2 , d2 , d2 + +.endm + +.macro KERNEL_S1X4 + + fldmiad XO , { d12 } + add XO, XO, INC_X + + pld [ AO1 , #A_PRE ] + fldmiad AO1!, { d8 - d9 } + + fldmiad XO , { d13 } + add XO, XO, INC_X + fldmiad AO1!, { d10 - d11 } + + fldmiad XO , { d14 } + add XO, XO, INC_X + + fldmiad XO , { d15 } + add XO, XO, INC_X + + vmla.f64 d2 , d12 , d8 + vmla.f64 d2 , d13 , d9 + vmla.f64 d2 , d14, d10 + vmla.f64 d2 , d15, d11 + +.endm + +.macro KERNEL_S1X1 + + fldmiad XO , { d1 } + fldmiad AO1!, { d8 } + vmla.f64 d2 , d1 , d8 + add XO, XO, INC_X + +.endm + +.macro SAVE_S1 + + fldmiad YO, { d4 } + vmla.f64 d4, d0, d2 + fstmiad YO, { d4 } + add YO, YO, INC_Y + +.endm + + +#else /************************* SINGLE PRECISION *****************************************/ + +.macro INIT_F2 + + vsub.f32 s2 , s2 , s2 + vsub.f32 s3 , s3 , s3 + +.endm + +.macro KERNEL_F2X4 + + fldmias XO! , { s12 - s15 } + fldmias AO1!, { s8 - s9 } + fldmias AO2!, { s4 - s5 } + fldmias AO1!, { s10 - s11 } + fldmias AO2!, { s6 - s7 } + + vmla.f32 s2 , s12 , s8 + vmla.f32 s3 , s12 , s4 + vmla.f32 s2 , s13 , s9 + vmla.f32 s3 , s13 , s5 + vmla.f32 s2 , s14, s10 + vmla.f32 s3 , s14, s6 + vmla.f32 s2 , s15, s11 + vmla.f32 s3 , s15, s7 + +.endm + +.macro KERNEL_F2X1 + + fldmias XO! , { s1 } + fldmias AO1!, { s8 } + fldmias AO2!, { s4 } + vmla.f32 s2 , s1 , s8 + vmla.f32 s3 , s1 , s4 + +.endm + +.macro SAVE_F2 + + fldmias YO, { s4 - s5 } + vmla.f32 s4, s0, s2 + vmla.f32 s5, s0, s3 + fstmias YO!, { s4 - s5 } + +.endm + +.macro INIT_F1 + + vsub.f32 s2 , s2 , s2 + +.endm + +.macro KERNEL_F1X4 + + fldmias XO! , { s12 - s15 } + fldmias AO1!, { s8 - s9 } + fldmias AO1!, { s10 - s11 } + vmla.f32 s2 , s12 , s8 + vmla.f32 s2 , s13 , s9 + vmla.f32 s2 , s14, s10 + vmla.f32 s2 , s15, s11 + +.endm + +.macro KERNEL_F1X1 + + fldmias XO! , { s1 } + fldmias AO1!, { s8 } + vmla.f32 s2 , s1 , s8 + +.endm + +.macro SAVE_F1 + + fldmias YO, { s4 } + vmla.f32 s4, s0, s2 + fstmias YO!, { s4 } + +.endm + + +.macro INIT_S2 + + vsub.f32 s2 , s2 , s2 + vsub.f32 s3 , s3 , s3 + +.endm + +.macro KERNEL_S2X4 + + fldmias XO , { s12 } + add XO, XO, INC_X + + fldmias AO1!, { s8 - s9 } + fldmias AO2!, { s4 - s5 } + + fldmias XO , { s13 } + add XO, XO, INC_X + fldmias AO1!, { s10 - s11 } + fldmias AO2!, { s6 - s7 } + + fldmias XO , { s14 } + add XO, XO, INC_X + + fldmias XO , { s15 } + add XO, XO, INC_X + + vmla.f32 s2 , s12 , s8 + vmla.f32 s3 , s12 , s4 + vmla.f32 s2 , s13 , s9 + vmla.f32 s3 , s13 , s5 + vmla.f32 s2 , s14, s10 + vmla.f32 s3 , s14, s6 + vmla.f32 s2 , s15, s11 + vmla.f32 s3 , s15, s7 + +.endm + +.macro KERNEL_S2X1 + + fldmias XO , { s1 } + fldmias AO1!, { s8 } + fldmias AO2!, { s4 } + vmla.f32 s2 , s1 , s8 + add XO, XO, INC_X + vmla.f32 s3 , s1 , s4 + +.endm + +.macro SAVE_S2 + + fldmias YO, { s4 } + vmla.f32 s4, s0, s2 + fstmias YO, { s4 } + add YO, YO, INC_Y + + fldmias YO, { s5 } + vmla.f32 s5, s0, s3 + fstmias YO, { s5 } + add YO, YO, INC_Y + +.endm + +.macro INIT_S1 + + vsub.f32 s2 , s2 , s2 + +.endm + +.macro KERNEL_S1X4 + + fldmias XO , { s12 } + add XO, XO, INC_X + + pld [ AO1 , #A_PRE ] + fldmias AO1!, { s8 - s9 } + + fldmias XO , { s13 } + add XO, XO, INC_X + fldmias AO1!, { s10 - s11 } + + fldmias XO , { s14 } + add XO, XO, INC_X + + fldmias XO , { s15 } + add XO, XO, INC_X + + vmla.f32 s2 , s12 , s8 + vmla.f32 s2 , s13 , s9 + vmla.f32 s2 , s14, s10 + vmla.f32 s2 , s15, s11 + +.endm + +.macro KERNEL_S1X1 + + fldmias XO , { s1 } + fldmias AO1!, { s8 } + vmla.f32 s2 , s1 , s8 + add XO, XO, INC_X + +.endm + +.macro SAVE_S1 + + fldmias YO, { s4 } + vmla.f32 s4, s0, s2 + fstmias YO, { s4 } + add YO, YO, INC_Y + +.endm + + + +#endif + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + push {r4 - r9 , fp} + add fp, sp, #28 + sub sp, sp, #STACKSIZE // reserve stack + + sub r12, fp, #192 + +#if defined(DOUBLE) + vstm r12, { d8 - d15 } // store floating point registers +#else + vstm r12, { s8 - s15 } // store floating point registers +#endif + + cmp M, #0 + ble gemvt_kernel_L999 + + cmp OLD_N, #0 + ble gemvt_kernel_L999 + + str OLD_A, A + str OLD_N, N + + ldr INC_X , OLD_INC_X + ldr INC_Y , OLD_INC_Y + + cmp INC_X, #0 + beq gemvt_kernel_L999 + + cmp INC_Y, #0 + beq gemvt_kernel_L999 + + ldr LDA, OLD_LDA + + +#if defined(DOUBLE) + lsl LDA, LDA, #3 // LDA * SIZE +#else + lsl LDA, LDA, #2 // LDA * SIZE +#endif + + cmp INC_X, #1 + bne gemvt_kernel_S2_BEGIN + + cmp INC_Y, #1 + bne gemvt_kernel_S2_BEGIN + + +gemvt_kernel_F2_BEGIN: + + ldr YO , Y + + ldr J, N + asrs J, J, #1 // J = N / 2 + ble gemvt_kernel_F1_BEGIN + +gemvt_kernel_F2X4: + + ldr AO1, A + add AO2, AO1, LDA + add r3 , AO2, LDA + str r3 , A + + ldr XO , X + + INIT_F2 + + asrs I, M, #2 // I = M / 4 + ble gemvt_kernel_F2X1 + + +gemvt_kernel_F2X4_10: + + KERNEL_F2X4 + + subs I, I, #1 + bne gemvt_kernel_F2X4_10 + + +gemvt_kernel_F2X1: + + ands I, M , #3 + ble gemvt_kernel_F2_END + +gemvt_kernel_F2X1_10: + + KERNEL_F2X1 + + subs I, I, #1 + bne gemvt_kernel_F2X1_10 + + +gemvt_kernel_F2_END: + + SAVE_F2 + + subs J , J , #1 + bne gemvt_kernel_F2X4 + + +gemvt_kernel_F1_BEGIN: + + ldr J, N + ands J, J, #1 + ble gemvt_kernel_L999 + +gemvt_kernel_F1X4: + + ldr AO1, A + + ldr XO , X + + INIT_F1 + + asrs I, M, #2 // I = M / 4 + ble gemvt_kernel_F1X1 + + +gemvt_kernel_F1X4_10: + + KERNEL_F1X4 + + subs I, I, #1 + bne gemvt_kernel_F1X4_10 + + +gemvt_kernel_F1X1: + + ands I, M , #3 + ble gemvt_kernel_F1_END + +gemvt_kernel_F1X1_10: + + KERNEL_F1X1 + + subs I, I, #1 + bne gemvt_kernel_F1X1_10 + + +gemvt_kernel_F1_END: + + SAVE_F1 + + b gemvt_kernel_L999 + + + +/*************************************************************************************************************/ + +gemvt_kernel_S2_BEGIN: + +#if defined(DOUBLE) + lsl INC_X, INC_X, #3 // INC_X * SIZE + lsl INC_Y, INC_Y, #3 // INC_Y * SIZE +#else + lsl INC_X, INC_X, #2 // INC_X * SIZE + lsl INC_Y, INC_Y, #2 // INC_Y * SIZE +#endif + + ldr YO , Y + + ldr J, N + asrs J, J, #1 // J = N / 2 + ble gemvt_kernel_S1_BEGIN + +gemvt_kernel_S2X4: + + ldr AO1, A + add AO2, AO1, LDA + add r3 , AO2, LDA + str r3 , A + + ldr XO , X + + INIT_S2 + + asrs I, M, #2 // I = M / 4 + ble gemvt_kernel_S2X1 + + +gemvt_kernel_S2X4_10: + + KERNEL_S2X4 + + subs I, I, #1 + bne gemvt_kernel_S2X4_10 + + +gemvt_kernel_S2X1: + + ands I, M , #3 + ble gemvt_kernel_S2_END + +gemvt_kernel_S2X1_10: + + KERNEL_S2X1 + + subs I, I, #1 + bne gemvt_kernel_S2X1_10 + + +gemvt_kernel_S2_END: + + SAVE_S2 + + subs J , J , #1 + bne gemvt_kernel_S2X4 + + +gemvt_kernel_S1_BEGIN: + + ldr J, N + ands J, J, #1 + ble gemvt_kernel_L999 + +gemvt_kernel_S1X4: + + ldr AO1, A + + ldr XO , X + + INIT_S1 + + asrs I, M, #2 // I = M / 4 + ble gemvt_kernel_S1X1 + + +gemvt_kernel_S1X4_10: + + KERNEL_S1X4 + + subs I, I, #1 + bne gemvt_kernel_S1X4_10 + + +gemvt_kernel_S1X1: + + ands I, M , #3 + ble gemvt_kernel_S1_END + +gemvt_kernel_S1X1_10: + + KERNEL_S1X1 + + subs I, I, #1 + bne gemvt_kernel_S1X1_10 + + +gemvt_kernel_S1_END: + + SAVE_S1 + + + +/*************************************************************************************************************/ + +gemvt_kernel_L999: + + sub r3, fp, #192 + +#if defined(DOUBLE) + vldm r3, { d8 - d15 } // restore floating point registers +#else + vldm r3, { s8 - s15 } // restore floating point registers +#endif + + mov r0, #0 // set return value + + sub sp, fp, #28 + pop {r4 -r9 ,fp} + bx lr + + EPILOGUE + From 697e198e8a530be338b7114d2e24d2d1bb7e3392 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Wed, 27 Nov 2013 16:15:06 +0100 Subject: [PATCH 64/81] added zgemm_kernel for ARMV6 --- kernel/arm/KERNEL.ARMV6 | 2 +- kernel/arm/zgemm_kernel_2x2_vfp.S | 1311 +++++++++++++++++++++++++++++ 2 files changed, 1312 insertions(+), 1 deletion(-) create mode 100644 kernel/arm/zgemm_kernel_2x2_vfp.S diff --git a/kernel/arm/KERNEL.ARMV6 b/kernel/arm/KERNEL.ARMV6 index 1ab2683cf..58e4d2702 100644 --- a/kernel/arm/KERNEL.ARMV6 +++ b/kernel/arm/KERNEL.ARMV6 @@ -111,7 +111,7 @@ CGEMMOTCOPY = cgemm_tcopy_2_vfp.S CGEMMONCOPYOBJ = cgemm_oncopy.o CGEMMOTCOPYOBJ = cgemm_otcopy.o -ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c +ZGEMMKERNEL = zgemm_kernel_2x2_vfp.S ZGEMMONCOPY = zgemm_ncopy_2_vfp.S ZGEMMOTCOPY = zgemm_tcopy_2_vfp.S ZGEMMONCOPYOBJ = zgemm_oncopy.o diff --git a/kernel/arm/zgemm_kernel_2x2_vfp.S b/kernel/arm/zgemm_kernel_2x2_vfp.S new file mode 100644 index 000000000..7f7664981 --- /dev/null +++ b/kernel/arm/zgemm_kernel_2x2_vfp.S @@ -0,0 +1,1311 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/05 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +* 2013/11/02 Saar +* UNROLL_N 2 +* UNROLL_M 2 +* ZGEMM_P 64 +* ZGEMM_Q 120 +* ZGEMM_R 4096 +* A_PRE 96 +* B_PRE 96 +* C_PRE 64 +* +* Performance on Odroid U2: +* +* 1 Core: 1.62 GFLOPS ATLAS: 1.39 GFLOPS +* 2 Cores: 3.20 GFLOPS ATLAS: 2.54 GFLOPS +* 3 Cores: 4.72 GFLOPS ATLAS: 3.76 GFLOPS +* 4 Cores: 5.93 GFLOPS ATLAS: 4.88 GFLOPS +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_M r0 +#define OLD_N r1 +#define OLD_K r2 +#define OLD_A r3 +#define OLD_ALPHA_R d0 +#define OLD_ALPHA_I d1 + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define A [fp, #-248 ] +#define LDC [fp, #-252 ] +#define M [fp, #-256 ] +#define N [fp, #-260 ] +#define K [fp, #-264 ] + +#define ALPHA_I [fp, #-272] +#define ALPHA_R [fp, #-280] + +#define B [fp, #4 ] +#define C [fp, #8 ] +#define OLD_LDC [fp, #12 ] + +#define I r0 +#define J r1 +#define L r2 + +#define AO r5 +#define BO r6 + +#define CO1 r8 +#define CO2 r9 + +#define K1 r7 +#define BC r12 + +#define A_PRE 96 +#define B_PRE 96 +#define C_PRE 64 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + + #define KMAC_R fnmacd + #define KMAC_I fmacd + + #define FMAC_R1 fmacd + #define FMAC_R2 fnmacd + #define FMAC_I1 fmacd + #define FMAC_I2 fmacd + +#elif defined(CN) || defined(CT) + + #define KMAC_R fmacd + #define KMAC_I fnmacd + + #define FMAC_R1 fmacd + #define FMAC_R2 fnmacd + #define FMAC_I1 fmacd + #define FMAC_I2 fmacd + +#elif defined(NC) || defined(TC) + + #define KMAC_R fmacd + #define KMAC_I fnmacd + + #define FMAC_R1 fmacd + #define FMAC_R2 fmacd + #define FMAC_I1 fnmacd + #define FMAC_I2 fmacd + +#else + + #define KMAC_R fnmacd + #define KMAC_I fmacd + + #define FMAC_R1 fmacd + #define FMAC_R2 fmacd + #define FMAC_I1 fnmacd + #define FMAC_I2 fmacd + +#endif + + + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro INIT2x2 + + vsub.f64 d8 , d8 , d8 + vmov.f64 d9 , d8 + vmov.f64 d10, d8 + vmov.f64 d11, d8 + vmov.f64 d12, d8 + vmov.f64 d13, d8 + vmov.f64 d14, d8 + vmov.f64 d15, d8 + +.endm + +.macro KERNEL2x2_I + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fldd d6 , [ BO, #16 ] + fldd d7 , [ BO, #24 ] + + fmuld d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmuld d9 , d0, d5 + KMAC_I d9 , d1, d4 + + fmuld d10 , d2, d4 + KMAC_R d10 , d3, d5 + fmuld d11 , d2, d5 + KMAC_I d11 , d3, d4 + + fmuld d12 , d0, d6 + KMAC_R d12 , d1, d7 + fmuld d13 , d0, d7 + KMAC_I d13 , d1, d6 + + fmuld d14 , d2, d6 + KMAC_R d14 , d3, d7 + fmuld d15 , d2, d7 + KMAC_I d15 , d3, d6 + + add BO , BO, #32 + add AO , AO, #32 + +.endm + + + +.macro KERNEL2x2_M1 + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + fldd d6 , [ BO, #16 ] + fldd d7 , [ BO, #24 ] + + fmacd d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmacd d9 , d0, d5 + KMAC_I d9 , d1, d4 + + fmacd d10 , d2, d4 + KMAC_R d10 , d3, d5 + fmacd d11 , d2, d5 + KMAC_I d11 , d3, d4 + + fmacd d12 , d0, d6 + KMAC_R d12 , d1, d7 + fmacd d13 , d0, d7 + KMAC_I d13 , d1, d6 + + fmacd d14 , d2, d6 + KMAC_R d14 , d3, d7 + fmacd d15 , d2, d7 + KMAC_I d15 , d3, d6 + + add BO , BO, #32 + add AO , AO, #32 + +.endm + +.macro KERNEL2x2_M2 + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + fldd d6 , [ BO, #16 ] + fldd d7 , [ BO, #24 ] + + fmacd d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmacd d9 , d0, d5 + KMAC_I d9 , d1, d4 + + fmacd d10 , d2, d4 + KMAC_R d10 , d3, d5 + fmacd d11 , d2, d5 + KMAC_I d11 , d3, d4 + + fmacd d12 , d0, d6 + KMAC_R d12 , d1, d7 + fmacd d13 , d0, d7 + KMAC_I d13 , d1, d6 + + fmacd d14 , d2, d6 + KMAC_R d14 , d3, d7 + fmacd d15 , d2, d7 + KMAC_I d15 , d3, d6 + + add BO , BO, #32 + add AO , AO, #32 + + +.endm + + +.macro KERNEL2x2_E + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + fldd d6 , [ BO, #16 ] + fldd d7 , [ BO, #24 ] + + fmacd d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmacd d9 , d0, d5 + KMAC_I d9 , d1, d4 + + fmacd d10 , d2, d4 + KMAC_R d10 , d3, d5 + fmacd d11 , d2, d5 + KMAC_I d11 , d3, d4 + + fmacd d12 , d0, d6 + KMAC_R d12 , d1, d7 + fmacd d13 , d0, d7 + KMAC_I d13 , d1, d6 + + fmacd d14 , d2, d6 + KMAC_R d14 , d3, d7 + fmacd d15 , d2, d7 + KMAC_I d15 , d3, d6 + + add BO , BO, #32 + add AO , AO, #32 + +.endm + +.macro KERNEL2x2_SUB + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + fldd d6 , [ BO, #16 ] + fldd d7 , [ BO, #24 ] + + fmacd d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmacd d9 , d0, d5 + KMAC_I d9 , d1, d4 + + fmacd d10 , d2, d4 + KMAC_R d10 , d3, d5 + fmacd d11 , d2, d5 + KMAC_I d11 , d3, d4 + + fmacd d12 , d0, d6 + KMAC_R d12 , d1, d7 + fmacd d13 , d0, d7 + KMAC_I d13 , d1, d6 + + fmacd d14 , d2, d6 + KMAC_R d14 , d3, d7 + fmacd d15 , d2, d7 + KMAC_I d15 , d3, d6 + + add BO , BO, #32 + add AO , AO, #32 + + +.endm + + +.macro SAVE2x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + fldd d0, ALPHA_R + fldd d1, ALPHA_I + + fldmiad CO1, { d4 - d7 } + + FMAC_R1 d4 , d0 , d8 + FMAC_I1 d5 , d0 , d9 + FMAC_R2 d4 , d1 , d9 + FMAC_I2 d5 , d1 , d8 + + FMAC_R1 d6 , d0 , d10 + FMAC_I1 d7 , d0 , d11 + FMAC_R2 d6 , d1 , d11 + FMAC_I2 d7 , d1 , d10 + + fstmiad CO1, { d4 - d7 } + + fldmiad CO2, { d4 - d7 } + + FMAC_R1 d4 , d0 , d12 + FMAC_I1 d5 , d0 , d13 + FMAC_R2 d4 , d1 , d13 + FMAC_I2 d5 , d1 , d12 + + FMAC_R1 d6 , d0 , d14 + FMAC_I1 d7 , d0 , d15 + FMAC_R2 d6 , d1 , d15 + FMAC_I2 d7 , d1 , d14 + + fstmiad CO2, { d4 - d7 } + + add CO1, CO1, #32 + +.endm + +/******************************************************************************/ + +.macro INIT1x2 + + vsub.f64 d8 , d8 , d8 + vmov.f64 d9 , d8 + vmov.f64 d12, d8 + vmov.f64 d13, d8 + +.endm + +.macro KERNEL1x2_I + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fldd d6 , [ BO, #16 ] + fldd d7 , [ BO, #24 ] + + fmuld d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmuld d9 , d0, d5 + KMAC_I d9 , d1, d4 + + fmuld d12 , d0, d6 + KMAC_R d12 , d1, d7 + fmuld d13 , d0, d7 + KMAC_I d13 , d1, d6 + + add BO , BO, #32 + add AO , AO, #16 + +.endm + + + +.macro KERNEL1x2_M1 + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + fldd d6 , [ BO, #16 ] + fldd d7 , [ BO, #24 ] + + fmacd d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmacd d9 , d0, d5 + KMAC_I d9 , d1, d4 + + fmacd d12 , d0, d6 + KMAC_R d12 , d1, d7 + fmacd d13 , d0, d7 + KMAC_I d13 , d1, d6 + + add BO , BO, #32 + add AO , AO, #16 + +.endm + +.macro KERNEL1x2_M2 + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + fldd d6 , [ BO, #16 ] + fldd d7 , [ BO, #24 ] + + fmacd d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmacd d9 , d0, d5 + KMAC_I d9 , d1, d4 + + fmacd d12 , d0, d6 + KMAC_R d12 , d1, d7 + fmacd d13 , d0, d7 + KMAC_I d13 , d1, d6 + + add BO , BO, #32 + add AO , AO, #16 + + +.endm + + +.macro KERNEL1x2_E + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + fldd d6 , [ BO, #16 ] + fldd d7 , [ BO, #24 ] + + fmacd d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmacd d9 , d0, d5 + KMAC_I d9 , d1, d4 + + fmacd d12 , d0, d6 + KMAC_R d12 , d1, d7 + fmacd d13 , d0, d7 + KMAC_I d13 , d1, d6 + + add BO , BO, #32 + add AO , AO, #16 + +.endm + +.macro KERNEL1x2_SUB + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + fldd d6 , [ BO, #16 ] + fldd d7 , [ BO, #24 ] + + fmacd d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmacd d9 , d0, d5 + KMAC_I d9 , d1, d4 + + fmacd d12 , d0, d6 + KMAC_R d12 , d1, d7 + fmacd d13 , d0, d7 + KMAC_I d13 , d1, d6 + + add BO , BO, #32 + add AO , AO, #16 + + +.endm + + +.macro SAVE1x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + fldd d0, ALPHA_R + fldd d1, ALPHA_I + + fldmiad CO1, { d4 - d5 } + + FMAC_R1 d4 , d0 , d8 + FMAC_I1 d5 , d0 , d9 + FMAC_R2 d4 , d1 , d9 + FMAC_I2 d5 , d1 , d8 + + fstmiad CO1, { d4 - d5 } + + fldmiad CO2, { d4 - d5 } + + FMAC_R1 d4 , d0 , d12 + FMAC_I1 d5 , d0 , d13 + FMAC_R2 d4 , d1 , d13 + FMAC_I2 d5 , d1 , d12 + + fstmiad CO2, { d4 - d5 } + + add CO1, CO1, #16 + +.endm + + +/******************************************************************************/ + +.macro INIT2x1 + + vsub.f64 d8 , d8 , d8 + vmov.f64 d9 , d8 + vmov.f64 d10, d8 + vmov.f64 d11, d8 + +.endm + +.macro KERNEL2x1_I + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fmuld d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmuld d9 , d0, d5 + KMAC_I d9 , d1, d4 + + fmuld d10 , d2, d4 + KMAC_R d10 , d3, d5 + fmuld d11 , d2, d5 + KMAC_I d11 , d3, d4 + + add BO , BO, #16 + add AO , AO, #32 + +.endm + + + +.macro KERNEL2x1_M1 + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fmacd d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmacd d9 , d0, d5 + KMAC_I d9 , d1, d4 + + fmacd d10 , d2, d4 + KMAC_R d10 , d3, d5 + fmacd d11 , d2, d5 + KMAC_I d11 , d3, d4 + + add BO , BO, #16 + add AO , AO, #32 + +.endm + +.macro KERNEL2x1_M2 + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fmacd d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmacd d9 , d0, d5 + KMAC_I d9 , d1, d4 + + fmacd d10 , d2, d4 + KMAC_R d10 , d3, d5 + fmacd d11 , d2, d5 + KMAC_I d11 , d3, d4 + + add BO , BO, #16 + add AO , AO, #32 + + +.endm + + +.macro KERNEL2x1_E + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fmacd d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmacd d9 , d0, d5 + KMAC_I d9 , d1, d4 + + fmacd d10 , d2, d4 + KMAC_R d10 , d3, d5 + fmacd d11 , d2, d5 + KMAC_I d11 , d3, d4 + + add BO , BO, #16 + add AO , AO, #32 + +.endm + +.macro KERNEL2x1_SUB + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fmacd d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmacd d9 , d0, d5 + KMAC_I d9 , d1, d4 + + fmacd d10 , d2, d4 + KMAC_R d10 , d3, d5 + fmacd d11 , d2, d5 + KMAC_I d11 , d3, d4 + + add BO , BO, #16 + add AO , AO, #32 + + +.endm + + +.macro SAVE2x1 + + fldd d0, ALPHA_R + fldd d1, ALPHA_I + + fldmiad CO1, { d4 - d7 } + + FMAC_R1 d4 , d0 , d8 + FMAC_I1 d5 , d0 , d9 + FMAC_R2 d4 , d1 , d9 + FMAC_I2 d5 , d1 , d8 + + FMAC_R1 d6 , d0 , d10 + FMAC_I1 d7 , d0 , d11 + FMAC_R2 d6 , d1 , d11 + FMAC_I2 d7 , d1 , d10 + + fstmiad CO1, { d4 - d7 } + + add CO1, CO1, #32 + +.endm + + +/******************************************************************************/ + +.macro INIT1x1 + + vsub.f64 d8 , d8 , d8 + vmov.f64 d9 , d8 + +.endm + +.macro KERNEL1x1_I + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fmuld d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmuld d9 , d0, d5 + KMAC_I d9 , d1, d4 + + add BO , BO, #16 + add AO , AO, #16 + +.endm + + + +.macro KERNEL1x1_M1 + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fmacd d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmacd d9 , d0, d5 + KMAC_I d9 , d1, d4 + + add BO , BO, #16 + add AO , AO, #16 + +.endm + +.macro KERNEL1x1_M2 + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fmacd d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmacd d9 , d0, d5 + KMAC_I d9 , d1, d4 + + add BO , BO, #16 + add AO , AO, #16 + + +.endm + + +.macro KERNEL1x1_E + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fmacd d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmacd d9 , d0, d5 + KMAC_I d9 , d1, d4 + + add BO , BO, #16 + add AO , AO, #16 + +.endm + +.macro KERNEL1x1_SUB + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fmacd d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmacd d9 , d0, d5 + KMAC_I d9 , d1, d4 + + add BO , BO, #16 + add AO , AO, #16 + + +.endm + + +.macro SAVE1x1 + + fldd d0, ALPHA_R + fldd d1, ALPHA_I + + fldmiad CO1, { d4 - d5 } + + FMAC_R1 d4 , d0 , d8 + FMAC_I1 d5 , d0 , d9 + FMAC_R2 d4 , d1 , d9 + FMAC_I2 d5 , d1 , d8 + + fstmiad CO1, { d4 - d5 } + + add CO1, CO1, #16 + +.endm + + +/******************************************************************************/ + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + str OLD_M, M + str OLD_N, N + str OLD_K, K + str OLD_A, A + vstr OLD_ALPHA_R, ALPHA_R + vstr OLD_ALPHA_I, ALPHA_I + + sub r3, fp, #128 + vstm r3, { d8 - d15} // store floating point registers + + ldr r3, OLD_LDC + lsl r3, r3, #4 // ldc = ldc * 8 * 2 + str r3, LDC + + ldr K1, K + ldr BC, B + + ldr J, N + asrs J, J, #1 // J = J / 2 + ble zgemm_kernel_L1_BEGIN + +zgemm_kernel_L2_BEGIN: + + ldr CO1, C // CO1 = C + ldr r4 , LDC + lsl r4 , r4 , #1 // LDC * 2 + add r3 , r4, CO1 + str r3 , C // store C + + ldr AO, A // AO = A + pld [AO , #A_PRE-64] + pld [AO , #A_PRE-32] + + + +zgemm_kernel_L2_M2_BEGIN: + + ldr I, M + asrs I, I, #1 // I = I / 2 + ble zgemm_kernel_L2_M1_BEGIN + +zgemm_kernel_L2_M2_20: + + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + cmp L , #3 + blt zgemm_kernel_L2_M2_30 + .align 5 + + + + KERNEL2x2_I + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + sub L, L, #2 + +zgemm_kernel_L2_M2_22: + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + subs L, L, #1 + bgt zgemm_kernel_L2_M2_22 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_E + + b zgemm_kernel_L2_M2_44 + + +zgemm_kernel_L2_M2_30: + tst L, #3 + ble zgemm_kernel_L2_M2_40 + + tst L, #2 + ble zgemm_kernel_L2_M2_32 + + KERNEL2x2_I + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_E + + b zgemm_kernel_L2_M2_44 + +zgemm_kernel_L2_M2_32: + + tst L, #1 + ble zgemm_kernel_L2_M2_40 + + KERNEL2x2_I + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_E + + b zgemm_kernel_L2_M2_44 + + +zgemm_kernel_L2_M2_40: + + INIT2x2 + + +zgemm_kernel_L2_M2_44: + + ands L , K1, #7 // L = L % 8 + ble zgemm_kernel_L2_M2_100 + +zgemm_kernel_L2_M2_46: + + KERNEL2x2_SUB + + subs L, L, #1 + bne zgemm_kernel_L2_M2_46 + +zgemm_kernel_L2_M2_100: + + SAVE2x2 + +zgemm_kernel_L2_M2_END: + + subs I, I, #1 + bne zgemm_kernel_L2_M2_20 + + +zgemm_kernel_L2_M1_BEGIN: + + ldr I, M + tst I, #1 // I = I % 2 + ble zgemm_kernel_L2_END + +zgemm_kernel_L2_M1_20: + + INIT1x2 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble zgemm_kernel_L2_M1_40 + +zgemm_kernel_L2_M1_22: + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + subs L, L, #1 + bgt zgemm_kernel_L2_M1_22 + + +zgemm_kernel_L2_M1_40: + + ands L , K1, #7 // L = L % 8 + ble zgemm_kernel_L2_M1_100 + +zgemm_kernel_L2_M1_42: + + KERNEL1x2_SUB + + subs L, L, #1 + bgt zgemm_kernel_L2_M1_42 + +zgemm_kernel_L2_M1_100: + + SAVE1x2 + + +zgemm_kernel_L2_END: + + mov r3, BC + mov r4, K1 + lsl r4, r4, #5 // k * 2 * 8 * 2 + add r3, r3, r4 // B = B + K * 4 * 8 + mov BC, r3 + + subs J , #1 // j-- + bgt zgemm_kernel_L2_BEGIN + + + +/*********************************************************************************************/ + +zgemm_kernel_L1_BEGIN: + + ldr J , N + tst J , #1 + ble zgemm_kernel_L999 + + + ldr CO1, C // CO1 = C + ldr r4 , LDC + add r3 , r4, CO1 + str r3 , C // store C + + ldr AO, A // AO = A + +zgemm_kernel_L1_M2_BEGIN: + + ldr I, M + asrs I, I, #1 // I = I / 2 + ble zgemm_kernel_L1_M1_BEGIN + +zgemm_kernel_L1_M2_20: + + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + cmp L , #3 + blt zgemm_kernel_L1_M2_30 + .align 5 + + + + KERNEL2x1_I + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + sub L, L, #2 + +zgemm_kernel_L1_M2_22: + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + subs L, L, #1 + bgt zgemm_kernel_L1_M2_22 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_E + + b zgemm_kernel_L1_M2_44 + + +zgemm_kernel_L1_M2_30: + tst L, #3 + ble zgemm_kernel_L1_M2_40 + + tst L, #2 + ble zgemm_kernel_L1_M2_32 + + KERNEL2x1_I + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_E + + b zgemm_kernel_L1_M2_44 + +zgemm_kernel_L1_M2_32: + + tst L, #1 + ble zgemm_kernel_L1_M2_40 + + KERNEL2x1_I + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_E + + b zgemm_kernel_L1_M2_44 + + +zgemm_kernel_L1_M2_40: + + INIT2x1 + + +zgemm_kernel_L1_M2_44: + + ands L , K1, #7 // L = L % 8 + ble zgemm_kernel_L1_M2_100 + +zgemm_kernel_L1_M2_46: + + KERNEL2x1_SUB + + subs L, L, #1 + bne zgemm_kernel_L1_M2_46 + +zgemm_kernel_L1_M2_100: + + SAVE2x1 + +zgemm_kernel_L1_M2_END: + + subs I, I, #1 + bne zgemm_kernel_L1_M2_20 + + +zgemm_kernel_L1_M1_BEGIN: + + ldr I, M + tst I, #1 // I = I % 2 + ble zgemm_kernel_L1_END + +zgemm_kernel_L1_M1_20: + + INIT1x1 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble zgemm_kernel_L1_M1_40 + +zgemm_kernel_L1_M1_22: + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + subs L, L, #1 + bgt zgemm_kernel_L1_M1_22 + + +zgemm_kernel_L1_M1_40: + + ands L , K1, #7 // L = L % 8 + ble zgemm_kernel_L1_M1_100 + +zgemm_kernel_L1_M1_42: + + KERNEL1x1_SUB + + subs L, L, #1 + bgt zgemm_kernel_L1_M1_42 + +zgemm_kernel_L1_M1_100: + + SAVE1x1 + + +zgemm_kernel_L1_END: + + + +zgemm_kernel_L999: + + sub r3, fp, #128 + vldm r3, { d8 - d15} // restore floating point registers + + movs r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + From a9bd12da2c4e38449a10ea00918849e1a9c76b4f Mon Sep 17 00:00:00 2001 From: wernsaar Date: Wed, 27 Nov 2013 17:37:38 +0100 Subject: [PATCH 65/81] optimized dgemm kernel for ARMV6 --- kernel/arm/dgemm_kernel_4x2_vfp.S | 45 ++++++++++++++++++------------- 1 file changed, 27 insertions(+), 18 deletions(-) diff --git a/kernel/arm/dgemm_kernel_4x2_vfp.S b/kernel/arm/dgemm_kernel_4x2_vfp.S index 56fd81513..55409a5ef 100644 --- a/kernel/arm/dgemm_kernel_4x2_vfp.S +++ b/kernel/arm/dgemm_kernel_4x2_vfp.S @@ -26,10 +26,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** -* 2013/11/23 Saar -* BLASTEST : xOK -* CTEST : xOK -* TEST : xOK +* 2013/11/27 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK * **************************************************************************************/ @@ -77,7 +77,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define A_PRE 96 #define B_PRE 96 -#define C_PRE 64 +#define C_PRE 32 /************************************************************************************** * Macro definitions @@ -100,26 +100,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x2_SUB + pld [ AO, #A_PRE ] fldd d4 , [ BO ] - fldd d5 , [ BO, #8 ] fldd d0 , [ AO ] fldd d1 , [ AO, #8 ] - fldd d2 , [ AO, #16 ] - fldd d3 , [ AO, #24 ] fmacd d8 , d0, d4 + fldd d2 , [ AO, #16 ] fmacd d9 , d1, d4 + fldd d3 , [ AO, #24 ] fmacd d10 , d2, d4 + fldd d5 , [ BO, #8 ] fmacd d11 , d3, d4 fmacd d12 , d0, d5 fmacd d13 , d1, d5 + add AO , AO, #32 fmacd d14 , d2, d5 + add BO , BO, #16 fmacd d15 , d3, d5 - add AO , AO, #32 - add BO , BO, #16 .endm @@ -130,37 +131,40 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d0, ALPHA + fldd d4 , [CO1] fldd d5 , [CO1, #8 ] - fldd d6 , [CO1, #16 ] - fldd d7 , [CO1, #24 ] + pld [ CO1, #C_PRE ] fmacd d4 , d0 , d8 + fldd d6 , [CO1, #16 ] fmacd d5 , d0 , d9 + fldd d7 , [CO1, #24 ] fmacd d6 , d0 , d10 + fstd d4 , [CO1] fmacd d7 , d0 , d11 - fstd d4 , [CO1] fstd d5 , [CO1, #8 ] fstd d6 , [CO1, #16 ] fstd d7 , [CO1, #24 ] fldd d4 , [CO2] fldd d5 , [CO2, #8 ] - fldd d6 , [CO2, #16 ] - fldd d7 , [CO2, #24 ] + pld [ CO2, #C_PRE ] fmacd d4 , d0 , d12 + fldd d6 , [CO2, #16 ] fmacd d5 , d0 , d13 + fldd d7 , [CO2, #24 ] fmacd d6 , d0 , d14 - fmacd d7 , d0 , d15 - fstd d4 , [CO2] + fmacd d7 , d0 , d15 + add CO1, CO1, #32 + fstd d5 , [CO2, #8 ] fstd d6 , [CO2, #16 ] fstd d7 , [CO2, #24 ] - add CO1, CO1, #32 .endm @@ -469,13 +473,18 @@ dgemm_kernel_L2_M4_20: .align 5 dgemm_kernel_L2_M4_22: + + pld [ BO, #B_PRE ] KERNEL4x2_SUB KERNEL4x2_SUB + pld [ BO, #B_PRE ] KERNEL4x2_SUB KERNEL4x2_SUB + pld [ BO, #B_PRE ] KERNEL4x2_SUB KERNEL4x2_SUB + pld [ BO, #B_PRE ] KERNEL4x2_SUB KERNEL4x2_SUB From 3d5e792c72cd6d9894c4583c527e058481816657 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Wed, 27 Nov 2013 18:38:32 +0100 Subject: [PATCH 66/81] optimized sgemm kernel for ARMV6 --- kernel/arm/sgemm_kernel_4x2_vfp.S | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/kernel/arm/sgemm_kernel_4x2_vfp.S b/kernel/arm/sgemm_kernel_4x2_vfp.S index 3e20f86f0..e074e744c 100644 --- a/kernel/arm/sgemm_kernel_4x2_vfp.S +++ b/kernel/arm/sgemm_kernel_4x2_vfp.S @@ -26,7 +26,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** -* 2013/11/23 Saar +* 2013/11/27 Saar * BLASTEST : OK * CTEST : OK * TEST : OK @@ -101,16 +101,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x2_SUB flds s4 , [ BO ] - flds s5 , [ BO, #4 ] flds s0 , [ AO ] flds s1 , [ AO, #4 ] - flds s2 , [ AO, #8 ] - flds s3 , [ AO, #12 ] fmacs s8 , s0, s4 + flds s2 , [ AO, #8 ] fmacs s9 , s1, s4 + flds s3 , [ AO, #12 ] fmacs s10 , s2, s4 + flds s5 , [ BO, #4 ] fmacs s11 , s3, s4 fmacs s12 , s0, s5 @@ -469,13 +469,20 @@ sgemm_kernel_L2_M4_20: .align 5 sgemm_kernel_L2_M4_22: + + pld [ AO, #A_PRE ] + pld [ BO, #B_PRE ] KERNEL4x2_SUB KERNEL4x2_SUB + pld [ AO, #A_PRE ] KERNEL4x2_SUB KERNEL4x2_SUB + pld [ AO, #A_PRE ] + pld [ BO, #B_PRE ] KERNEL4x2_SUB KERNEL4x2_SUB + pld [ AO, #A_PRE ] KERNEL4x2_SUB KERNEL4x2_SUB From b42145834f2f9d6eb3dab4a206d48c9a1db2bbe4 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Thu, 28 Nov 2013 08:08:08 +0100 Subject: [PATCH 67/81] optimized sgemm kernel for ARMV6 --- kernel/arm/sgemm_kernel_4x2_vfp.S | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/kernel/arm/sgemm_kernel_4x2_vfp.S b/kernel/arm/sgemm_kernel_4x2_vfp.S index e074e744c..0e2061d77 100644 --- a/kernel/arm/sgemm_kernel_4x2_vfp.S +++ b/kernel/arm/sgemm_kernel_4x2_vfp.S @@ -26,7 +26,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** -* 2013/11/27 Saar +* 2013/11/28 Saar * BLASTEST : OK * CTEST : OK * TEST : OK @@ -100,17 +100,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x2_SUB - flds s4 , [ BO ] - - flds s0 , [ AO ] - flds s1 , [ AO, #4 ] + fldmias AO! , { s0 - s3 } + fldmias BO! , { s4 - s5 } fmacs s8 , s0, s4 - flds s2 , [ AO, #8 ] fmacs s9 , s1, s4 - flds s3 , [ AO, #12 ] fmacs s10 , s2, s4 - flds s5 , [ BO, #4 ] fmacs s11 , s3, s4 fmacs s12 , s0, s5 @@ -118,9 +113,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmacs s14 , s2, s5 fmacs s15 , s3, s5 - add AO , AO, #16 - add BO , BO, #8 - .endm .macro SAVE4x2 From a537d7d8d7a5936a66436ec1f184a43fb39507b5 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Thu, 28 Nov 2013 08:33:44 +0100 Subject: [PATCH 68/81] optimized zgemm_kernel_2x2_vfp.S --- kernel/arm/zgemm_kernel_2x2_vfp.S | 54 +++++++++++++------------------ 1 file changed, 22 insertions(+), 32 deletions(-) diff --git a/kernel/arm/zgemm_kernel_2x2_vfp.S b/kernel/arm/zgemm_kernel_2x2_vfp.S index 7f7664981..ad6b56ac0 100644 --- a/kernel/arm/zgemm_kernel_2x2_vfp.S +++ b/kernel/arm/zgemm_kernel_2x2_vfp.S @@ -26,28 +26,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** -* 2013/11/05 Saar +* 2013/11/28 Saar * BLASTEST : OK * CTEST : OK * TEST : OK * -* 2013/11/02 Saar -* UNROLL_N 2 -* UNROLL_M 2 -* ZGEMM_P 64 -* ZGEMM_Q 120 -* ZGEMM_R 4096 -* A_PRE 96 -* B_PRE 96 -* C_PRE 64 -* -* Performance on Odroid U2: -* -* 1 Core: 1.62 GFLOPS ATLAS: 1.39 GFLOPS -* 2 Cores: 3.20 GFLOPS ATLAS: 2.54 GFLOPS -* 3 Cores: 4.72 GFLOPS ATLAS: 3.76 GFLOPS -* 4 Cores: 5.93 GFLOPS ATLAS: 4.88 GFLOPS -**************************************************************************************/ +***************************************************************************************/ #define ASSEMBLER #include "common.h" @@ -159,6 +143,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL2x2_I + pld [ AO, #A_PRE ] + pld [ BO, #B_PRE ] fldd d0 , [ AO ] fldd d1 , [ AO, #8 ] @@ -201,22 +187,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL2x2_M1 + pld [ AO, #A_PRE ] + pld [ BO, #B_PRE ] + fldd d0 , [ AO ] fldd d1 , [ AO, #8 ] - fldd d2 , [ AO, #16 ] - fldd d3 , [ AO, #24 ] fldd d4 , [ BO ] fldd d5 , [ BO, #8 ] - fldd d6 , [ BO, #16 ] - fldd d7 , [ BO, #24 ] fmacd d8 , d0, d4 KMAC_R d8 , d1, d5 + fldd d2 , [ AO, #16 ] fmacd d9 , d0, d5 + fldd d3 , [ AO, #24 ] KMAC_I d9 , d1, d4 + fldd d6 , [ BO, #16 ] fmacd d10 , d2, d4 + fldd d7 , [ BO, #24 ] KMAC_R d10 , d3, d5 fmacd d11 , d2, d5 KMAC_I d11 , d3, d4 @@ -228,32 +217,35 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmacd d14 , d2, d6 KMAC_R d14 , d3, d7 + add BO , BO, #32 fmacd d15 , d2, d7 + add AO , AO, #32 KMAC_I d15 , d3, d6 - add BO , BO, #32 - add AO , AO, #32 .endm .macro KERNEL2x2_M2 + pld [ AO, #A_PRE ] + pld [ BO, #B_PRE ] + fldd d0 , [ AO ] fldd d1 , [ AO, #8 ] - fldd d2 , [ AO, #16 ] - fldd d3 , [ AO, #24 ] fldd d4 , [ BO ] fldd d5 , [ BO, #8 ] - fldd d6 , [ BO, #16 ] - fldd d7 , [ BO, #24 ] fmacd d8 , d0, d4 KMAC_R d8 , d1, d5 + fldd d2 , [ AO, #16 ] fmacd d9 , d0, d5 + fldd d3 , [ AO, #24 ] KMAC_I d9 , d1, d4 + fldd d6 , [ BO, #16 ] fmacd d10 , d2, d4 + fldd d7 , [ BO, #24 ] KMAC_R d10 , d3, d5 fmacd d11 , d2, d5 KMAC_I d11 , d3, d4 @@ -265,12 +257,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmacd d14 , d2, d6 KMAC_R d14 , d3, d7 - fmacd d15 , d2, d7 - KMAC_I d15 , d3, d6 - add BO , BO, #32 + fmacd d15 , d2, d7 add AO , AO, #32 - + KMAC_I d15 , d3, d6 .endm From 5007a534c48c9c966c3bfd3de6b2cec1a696ccbf Mon Sep 17 00:00:00 2001 From: wernsaar Date: Thu, 28 Nov 2013 10:04:43 +0100 Subject: [PATCH 69/81] optimized zgemm kernel for ARMV6 --- kernel/arm/zgemm_kernel_2x2_vfp.S | 64 +++++++++++++++---------------- 1 file changed, 31 insertions(+), 33 deletions(-) diff --git a/kernel/arm/zgemm_kernel_2x2_vfp.S b/kernel/arm/zgemm_kernel_2x2_vfp.S index ad6b56ac0..8a5401858 100644 --- a/kernel/arm/zgemm_kernel_2x2_vfp.S +++ b/kernel/arm/zgemm_kernel_2x2_vfp.S @@ -187,38 +187,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL2x2_M1 - pld [ AO, #A_PRE ] - pld [ BO, #B_PRE ] fldd d0 , [ AO ] - fldd d1 , [ AO, #8 ] - fldd d4 , [ BO ] fldd d5 , [ BO, #8 ] fmacd d8 , d0, d4 - KMAC_R d8 , d1, d5 - fldd d2 , [ AO, #16 ] + fldd d1 , [ AO, #8 ] fmacd d9 , d0, d5 + fldd d2 , [ AO, #16 ] + KMAC_R d8 , d1, d5 fldd d3 , [ AO, #24 ] KMAC_I d9 , d1, d4 fldd d6 , [ BO, #16 ] fmacd d10 , d2, d4 fldd d7 , [ BO, #24 ] - KMAC_R d10 , d3, d5 fmacd d11 , d2, d5 + KMAC_R d10 , d3, d5 + pld [ AO, #A_PRE ] KMAC_I d11 , d3, d4 + pld [ BO, #B_PRE ] fmacd d12 , d0, d6 - KMAC_R d12 , d1, d7 fmacd d13 , d0, d7 + KMAC_R d12 , d1, d7 KMAC_I d13 , d1, d6 fmacd d14 , d2, d6 - KMAC_R d14 , d3, d7 - add BO , BO, #32 fmacd d15 , d2, d7 + add BO , BO, #32 + KMAC_R d14 , d3, d7 add AO , AO, #32 KMAC_I d15 , d3, d6 @@ -227,41 +226,40 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL2x2_M2 - pld [ AO, #A_PRE ] - pld [ BO, #B_PRE ] - fldd d0 , [ AO ] - fldd d1 , [ AO, #8 ] - fldd d4 , [ BO ] fldd d5 , [ BO, #8 ] fmacd d8 , d0, d4 - KMAC_R d8 , d1, d5 - fldd d2 , [ AO, #16 ] + fldd d1 , [ AO, #8 ] fmacd d9 , d0, d5 + fldd d2 , [ AO, #16 ] + KMAC_R d8 , d1, d5 fldd d3 , [ AO, #24 ] KMAC_I d9 , d1, d4 fldd d6 , [ BO, #16 ] fmacd d10 , d2, d4 fldd d7 , [ BO, #24 ] - KMAC_R d10 , d3, d5 fmacd d11 , d2, d5 + KMAC_R d10 , d3, d5 + pld [ AO, #A_PRE ] KMAC_I d11 , d3, d4 + pld [ BO, #B_PRE ] fmacd d12 , d0, d6 - KMAC_R d12 , d1, d7 fmacd d13 , d0, d7 + KMAC_R d12 , d1, d7 KMAC_I d13 , d1, d6 fmacd d14 , d2, d6 - KMAC_R d14 , d3, d7 - add BO , BO, #32 fmacd d15 , d2, d7 + add BO , BO, #32 + KMAC_R d14 , d3, d7 add AO , AO, #32 KMAC_I d15 , d3, d6 + .endm @@ -305,37 +303,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL2x2_SUB fldd d0 , [ AO ] - fldd d1 , [ AO, #8 ] - fldd d2 , [ AO, #16 ] - fldd d3 , [ AO, #24 ] - fldd d4 , [ BO ] fldd d5 , [ BO, #8 ] - fldd d6 , [ BO, #16 ] - fldd d7 , [ BO, #24 ] fmacd d8 , d0, d4 - KMAC_R d8 , d1, d5 + fldd d1 , [ AO, #8 ] fmacd d9 , d0, d5 + fldd d2 , [ AO, #16 ] + KMAC_R d8 , d1, d5 + fldd d3 , [ AO, #24 ] KMAC_I d9 , d1, d4 + fldd d6 , [ BO, #16 ] fmacd d10 , d2, d4 - KMAC_R d10 , d3, d5 + fldd d7 , [ BO, #24 ] fmacd d11 , d2, d5 + KMAC_R d10 , d3, d5 + pld [ AO, #A_PRE ] KMAC_I d11 , d3, d4 + pld [ BO, #B_PRE ] fmacd d12 , d0, d6 - KMAC_R d12 , d1, d7 fmacd d13 , d0, d7 + KMAC_R d12 , d1, d7 KMAC_I d13 , d1, d6 fmacd d14 , d2, d6 - KMAC_R d14 , d3, d7 fmacd d15 , d2, d7 - KMAC_I d15 , d3, d6 - add BO , BO, #32 + KMAC_R d14 , d3, d7 add AO , AO, #32 + KMAC_I d15 , d3, d6 .endm From 274304bd032990ba57d3b2e375e7bf916f1a1fe2 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Thu, 28 Nov 2013 11:54:38 +0100 Subject: [PATCH 70/81] add optimized cgemm kernel for ARMV6 --- kernel/arm/KERNEL.ARMV6 | 2 +- kernel/arm/cgemm_kernel_2x2_vfp.S | 1252 +++++++++++++++++++++++++++++ 2 files changed, 1253 insertions(+), 1 deletion(-) create mode 100644 kernel/arm/cgemm_kernel_2x2_vfp.S diff --git a/kernel/arm/KERNEL.ARMV6 b/kernel/arm/KERNEL.ARMV6 index 58e4d2702..3d3847f13 100644 --- a/kernel/arm/KERNEL.ARMV6 +++ b/kernel/arm/KERNEL.ARMV6 @@ -105,7 +105,7 @@ DGEMMOTCOPY = ../generic/gemm_tcopy_2.c DGEMMONCOPYOBJ = dgemm_oncopy.o DGEMMOTCOPYOBJ = dgemm_otcopy.o -CGEMMKERNEL = ../generic/zgemmkernel_2x2.c +CGEMMKERNEL = cgemm_kernel_2x2_vfp.S CGEMMONCOPY = cgemm_ncopy_2_vfp.S CGEMMOTCOPY = cgemm_tcopy_2_vfp.S CGEMMONCOPYOBJ = cgemm_oncopy.o diff --git a/kernel/arm/cgemm_kernel_2x2_vfp.S b/kernel/arm/cgemm_kernel_2x2_vfp.S new file mode 100644 index 000000000..75fbf097b --- /dev/null +++ b/kernel/arm/cgemm_kernel_2x2_vfp.S @@ -0,0 +1,1252 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/28 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_M r0 +#define OLD_N r1 +#define OLD_K r2 +#define OLD_A r3 +#define OLD_ALPHA_R s0 +#define OLD_ALPHA_I s1 + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define A [fp, #-248 ] +#define LDC [fp, #-252 ] +#define M [fp, #-256 ] +#define N [fp, #-260 ] +#define K [fp, #-264 ] + +#define ALPHA_I [fp, #-272] +#define ALPHA_R [fp, #-280] + +#define B [fp, #4 ] +#define C [fp, #8 ] +#define OLD_LDC [fp, #12 ] + +#define I r0 +#define J r1 +#define L r2 + +#define AO r5 +#define BO r6 + +#define CO1 r8 +#define CO2 r9 + +#define K1 r7 +#define BC r12 + +#define A_PRE 96 +#define B_PRE 96 +#define C_PRE 64 + + + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + + #define KMAC_R fnmacs + #define KMAC_I fmacs + + #define FMAC_R1 fmacs + #define FMAC_R2 fnmacs + #define FMAC_I1 fmacs + #define FMAC_I2 fmacs + +#elif defined(CN) || defined(CT) + + #define KMAC_R fmacs + #define KMAC_I fnmacs + + #define FMAC_R1 fmacs + #define FMAC_R2 fnmacs + #define FMAC_I1 fmacs + #define FMAC_I2 fmacs + +#elif defined(NC) || defined(TC) + + #define KMAC_R fmacs + #define KMAC_I fnmacs + + #define FMAC_R1 fmacs + #define FMAC_R2 fmacs + #define FMAC_I1 fnmacs + #define FMAC_I2 fmacs + +#else + + #define KMAC_R fnmacs + #define KMAC_I fmacs + + #define FMAC_R1 fmacs + #define FMAC_R2 fmacs + #define FMAC_I1 fnmacs + #define FMAC_I2 fmacs + +#endif + + + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro INIT2x2 + + vsub.f32 s8 , s8 , s8 + vmov.f32 s9 , s8 + vmov.f32 s10, s8 + vmov.f32 s11, s8 + vmov.f32 s12, s8 + vmov.f32 s13, s8 + vmov.f32 s14, s8 + vmov.f32 s15, s8 + +.endm + +.macro KERNEL2x2_I + + pld [ AO, #A_PRE ] + fldmias AO!, { s0 - s3 } + pld [ BO, #B_PRE ] + fldmias BO!, { s4 - s7 } + + + fmuls s8 , s0, s4 + fmuls s9 , s0, s5 + fmuls s10 , s2, s4 + fmuls s11 , s2, s5 + + KMAC_R s8 , s1, s5 + KMAC_I s9 , s1, s4 + KMAC_R s10 , s3, s5 + KMAC_I s11 , s3, s4 + + fmuls s12 , s0, s6 + fmuls s13 , s0, s7 + fmuls s14 , s2, s6 + fmuls s15 , s2, s7 + + KMAC_R s12 , s1, s7 + KMAC_I s13 , s1, s6 + KMAC_R s14 , s3, s7 + KMAC_I s15 , s3, s6 + +.endm + + + +.macro KERNEL2x2_M1 + + pld [ AO, #A_PRE ] + fldmias AO!, { s0 - s3 } + pld [ BO, #B_PRE ] + fldmias BO!, { s4 - s7 } + + fmacs s8 , s0, s4 + fmacs s9 , s0, s5 + fmacs s10 , s2, s4 + fmacs s11 , s2, s5 + + KMAC_R s8 , s1, s5 + KMAC_I s9 , s1, s4 + KMAC_R s10 , s3, s5 + KMAC_I s11 , s3, s4 + + fmacs s12 , s0, s6 + fmacs s13 , s0, s7 + fmacs s14 , s2, s6 + fmacs s15 , s2, s7 + + KMAC_R s12 , s1, s7 + KMAC_I s13 , s1, s6 + KMAC_R s14 , s3, s7 + KMAC_I s15 , s3, s6 + +.endm + +.macro KERNEL2x2_M2 + + fldmias AO!, { s0 - s3 } + fldmias BO!, { s4 - s7 } + + fmacs s8 , s0, s4 + fmacs s9 , s0, s5 + fmacs s10 , s2, s4 + fmacs s11 , s2, s5 + + KMAC_R s8 , s1, s5 + KMAC_I s9 , s1, s4 + KMAC_R s10 , s3, s5 + KMAC_I s11 , s3, s4 + + fmacs s12 , s0, s6 + fmacs s13 , s0, s7 + fmacs s14 , s2, s6 + fmacs s15 , s2, s7 + + KMAC_R s12 , s1, s7 + KMAC_I s13 , s1, s6 + KMAC_R s14 , s3, s7 + KMAC_I s15 , s3, s6 + + +.endm + + +.macro KERNEL2x2_E + + fldmias AO!, { s0 - s3 } + fldmias BO!, { s4 - s7 } + + fmacs s8 , s0, s4 + fmacs s9 , s0, s5 + fmacs s10 , s2, s4 + fmacs s11 , s2, s5 + + KMAC_R s8 , s1, s5 + KMAC_I s9 , s1, s4 + KMAC_R s10 , s3, s5 + KMAC_I s11 , s3, s4 + + fmacs s12 , s0, s6 + fmacs s13 , s0, s7 + fmacs s14 , s2, s6 + fmacs s15 , s2, s7 + + KMAC_R s12 , s1, s7 + KMAC_I s13 , s1, s6 + KMAC_R s14 , s3, s7 + KMAC_I s15 , s3, s6 + + +.endm + +.macro KERNEL2x2_SUB + + fldmias AO!, { s0 - s3 } + fldmias BO!, { s4 - s7 } + + fmacs s8 , s0, s4 + fmacs s9 , s0, s5 + fmacs s10 , s2, s4 + fmacs s11 , s2, s5 + + KMAC_R s8 , s1, s5 + KMAC_I s9 , s1, s4 + KMAC_R s10 , s3, s5 + KMAC_I s11 , s3, s4 + + fmacs s12 , s0, s6 + fmacs s13 , s0, s7 + fmacs s14 , s2, s6 + fmacs s15 , s2, s7 + + KMAC_R s12 , s1, s7 + KMAC_I s13 , s1, s6 + KMAC_R s14 , s3, s7 + KMAC_I s15 , s3, s6 + + +.endm + + +.macro SAVE2x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + flds s0, ALPHA_R + flds s1, ALPHA_I + + fldmias CO1, { s4 - s7 } + + FMAC_R1 s4 , s0 , s8 + FMAC_I1 s5 , s0 , s9 + FMAC_R2 s4 , s1 , s9 + FMAC_I2 s5 , s1 , s8 + + FMAC_R1 s6 , s0 , s10 + FMAC_I1 s7 , s0 , s11 + FMAC_R2 s6 , s1 , s11 + FMAC_I2 s7 , s1 , s10 + + fstmias CO1, { s4 - s7 } + + fldmias CO2, { s4 - s7 } + + FMAC_R1 s4 , s0 , s12 + FMAC_I1 s5 , s0 , s13 + FMAC_R2 s4 , s1 , s13 + FMAC_I2 s5 , s1 , s12 + + FMAC_R1 s6 , s0 , s14 + FMAC_I1 s7 , s0 , s15 + FMAC_R2 s6 , s1 , s15 + FMAC_I2 s7 , s1 , s14 + + fstmias CO2, { s4 - s7 } + + add CO1, CO1, #16 + +.endm + +/******************************************************************************/ + +.macro INIT1x2 + + vsub.f32 s8 , s8 , s8 + vmov.f32 s9 , s8 + vmov.f32 s12, s8 + vmov.f32 s13, s8 + +.endm + +.macro KERNEL1x2_I + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + + flds s6 , [ BO, #8 ] + flds s7 , [ BO, #12 ] + + fmuls s8 , s0, s4 + KMAC_R s8 , s1, s5 + fmuls s9 , s0, s5 + KMAC_I s9 , s1, s4 + + fmuls s12 , s0, s6 + KMAC_R s12 , s1, s7 + fmuls s13 , s0, s7 + KMAC_I s13 , s1, s6 + + add BO , BO, #16 + add AO , AO, #8 + +.endm + + + +.macro KERNEL1x2_M1 + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + flds s6 , [ BO, #8 ] + flds s7 , [ BO, #12 ] + + fmacs s8 , s0, s4 + KMAC_R s8 , s1, s5 + fmacs s9 , s0, s5 + KMAC_I s9 , s1, s4 + + fmacs s12 , s0, s6 + KMAC_R s12 , s1, s7 + fmacs s13 , s0, s7 + KMAC_I s13 , s1, s6 + + add BO , BO, #16 + add AO , AO, #8 + +.endm + +.macro KERNEL1x2_M2 + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + flds s6 , [ BO, #8 ] + flds s7 , [ BO, #12 ] + + fmacs s8 , s0, s4 + KMAC_R s8 , s1, s5 + fmacs s9 , s0, s5 + KMAC_I s9 , s1, s4 + + fmacs s12 , s0, s6 + KMAC_R s12 , s1, s7 + fmacs s13 , s0, s7 + KMAC_I s13 , s1, s6 + + add BO , BO, #16 + add AO , AO, #8 + + +.endm + + +.macro KERNEL1x2_E + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + flds s6 , [ BO, #8 ] + flds s7 , [ BO, #12 ] + + fmacs s8 , s0, s4 + KMAC_R s8 , s1, s5 + fmacs s9 , s0, s5 + KMAC_I s9 , s1, s4 + + fmacs s12 , s0, s6 + KMAC_R s12 , s1, s7 + fmacs s13 , s0, s7 + KMAC_I s13 , s1, s6 + + add BO , BO, #16 + add AO , AO, #8 + +.endm + +.macro KERNEL1x2_SUB + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + flds s6 , [ BO, #8 ] + flds s7 , [ BO, #12 ] + + fmacs s8 , s0, s4 + KMAC_R s8 , s1, s5 + fmacs s9 , s0, s5 + KMAC_I s9 , s1, s4 + + fmacs s12 , s0, s6 + KMAC_R s12 , s1, s7 + fmacs s13 , s0, s7 + KMAC_I s13 , s1, s6 + + add BO , BO, #16 + add AO , AO, #8 + + +.endm + + +.macro SAVE1x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + flds s0, ALPHA_R + flds s1, ALPHA_I + + fldmias CO1, { s4 - s5 } + + FMAC_R1 s4 , s0 , s8 + FMAC_I1 s5 , s0 , s9 + FMAC_R2 s4 , s1 , s9 + FMAC_I2 s5 , s1 , s8 + + fstmias CO1, { s4 - s5 } + + fldmias CO2, { s4 - s5 } + + FMAC_R1 s4 , s0 , s12 + FMAC_I1 s5 , s0 , s13 + FMAC_R2 s4 , s1 , s13 + FMAC_I2 s5 , s1 , s12 + + fstmias CO2, { s4 - s5 } + + add CO1, CO1, #8 + +.endm + + +/******************************************************************************/ + +.macro INIT2x1 + + vsub.f32 s8 , s8 , s8 + vmov.f32 s9 , s8 + vmov.f32 s10, s8 + vmov.f32 s11, s8 + +.endm + +.macro KERNEL2x1_I + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + flds s2 , [ AO, #8 ] + flds s3 , [ AO, #12 ] + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + + fmuls s8 , s0, s4 + KMAC_R s8 , s1, s5 + fmuls s9 , s0, s5 + KMAC_I s9 , s1, s4 + + fmuls s10 , s2, s4 + KMAC_R s10 , s3, s5 + fmuls s11 , s2, s5 + KMAC_I s11 , s3, s4 + + add BO , BO, #8 + add AO , AO, #16 + +.endm + + + +.macro KERNEL2x1_M1 + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s2 , [ AO, #8 ] + flds s3 , [ AO, #12 ] + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + + fmacs s8 , s0, s4 + KMAC_R s8 , s1, s5 + fmacs s9 , s0, s5 + KMAC_I s9 , s1, s4 + + fmacs s10 , s2, s4 + KMAC_R s10 , s3, s5 + fmacs s11 , s2, s5 + KMAC_I s11 , s3, s4 + + add BO , BO, #8 + add AO , AO, #16 + +.endm + +.macro KERNEL2x1_M2 + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s2 , [ AO, #8 ] + flds s3 , [ AO, #12 ] + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + + fmacs s8 , s0, s4 + KMAC_R s8 , s1, s5 + fmacs s9 , s0, s5 + KMAC_I s9 , s1, s4 + + fmacs s10 , s2, s4 + KMAC_R s10 , s3, s5 + fmacs s11 , s2, s5 + KMAC_I s11 , s3, s4 + + add BO , BO, #8 + add AO , AO, #16 + + +.endm + + +.macro KERNEL2x1_E + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s2 , [ AO, #8 ] + flds s3 , [ AO, #12 ] + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + + fmacs s8 , s0, s4 + KMAC_R s8 , s1, s5 + fmacs s9 , s0, s5 + KMAC_I s9 , s1, s4 + + fmacs s10 , s2, s4 + KMAC_R s10 , s3, s5 + fmacs s11 , s2, s5 + KMAC_I s11 , s3, s4 + + add BO , BO, #8 + add AO , AO, #16 + +.endm + +.macro KERNEL2x1_SUB + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s2 , [ AO, #8 ] + flds s3 , [ AO, #12 ] + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + + fmacs s8 , s0, s4 + KMAC_R s8 , s1, s5 + fmacs s9 , s0, s5 + KMAC_I s9 , s1, s4 + + fmacs s10 , s2, s4 + KMAC_R s10 , s3, s5 + fmacs s11 , s2, s5 + KMAC_I s11 , s3, s4 + + add BO , BO, #8 + add AO , AO, #16 + + +.endm + + +.macro SAVE2x1 + + flds s0, ALPHA_R + flds s1, ALPHA_I + + fldmias CO1, { s4 - s7 } + + FMAC_R1 s4 , s0 , s8 + FMAC_I1 s5 , s0 , s9 + FMAC_R2 s4 , s1 , s9 + FMAC_I2 s5 , s1 , s8 + + FMAC_R1 s6 , s0 , s10 + FMAC_I1 s7 , s0 , s11 + FMAC_R2 s6 , s1 , s11 + FMAC_I2 s7 , s1 , s10 + + fstmias CO1, { s4 - s7 } + + add CO1, CO1, #16 + +.endm + + +/******************************************************************************/ + +.macro INIT1x1 + + vsub.f32 s8 , s8 , s8 + vmov.f32 s9 , s8 + +.endm + +.macro KERNEL1x1_I + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + + fmuls s8 , s0, s4 + KMAC_R s8 , s1, s5 + fmuls s9 , s0, s5 + KMAC_I s9 , s1, s4 + + add BO , BO, #8 + add AO , AO, #8 + +.endm + + + +.macro KERNEL1x1_M1 + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + + fmacs s8 , s0, s4 + KMAC_R s8 , s1, s5 + fmacs s9 , s0, s5 + KMAC_I s9 , s1, s4 + + add BO , BO, #8 + add AO , AO, #8 + +.endm + +.macro KERNEL1x1_M2 + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + + fmacs s8 , s0, s4 + KMAC_R s8 , s1, s5 + fmacs s9 , s0, s5 + KMAC_I s9 , s1, s4 + + add BO , BO, #8 + add AO , AO, #8 + + +.endm + + +.macro KERNEL1x1_E + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + + fmacs s8 , s0, s4 + KMAC_R s8 , s1, s5 + fmacs s9 , s0, s5 + KMAC_I s9 , s1, s4 + + add BO , BO, #8 + add AO , AO, #8 + +.endm + +.macro KERNEL1x1_SUB + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + + fmacs s8 , s0, s4 + KMAC_R s8 , s1, s5 + fmacs s9 , s0, s5 + KMAC_I s9 , s1, s4 + + add BO , BO, #8 + add AO , AO, #8 + + +.endm + + +.macro SAVE1x1 + + flds s0, ALPHA_R + flds s1, ALPHA_I + + fldmias CO1, { s4 - s5 } + + FMAC_R1 s4 , s0 , s8 + FMAC_I1 s5 , s0 , s9 + FMAC_R2 s4 , s1 , s9 + FMAC_I2 s5 , s1 , s8 + + fstmias CO1, { s4 - s5 } + + add CO1, CO1, #8 + +.endm + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + str OLD_M, M + str OLD_N, N + str OLD_K, K + str OLD_A, A + vstr OLD_ALPHA_R, ALPHA_R + vstr OLD_ALPHA_I, ALPHA_I + + sub r3, fp, #128 + vstm r3, { s8 - s15} // store floating point registers + + ldr r3, OLD_LDC + lsl r3, r3, #3 // ldc = ldc * 4 * 2 + str r3, LDC + + ldr K1, K + ldr BC, B + + ldr J, N + asrs J, J, #1 // J = J / 2 + ble cgemm_kernel_L1_BEGIN + +cgemm_kernel_L2_BEGIN: + + ldr CO1, C // CO1 = C + ldr r4 , LDC + lsl r4 , r4 , #1 // LDC * 2 + add r3 , r4, CO1 + str r3 , C // store C + + ldr AO, A // AO = A + pld [AO , #A_PRE-64] + pld [AO , #A_PRE-32] + + + +cgemm_kernel_L2_M2_BEGIN: + + ldr I, M + asrs I, I, #1 // I = I / 2 + ble cgemm_kernel_L2_M1_BEGIN + +cgemm_kernel_L2_M2_20: + + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + cmp L , #3 + blt cgemm_kernel_L2_M2_30 + .align 5 + + + + KERNEL2x2_I + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + sub L, L, #2 + +cgemm_kernel_L2_M2_22: + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + subs L, L, #1 + bgt cgemm_kernel_L2_M2_22 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_E + + b cgemm_kernel_L2_M2_44 + + +cgemm_kernel_L2_M2_30: + tst L, #3 + ble cgemm_kernel_L2_M2_40 + + tst L, #2 + ble cgemm_kernel_L2_M2_32 + + KERNEL2x2_I + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_E + + b cgemm_kernel_L2_M2_44 + +cgemm_kernel_L2_M2_32: + + tst L, #1 + ble cgemm_kernel_L2_M2_40 + + KERNEL2x2_I + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_E + + b cgemm_kernel_L2_M2_44 + + +cgemm_kernel_L2_M2_40: + + INIT2x2 + + +cgemm_kernel_L2_M2_44: + + ands L , K1, #7 // L = L % 8 + ble cgemm_kernel_L2_M2_100 + +cgemm_kernel_L2_M2_46: + + KERNEL2x2_SUB + + subs L, L, #1 + bne cgemm_kernel_L2_M2_46 + +cgemm_kernel_L2_M2_100: + + SAVE2x2 + +cgemm_kernel_L2_M2_END: + + subs I, I, #1 + bne cgemm_kernel_L2_M2_20 + + +cgemm_kernel_L2_M1_BEGIN: + + ldr I, M + tst I, #1 // I = I % 2 + ble cgemm_kernel_L2_END + +cgemm_kernel_L2_M1_20: + + INIT1x2 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble cgemm_kernel_L2_M1_40 + +cgemm_kernel_L2_M1_22: + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + subs L, L, #1 + bgt cgemm_kernel_L2_M1_22 + + +cgemm_kernel_L2_M1_40: + + ands L , K1, #7 // L = L % 8 + ble cgemm_kernel_L2_M1_100 + +cgemm_kernel_L2_M1_42: + + KERNEL1x2_SUB + + subs L, L, #1 + bgt cgemm_kernel_L2_M1_42 + +cgemm_kernel_L2_M1_100: + + SAVE1x2 + + +cgemm_kernel_L2_END: + + mov r3, BC + mov r4, K1 + lsl r4, r4, #4 // k * 2 * 4 * 2 + add r3, r3, r4 // B = B + K * 2 * 8 + mov BC, r3 + + subs J , #1 // j-- + bgt cgemm_kernel_L2_BEGIN + + + +/*********************************************************************************************/ + +cgemm_kernel_L1_BEGIN: + + ldr J , N + tst J , #1 + ble cgemm_kernel_L999 + + + ldr CO1, C // CO1 = C + ldr r4 , LDC + add r3 , r4, CO1 + str r3 , C // store C + + ldr AO, A // AO = A + +cgemm_kernel_L1_M2_BEGIN: + + ldr I, M + asrs I, I, #1 // I = I / 2 + ble cgemm_kernel_L1_M1_BEGIN + +cgemm_kernel_L1_M2_20: + + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + cmp L , #3 + blt cgemm_kernel_L1_M2_30 + .align 5 + + + + KERNEL2x1_I + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + sub L, L, #2 + +cgemm_kernel_L1_M2_22: + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + subs L, L, #1 + bgt cgemm_kernel_L1_M2_22 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_E + + b cgemm_kernel_L1_M2_44 + + +cgemm_kernel_L1_M2_30: + tst L, #3 + ble cgemm_kernel_L1_M2_40 + + tst L, #2 + ble cgemm_kernel_L1_M2_32 + + KERNEL2x1_I + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_E + + b cgemm_kernel_L1_M2_44 + +cgemm_kernel_L1_M2_32: + + tst L, #1 + ble cgemm_kernel_L1_M2_40 + + KERNEL2x1_I + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_E + + b cgemm_kernel_L1_M2_44 + + +cgemm_kernel_L1_M2_40: + + INIT2x1 + + +cgemm_kernel_L1_M2_44: + + ands L , K1, #7 // L = L % 8 + ble cgemm_kernel_L1_M2_100 + +cgemm_kernel_L1_M2_46: + + KERNEL2x1_SUB + + subs L, L, #1 + bne cgemm_kernel_L1_M2_46 + +cgemm_kernel_L1_M2_100: + + SAVE2x1 + +cgemm_kernel_L1_M2_END: + + subs I, I, #1 + bne cgemm_kernel_L1_M2_20 + + +cgemm_kernel_L1_M1_BEGIN: + + ldr I, M + tst I, #1 // I = I % 2 + ble cgemm_kernel_L1_END + +cgemm_kernel_L1_M1_20: + + INIT1x1 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble cgemm_kernel_L1_M1_40 + +cgemm_kernel_L1_M1_22: + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + subs L, L, #1 + bgt cgemm_kernel_L1_M1_22 + + +cgemm_kernel_L1_M1_40: + + ands L , K1, #7 // L = L % 8 + ble cgemm_kernel_L1_M1_100 + +cgemm_kernel_L1_M1_42: + + KERNEL1x1_SUB + + subs L, L, #1 + bgt cgemm_kernel_L1_M1_42 + +cgemm_kernel_L1_M1_100: + + SAVE1x1 + + +cgemm_kernel_L1_END: + + + +cgemm_kernel_L999: + + sub r3, fp, #128 + vldm r3, { s8 - s15} // restore floating point registers + + movs r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + From dec7ad0dfd2738ac708aae408d16e280ee06b5cc Mon Sep 17 00:00:00 2001 From: wernsaar Date: Thu, 28 Nov 2013 12:32:12 +0100 Subject: [PATCH 71/81] optimized dtrmm kernel for ARMV7 --- kernel/arm/dtrmm_kernel_4x2_vfp.S | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/kernel/arm/dtrmm_kernel_4x2_vfp.S b/kernel/arm/dtrmm_kernel_4x2_vfp.S index 55a017a97..762b9c580 100644 --- a/kernel/arm/dtrmm_kernel_4x2_vfp.S +++ b/kernel/arm/dtrmm_kernel_4x2_vfp.S @@ -26,7 +26,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** -* 2013/11/23 Saar +* 2013/11/28 Saar * BLASTEST : OK * CTEST : OK * TEST : OK @@ -106,25 +106,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x2_SUB fldd d4 , [ BO ] - fldd d5 , [ BO, #8 ] - fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] - fldd d2 , [ AO, #16 ] - fldd d3 , [ AO, #24 ] + pld [ AO , #A_PRE ] fmacd d8 , d0, d4 + fldd d2 , [ AO, #16 ] fmacd d9 , d1, d4 + fldd d3 , [ AO, #24 ] fmacd d10 , d2, d4 + fldd d5 , [ BO, #8 ] fmacd d11 , d3, d4 fmacd d12 , d0, d5 fmacd d13 , d1, d5 + add AO , AO, #32 fmacd d14 , d2, d5 + add BO , BO, #16 fmacd d15 , d3, d5 - add AO , AO, #32 - add BO , BO, #16 .endm @@ -490,13 +491,18 @@ _L2_M4_20: .align 5 _L2_M4_22: + + pld [ BO , #B_PRE ] KERNEL4x2_SUB KERNEL4x2_SUB + pld [ BO , #B_PRE ] KERNEL4x2_SUB KERNEL4x2_SUB + pld [ BO , #B_PRE ] KERNEL4x2_SUB KERNEL4x2_SUB + pld [ BO , #B_PRE ] KERNEL4x2_SUB KERNEL4x2_SUB From 5bc322a66c1058eb1d600511ee7ee36e40633172 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Thu, 28 Nov 2013 12:45:38 +0100 Subject: [PATCH 72/81] optimized strmm kernel for ARMV6 --- kernel/arm/strmm_kernel_4x2_vfp.S | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/kernel/arm/strmm_kernel_4x2_vfp.S b/kernel/arm/strmm_kernel_4x2_vfp.S index 5394a6444..ab5ff7fa2 100644 --- a/kernel/arm/strmm_kernel_4x2_vfp.S +++ b/kernel/arm/strmm_kernel_4x2_vfp.S @@ -26,7 +26,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** -* 2013/11/23 Saar +* 2013/11/28 Saar * BLASTEST : OK * CTEST : OK * TEST : OK @@ -105,13 +105,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x2_SUB - flds s4 , [ BO ] - flds s5 , [ BO, #4 ] - - flds s0 , [ AO ] - flds s1 , [ AO, #4 ] - flds s2 , [ AO, #8 ] - flds s3 , [ AO, #12 ] + fldmias AO!, { s0 - s3 } + fldmias BO!, { s4 - s5 } fmacs s8 , s0, s4 fmacs s9 , s1, s4 @@ -123,9 +118,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmacs s14 , s2, s5 fmacs s15 , s3, s5 - add AO , AO, #16 - add BO , BO, #8 - .endm .macro SAVE4x2 @@ -490,13 +482,19 @@ _L2_M4_20: .align 5 _L2_M4_22: + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] KERNEL4x2_SUB KERNEL4x2_SUB + pld [ AO , #A_PRE ] KERNEL4x2_SUB KERNEL4x2_SUB + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] KERNEL4x2_SUB KERNEL4x2_SUB + pld [ AO , #A_PRE ] KERNEL4x2_SUB KERNEL4x2_SUB From 42a4dff0568d679c942cac8722cf01185ae97c21 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Thu, 28 Nov 2013 13:41:06 +0100 Subject: [PATCH 73/81] added optimized ztrmm kernel for ARMV6 --- kernel/arm/KERNEL.ARMV6 | 2 +- kernel/arm/ztrmm_kernel_2x2_vfp.S | 1537 +++++++++++++++++++++++++++++ 2 files changed, 1538 insertions(+), 1 deletion(-) create mode 100644 kernel/arm/ztrmm_kernel_2x2_vfp.S diff --git a/kernel/arm/KERNEL.ARMV6 b/kernel/arm/KERNEL.ARMV6 index 3d3847f13..9ee6070fd 100644 --- a/kernel/arm/KERNEL.ARMV6 +++ b/kernel/arm/KERNEL.ARMV6 @@ -83,7 +83,7 @@ ZGEMVTKERNEL = zgemv_t.c STRMMKERNEL = strmm_kernel_4x2_vfp.S DTRMMKERNEL = dtrmm_kernel_4x2_vfp.S CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c -ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c +ZTRMMKERNEL = ztrmm_kernel_2x2_vfp.S SGEMMKERNEL = sgemm_kernel_4x2_vfp.S SGEMMINCOPY = sgemm_ncopy_4_vfp.S diff --git a/kernel/arm/ztrmm_kernel_2x2_vfp.S b/kernel/arm/ztrmm_kernel_2x2_vfp.S new file mode 100644 index 000000000..59039c32f --- /dev/null +++ b/kernel/arm/ztrmm_kernel_2x2_vfp.S @@ -0,0 +1,1537 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/28 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_M r0 +#define OLD_N r1 +#define OLD_K r2 +#define OLD_A r3 +#define OLD_ALPHA_R d0 +#define OLD_ALPHA_I d1 + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define KKK [fp, #-240] +#define KK [fp, #-244 ] +#define A [fp, #-248 ] +#define LDC [fp, #-252 ] +#define M [fp, #-256 ] +#define N [fp, #-260 ] +#define K [fp, #-264 ] + +#define ALPHA_I [fp, #-272] +#define ALPHA_R [fp, #-280] + +#define B [fp, #4 ] +#define C [fp, #8 ] +#define OLD_LDC [fp, #12 ] +#define OFFSET [fp, #16 ] + +#define I r0 +#define J r1 +#define L r2 + +#define AO r5 +#define BO r6 + +#define CO1 r8 +#define CO2 r9 + +#define K1 r7 +#define BC r12 + +#define A_PRE 96 +#define B_PRE 96 +#define C_PRE 64 + + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + + #define KMAC_R fnmacd + #define KMAC_I fmacd + + #define FMAC_R1 fmacd + #define FMAC_R2 fnmacd + #define FMAC_I1 fmacd + #define FMAC_I2 fmacd + +#elif defined(CN) || defined(CT) + + #define KMAC_R fmacd + #define KMAC_I fnmacd + + #define FMAC_R1 fmacd + #define FMAC_R2 fnmacd + #define FMAC_I1 fmacd + #define FMAC_I2 fmacd + +#elif defined(NC) || defined(TC) + + #define KMAC_R fmacd + #define KMAC_I fnmacd + + #define FMAC_R1 fmacd + #define FMAC_R2 fmacd + #define FMAC_I1 fnmacd + #define FMAC_I2 fmacd + +#else + + #define KMAC_R fnmacd + #define KMAC_I fmacd + + #define FMAC_R1 fmacd + #define FMAC_R2 fmacd + #define FMAC_I1 fnmacd + #define FMAC_I2 fmacd + +#endif + + + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro INIT2x2 + + vsub.f64 d8 , d8 , d8 + vmov.f64 d9 , d8 + vmov.f64 d10, d8 + vmov.f64 d11, d8 + vmov.f64 d12, d8 + vmov.f64 d13, d8 + vmov.f64 d14, d8 + vmov.f64 d15, d8 + +.endm + +.macro KERNEL2x2_I + pld [ AO, #A_PRE ] + pld [ BO, #B_PRE ] + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fldd d6 , [ BO, #16 ] + fldd d7 , [ BO, #24 ] + + fmuld d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmuld d9 , d0, d5 + KMAC_I d9 , d1, d4 + + fmuld d10 , d2, d4 + KMAC_R d10 , d3, d5 + fmuld d11 , d2, d5 + KMAC_I d11 , d3, d4 + + fmuld d12 , d0, d6 + KMAC_R d12 , d1, d7 + fmuld d13 , d0, d7 + KMAC_I d13 , d1, d6 + + fmuld d14 , d2, d6 + KMAC_R d14 , d3, d7 + fmuld d15 , d2, d7 + KMAC_I d15 , d3, d6 + + add BO , BO, #32 + add AO , AO, #32 + +.endm + + + +.macro KERNEL2x2_M1 + + + fldd d0 , [ AO ] + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fmacd d8 , d0, d4 + fldd d1 , [ AO, #8 ] + fmacd d9 , d0, d5 + fldd d2 , [ AO, #16 ] + KMAC_R d8 , d1, d5 + fldd d3 , [ AO, #24 ] + KMAC_I d9 , d1, d4 + + fldd d6 , [ BO, #16 ] + fmacd d10 , d2, d4 + fldd d7 , [ BO, #24 ] + fmacd d11 , d2, d5 + KMAC_R d10 , d3, d5 + pld [ AO, #A_PRE ] + KMAC_I d11 , d3, d4 + + pld [ BO, #B_PRE ] + fmacd d12 , d0, d6 + fmacd d13 , d0, d7 + KMAC_R d12 , d1, d7 + KMAC_I d13 , d1, d6 + + fmacd d14 , d2, d6 + fmacd d15 , d2, d7 + add BO , BO, #32 + KMAC_R d14 , d3, d7 + add AO , AO, #32 + KMAC_I d15 , d3, d6 + + +.endm + +.macro KERNEL2x2_M2 + + fldd d0 , [ AO ] + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fmacd d8 , d0, d4 + fldd d1 , [ AO, #8 ] + fmacd d9 , d0, d5 + fldd d2 , [ AO, #16 ] + KMAC_R d8 , d1, d5 + fldd d3 , [ AO, #24 ] + KMAC_I d9 , d1, d4 + + fldd d6 , [ BO, #16 ] + fmacd d10 , d2, d4 + fldd d7 , [ BO, #24 ] + fmacd d11 , d2, d5 + KMAC_R d10 , d3, d5 + pld [ AO, #A_PRE ] + KMAC_I d11 , d3, d4 + + pld [ BO, #B_PRE ] + fmacd d12 , d0, d6 + fmacd d13 , d0, d7 + KMAC_R d12 , d1, d7 + KMAC_I d13 , d1, d6 + + fmacd d14 , d2, d6 + fmacd d15 , d2, d7 + add BO , BO, #32 + KMAC_R d14 , d3, d7 + add AO , AO, #32 + KMAC_I d15 , d3, d6 + + +.endm + + +.macro KERNEL2x2_E + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + fldd d6 , [ BO, #16 ] + fldd d7 , [ BO, #24 ] + + fmacd d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmacd d9 , d0, d5 + KMAC_I d9 , d1, d4 + + fmacd d10 , d2, d4 + KMAC_R d10 , d3, d5 + fmacd d11 , d2, d5 + KMAC_I d11 , d3, d4 + + fmacd d12 , d0, d6 + KMAC_R d12 , d1, d7 + fmacd d13 , d0, d7 + KMAC_I d13 , d1, d6 + + fmacd d14 , d2, d6 + KMAC_R d14 , d3, d7 + fmacd d15 , d2, d7 + KMAC_I d15 , d3, d6 + + add BO , BO, #32 + add AO , AO, #32 + +.endm + +.macro KERNEL2x2_SUB + + fldd d0 , [ AO ] + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fmacd d8 , d0, d4 + fldd d1 , [ AO, #8 ] + fmacd d9 , d0, d5 + fldd d2 , [ AO, #16 ] + KMAC_R d8 , d1, d5 + fldd d3 , [ AO, #24 ] + KMAC_I d9 , d1, d4 + + fldd d6 , [ BO, #16 ] + fmacd d10 , d2, d4 + fldd d7 , [ BO, #24 ] + fmacd d11 , d2, d5 + KMAC_R d10 , d3, d5 + pld [ AO, #A_PRE ] + KMAC_I d11 , d3, d4 + + pld [ BO, #B_PRE ] + fmacd d12 , d0, d6 + fmacd d13 , d0, d7 + KMAC_R d12 , d1, d7 + KMAC_I d13 , d1, d6 + + fmacd d14 , d2, d6 + fmacd d15 , d2, d7 + add BO , BO, #32 + KMAC_R d14 , d3, d7 + add AO , AO, #32 + KMAC_I d15 , d3, d6 + + +.endm + + +.macro SAVE2x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + fldd d0, ALPHA_R + fldd d1, ALPHA_I + + vsub.f64 d4, d4 , d4 + vsub.f64 d5, d5 , d5 + vsub.f64 d6, d6 , d6 + vsub.f64 d7, d7 , d7 + + FMAC_R1 d4 , d0 , d8 + FMAC_I1 d5 , d0 , d9 + FMAC_R2 d4 , d1 , d9 + FMAC_I2 d5 , d1 , d8 + + FMAC_R1 d6 , d0 , d10 + FMAC_I1 d7 , d0 , d11 + FMAC_R2 d6 , d1 , d11 + FMAC_I2 d7 , d1 , d10 + + fstmiad CO1, { d4 - d7 } + + vsub.f64 d4, d4 , d4 + vsub.f64 d5, d5 , d5 + vsub.f64 d6, d6 , d6 + vsub.f64 d7, d7 , d7 + + FMAC_R1 d4 , d0 , d12 + FMAC_I1 d5 , d0 , d13 + FMAC_R2 d4 , d1 , d13 + FMAC_I2 d5 , d1 , d12 + + FMAC_R1 d6 , d0 , d14 + FMAC_I1 d7 , d0 , d15 + FMAC_R2 d6 , d1 , d15 + FMAC_I2 d7 , d1 , d14 + + fstmiad CO2, { d4 - d7 } + + add CO1, CO1, #32 + +.endm + +/******************************************************************************/ + +.macro INIT1x2 + + vsub.f64 d8 , d8 , d8 + vmov.f64 d9 , d8 + vmov.f64 d12, d8 + vmov.f64 d13, d8 + +.endm + +.macro KERNEL1x2_I + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fldd d6 , [ BO, #16 ] + fldd d7 , [ BO, #24 ] + + fmuld d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmuld d9 , d0, d5 + KMAC_I d9 , d1, d4 + + fmuld d12 , d0, d6 + KMAC_R d12 , d1, d7 + fmuld d13 , d0, d7 + KMAC_I d13 , d1, d6 + + add BO , BO, #32 + add AO , AO, #16 + +.endm + + + +.macro KERNEL1x2_M1 + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + fldd d6 , [ BO, #16 ] + fldd d7 , [ BO, #24 ] + + fmacd d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmacd d9 , d0, d5 + KMAC_I d9 , d1, d4 + + fmacd d12 , d0, d6 + KMAC_R d12 , d1, d7 + fmacd d13 , d0, d7 + KMAC_I d13 , d1, d6 + + add BO , BO, #32 + add AO , AO, #16 + +.endm + +.macro KERNEL1x2_M2 + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + fldd d6 , [ BO, #16 ] + fldd d7 , [ BO, #24 ] + + fmacd d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmacd d9 , d0, d5 + KMAC_I d9 , d1, d4 + + fmacd d12 , d0, d6 + KMAC_R d12 , d1, d7 + fmacd d13 , d0, d7 + KMAC_I d13 , d1, d6 + + add BO , BO, #32 + add AO , AO, #16 + + +.endm + + +.macro KERNEL1x2_E + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + fldd d6 , [ BO, #16 ] + fldd d7 , [ BO, #24 ] + + fmacd d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmacd d9 , d0, d5 + KMAC_I d9 , d1, d4 + + fmacd d12 , d0, d6 + KMAC_R d12 , d1, d7 + fmacd d13 , d0, d7 + KMAC_I d13 , d1, d6 + + add BO , BO, #32 + add AO , AO, #16 + +.endm + +.macro KERNEL1x2_SUB + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + fldd d6 , [ BO, #16 ] + fldd d7 , [ BO, #24 ] + + fmacd d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmacd d9 , d0, d5 + KMAC_I d9 , d1, d4 + + fmacd d12 , d0, d6 + KMAC_R d12 , d1, d7 + fmacd d13 , d0, d7 + KMAC_I d13 , d1, d6 + + add BO , BO, #32 + add AO , AO, #16 + + +.endm + + +.macro SAVE1x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + fldd d0, ALPHA_R + fldd d1, ALPHA_I + + vsub.f64 d4, d4 , d4 + vsub.f64 d5, d5 , d5 + + FMAC_R1 d4 , d0 , d8 + FMAC_I1 d5 , d0 , d9 + FMAC_R2 d4 , d1 , d9 + FMAC_I2 d5 , d1 , d8 + + fstmiad CO1, { d4 - d5 } + + vsub.f64 d4, d4 , d4 + vsub.f64 d5, d5 , d5 + + FMAC_R1 d4 , d0 , d12 + FMAC_I1 d5 , d0 , d13 + FMAC_R2 d4 , d1 , d13 + FMAC_I2 d5 , d1 , d12 + + fstmiad CO2, { d4 - d5 } + + add CO1, CO1, #16 + +.endm + + +/******************************************************************************/ + +.macro INIT2x1 + + vsub.f64 d8 , d8 , d8 + vmov.f64 d9 , d8 + vmov.f64 d10, d8 + vmov.f64 d11, d8 + +.endm + +.macro KERNEL2x1_I + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fmuld d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmuld d9 , d0, d5 + KMAC_I d9 , d1, d4 + + fmuld d10 , d2, d4 + KMAC_R d10 , d3, d5 + fmuld d11 , d2, d5 + KMAC_I d11 , d3, d4 + + add BO , BO, #16 + add AO , AO, #32 + +.endm + + + +.macro KERNEL2x1_M1 + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fmacd d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmacd d9 , d0, d5 + KMAC_I d9 , d1, d4 + + fmacd d10 , d2, d4 + KMAC_R d10 , d3, d5 + fmacd d11 , d2, d5 + KMAC_I d11 , d3, d4 + + add BO , BO, #16 + add AO , AO, #32 + +.endm + +.macro KERNEL2x1_M2 + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fmacd d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmacd d9 , d0, d5 + KMAC_I d9 , d1, d4 + + fmacd d10 , d2, d4 + KMAC_R d10 , d3, d5 + fmacd d11 , d2, d5 + KMAC_I d11 , d3, d4 + + add BO , BO, #16 + add AO , AO, #32 + + +.endm + + +.macro KERNEL2x1_E + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fmacd d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmacd d9 , d0, d5 + KMAC_I d9 , d1, d4 + + fmacd d10 , d2, d4 + KMAC_R d10 , d3, d5 + fmacd d11 , d2, d5 + KMAC_I d11 , d3, d4 + + add BO , BO, #16 + add AO , AO, #32 + +.endm + +.macro KERNEL2x1_SUB + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fmacd d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmacd d9 , d0, d5 + KMAC_I d9 , d1, d4 + + fmacd d10 , d2, d4 + KMAC_R d10 , d3, d5 + fmacd d11 , d2, d5 + KMAC_I d11 , d3, d4 + + add BO , BO, #16 + add AO , AO, #32 + + +.endm + + +.macro SAVE2x1 + + fldd d0, ALPHA_R + fldd d1, ALPHA_I + + vsub.f64 d4, d4 , d4 + vsub.f64 d5, d5 , d5 + vsub.f64 d6, d6 , d6 + vsub.f64 d7, d7 , d7 + + FMAC_R1 d4 , d0 , d8 + FMAC_I1 d5 , d0 , d9 + FMAC_R2 d4 , d1 , d9 + FMAC_I2 d5 , d1 , d8 + + FMAC_R1 d6 , d0 , d10 + FMAC_I1 d7 , d0 , d11 + FMAC_R2 d6 , d1 , d11 + FMAC_I2 d7 , d1 , d10 + + fstmiad CO1, { d4 - d7 } + + add CO1, CO1, #32 + +.endm + + +/******************************************************************************/ + +.macro INIT1x1 + + vsub.f64 d8 , d8 , d8 + vmov.f64 d9 , d8 + +.endm + +.macro KERNEL1x1_I + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fmuld d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmuld d9 , d0, d5 + KMAC_I d9 , d1, d4 + + add BO , BO, #16 + add AO , AO, #16 + +.endm + + + +.macro KERNEL1x1_M1 + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fmacd d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmacd d9 , d0, d5 + KMAC_I d9 , d1, d4 + + add BO , BO, #16 + add AO , AO, #16 + +.endm + +.macro KERNEL1x1_M2 + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fmacd d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmacd d9 , d0, d5 + KMAC_I d9 , d1, d4 + + add BO , BO, #16 + add AO , AO, #16 + + +.endm + + +.macro KERNEL1x1_E + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fmacd d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmacd d9 , d0, d5 + KMAC_I d9 , d1, d4 + + add BO , BO, #16 + add AO , AO, #16 + +.endm + +.macro KERNEL1x1_SUB + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fmacd d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmacd d9 , d0, d5 + KMAC_I d9 , d1, d4 + + add BO , BO, #16 + add AO , AO, #16 + + +.endm + + +.macro SAVE1x1 + + fldd d0, ALPHA_R + fldd d1, ALPHA_I + + vsub.f64 d4, d4 , d4 + vsub.f64 d5, d5 , d5 + + FMAC_R1 d4 , d0 , d8 + FMAC_I1 d5 , d0 , d9 + FMAC_R2 d4 , d1 , d9 + FMAC_I2 d5 , d1 , d8 + + fstmiad CO1, { d4 - d5 } + + add CO1, CO1, #16 + +.endm + + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + str OLD_M, M + str OLD_N, N + str OLD_K, K + str OLD_A, A + vstr OLD_ALPHA_R, ALPHA_R + vstr OLD_ALPHA_I, ALPHA_I + + sub r3, fp, #128 + vstm r3, { d8 - d15} // store floating point registers + + ldr r3, OLD_LDC + lsl r3, r3, #4 // ldc = ldc * 8 * 2 + str r3, LDC + + ldr r3, OFFSET +#ifndef LEFT + neg r3 , r3 +#endif + str r3 , KK + + ldr BC, B + + ldr J, N + asrs J, J, #1 // J = J / 2 + ble _L1_BEGIN + +_L2_BEGIN: + + ldr CO1, C // CO1 = C + ldr r4 , LDC + lsl r4 , r4 , #1 // LDC * 2 + add r3 , r4, CO1 + str r3 , C // store C + +#if defined(LEFT) + ldr r3 , OFFSET + str r3 , KK +#endif + + + ldr AO, A // AO = A + pld [AO , #A_PRE-64] + pld [AO , #A_PRE-32] + + + +_L2_M2_BEGIN: + + ldr I, M + asrs I, I, #1 // I = I / 2 + ble _L2_M1_BEGIN + +_L2_M2_20: + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #5 // 2 * 8 * 2 double values + add BO , BO , r4 + lsls r4 , r3 , #5 // 2 * 8 * 2 double values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr K1, K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr K1, K + ldr r3, KK + sub K1, K1, r3 + str K1, KKK +#else + ldr K1, KK +#ifdef LEFT + add K1, K1, #2 // number of values in AO +#else + add K1, K1, #2 // number of values in BO +#endif + str K1, KKK +#endif + + + asrs L , K1, #3 // L = L / 8 + cmp L , #3 + blt _L2_M2_30 + .align 5 + + + + KERNEL2x2_I + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + sub L, L, #2 + +_L2_M2_22: + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + subs L, L, #1 + bgt _L2_M2_22 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_E + + b _L2_M2_44 + + +_L2_M2_30: + tst L, #3 + ble _L2_M2_40 + + tst L, #2 + ble _L2_M2_32 + + KERNEL2x2_I + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_E + + b _L2_M2_44 + +_L2_M2_32: + + tst L, #1 + ble _L2_M2_40 + + KERNEL2x2_I + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_E + + b _L2_M2_44 + + +_L2_M2_40: + + INIT2x2 + + +_L2_M2_44: + + ands L , K1, #7 // L = L % 8 + ble _L2_M2_100 + +_L2_M2_46: + + KERNEL2x2_SUB + + subs L, L, #1 + bne _L2_M2_46 + +_L2_M2_100: + + SAVE2x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #5 // 2 * 8 * 2 double values + add BO , BO , r4 + lsls r4 , r3 , #5 // 2 * 8 * 2 double values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #2 // number of values in AO + str r3 , KK +#endif + + + + +_L2_M2_END: + + subs I, I, #1 + bne _L2_M2_20 + + +_L2_M1_BEGIN: + + ldr I, M + tst I, #1 // I = I % 2 + ble _L2_END + +_L2_M1_20: + + INIT1x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #5 // 2 * 8 * 2 double values + add BO , BO , r4 + lsls r4 , r3 , #4 // 1 * 8 * 2 double values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr K1, K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr K1, K + ldr r3, KK + sub K1, K1, r3 + str K1, KKK +#else + ldr K1, KK +#ifdef LEFT + add K1, K1, #1 // number of values in AO +#else + add K1, K1, #2 // number of values in BO +#endif + str K1, KKK +#endif + + asrs L , K1, #3 // L = L / 8 + ble _L2_M1_40 + +_L2_M1_22: + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + subs L, L, #1 + bgt _L2_M1_22 + + +_L2_M1_40: + + ands L , K1, #7 // L = L % 8 + ble _L2_M1_100 + +_L2_M1_42: + + KERNEL1x2_SUB + + subs L, L, #1 + bgt _L2_M1_42 + +_L2_M1_100: + + SAVE1x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #5 // 2 * 8 * 2 double values + add BO , BO , r4 + lsls r4 , r3 , #4 // 1 * 8 * 2 double values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #1 // number of values in AO + str r3 , KK +#endif + + + +_L2_END: + + mov r3, BC + ldr r4, K + lsl r4, r4, #5 // k * 2 * 8 * 2 + add r3, r3, r4 // B = B + K * 4 * 8 + mov BC, r3 + +#if !defined(LEFT) + ldr r3 , KK + add r3 , r3 , #2 // number of values in BO + str r3 , KK +#endif + + + subs J , #1 // j-- + bgt _L2_BEGIN + + + +/*********************************************************************************************/ + +_L1_BEGIN: + + ldr J , N + tst J , #1 + ble _L999 + + + ldr CO1, C // CO1 = C + ldr r4 , LDC + add r3 , r4, CO1 + str r3 , C // store C + +#if defined(LEFT) + ldr r3 , OFFSET + str r3 , KK +#endif + + + ldr AO, A // AO = A + +_L1_M2_BEGIN: + + ldr I, M + asrs I, I, #1 // I = I / 2 + ble _L1_M1_BEGIN + +_L1_M2_20: + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #4 // 1 * 8 * 2 double values + add BO , BO , r4 + lsls r4 , r3 , #5 // 2 * 8 * 2 double values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr K1, K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr K1, K + ldr r3, KK + sub K1, K1, r3 + str K1, KKK +#else + ldr K1, KK +#ifdef LEFT + add K1, K1, #2 // number of values in AO +#else + add K1, K1, #1 // number of values in BO +#endif + str K1, KKK +#endif + + asrs L , K1, #3 // L = L / 8 + cmp L , #3 + blt _L1_M2_30 + .align 5 + + + + KERNEL2x1_I + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + sub L, L, #2 + +_L1_M2_22: + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + subs L, L, #1 + bgt _L1_M2_22 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_E + + b _L1_M2_44 + + +_L1_M2_30: + tst L, #3 + ble _L1_M2_40 + + tst L, #2 + ble _L1_M2_32 + + KERNEL2x1_I + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_E + + b _L1_M2_44 + +_L1_M2_32: + + tst L, #1 + ble _L1_M2_40 + + KERNEL2x1_I + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_E + + b _L1_M2_44 + + +_L1_M2_40: + + INIT2x1 + + +_L1_M2_44: + + ands L , K1, #7 // L = L % 8 + ble _L1_M2_100 + +_L1_M2_46: + + KERNEL2x1_SUB + + subs L, L, #1 + bne _L1_M2_46 + +_L1_M2_100: + + SAVE2x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #4 // 1 * 8 * 2 double values + add BO , BO , r4 + lsls r4 , r3 , #5 // 2 * 8 * 2 double values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #2 // number of values in AO + str r3 , KK +#endif + + + +_L1_M2_END: + + subs I, I, #1 + bne _L1_M2_20 + + +_L1_M1_BEGIN: + + ldr I, M + tst I, #1 // I = I % 2 + ble _L1_END + +_L1_M1_20: + + INIT1x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #4 // 1 * 8 * 2 double values + add BO , BO , r4 + lsls r4 , r3 , #4 // 1 * 8 * 2 double values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr K1, K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr K1, K + ldr r3, KK + sub K1, K1, r3 + str K1, KKK +#else + ldr K1, KK +#ifdef LEFT + add K1, K1, #1 // number of values in AO +#else + add K1, K1, #1 // number of values in BO +#endif + str K1, KKK +#endif + + asrs L , K1, #3 // L = L / 8 + ble _L1_M1_40 + +_L1_M1_22: + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + subs L, L, #1 + bgt _L1_M1_22 + + +_L1_M1_40: + + ands L , K1, #7 // L = L % 8 + ble _L1_M1_100 + +_L1_M1_42: + + KERNEL1x1_SUB + + subs L, L, #1 + bgt _L1_M1_42 + +_L1_M1_100: + + SAVE1x1 + + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #4 // 1 * 8 * 2 double values + add BO , BO , r4 + lsls r4 , r3 , #4 // 1 * 8 * 2 double values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #1 // number of values in AO + str r3 , KK +#endif + + + +_L1_END: + + + +_L999: + + sub r3, fp, #128 + vldm r3, { d8 - d15} // restore floating point registers + + movs r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + From 86afb47e8365f012f1c0fbd8fa6c1a639aa7a8fb Mon Sep 17 00:00:00 2001 From: wernsaar Date: Thu, 28 Nov 2013 14:35:07 +0100 Subject: [PATCH 74/81] added optimized ctrmm kernel for ARMV6 --- kernel/arm/KERNEL.ARMV6 | 2 +- kernel/arm/ctrmm_kernel_2x2_vfp.S | 1455 +++++++++++++++++++++++++++++ 2 files changed, 1456 insertions(+), 1 deletion(-) create mode 100644 kernel/arm/ctrmm_kernel_2x2_vfp.S diff --git a/kernel/arm/KERNEL.ARMV6 b/kernel/arm/KERNEL.ARMV6 index 9ee6070fd..e79ea65d1 100644 --- a/kernel/arm/KERNEL.ARMV6 +++ b/kernel/arm/KERNEL.ARMV6 @@ -82,7 +82,7 @@ ZGEMVTKERNEL = zgemv_t.c STRMMKERNEL = strmm_kernel_4x2_vfp.S DTRMMKERNEL = dtrmm_kernel_4x2_vfp.S -CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c +CTRMMKERNEL = ctrmm_kernel_2x2_vfp.S ZTRMMKERNEL = ztrmm_kernel_2x2_vfp.S SGEMMKERNEL = sgemm_kernel_4x2_vfp.S diff --git a/kernel/arm/ctrmm_kernel_2x2_vfp.S b/kernel/arm/ctrmm_kernel_2x2_vfp.S new file mode 100644 index 000000000..a68434f97 --- /dev/null +++ b/kernel/arm/ctrmm_kernel_2x2_vfp.S @@ -0,0 +1,1455 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/10/16 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_M r0 +#define OLD_N r1 +#define OLD_K r2 +#define OLD_A r3 +#define OLD_ALPHA_R s0 +#define OLD_ALPHA_I s1 + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define KKK [fp, #-240] +#define KK [fp, #-244 ] +#define A [fp, #-248 ] +#define LDC [fp, #-252 ] +#define M [fp, #-256 ] +#define N [fp, #-260 ] +#define K [fp, #-264 ] + +#define ALPHA_I [fp, #-272] +#define ALPHA_R [fp, #-280] + +#define B [fp, #4 ] +#define C [fp, #8 ] +#define OLD_LDC [fp, #12 ] +#define OFFSET [fp, #16 ] + +#define I r0 +#define J r1 +#define L r2 + +#define AO r5 +#define BO r6 + +#define CO1 r8 +#define CO2 r9 + +#define K1 r7 +#define BC r12 + +#define A_PRE 96 +#define B_PRE 96 +#define C_PRE 64 + + + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + + #define KMAC_R fnmacs + #define KMAC_I fmacs + + #define FMAC_R1 fmacs + #define FMAC_R2 fnmacs + #define FMAC_I1 fmacs + #define FMAC_I2 fmacs + +#elif defined(CN) || defined(CT) + + #define KMAC_R fmacs + #define KMAC_I fnmacs + + #define FMAC_R1 fmacs + #define FMAC_R2 fnmacs + #define FMAC_I1 fmacs + #define FMAC_I2 fmacs + +#elif defined(NC) || defined(TC) + + #define KMAC_R fmacs + #define KMAC_I fnmacs + + #define FMAC_R1 fmacs + #define FMAC_R2 fmacs + #define FMAC_I1 fnmacs + #define FMAC_I2 fmacs + +#else + + #define KMAC_R fnmacs + #define KMAC_I fmacs + + #define FMAC_R1 fmacs + #define FMAC_R2 fmacs + #define FMAC_I1 fnmacs + #define FMAC_I2 fmacs + +#endif + + +.macro INIT2x2 + + vsub.f32 s8 , s8 , s8 + vmov.f32 s9 , s8 + vmov.f32 s10, s8 + vmov.f32 s11, s8 + vmov.f32 s12, s8 + vmov.f32 s13, s8 + vmov.f32 s14, s8 + vmov.f32 s15, s8 + +.endm + +.macro KERNEL2x2_I + + pld [ AO, #A_PRE ] + fldmias AO!, { s0 - s3 } + pld [ BO, #B_PRE ] + fldmias BO!, { s4 - s7 } + + + fmuls s8 , s0, s4 + fmuls s9 , s0, s5 + fmuls s10 , s2, s4 + fmuls s11 , s2, s5 + + KMAC_R s8 , s1, s5 + KMAC_I s9 , s1, s4 + KMAC_R s10 , s3, s5 + KMAC_I s11 , s3, s4 + + fmuls s12 , s0, s6 + fmuls s13 , s0, s7 + fmuls s14 , s2, s6 + fmuls s15 , s2, s7 + + KMAC_R s12 , s1, s7 + KMAC_I s13 , s1, s6 + KMAC_R s14 , s3, s7 + KMAC_I s15 , s3, s6 + +.endm + + + +.macro KERNEL2x2_M1 + + pld [ AO, #A_PRE ] + fldmias AO!, { s0 - s3 } + pld [ BO, #B_PRE ] + fldmias BO!, { s4 - s7 } + + fmacs s8 , s0, s4 + fmacs s9 , s0, s5 + fmacs s10 , s2, s4 + fmacs s11 , s2, s5 + + KMAC_R s8 , s1, s5 + KMAC_I s9 , s1, s4 + KMAC_R s10 , s3, s5 + KMAC_I s11 , s3, s4 + + fmacs s12 , s0, s6 + fmacs s13 , s0, s7 + fmacs s14 , s2, s6 + fmacs s15 , s2, s7 + + KMAC_R s12 , s1, s7 + KMAC_I s13 , s1, s6 + KMAC_R s14 , s3, s7 + KMAC_I s15 , s3, s6 + +.endm + +.macro KERNEL2x2_M2 + + fldmias AO!, { s0 - s3 } + fldmias BO!, { s4 - s7 } + + fmacs s8 , s0, s4 + fmacs s9 , s0, s5 + fmacs s10 , s2, s4 + fmacs s11 , s2, s5 + + KMAC_R s8 , s1, s5 + KMAC_I s9 , s1, s4 + KMAC_R s10 , s3, s5 + KMAC_I s11 , s3, s4 + + fmacs s12 , s0, s6 + fmacs s13 , s0, s7 + fmacs s14 , s2, s6 + fmacs s15 , s2, s7 + + KMAC_R s12 , s1, s7 + KMAC_I s13 , s1, s6 + KMAC_R s14 , s3, s7 + KMAC_I s15 , s3, s6 + + +.endm + + +.macro KERNEL2x2_E + + fldmias AO!, { s0 - s3 } + fldmias BO!, { s4 - s7 } + + fmacs s8 , s0, s4 + fmacs s9 , s0, s5 + fmacs s10 , s2, s4 + fmacs s11 , s2, s5 + + KMAC_R s8 , s1, s5 + KMAC_I s9 , s1, s4 + KMAC_R s10 , s3, s5 + KMAC_I s11 , s3, s4 + + fmacs s12 , s0, s6 + fmacs s13 , s0, s7 + fmacs s14 , s2, s6 + fmacs s15 , s2, s7 + + KMAC_R s12 , s1, s7 + KMAC_I s13 , s1, s6 + KMAC_R s14 , s3, s7 + KMAC_I s15 , s3, s6 + + +.endm + +.macro KERNEL2x2_SUB + + fldmias AO!, { s0 - s3 } + fldmias BO!, { s4 - s7 } + + fmacs s8 , s0, s4 + fmacs s9 , s0, s5 + fmacs s10 , s2, s4 + fmacs s11 , s2, s5 + + KMAC_R s8 , s1, s5 + KMAC_I s9 , s1, s4 + KMAC_R s10 , s3, s5 + KMAC_I s11 , s3, s4 + + fmacs s12 , s0, s6 + fmacs s13 , s0, s7 + fmacs s14 , s2, s6 + fmacs s15 , s2, s7 + + KMAC_R s12 , s1, s7 + KMAC_I s13 , s1, s6 + KMAC_R s14 , s3, s7 + KMAC_I s15 , s3, s6 + + +.endm + +.macro SAVE2x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + flds s0, ALPHA_R + flds s1, ALPHA_I + + vsub.f32 s4, s4, s4 + vsub.f32 s5, s5, s5 + vsub.f32 s6, s6, s6 + vsub.f32 s7, s7, s7 + + FMAC_R1 s4 , s0 , s8 + FMAC_I1 s5 , s0 , s9 + FMAC_R2 s4 , s1 , s9 + FMAC_I2 s5 , s1 , s8 + + FMAC_R1 s6 , s0 , s10 + FMAC_I1 s7 , s0 , s11 + FMAC_R2 s6 , s1 , s11 + FMAC_I2 s7 , s1 , s10 + + fstmias CO1, { s4 - s7 } + + vsub.f32 s4, s4, s4 + vsub.f32 s5, s5, s5 + vsub.f32 s6, s6, s6 + vsub.f32 s7, s7, s7 + + FMAC_R1 s4 , s0 , s12 + FMAC_I1 s5 , s0 , s13 + FMAC_R2 s4 , s1 , s13 + FMAC_I2 s5 , s1 , s12 + + FMAC_R1 s6 , s0 , s14 + FMAC_I1 s7 , s0 , s15 + FMAC_R2 s6 , s1 , s15 + FMAC_I2 s7 , s1 , s14 + + fstmias CO2, { s4 - s7 } + + add CO1, CO1, #16 + +.endm + +/******************************************************************************/ + +.macro INIT1x2 + + vsub.f32 s8 , s8 , s8 + vmov.f32 s9 , s8 + vmov.f32 s12, s8 + vmov.f32 s13, s8 + +.endm + +.macro KERNEL1x2_I + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + + flds s6 , [ BO, #8 ] + flds s7 , [ BO, #12 ] + + fmuls s8 , s0, s4 + KMAC_R s8 , s1, s5 + fmuls s9 , s0, s5 + KMAC_I s9 , s1, s4 + + fmuls s12 , s0, s6 + KMAC_R s12 , s1, s7 + fmuls s13 , s0, s7 + KMAC_I s13 , s1, s6 + + add BO , BO, #16 + add AO , AO, #8 + +.endm + + + +.macro KERNEL1x2_M1 + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + flds s6 , [ BO, #8 ] + flds s7 , [ BO, #12 ] + + fmacs s8 , s0, s4 + KMAC_R s8 , s1, s5 + fmacs s9 , s0, s5 + KMAC_I s9 , s1, s4 + + fmacs s12 , s0, s6 + KMAC_R s12 , s1, s7 + fmacs s13 , s0, s7 + KMAC_I s13 , s1, s6 + + add BO , BO, #16 + add AO , AO, #8 + +.endm + +.macro KERNEL1x2_M2 + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + flds s6 , [ BO, #8 ] + flds s7 , [ BO, #12 ] + + fmacs s8 , s0, s4 + KMAC_R s8 , s1, s5 + fmacs s9 , s0, s5 + KMAC_I s9 , s1, s4 + + fmacs s12 , s0, s6 + KMAC_R s12 , s1, s7 + fmacs s13 , s0, s7 + KMAC_I s13 , s1, s6 + + add BO , BO, #16 + add AO , AO, #8 + + +.endm + + +.macro KERNEL1x2_E + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + flds s6 , [ BO, #8 ] + flds s7 , [ BO, #12 ] + + fmacs s8 , s0, s4 + KMAC_R s8 , s1, s5 + fmacs s9 , s0, s5 + KMAC_I s9 , s1, s4 + + fmacs s12 , s0, s6 + KMAC_R s12 , s1, s7 + fmacs s13 , s0, s7 + KMAC_I s13 , s1, s6 + + add BO , BO, #16 + add AO , AO, #8 + +.endm + +.macro KERNEL1x2_SUB + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + flds s6 , [ BO, #8 ] + flds s7 , [ BO, #12 ] + + fmacs s8 , s0, s4 + KMAC_R s8 , s1, s5 + fmacs s9 , s0, s5 + KMAC_I s9 , s1, s4 + + fmacs s12 , s0, s6 + KMAC_R s12 , s1, s7 + fmacs s13 , s0, s7 + KMAC_I s13 , s1, s6 + + add BO , BO, #16 + add AO , AO, #8 + + +.endm + + +.macro SAVE1x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + flds s0, ALPHA_R + flds s1, ALPHA_I + + vsub.f32 s4, s4, s4 + vsub.f32 s5, s5, s5 + + FMAC_R1 s4 , s0 , s8 + FMAC_I1 s5 , s0 , s9 + FMAC_R2 s4 , s1 , s9 + FMAC_I2 s5 , s1 , s8 + + fstmias CO1, { s4 - s5 } + + vsub.f32 s4, s4, s4 + vsub.f32 s5, s5, s5 + + FMAC_R1 s4 , s0 , s12 + FMAC_I1 s5 , s0 , s13 + FMAC_R2 s4 , s1 , s13 + FMAC_I2 s5 , s1 , s12 + + fstmias CO2, { s4 - s5 } + + add CO1, CO1, #8 + +.endm + + +/******************************************************************************/ + +.macro INIT2x1 + + vsub.f32 s8 , s8 , s8 + vmov.f32 s9 , s8 + vmov.f32 s10, s8 + vmov.f32 s11, s8 + +.endm + +.macro KERNEL2x1_I + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + flds s2 , [ AO, #8 ] + flds s3 , [ AO, #12 ] + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + + fmuls s8 , s0, s4 + KMAC_R s8 , s1, s5 + fmuls s9 , s0, s5 + KMAC_I s9 , s1, s4 + + fmuls s10 , s2, s4 + KMAC_R s10 , s3, s5 + fmuls s11 , s2, s5 + KMAC_I s11 , s3, s4 + + add BO , BO, #8 + add AO , AO, #16 + +.endm + + + +.macro KERNEL2x1_M1 + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s2 , [ AO, #8 ] + flds s3 , [ AO, #12 ] + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + + fmacs s8 , s0, s4 + KMAC_R s8 , s1, s5 + fmacs s9 , s0, s5 + KMAC_I s9 , s1, s4 + + fmacs s10 , s2, s4 + KMAC_R s10 , s3, s5 + fmacs s11 , s2, s5 + KMAC_I s11 , s3, s4 + + add BO , BO, #8 + add AO , AO, #16 + +.endm + +.macro KERNEL2x1_M2 + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s2 , [ AO, #8 ] + flds s3 , [ AO, #12 ] + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + + fmacs s8 , s0, s4 + KMAC_R s8 , s1, s5 + fmacs s9 , s0, s5 + KMAC_I s9 , s1, s4 + + fmacs s10 , s2, s4 + KMAC_R s10 , s3, s5 + fmacs s11 , s2, s5 + KMAC_I s11 , s3, s4 + + add BO , BO, #8 + add AO , AO, #16 + + +.endm + + +.macro KERNEL2x1_E + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s2 , [ AO, #8 ] + flds s3 , [ AO, #12 ] + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + + fmacs s8 , s0, s4 + KMAC_R s8 , s1, s5 + fmacs s9 , s0, s5 + KMAC_I s9 , s1, s4 + + fmacs s10 , s2, s4 + KMAC_R s10 , s3, s5 + fmacs s11 , s2, s5 + KMAC_I s11 , s3, s4 + + add BO , BO, #8 + add AO , AO, #16 + +.endm + +.macro KERNEL2x1_SUB + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s2 , [ AO, #8 ] + flds s3 , [ AO, #12 ] + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + + fmacs s8 , s0, s4 + KMAC_R s8 , s1, s5 + fmacs s9 , s0, s5 + KMAC_I s9 , s1, s4 + + fmacs s10 , s2, s4 + KMAC_R s10 , s3, s5 + fmacs s11 , s2, s5 + KMAC_I s11 , s3, s4 + + add BO , BO, #8 + add AO , AO, #16 + + +.endm + + +.macro SAVE2x1 + + flds s0, ALPHA_R + flds s1, ALPHA_I + + vsub.f32 s4, s4, s4 + vsub.f32 s5, s5, s5 + vsub.f32 s6, s6, s6 + vsub.f32 s7, s7, s7 + + FMAC_R1 s4 , s0 , s8 + FMAC_I1 s5 , s0 , s9 + FMAC_R2 s4 , s1 , s9 + FMAC_I2 s5 , s1 , s8 + + FMAC_R1 s6 , s0 , s10 + FMAC_I1 s7 , s0 , s11 + FMAC_R2 s6 , s1 , s11 + FMAC_I2 s7 , s1 , s10 + + fstmias CO1, { s4 - s7 } + + add CO1, CO1, #16 + +.endm + + +/******************************************************************************/ + +.macro INIT1x1 + + vsub.f32 s8 , s8 , s8 + vmov.f32 s9 , s8 + +.endm + +.macro KERNEL1x1_I + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + + fmuls s8 , s0, s4 + KMAC_R s8 , s1, s5 + fmuls s9 , s0, s5 + KMAC_I s9 , s1, s4 + + add BO , BO, #8 + add AO , AO, #8 + +.endm + + + +.macro KERNEL1x1_M1 + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + + fmacs s8 , s0, s4 + KMAC_R s8 , s1, s5 + fmacs s9 , s0, s5 + KMAC_I s9 , s1, s4 + + add BO , BO, #8 + add AO , AO, #8 + +.endm + +.macro KERNEL1x1_M2 + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + + fmacs s8 , s0, s4 + KMAC_R s8 , s1, s5 + fmacs s9 , s0, s5 + KMAC_I s9 , s1, s4 + + add BO , BO, #8 + add AO , AO, #8 + + +.endm + + +.macro KERNEL1x1_E + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + + fmacs s8 , s0, s4 + KMAC_R s8 , s1, s5 + fmacs s9 , s0, s5 + KMAC_I s9 , s1, s4 + + add BO , BO, #8 + add AO , AO, #8 + +.endm + +.macro KERNEL1x1_SUB + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + + fmacs s8 , s0, s4 + KMAC_R s8 , s1, s5 + fmacs s9 , s0, s5 + KMAC_I s9 , s1, s4 + + add BO , BO, #8 + add AO , AO, #8 + + +.endm + + +.macro SAVE1x1 + + flds s0, ALPHA_R + flds s1, ALPHA_I + + vsub.f32 s4, s4, s4 + vsub.f32 s5, s5, s5 + + FMAC_R1 s4 , s0 , s8 + FMAC_I1 s5 , s0 , s9 + FMAC_R2 s4 , s1 , s9 + FMAC_I2 s5 , s1 , s8 + + fstmias CO1, { s4 - s5 } + + add CO1, CO1, #8 + +.endm + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + str OLD_M, M + str OLD_N, N + str OLD_K, K + str OLD_A, A + vstr OLD_ALPHA_R, ALPHA_R + vstr OLD_ALPHA_I, ALPHA_I + + sub r3, fp, #128 + vstm r3, { s8 - s15} // store floating point registers + + ldr r3, OLD_LDC + lsl r3, r3, #3 // ldc = ldc * 4 * 2 + str r3, LDC + + ldr r3, OFFSET +#ifndef LEFT + neg r3 , r3 +#endif + str r3 , KK + + ldr BC, B + + ldr J, N + asrs J, J, #1 // J = J / 2 + ble _L1_BEGIN + +_L2_BEGIN: + + ldr CO1, C // CO1 = C + ldr r4 , LDC + lsl r4 , r4 , #1 // LDC * 2 + add r3 , r4, CO1 + str r3 , C // store C + +#if defined(LEFT) + ldr r3 , OFFSET + str r3 , KK +#endif + + ldr AO, A // AO = A + pld [AO , #A_PRE-64] + pld [AO , #A_PRE-32] + + + +_L2_M2_BEGIN: + + ldr I, M + asrs I, I, #1 // I = I / 2 + ble _L2_M1_BEGIN + +_L2_M2_20: + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #4 // 2 * 4 * 2 float values + add BO , BO , r4 + lsls r4 , r3 , #4 // 2 * 4 * 2 float values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr K1, K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr K1, K + ldr r3, KK + sub K1, K1, r3 + str K1, KKK +#else + ldr K1, KK +#ifdef LEFT + add K1, K1, #2 // number of values in AO +#else + add K1, K1, #2 // number of values in BO +#endif + str K1, KKK +#endif + + asrs L , K1, #3 // L = L / 8 + cmp L , #3 + blt _L2_M2_30 + .align 5 + + + + KERNEL2x2_I + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + sub L, L, #2 + +_L2_M2_22: + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + subs L, L, #1 + bgt _L2_M2_22 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_E + + b _L2_M2_44 + + +_L2_M2_30: + tst L, #3 + ble _L2_M2_40 + + tst L, #2 + ble _L2_M2_32 + + KERNEL2x2_I + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_E + + b _L2_M2_44 + +_L2_M2_32: + + tst L, #1 + ble _L2_M2_40 + + KERNEL2x2_I + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_E + + b _L2_M2_44 + + +_L2_M2_40: + + INIT2x2 + + +_L2_M2_44: + + ands L , K1, #7 // L = L % 8 + ble _L2_M2_100 + +_L2_M2_46: + + KERNEL2x2_SUB + + subs L, L, #1 + bne _L2_M2_46 + +_L2_M2_100: + + SAVE2x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #4 // 2 * 4 * 2 float values + add BO , BO , r4 + lsls r4 , r3 , #4 // 2 * 4 * 2 float values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #2 // number of values in AO + str r3 , KK +#endif + + +_L2_M2_END: + + subs I, I, #1 + bne _L2_M2_20 + + +_L2_M1_BEGIN: + + ldr I, M + tst I, #1 // I = I % 2 + ble _L2_END + +_L2_M1_20: + + INIT1x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #4 // 2 * 4 * 2 float values + add BO , BO , r4 + lsls r4 , r3 , #3 // 1 * 4 * 2 float values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr K1, K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr K1, K + ldr r3, KK + sub K1, K1, r3 + str K1, KKK +#else + ldr K1, KK +#ifdef LEFT + add K1, K1, #1 // number of values in AO +#else + add K1, K1, #2 // number of values in BO +#endif + str K1, KKK +#endif + + asrs L , K1, #3 // L = L / 8 + ble _L2_M1_40 + +_L2_M1_22: + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + subs L, L, #1 + bgt _L2_M1_22 + + +_L2_M1_40: + + ands L , K1, #7 // L = L % 8 + ble _L2_M1_100 + +_L2_M1_42: + + KERNEL1x2_SUB + + subs L, L, #1 + bgt _L2_M1_42 + +_L2_M1_100: + + SAVE1x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #4 // 2 * 4 * 2 float values + add BO , BO , r4 + lsls r4 , r3 , #3 // 1 * 4 * 2 float values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #1 // number of values in AO + str r3 , KK +#endif + + + +_L2_END: + + mov r3, BC + ldr r4, K + lsl r4, r4, #4 // k * 2 * 4 * 2 + add r3, r3, r4 // B = B + K * 2 * 8 + mov BC, r3 + +#if !defined(LEFT) + ldr r3 , KK + add r3 , r3 , #2 // number of values in BO + str r3 , KK +#endif + + subs J , #1 // j-- + bgt _L2_BEGIN + + + +/*********************************************************************************************/ + +_L1_BEGIN: + + ldr J , N + tst J , #1 + ble _L999 + + + ldr CO1, C // CO1 = C + ldr r4 , LDC + add r3 , r4, CO1 + str r3 , C // store C + +#if defined(LEFT) + ldr r3 , OFFSET + str r3 , KK +#endif + + ldr AO, A // AO = A + +_L1_M2_BEGIN: + + ldr I, M + asrs I, I, #1 // I = I / 2 + ble _L1_M1_BEGIN + +_L1_M2_20: + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #3 // 1 * 4 * 2 float values + add BO , BO , r4 + lsls r4 , r3 , #4 // 2 * 4 * 2 float values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr K1, K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr K1, K + ldr r3, KK + sub K1, K1, r3 + str K1, KKK +#else + ldr K1, KK +#ifdef LEFT + add K1, K1, #2 // number of values in AO +#else + add K1, K1, #1 // number of values in BO +#endif + str K1, KKK +#endif + + asrs L , K1, #3 // L = L / 8 + cmp L , #3 + blt _L1_M2_30 + .align 5 + + + + KERNEL2x1_I + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + sub L, L, #2 + +_L1_M2_22: + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + subs L, L, #1 + bgt _L1_M2_22 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_E + + b _L1_M2_44 + + +_L1_M2_30: + tst L, #3 + ble _L1_M2_40 + + tst L, #2 + ble _L1_M2_32 + + KERNEL2x1_I + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_E + + b _L1_M2_44 + +_L1_M2_32: + + tst L, #1 + ble _L1_M2_40 + + KERNEL2x1_I + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_E + + b _L1_M2_44 + + +_L1_M2_40: + + INIT2x1 + + +_L1_M2_44: + + ands L , K1, #7 // L = L % 8 + ble _L1_M2_100 + +_L1_M2_46: + + KERNEL2x1_SUB + + subs L, L, #1 + bne _L1_M2_46 + +_L1_M2_100: + + SAVE2x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #3 // 1 * 4 * 2 float values + add BO , BO , r4 + lsls r4 , r3 , #4 // 2 * 4 * 2 float values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #2 // number of values in AO + str r3 , KK +#endif + + + +_L1_M2_END: + + subs I, I, #1 + bne _L1_M2_20 + + +_L1_M1_BEGIN: + + ldr I, M + tst I, #1 // I = I % 2 + ble _L1_END + +_L1_M1_20: + + INIT1x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #3 // 1 * 4 * 2 float values + add BO , BO , r4 + lsls r4 , r3 , #3 // 1 * 4 * 2 float values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr K1, K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr K1, K + ldr r3, KK + sub K1, K1, r3 + str K1, KKK +#else + ldr K1, KK +#ifdef LEFT + add K1, K1, #1 // number of values in AO +#else + add K1, K1, #1 // number of values in BO +#endif + str K1, KKK +#endif + + asrs L , K1, #3 // L = L / 8 + ble _L1_M1_40 + +_L1_M1_22: + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + subs L, L, #1 + bgt _L1_M1_22 + + +_L1_M1_40: + + ands L , K1, #7 // L = L % 8 + ble _L1_M1_100 + +_L1_M1_42: + + KERNEL1x1_SUB + + subs L, L, #1 + bgt _L1_M1_42 + +_L1_M1_100: + + SAVE1x1 + + +_L1_END: + + + +_L999: + + sub r3, fp, #128 + vldm r3, { s8 - s15} // restore floating point registers + + movs r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + From d54a06171351f545cebca73737a102c88cb1ff74 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Thu, 28 Nov 2013 17:40:21 +0100 Subject: [PATCH 75/81] optimized gemv_n_vfp.S --- kernel/arm/gemv_n_vfp.S | 159 ++++++++++++++++++++++++++++------------ 1 file changed, 112 insertions(+), 47 deletions(-) diff --git a/kernel/arm/gemv_n_vfp.S b/kernel/arm/gemv_n_vfp.S index 47265994c..f1cf9a05e 100644 --- a/kernel/arm/gemv_n_vfp.S +++ b/kernel/arm/gemv_n_vfp.S @@ -26,7 +26,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** -* 2013/11/24 Saar +* 2013/11/28 Saar * BLASTEST : OK * CTEST : OK * TEST : OK @@ -74,44 +74,75 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(DOUBLE) -.macro INIT_F4 +.macro INIT_F8 pld [ YO , #Y_PRE ] + pld [ YO , #Y_PRE+32 ] - vsub.f64 d12 , d12 , d12 - vmov.f64 d13 , d12 - vmov.f64 d14 , d12 - vmov.f64 d15 , d12 + vsub.f64 d8 , d8 , d8 + vmov.f64 d9 , d8 + vmov.f64 d10 , d8 + vmov.f64 d11 , d8 + vmov.f64 d12 , d8 + vmov.f64 d13 , d8 + vmov.f64 d14 , d8 + vmov.f64 d15 , d8 .endm -.macro KERNEL_F4X4 +.macro KERNEL_F8X8 pld [ XO , #X_PRE ] - KERNEL_F4X1 - KERNEL_F4X1 - KERNEL_F4X1 - KERNEL_F4X1 + KERNEL_F8X1 + KERNEL_F8X1 + KERNEL_F8X1 + KERNEL_F8X1 + + pld [ XO , #X_PRE ] + KERNEL_F8X1 + KERNEL_F8X1 + KERNEL_F8X1 + KERNEL_F8X1 .endm -.macro KERNEL_F4X1 +.macro KERNEL_F8X1 - fldmiad XO! , { d2 } - fldmiad AO1 , { d8 - d11 } - - vmla.f64 d12 , d2 , d8 pld [ AO2 , #A_PRE ] - vmla.f64 d13 , d2 , d9 + fldmiad XO! , { d2 } + fldmiad AO1 , { d4 - d7 } + + vmla.f64 d8 , d2 , d4 + pld [ AO2 , #4*SIZE ] + vmla.f64 d9 , d2 , d5 + add r3, AO1, #4*SIZE + vmla.f64 d10 , d2 , d6 + vmla.f64 d11 , d2 , d7 + + + fldmiad r3 , { d4 - d7 } + + vmla.f64 d12 , d2 , d4 + vmla.f64 d13 , d2 , d5 add AO1, AO1, LDA - vmla.f64 d14 , d2 , d10 - vmla.f64 d15 , d2 , d11 + vmla.f64 d14 , d2 , d6 add AO2, AO2, LDA + vmla.f64 d15 , d2 , d7 + .endm -.macro SAVE_F4 +.macro SAVE_F8 + + fldmiad YO, { d4 - d7 } + + vmla.f64 d4 , d0, d8 + vmla.f64 d5 , d0, d9 + vmla.f64 d6 , d0, d10 + vmla.f64 d7 , d0, d11 + + fstmiad YO!, { d4 - d7 } fldmiad YO, { d4 - d7 } @@ -244,43 +275,73 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else /************************* SINGLE PRECISION *****************************************/ -.macro INIT_F4 +.macro INIT_F8 pld [ YO , #Y_PRE ] - vsub.f32 s12 , s12 , s12 - vmov.f32 s13 , s12 - vmov.f32 s14 , s12 - vmov.f32 s15 , s12 + vsub.f32 s8 , s8 , s8 + vmov.f32 s9 , s8 + vmov.f32 s10 , s8 + vmov.f32 s11 , s8 + vmov.f32 s12 , s8 + vmov.f32 s13 , s8 + vmov.f32 s14 , s8 + vmov.f32 s15 , s8 .endm -.macro KERNEL_F4X4 +.macro KERNEL_F8X8 pld [ XO , #X_PRE ] - KERNEL_F4X1 - KERNEL_F4X1 - KERNEL_F4X1 - KERNEL_F4X1 + KERNEL_F8X1 + KERNEL_F8X1 + KERNEL_F8X1 + KERNEL_F8X1 + + KERNEL_F8X1 + KERNEL_F8X1 + KERNEL_F8X1 + KERNEL_F8X1 .endm -.macro KERNEL_F4X1 +.macro KERNEL_F8X1 + pld [ AO2, #A_PRE ] fldmias XO! , { s2 } - fldmias AO1 , { s8 - s11 } + fldmias AO1 , { s4 - s7 } + + vmla.f32 s8 , s2 , s4 + vmla.f32 s9 , s2 , s5 + vmla.f32 s10 , s2 , s6 + vmla.f32 s11 , s2 , s7 + + add r3, AO1, #4*SIZE + + fldmias r3 , { s4 - s7 } + + vmla.f32 s12 , s2 , s4 + vmla.f32 s13 , s2 , s5 + vmla.f32 s14 , s2 , s6 + vmla.f32 s15 , s2 , s7 - vmla.f32 s12 , s2 , s8 - vmla.f32 s13 , s2 , s9 - vmla.f32 s14 , s2 , s10 - vmla.f32 s15 , s2 , s11 add AO1, AO1, LDA add AO2, AO2, LDA .endm -.macro SAVE_F4 +.macro SAVE_F8 + + fldmias YO, { s4 - s7 } + + vmla.f32 s4 , s0, s8 + vmla.f32 s5 , s0, s9 + vmla.f32 s6 , s0, s10 + vmla.f32 s7 , s0, s11 + + fstmias YO!, { s4 - s7 } + fldmias YO, { s4 - s7 } @@ -332,8 +393,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S4X4 + pld [ AO2 , #A_PRE ] KERNEL_S4X1 KERNEL_S4X1 + pld [ AO2 , #A_PRE ] KERNEL_S4X1 KERNEL_S4X1 @@ -342,7 +405,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S4X1 - pld [ AO2 , #A_PRE ] fldmias XO , { s2 } fldmias AO1 , { s8 - s11 } @@ -471,27 +533,30 @@ gemvn_kernel_F4_BEGIN: ldr YO , Y ldr I, M - asrs I, I, #2 // I = M / 4 + asrs I, I, #3 // I = M / 8 ble gemvn_kernel_F1_BEGIN gemvn_kernel_F4X4: ldr AO1, A add AO2, AO1, LDA - add r3 , AO1, #4*SIZE + add r3 , AO1, #8*SIZE str r3 , A + add AO2, AO2, LDA + add AO2, AO2, LDA + ldr XO , X - INIT_F4 + INIT_F8 - asrs J, N, #2 // J = N / 4 + asrs J, N, #3 // J = N / 8 ble gemvn_kernel_F4X1 gemvn_kernel_F4X4_10: - KERNEL_F4X4 + KERNEL_F8X8 subs J, J, #1 bne gemvn_kernel_F4X4_10 @@ -499,12 +564,12 @@ gemvn_kernel_F4X4_10: gemvn_kernel_F4X1: - ands J, N , #3 + ands J, N , #7 ble gemvn_kernel_F4_END gemvn_kernel_F4X1_10: - KERNEL_F4X1 + KERNEL_F8X1 subs J, J, #1 bne gemvn_kernel_F4X1_10 @@ -512,7 +577,7 @@ gemvn_kernel_F4X1_10: gemvn_kernel_F4_END: - SAVE_F4 + SAVE_F8 subs I , I , #1 bne gemvn_kernel_F4X4 @@ -521,7 +586,7 @@ gemvn_kernel_F4_END: gemvn_kernel_F1_BEGIN: ldr I, M - ands I, I , #3 + ands I, I , #7 ble gemvn_kernel_L999 gemvn_kernel_F1X1: From 2d3c88429403f3302531352ce293583f37871bfc Mon Sep 17 00:00:00 2001 From: wernsaar Date: Fri, 29 Nov 2013 17:06:33 +0100 Subject: [PATCH 76/81] added complex gemv kernels for ARMV6 and ARMV7 --- kernel/arm/KERNEL.ARMV6 | 8 +- kernel/arm/KERNEL.ARMV7 | 16 +- kernel/arm/cgemv_n_vfp.S | 697 ++++++++++++++++++++++++++++++++++++++ kernel/arm/cgemv_t_vfp.S | 607 +++++++++++++++++++++++++++++++++ kernel/arm/zgemv_n_vfp.S | 699 +++++++++++++++++++++++++++++++++++++++ kernel/arm/zgemv_t_vfp.S | 608 ++++++++++++++++++++++++++++++++++ 6 files changed, 2623 insertions(+), 12 deletions(-) create mode 100644 kernel/arm/cgemv_n_vfp.S create mode 100644 kernel/arm/cgemv_t_vfp.S create mode 100644 kernel/arm/zgemv_n_vfp.S create mode 100644 kernel/arm/zgemv_t_vfp.S diff --git a/kernel/arm/KERNEL.ARMV6 b/kernel/arm/KERNEL.ARMV6 index e79ea65d1..f47a843f3 100644 --- a/kernel/arm/KERNEL.ARMV6 +++ b/kernel/arm/KERNEL.ARMV6 @@ -72,13 +72,13 @@ ZSWAPKERNEL = swap_vfp.S SGEMVNKERNEL = gemv_n_vfp.S DGEMVNKERNEL = gemv_n_vfp.S -CGEMVNKERNEL = zgemv_n.c -ZGEMVNKERNEL = zgemv_n.c +CGEMVNKERNEL = cgemv_n_vfp.S +ZGEMVNKERNEL = zgemv_n_vfp.S SGEMVTKERNEL = gemv_t_vfp.S DGEMVTKERNEL = gemv_t_vfp.S -CGEMVTKERNEL = zgemv_t.c -ZGEMVTKERNEL = zgemv_t.c +CGEMVTKERNEL = cgemv_t_vfp.S +ZGEMVTKERNEL = zgemv_t_vfp.S STRMMKERNEL = strmm_kernel_4x2_vfp.S DTRMMKERNEL = dtrmm_kernel_4x2_vfp.S diff --git a/kernel/arm/KERNEL.ARMV7 b/kernel/arm/KERNEL.ARMV7 index 143327074..507f9813c 100644 --- a/kernel/arm/KERNEL.ARMV7 +++ b/kernel/arm/KERNEL.ARMV7 @@ -70,15 +70,15 @@ DSCALKERNEL = scal_vfp.S CSCALKERNEL = scal_vfp.S ZSCALKERNEL = scal_vfp.S -SGEMVNKERNEL = gemv_n_vfpv3.S -DGEMVNKERNEL = gemv_n_vfpv3.S -CGEMVNKERNEL = zgemv_n.c -ZGEMVNKERNEL = zgemv_n.c +SGEMVNKERNEL = gemv_n_vfp.S +DGEMVNKERNEL = gemv_n_vfp.S +CGEMVNKERNEL = cgemv_n_vfp.S +ZGEMVNKERNEL = zgemv_n_vfp.S -SGEMVTKERNEL = gemv_t_vfpv3.S -DGEMVTKERNEL = gemv_t_vfpv3.S -CGEMVTKERNEL = zgemv_t.c -ZGEMVTKERNEL = zgemv_t.c +SGEMVTKERNEL = gemv_t_vfp.S +DGEMVTKERNEL = gemv_t_vfp.S +CGEMVTKERNEL = cgemv_t_vfp.S +ZGEMVTKERNEL = zgemv_t_vfp.S STRMMKERNEL = strmm_kernel_4x4_vfpv3.S DTRMMKERNEL = dtrmm_kernel_4x4_vfpv3.S diff --git a/kernel/arm/cgemv_n_vfp.S b/kernel/arm/cgemv_n_vfp.S new file mode 100644 index 000000000..522c4c764 --- /dev/null +++ b/kernel/arm/cgemv_n_vfp.S @@ -0,0 +1,697 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/29 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_LDA [fp, #0 ] +#define X [fp, #4 ] +#define OLD_INC_X [fp, #8 ] +#define Y [fp, #12 ] +#define OLD_INC_Y [fp, #16 ] +#define OLD_A r3 +#define OLD_M r0 + +#define AO1 r0 +#define N r1 +#define J r2 + +#define AO2 r4 +#define XO r5 +#define YO r6 +#define LDA r7 +#define INC_X r8 +#define INC_Y r9 + +#define I r12 + +#define ALPHA_I [fp, #-236] +#define ALPHA_R [fp, #-244] + +#define M [fp, #-252 ] +#define A [fp, #-256 ] + + +#define X_PRE 64 +#define Y_PRE 0 +#define A_PRE 0 + +/**************************************************************************************/ + +#if !defined(CONJ) && !defined(XCONJ) + + #define KMAC_R fnmacs + #define KMAC_I fmacs + + #define FMAC_R1 fmacs + #define FMAC_R2 fnmacs + #define FMAC_I1 fmacs + #define FMAC_I2 fmacs + +#elif defined(CONJ) && !defined(XCONJ) + + #define KMAC_R fmacs + #define KMAC_I fnmacs + + #define FMAC_R1 fmacs + #define FMAC_R2 fnmacs + #define FMAC_I1 fmacs + #define FMAC_I2 fmacs + +#elif !defined(CONJ) && defined(XCONJ) + + #define KMAC_R fmacs + #define KMAC_I fnmacs + + #define FMAC_R1 fmacs + #define FMAC_R2 fmacs + #define FMAC_I1 fnmacs + #define FMAC_I2 fmacs + +#else + + #define KMAC_R fnmacs + #define KMAC_I fmacs + + #define FMAC_R1 fmacs + #define FMAC_R2 fmacs + #define FMAC_I1 fnmacs + #define FMAC_I2 fmacs + +#endif + +.macro INIT_F4 + + pld [ YO, #Y_PRE ] + vsub.f32 s8 , s8 , s8 + vmov.f32 s9 , s8 + vmov.f32 s10, s8 + vmov.f32 s11, s8 + vmov.f32 s12, s8 + vmov.f32 s13, s8 + vmov.f32 s14, s8 + vmov.f32 s15, s8 + +.endm + +.macro KERNEL_F4X4 + + pld [ XO, #X_PRE ] + KERNEL_F4X1 + KERNEL_F4X1 + KERNEL_F4X1 + KERNEL_F4X1 + +.endm + +.macro KERNEL_F4X1 + + pld [ AO2, #A_PRE ] + flds s0 , [ AO1 ] + flds s1 , [ AO1, #4 ] + flds s2 , [ AO1, #8 ] + flds s3 , [ AO1, #12 ] + + flds s4 , [ XO ] + flds s5 , [ XO, #4 ] + + fmacs s8 , s0, s4 + fmacs s9 , s0, s5 + fmacs s10 , s2, s4 + fmacs s11 , s2, s5 + + KMAC_R s8 , s1, s5 + KMAC_I s9 , s1, s4 + KMAC_R s10 , s3, s5 + KMAC_I s11 , s3, s4 + + flds s0 , [ AO1, #16 ] + flds s1 , [ AO1, #20 ] + flds s2 , [ AO1, #24 ] + flds s3 , [ AO1, #28 ] + + fmacs s12 , s0, s4 + fmacs s13 , s0, s5 + fmacs s14 , s2, s4 + fmacs s15 , s2, s5 + + KMAC_R s12 , s1, s5 + KMAC_I s13 , s1, s4 + KMAC_R s14 , s3, s5 + KMAC_I s15 , s3, s4 + + add XO , XO, #8 + add AO1 , AO1, LDA + add AO2 , AO2, LDA + +.endm + +.macro SAVE_F4 + + flds s0, ALPHA_R + flds s1, ALPHA_I + + fldmias YO, { s4 - s7 } + + FMAC_R1 s4 , s0 , s8 + FMAC_I1 s5 , s0 , s9 + FMAC_R2 s4 , s1 , s9 + FMAC_I2 s5 , s1 , s8 + + FMAC_R1 s6 , s0 , s10 + FMAC_I1 s7 , s0 , s11 + FMAC_R2 s6 , s1 , s11 + FMAC_I2 s7 , s1 , s10 + + fstmias YO!, { s4 - s7 } + + fldmias YO, { s4 - s7 } + + FMAC_R1 s4 , s0 , s12 + FMAC_I1 s5 , s0 , s13 + FMAC_R2 s4 , s1 , s13 + FMAC_I2 s5 , s1 , s12 + + FMAC_R1 s6 , s0 , s14 + FMAC_I1 s7 , s0 , s15 + FMAC_R2 s6 , s1 , s15 + FMAC_I2 s7 , s1 , s14 + + fstmias YO!, { s4 - s7 } + +.endm + + + + +.macro INIT_F1 + + vsub.f32 s8 , s8 , s8 + vmov.f32 s9 , s8 + +.endm + +.macro KERNEL_F1X1 + + flds s0 , [ AO1 ] + flds s1 , [ AO1, #4 ] + + flds s4 , [ XO ] + flds s5 , [ XO, #4 ] + + fmacs s8 , s0, s4 + fmacs s9 , s0, s5 + + KMAC_R s8 , s1, s5 + KMAC_I s9 , s1, s4 + + add XO , XO, #8 + add AO1 , AO1, LDA + + +.endm + +.macro SAVE_F1 + + flds s0, ALPHA_R + flds s1, ALPHA_I + + fldmias YO, { s4 - s5 } + + FMAC_R1 s4 , s0 , s8 + FMAC_I1 s5 , s0 , s9 + FMAC_R2 s4 , s1 , s9 + FMAC_I2 s5 , s1 , s8 + + fstmias YO, { s4 - s5 } + + add YO, YO, #8 + +.endm + +/****************************************************************************************/ + +.macro INIT_S4 + + vsub.f32 s8 , s8 , s8 + vmov.f32 s9 , s8 + vmov.f32 s10, s8 + vmov.f32 s11, s8 + vmov.f32 s12, s8 + vmov.f32 s13, s8 + vmov.f32 s14, s8 + vmov.f32 s15, s8 + +.endm + +.macro KERNEL_S4X4 + + KERNEL_S4X1 + KERNEL_S4X1 + KERNEL_S4X1 + KERNEL_S4X1 + +.endm + +.macro KERNEL_S4X1 + + flds s0 , [ AO1 ] + flds s1 , [ AO1, #4 ] + flds s2 , [ AO1, #8 ] + flds s3 , [ AO1, #12 ] + + flds s4 , [ XO ] + flds s5 , [ XO, #4 ] + + fmacs s8 , s0, s4 + fmacs s9 , s0, s5 + fmacs s10 , s2, s4 + fmacs s11 , s2, s5 + + KMAC_R s8 , s1, s5 + KMAC_I s9 , s1, s4 + KMAC_R s10 , s3, s5 + KMAC_I s11 , s3, s4 + + flds s0 , [ AO1, #16 ] + flds s1 , [ AO1, #20 ] + flds s2 , [ AO1, #24 ] + flds s3 , [ AO1, #28 ] + + fmacs s12 , s0, s4 + fmacs s13 , s0, s5 + fmacs s14 , s2, s4 + fmacs s15 , s2, s5 + + KMAC_R s12 , s1, s5 + KMAC_I s13 , s1, s4 + KMAC_R s14 , s3, s5 + KMAC_I s15 , s3, s4 + + add XO , XO, INC_X + add AO1 , AO1, LDA + add AO2 , AO2, LDA + +.endm + +.macro SAVE_S4 + + flds s0, ALPHA_R + flds s1, ALPHA_I + + fldmias YO, { s4 - s5 } + + FMAC_R1 s4 , s0 , s8 + FMAC_I1 s5 , s0 , s9 + FMAC_R2 s4 , s1 , s9 + FMAC_I2 s5 , s1 , s8 + + fstmias YO, { s4 - s5 } + + add YO, YO, INC_Y + + fldmias YO, { s6 - s7 } + + FMAC_R1 s6 , s0 , s10 + FMAC_I1 s7 , s0 , s11 + FMAC_R2 s6 , s1 , s11 + FMAC_I2 s7 , s1 , s10 + + fstmias YO, { s6 - s7 } + + add YO, YO, INC_Y + + fldmias YO, { s4 - s5 } + + FMAC_R1 s4 , s0 , s12 + FMAC_I1 s5 , s0 , s13 + FMAC_R2 s4 , s1 , s13 + FMAC_I2 s5 , s1 , s12 + + fstmias YO, { s4 - s5 } + + add YO, YO, INC_Y + + fldmias YO, { s6 - s7 } + + FMAC_R1 s6 , s0 , s14 + FMAC_I1 s7 , s0 , s15 + FMAC_R2 s6 , s1 , s15 + FMAC_I2 s7 , s1 , s14 + + fstmias YO, { s6 - s7 } + + add YO, YO, INC_Y + +.endm + + + + +.macro INIT_S1 + + vsub.f32 s8 , s8 , s8 + vmov.f32 s9 , s8 + +.endm + +.macro KERNEL_S1X1 + + flds s0 , [ AO1 ] + flds s1 , [ AO1, #4 ] + + flds s4 , [ XO ] + flds s5 , [ XO, #4 ] + + fmacs s8 , s0, s4 + fmacs s9 , s0, s5 + + KMAC_R s8 , s1, s5 + KMAC_I s9 , s1, s4 + + add XO , XO, INC_X + add AO1 , AO1, LDA + + +.endm + +.macro SAVE_S1 + + flds s0, ALPHA_R + flds s1, ALPHA_I + + fldmias YO, { s4 - s5 } + + FMAC_R1 s4 , s0 , s8 + FMAC_I1 s5 , s0 , s9 + FMAC_R2 s4 , s1 , s9 + FMAC_I2 s5 , s1 , s8 + + fstmias YO, { s4 - s5 } + + add YO, YO, INC_Y + +.endm + + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + push {r4 - r9 , fp} + add fp, sp, #28 + sub sp, sp, #STACKSIZE // reserve stack + + sub r12, fp, #192 + +#if defined(DOUBLE) + vstm r12, { d8 - d15 } // store floating point registers +#else + vstm r12, { s8 - s15 } // store floating point registers +#endif + + cmp OLD_M, #0 + ble cgemvn_kernel_L999 + + cmp N, #0 + ble cgemvn_kernel_L999 + + str OLD_A, A + str OLD_M, M + vstr s0 , ALPHA_R + vstr s1 , ALPHA_I + + + ldr INC_X , OLD_INC_X + ldr INC_Y , OLD_INC_Y + + cmp INC_X, #0 + beq cgemvn_kernel_L999 + + cmp INC_Y, #0 + beq cgemvn_kernel_L999 + + ldr LDA, OLD_LDA + + +#if defined(DOUBLE) + lsl LDA, LDA, #4 // LDA * SIZE * 2 +#else + lsl LDA, LDA, #3 // LDA * SIZE * 2 +#endif + + cmp INC_X, #1 + bne cgemvn_kernel_S4_BEGIN + + cmp INC_Y, #1 + bne cgemvn_kernel_S4_BEGIN + + +cgemvn_kernel_F4_BEGIN: + + ldr YO , Y + + ldr I, M + asrs I, I, #2 // I = M / 4 + ble cgemvn_kernel_F1_BEGIN + +cgemvn_kernel_F4X4: + + ldr AO1, A + add AO2, AO1, LDA + add r3 , AO1, #32 + str r3 , A + + add AO2, AO2, LDA + add AO2, AO2, LDA + + ldr XO , X + + INIT_F4 + + asrs J, N, #2 // J = N / 4 + ble cgemvn_kernel_F4X1 + + +cgemvn_kernel_F4X4_10: + + KERNEL_F4X4 + + subs J, J, #1 + bne cgemvn_kernel_F4X4_10 + + +cgemvn_kernel_F4X1: + + ands J, N , #3 + ble cgemvn_kernel_F4_END + +cgemvn_kernel_F4X1_10: + + KERNEL_F4X1 + + subs J, J, #1 + bne cgemvn_kernel_F4X1_10 + + +cgemvn_kernel_F4_END: + + SAVE_F4 + + subs I , I , #1 + bne cgemvn_kernel_F4X4 + + +cgemvn_kernel_F1_BEGIN: + + ldr I, M + ands I, I , #3 + ble cgemvn_kernel_L999 + +cgemvn_kernel_F1X1: + + ldr AO1, A + add r3, AO1, #8 + str r3, A + + ldr XO , X + + INIT_F1 + + mov J, N + + +cgemvn_kernel_F1X1_10: + + KERNEL_F1X1 + + subs J, J, #1 + bne cgemvn_kernel_F1X1_10 + + +cgemvn_kernel_F1_END: + + SAVE_F1 + + subs I , I , #1 + bne cgemvn_kernel_F1X1 + + b cgemvn_kernel_L999 + + + +/*************************************************************************************************************/ + +cgemvn_kernel_S4_BEGIN: + +#if defined(DOUBLE) + lsl INC_X, INC_X, #4 // INC_X * SIZE * 2 + lsl INC_Y, INC_Y, #4 // INC_Y * SIZE * 2 +#else + lsl INC_X, INC_X, #3 // INC_X * SIZE * 2 + lsl INC_Y, INC_Y, #3 // INC_Y * SIZE * 2 +#endif + + ldr YO , Y + + ldr I, M + asrs I, I, #2 // I = M / 4 + ble cgemvn_kernel_S1_BEGIN + +cgemvn_kernel_S4X4: + + ldr AO1, A + add AO2, AO1, LDA + add r3 , AO1, #32 + str r3 , A + + ldr XO , X + + INIT_S4 + + asrs J, N, #2 // J = N / 4 + ble cgemvn_kernel_S4X1 + + +cgemvn_kernel_S4X4_10: + + KERNEL_S4X4 + + subs J, J, #1 + bne cgemvn_kernel_S4X4_10 + + +cgemvn_kernel_S4X1: + + ands J, N , #3 + ble cgemvn_kernel_S4_END + +cgemvn_kernel_S4X1_10: + + KERNEL_S4X1 + + subs J, J, #1 + bne cgemvn_kernel_S4X1_10 + + +cgemvn_kernel_S4_END: + + SAVE_S4 + + subs I , I , #1 + bne cgemvn_kernel_S4X4 + + +cgemvn_kernel_S1_BEGIN: + + ldr I, M + ands I, I , #3 + ble cgemvn_kernel_L999 + +cgemvn_kernel_S1X1: + + ldr AO1, A + add r3, AO1, #8 + str r3, A + + ldr XO , X + + INIT_S1 + + mov J, N + + +cgemvn_kernel_S1X1_10: + + KERNEL_S1X1 + + subs J, J, #1 + bne cgemvn_kernel_S1X1_10 + + +cgemvn_kernel_S1_END: + + SAVE_S1 + + subs I , I , #1 + bne cgemvn_kernel_S1X1 + + +/*************************************************************************************************************/ + +cgemvn_kernel_L999: + + sub r3, fp, #192 + +#if defined(DOUBLE) + vldm r3, { d8 - d15 } // restore floating point registers +#else + vldm r3, { s8 - s15 } // restore floating point registers +#endif + + mov r0, #0 // set return value + + sub sp, fp, #28 + pop {r4 -r9 ,fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/cgemv_t_vfp.S b/kernel/arm/cgemv_t_vfp.S new file mode 100644 index 000000000..52276a06f --- /dev/null +++ b/kernel/arm/cgemv_t_vfp.S @@ -0,0 +1,607 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/29 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_LDA [fp, #0 ] +#define X [fp, #4 ] +#define OLD_INC_X [fp, #8 ] +#define Y [fp, #12 ] +#define OLD_INC_Y [fp, #16 ] +#define OLD_A r3 +#define OLD_N r1 + +#define M r0 +#define AO1 r1 +#define J r2 + +#define AO2 r4 +#define XO r5 +#define YO r6 +#define LDA r7 +#define INC_X r8 +#define INC_Y r9 + +#define I r12 + +#define N [fp, #-252 ] +#define A [fp, #-256 ] + + +#define X_PRE 512 +#define A_PRE 512 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +#if !defined(CONJ) && !defined(XCONJ) + + #define KMAC_R fnmacs + #define KMAC_I fmacs + + #define FMAC_R1 fmacs + #define FMAC_R2 fnmacs + #define FMAC_I1 fmacs + #define FMAC_I2 fmacs + +#elif defined(CONJ) && !defined(XCONJ) + + #define KMAC_R fmacs + #define KMAC_I fnmacs + + #define FMAC_R1 fmacs + #define FMAC_R2 fnmacs + #define FMAC_I1 fmacs + #define FMAC_I2 fmacs + +#elif !defined(CONJ) && defined(XCONJ) + + #define KMAC_R fmacs + #define KMAC_I fnmacs + + #define FMAC_R1 fmacs + #define FMAC_R2 fmacs + #define FMAC_I1 fnmacs + #define FMAC_I2 fmacs + +#else + + #define KMAC_R fnmacs + #define KMAC_I fmacs + + #define FMAC_R1 fmacs + #define FMAC_R2 fmacs + #define FMAC_I1 fnmacs + #define FMAC_I2 fmacs + +#endif + + + +.macro INIT_F2 + + vsub.f32 s12, s12, s12 + vsub.f32 s13, s13, s13 + vsub.f32 s14, s14, s14 + vsub.f32 s15, s15, s15 + +.endm + +.macro KERNEL_F2X4 + + KERNEL_F2X1 + KERNEL_F2X1 + KERNEL_F2X1 + KERNEL_F2X1 + +.endm + +.macro KERNEL_F2X1 + + fldmias XO! , { s2 - s3 } + fldmias AO1!, { s4 - s5 } + fldmias AO2!, { s8 - s9 } + + fmacs s12 , s4 , s2 + fmacs s13 , s4 , s3 + KMAC_R s12 , s5 , s3 + KMAC_I s13 , s5 , s2 + + fmacs s14 , s8 , s2 + fmacs s15 , s8 , s3 + KMAC_R s14 , s9 , s3 + KMAC_I s15 , s9 , s2 + +.endm + +.macro SAVE_F2 + + fldmias YO, { s4 - s7 } + + FMAC_R1 s4 , s0 , s12 + FMAC_I1 s5 , s0 , s13 + FMAC_R2 s4 , s1 , s13 + FMAC_I2 s5 , s1 , s12 + + FMAC_R1 s6 , s0 , s14 + FMAC_I1 s7 , s0 , s15 + FMAC_R2 s6 , s1 , s15 + FMAC_I2 s7 , s1 , s14 + + fstmias YO!, { s4 - s7 } + +.endm + +/************************************************************************************************/ + +.macro INIT_F1 + + vsub.f32 s12, s12, s12 + vsub.f32 s13, s13, s13 + +.endm + +.macro KERNEL_F1X4 + + KERNEL_F1X1 + KERNEL_F1X1 + KERNEL_F1X1 + KERNEL_F1X1 + +.endm + +.macro KERNEL_F1X1 + + fldmias XO! , { s2 - s3 } + fldmias AO1!, { s4 - s5 } + + fmacs s12 , s4 , s2 + fmacs s13 , s4 , s3 + KMAC_R s12 , s5 , s3 + KMAC_I s13 , s5 , s2 + +.endm + +.macro SAVE_F1 + + fldmias YO, { s4 - s5 } + + FMAC_R1 s4 , s0 , s12 + FMAC_I1 s5 , s0 , s13 + FMAC_R2 s4 , s1 , s13 + FMAC_I2 s5 , s1 , s12 + + fstmias YO!, { s4 - s5 } + +.endm + +/************************************************************************************************/ + +.macro INIT_S2 + + vsub.f32 s12, s12, s12 + vsub.f32 s13, s13, s13 + vsub.f32 s14, s14, s14 + vsub.f32 s15, s15, s15 + +.endm + +.macro KERNEL_S2X4 + + KERNEL_S2X1 + KERNEL_S2X1 + KERNEL_S2X1 + KERNEL_S2X1 + +.endm + +.macro KERNEL_S2X1 + + fldmias XO , { s2 - s3 } + fldmias AO1!, { s4 - s5 } + fldmias AO2!, { s8 - s9 } + + fmacs s12 , s4 , s2 + fmacs s13 , s4 , s3 + KMAC_R s12 , s5 , s3 + KMAC_I s13 , s5 , s2 + + fmacs s14 , s8 , s2 + fmacs s15 , s8 , s3 + KMAC_R s14 , s9 , s3 + KMAC_I s15 , s9 , s2 + + add XO, XO, INC_X + +.endm + +.macro SAVE_S2 + + fldmias YO, { s4 - s5 } + + FMAC_R1 s4 , s0 , s12 + FMAC_I1 s5 , s0 , s13 + FMAC_R2 s4 , s1 , s13 + FMAC_I2 s5 , s1 , s12 + + fstmias YO, { s4 - s5 } + + add YO, YO, INC_Y + + fldmias YO, { s6 - s7 } + + FMAC_R1 s6 , s0 , s14 + FMAC_I1 s7 , s0 , s15 + FMAC_R2 s6 , s1 , s15 + FMAC_I2 s7 , s1 , s14 + + fstmias YO, { s6 - s7 } + + add YO, YO, INC_Y + +.endm + +/************************************************************************************************/ + +.macro INIT_S1 + + vsub.f32 s12, s12, s12 + vsub.f32 s13, s13, s13 + +.endm + +.macro KERNEL_S1X4 + + KERNEL_S1X1 + KERNEL_S1X1 + KERNEL_S1X1 + KERNEL_S1X1 + +.endm + +.macro KERNEL_S1X1 + + fldmias XO , { s2 - s3 } + fldmias AO1!, { s4 - s5 } + + fmacs s12 , s4 , s2 + fmacs s13 , s4 , s3 + KMAC_R s12 , s5 , s3 + KMAC_I s13 , s5 , s2 + + add XO, XO, INC_X + +.endm + +.macro SAVE_S1 + + fldmias YO, { s4 - s5 } + + FMAC_R1 s4 , s0 , s12 + FMAC_I1 s5 , s0 , s13 + FMAC_R2 s4 , s1 , s13 + FMAC_I2 s5 , s1 , s12 + + fstmias YO, { s4 - s5 } + + add YO, YO, INC_Y + +.endm + + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + push {r4 - r9 , fp} + add fp, sp, #28 + sub sp, sp, #STACKSIZE // reserve stack + + sub r12, fp, #192 + +#if defined(DOUBLE) + vstm r12, { d8 - d15 } // store floating point registers +#else + vstm r12, { s8 - s15 } // store floating point registers +#endif + + cmp M, #0 + ble cgemvt_kernel_L999 + + cmp OLD_N, #0 + ble cgemvt_kernel_L999 + + str OLD_A, A + str OLD_N, N + + ldr INC_X , OLD_INC_X + ldr INC_Y , OLD_INC_Y + + cmp INC_X, #0 + beq cgemvt_kernel_L999 + + cmp INC_Y, #0 + beq cgemvt_kernel_L999 + + ldr LDA, OLD_LDA + + +#if defined(DOUBLE) + lsl LDA, LDA, #4 // LDA * SIZE +#else + lsl LDA, LDA, #3 // LDA * SIZE +#endif + + cmp INC_X, #1 + bne cgemvt_kernel_S2_BEGIN + + cmp INC_Y, #1 + bne cgemvt_kernel_S2_BEGIN + + +cgemvt_kernel_F2_BEGIN: + + ldr YO , Y + + ldr J, N + asrs J, J, #1 // J = N / 2 + ble cgemvt_kernel_F1_BEGIN + +cgemvt_kernel_F2X4: + + ldr AO1, A + add AO2, AO1, LDA + add r3 , AO2, LDA + str r3 , A + + ldr XO , X + + INIT_F2 + + asrs I, M, #2 // I = M / 4 + ble cgemvt_kernel_F2X1 + + +cgemvt_kernel_F2X4_10: + + KERNEL_F2X4 + + subs I, I, #1 + bne cgemvt_kernel_F2X4_10 + + +cgemvt_kernel_F2X1: + + ands I, M , #3 + ble cgemvt_kernel_F2_END + +cgemvt_kernel_F2X1_10: + + KERNEL_F2X1 + + subs I, I, #1 + bne cgemvt_kernel_F2X1_10 + + +cgemvt_kernel_F2_END: + + SAVE_F2 + + subs J , J , #1 + bne cgemvt_kernel_F2X4 + + +cgemvt_kernel_F1_BEGIN: + + ldr J, N + ands J, J, #1 + ble cgemvt_kernel_L999 + +cgemvt_kernel_F1X4: + + ldr AO1, A + + ldr XO , X + + INIT_F1 + + asrs I, M, #2 // I = M / 4 + ble cgemvt_kernel_F1X1 + + +cgemvt_kernel_F1X4_10: + + KERNEL_F1X4 + + subs I, I, #1 + bne cgemvt_kernel_F1X4_10 + + +cgemvt_kernel_F1X1: + + ands I, M , #3 + ble cgemvt_kernel_F1_END + +cgemvt_kernel_F1X1_10: + + KERNEL_F1X1 + + subs I, I, #1 + bne cgemvt_kernel_F1X1_10 + + +cgemvt_kernel_F1_END: + + SAVE_F1 + + b cgemvt_kernel_L999 + + + +/*************************************************************************************************************/ + +cgemvt_kernel_S2_BEGIN: + +#if defined(DOUBLE) + lsl INC_X, INC_X, #4 // INC_X * SIZE + lsl INC_Y, INC_Y, #4 // INC_Y * SIZE +#else + lsl INC_X, INC_X, #3 // INC_X * SIZE + lsl INC_Y, INC_Y, #3 // INC_Y * SIZE +#endif + + ldr YO , Y + + ldr J, N + asrs J, J, #1 // J = N / 2 + ble cgemvt_kernel_S1_BEGIN + +cgemvt_kernel_S2X4: + + ldr AO1, A + add AO2, AO1, LDA + add r3 , AO2, LDA + str r3 , A + + ldr XO , X + + INIT_S2 + + asrs I, M, #2 // I = M / 4 + ble cgemvt_kernel_S2X1 + + +cgemvt_kernel_S2X4_10: + + KERNEL_S2X4 + + subs I, I, #1 + bne cgemvt_kernel_S2X4_10 + + +cgemvt_kernel_S2X1: + + ands I, M , #3 + ble cgemvt_kernel_S2_END + +cgemvt_kernel_S2X1_10: + + KERNEL_S2X1 + + subs I, I, #1 + bne cgemvt_kernel_S2X1_10 + + +cgemvt_kernel_S2_END: + + SAVE_S2 + + subs J , J , #1 + bne cgemvt_kernel_S2X4 + + +cgemvt_kernel_S1_BEGIN: + + ldr J, N + ands J, J, #1 + ble cgemvt_kernel_L999 + +cgemvt_kernel_S1X4: + + ldr AO1, A + + ldr XO , X + + INIT_S1 + + asrs I, M, #2 // I = M / 4 + ble cgemvt_kernel_S1X1 + + +cgemvt_kernel_S1X4_10: + + KERNEL_S1X4 + + subs I, I, #1 + bne cgemvt_kernel_S1X4_10 + + +cgemvt_kernel_S1X1: + + ands I, M , #3 + ble cgemvt_kernel_S1_END + +cgemvt_kernel_S1X1_10: + + KERNEL_S1X1 + + subs I, I, #1 + bne cgemvt_kernel_S1X1_10 + + +cgemvt_kernel_S1_END: + + SAVE_S1 + + + +/*************************************************************************************************************/ + +cgemvt_kernel_L999: + + sub r3, fp, #192 + +#if defined(DOUBLE) + vldm r3, { d8 - d15 } // restore floating point registers +#else + vldm r3, { s8 - s15 } // restore floating point registers +#endif + + mov r0, #0 // set return value + + sub sp, fp, #28 + pop {r4 -r9 ,fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/zgemv_n_vfp.S b/kernel/arm/zgemv_n_vfp.S new file mode 100644 index 000000000..3b51d5553 --- /dev/null +++ b/kernel/arm/zgemv_n_vfp.S @@ -0,0 +1,699 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/29 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_LDA [fp, #0 ] +#define X [fp, #4 ] +#define OLD_INC_X [fp, #8 ] +#define Y [fp, #12 ] +#define OLD_INC_Y [fp, #16 ] +#define OLD_A r3 +#define OLD_M r0 + +#define AO1 r0 +#define N r1 +#define J r2 + +#define AO2 r4 +#define XO r5 +#define YO r6 +#define LDA r7 +#define INC_X r8 +#define INC_Y r9 + +#define I r12 + +#define ALPHA_I [fp, #-236] +#define ALPHA_R [fp, #-244] + +#define M [fp, #-252 ] +#define A [fp, #-256 ] + + +#define X_PRE 64 +#define Y_PRE 0 +#define A_PRE 0 + +/**************************************************************************************/ + +#if !defined(CONJ) && !defined(XCONJ) + + #define KMAC_R fnmacd + #define KMAC_I fmacd + + #define FMAC_R1 fmacd + #define FMAC_R2 fnmacd + #define FMAC_I1 fmacd + #define FMAC_I2 fmacd + +#elif defined(CONJ) && !defined(XCONJ) + + #define KMAC_R fmacd + #define KMAC_I fnmacd + + #define FMAC_R1 fmacd + #define FMAC_R2 fnmacd + #define FMAC_I1 fmacd + #define FMAC_I2 fmacd + +#elif !defined(CONJ) && defined(XCONJ) + + #define KMAC_R fmacd + #define KMAC_I fnmacd + + #define FMAC_R1 fmacd + #define FMAC_R2 fmacd + #define FMAC_I1 fnmacd + #define FMAC_I2 fmacd + +#else + + #define KMAC_R fnmacd + #define KMAC_I fmacd + + #define FMAC_R1 fmacd + #define FMAC_R2 fmacd + #define FMAC_I1 fnmacd + #define FMAC_I2 fmacd + +#endif + +.macro INIT_F4 + + pld [ YO, #Y_PRE ] + vsub.f64 d8 , d8 , d8 + vmov.f64 d9 , d8 + vmov.f64 d10, d8 + vmov.f64 d11, d8 + vmov.f64 d12, d8 + vmov.f64 d13, d8 + vmov.f64 d14, d8 + vmov.f64 d15, d8 + +.endm + +.macro KERNEL_F4X4 + + pld [ XO, #X_PRE ] + KERNEL_F4X1 + KERNEL_F4X1 + pld [ XO, #X_PRE ] + KERNEL_F4X1 + KERNEL_F4X1 + +.endm + +.macro KERNEL_F4X1 + + fldd d0 , [ AO1 ] + + fldd d4 , [ XO ] + fldd d5 , [ XO, #8 ] + + pld [ AO2, #A_PRE ] + + fldd d1 , [ AO1, #8 ] + fmacd d8 , d0, d4 + fldd d2 , [ AO1, #16 ] + fmacd d9 , d0, d5 + fldd d3 , [ AO1, #24 ] + fmacd d10 , d2, d4 + fldd d0 , [ AO1, #32 ] + fmacd d11 , d2, d5 + + KMAC_R d8 , d1, d5 + KMAC_I d9 , d1, d4 + KMAC_R d10 , d3, d5 + fldd d1 , [ AO1, #40 ] + KMAC_I d11 , d3, d4 + + fldd d2 , [ AO1, #48 ] + + fmacd d12 , d0, d4 + fldd d3 , [ AO1, #56 ] + fmacd d13 , d0, d5 + pld [ AO2, #A_PRE+32 ] + fmacd d14 , d2, d4 + fmacd d15 , d2, d5 + + KMAC_R d12 , d1, d5 + add XO , XO, #16 + KMAC_I d13 , d1, d4 + add AO1 , AO1, LDA + KMAC_R d14 , d3, d5 + add AO2 , AO2, LDA + KMAC_I d15 , d3, d4 + +.endm + +.macro SAVE_F4 + + fldd d0, ALPHA_R + fldd d1, ALPHA_I + + fldmiad YO, { d4 - d7 } + + FMAC_R1 d4 , d0 , d8 + FMAC_I1 d5 , d0 , d9 + FMAC_R2 d4 , d1 , d9 + FMAC_I2 d5 , d1 , d8 + + FMAC_R1 d6 , d0 , d10 + FMAC_I1 d7 , d0 , d11 + FMAC_R2 d6 , d1 , d11 + FMAC_I2 d7 , d1 , d10 + + fstmiad YO!, { d4 - d7 } + + fldmiad YO, { d4 - d7 } + + FMAC_R1 d4 , d0 , d12 + FMAC_I1 d5 , d0 , d13 + FMAC_R2 d4 , d1 , d13 + FMAC_I2 d5 , d1 , d12 + + FMAC_R1 d6 , d0 , d14 + FMAC_I1 d7 , d0 , d15 + FMAC_R2 d6 , d1 , d15 + FMAC_I2 d7 , d1 , d14 + + fstmiad YO!, { d4 - d7 } + +.endm + + + + +.macro INIT_F1 + + vsub.f64 d8 , d8 , d8 + vmov.f64 d9 , d8 + +.endm + +.macro KERNEL_F1X1 + + fldd d0 , [ AO1 ] + fldd d1 , [ AO1, #8 ] + + fldd d4 , [ XO ] + fldd d5 , [ XO, #8 ] + + fmacd d8 , d0, d4 + fmacd d9 , d0, d5 + + KMAC_R d8 , d1, d5 + KMAC_I d9 , d1, d4 + + add XO , XO, #16 + add AO1 , AO1, LDA + + +.endm + +.macro SAVE_F1 + + fldd d0, ALPHA_R + fldd d1, ALPHA_I + + fldmiad YO, { d4 - d5 } + + FMAC_R1 d4 , d0 , d8 + FMAC_I1 d5 , d0 , d9 + FMAC_R2 d4 , d1 , d9 + FMAC_I2 d5 , d1 , d8 + + fstmiad YO, { d4 - d5 } + + add YO, YO, #16 + +.endm + +/****************************************************************************************/ + +.macro INIT_S4 + + vsub.f64 d8 , d8 , d8 + vmov.f64 d9 , d8 + vmov.f64 d10, d8 + vmov.f64 d11, d8 + vmov.f64 d12, d8 + vmov.f64 d13, d8 + vmov.f64 d14, d8 + vmov.f64 d15, d8 + +.endm + +.macro KERNEL_S4X4 + + KERNEL_S4X1 + KERNEL_S4X1 + KERNEL_S4X1 + KERNEL_S4X1 + +.endm + +.macro KERNEL_S4X1 + + fldd d0 , [ AO1 ] + fldd d1 , [ AO1, #8 ] + fldd d2 , [ AO1, #16 ] + fldd d3 , [ AO1, #24 ] + + fldd d4 , [ XO ] + fldd d5 , [ XO, #8 ] + + fmacd d8 , d0, d4 + fmacd d9 , d0, d5 + fmacd d10 , d2, d4 + fmacd d11 , d2, d5 + + KMAC_R d8 , d1, d5 + KMAC_I d9 , d1, d4 + KMAC_R d10 , d3, d5 + KMAC_I d11 , d3, d4 + + fldd d0 , [ AO1, #32 ] + fldd d1 , [ AO1, #40 ] + fldd d2 , [ AO1, #48 ] + fldd d3 , [ AO1, #56 ] + + fmacd d12 , d0, d4 + fmacd d13 , d0, d5 + fmacd d14 , d2, d4 + fmacd d15 , d2, d5 + + KMAC_R d12 , d1, d5 + KMAC_I d13 , d1, d4 + KMAC_R d14 , d3, d5 + KMAC_I d15 , d3, d4 + + add XO , XO, INC_X + add AO1 , AO1, LDA + add AO2 , AO2, LDA + +.endm + +.macro SAVE_S4 + + fldd d0, ALPHA_R + fldd d1, ALPHA_I + + fldmiad YO, { d4 - d5 } + + FMAC_R1 d4 , d0 , d8 + FMAC_I1 d5 , d0 , d9 + FMAC_R2 d4 , d1 , d9 + FMAC_I2 d5 , d1 , d8 + + fstmiad YO, { d4 - d5 } + + add YO, YO, INC_Y + + fldmiad YO, { d6 - d7 } + + FMAC_R1 d6 , d0 , d10 + FMAC_I1 d7 , d0 , d11 + FMAC_R2 d6 , d1 , d11 + FMAC_I2 d7 , d1 , d10 + + fstmiad YO, { d6 - d7 } + + add YO, YO, INC_Y + + fldmiad YO, { d4 - d5 } + + FMAC_R1 d4 , d0 , d12 + FMAC_I1 d5 , d0 , d13 + FMAC_R2 d4 , d1 , d13 + FMAC_I2 d5 , d1 , d12 + + fstmiad YO, { d4 - d5 } + + add YO, YO, INC_Y + + fldmiad YO, { d6 - d7 } + + FMAC_R1 d6 , d0 , d14 + FMAC_I1 d7 , d0 , d15 + FMAC_R2 d6 , d1 , d15 + FMAC_I2 d7 , d1 , d14 + + fstmiad YO, { d6 - d7 } + + add YO, YO, INC_Y + +.endm + + + + +.macro INIT_S1 + + vsub.f64 d8 , d8 , d8 + vmov.f64 d9 , d8 + +.endm + +.macro KERNEL_S1X1 + + fldd d0 , [ AO1 ] + fldd d1 , [ AO1, #8 ] + + fldd d4 , [ XO ] + fldd d5 , [ XO, #8 ] + + fmacd d8 , d0, d4 + fmacd d9 , d0, d5 + + KMAC_R d8 , d1, d5 + KMAC_I d9 , d1, d4 + + add XO , XO, INC_X + add AO1 , AO1, LDA + + +.endm + +.macro SAVE_S1 + + fldd d0, ALPHA_R + fldd d1, ALPHA_I + + fldmiad YO, { d4 - d5 } + + FMAC_R1 d4 , d0 , d8 + FMAC_I1 d5 , d0 , d9 + FMAC_R2 d4 , d1 , d9 + FMAC_I2 d5 , d1 , d8 + + fstmiad YO, { d4 - d5 } + + add YO, YO, INC_Y + +.endm + + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + push {r4 - r9 , fp} + add fp, sp, #28 + sub sp, sp, #STACKSIZE // reserve stack + + sub r12, fp, #192 + +#if defined(DOUBLE) + vstm r12, { d8 - d15 } // store floating point registers +#else + vstm r12, { s8 - s15 } // store floating point registers +#endif + + cmp OLD_M, #0 + ble zgemvn_kernel_L999 + + cmp N, #0 + ble zgemvn_kernel_L999 + + str OLD_A, A + str OLD_M, M + vstr d0 , ALPHA_R + vstr d1 , ALPHA_I + + + ldr INC_X , OLD_INC_X + ldr INC_Y , OLD_INC_Y + + cmp INC_X, #0 + beq zgemvn_kernel_L999 + + cmp INC_Y, #0 + beq zgemvn_kernel_L999 + + ldr LDA, OLD_LDA + + +#if defined(DOUBLE) + lsl LDA, LDA, #4 // LDA * SIZE * 2 +#else + lsl LDA, LDA, #3 // LDA * SIZE * 2 +#endif + + cmp INC_X, #1 + bne zgemvn_kernel_S4_BEGIN + + cmp INC_Y, #1 + bne zgemvn_kernel_S4_BEGIN + + +zgemvn_kernel_F4_BEGIN: + + ldr YO , Y + + ldr I, M + asrs I, I, #2 // I = M / 4 + ble zgemvn_kernel_F1_BEGIN + +zgemvn_kernel_F4X4: + + ldr AO1, A + add AO2, AO1, LDA + add r3 , AO1, #64 + str r3 , A + + add AO2, AO2, LDA + add AO2, AO2, LDA + + ldr XO , X + + INIT_F4 + + asrs J, N, #2 // J = N / 4 + ble zgemvn_kernel_F4X1 + + +zgemvn_kernel_F4X4_10: + + KERNEL_F4X4 + + subs J, J, #1 + bne zgemvn_kernel_F4X4_10 + + +zgemvn_kernel_F4X1: + + ands J, N , #3 + ble zgemvn_kernel_F4_END + +zgemvn_kernel_F4X1_10: + + KERNEL_F4X1 + + subs J, J, #1 + bne zgemvn_kernel_F4X1_10 + + +zgemvn_kernel_F4_END: + + SAVE_F4 + + subs I , I , #1 + bne zgemvn_kernel_F4X4 + + +zgemvn_kernel_F1_BEGIN: + + ldr I, M + ands I, I , #3 + ble zgemvn_kernel_L999 + +zgemvn_kernel_F1X1: + + ldr AO1, A + add r3, AO1, #16 + str r3, A + + ldr XO , X + + INIT_F1 + + mov J, N + + +zgemvn_kernel_F1X1_10: + + KERNEL_F1X1 + + subs J, J, #1 + bne zgemvn_kernel_F1X1_10 + + +zgemvn_kernel_F1_END: + + SAVE_F1 + + subs I , I , #1 + bne zgemvn_kernel_F1X1 + + b zgemvn_kernel_L999 + + + +/*************************************************************************************************************/ + +zgemvn_kernel_S4_BEGIN: + +#if defined(DOUBLE) + lsl INC_X, INC_X, #4 // INC_X * SIZE * 2 + lsl INC_Y, INC_Y, #4 // INC_Y * SIZE * 2 +#else + lsl INC_X, INC_X, #3 // INC_X * SIZE * 2 + lsl INC_Y, INC_Y, #3 // INC_Y * SIZE * 2 +#endif + + ldr YO , Y + + ldr I, M + asrs I, I, #2 // I = M / 4 + ble zgemvn_kernel_S1_BEGIN + +zgemvn_kernel_S4X4: + + ldr AO1, A + add AO2, AO1, LDA + add r3 , AO1, #64 + str r3 , A + + ldr XO , X + + INIT_S4 + + asrs J, N, #2 // J = N / 4 + ble zgemvn_kernel_S4X1 + + +zgemvn_kernel_S4X4_10: + + KERNEL_S4X4 + + subs J, J, #1 + bne zgemvn_kernel_S4X4_10 + + +zgemvn_kernel_S4X1: + + ands J, N , #3 + ble zgemvn_kernel_S4_END + +zgemvn_kernel_S4X1_10: + + KERNEL_S4X1 + + subs J, J, #1 + bne zgemvn_kernel_S4X1_10 + + +zgemvn_kernel_S4_END: + + SAVE_S4 + + subs I , I , #1 + bne zgemvn_kernel_S4X4 + + +zgemvn_kernel_S1_BEGIN: + + ldr I, M + ands I, I , #3 + ble zgemvn_kernel_L999 + +zgemvn_kernel_S1X1: + + ldr AO1, A + add r3, AO1, #16 + str r3, A + + ldr XO , X + + INIT_S1 + + mov J, N + + +zgemvn_kernel_S1X1_10: + + KERNEL_S1X1 + + subs J, J, #1 + bne zgemvn_kernel_S1X1_10 + + +zgemvn_kernel_S1_END: + + SAVE_S1 + + subs I , I , #1 + bne zgemvn_kernel_S1X1 + + +/*************************************************************************************************************/ + +zgemvn_kernel_L999: + + sub r3, fp, #192 + +#if defined(DOUBLE) + vldm r3, { d8 - d15 } // restore floating point registers +#else + vldm r3, { s8 - s15 } // restore floating point registers +#endif + + mov r0, #0 // set return value + + sub sp, fp, #28 + pop {r4 -r9 ,fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/zgemv_t_vfp.S b/kernel/arm/zgemv_t_vfp.S new file mode 100644 index 000000000..500a3b608 --- /dev/null +++ b/kernel/arm/zgemv_t_vfp.S @@ -0,0 +1,608 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/29 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_LDA [fp, #0 ] +#define X [fp, #4 ] +#define OLD_INC_X [fp, #8 ] +#define Y [fp, #12 ] +#define OLD_INC_Y [fp, #16 ] +#define OLD_A r3 +#define OLD_N r1 + +#define M r0 +#define AO1 r1 +#define J r2 + +#define AO2 r4 +#define XO r5 +#define YO r6 +#define LDA r7 +#define INC_X r8 +#define INC_Y r9 + +#define I r12 + +#define N [fp, #-252 ] +#define A [fp, #-256 ] + + +#define X_PRE 512 +#define A_PRE 512 +#define Y_PRE 32 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +#if !defined(CONJ) && !defined(XCONJ) + + #define KMAC_R fnmacd + #define KMAC_I fmacd + + #define FMAC_R1 fmacd + #define FMAC_R2 fnmacd + #define FMAC_I1 fmacd + #define FMAC_I2 fmacd + +#elif defined(CONJ) && !defined(XCONJ) + + #define KMAC_R fmacd + #define KMAC_I fnmacd + + #define FMAC_R1 fmacd + #define FMAC_R2 fnmacd + #define FMAC_I1 fmacd + #define FMAC_I2 fmacd + +#elif !defined(CONJ) && defined(XCONJ) + + #define KMAC_R fmacd + #define KMAC_I fnmacd + + #define FMAC_R1 fmacd + #define FMAC_R2 fmacd + #define FMAC_I1 fnmacd + #define FMAC_I2 fmacd + +#else + + #define KMAC_R fnmacd + #define KMAC_I fmacd + + #define FMAC_R1 fmacd + #define FMAC_R2 fmacd + #define FMAC_I1 fnmacd + #define FMAC_I2 fmacd + +#endif + + + +.macro INIT_F2 + + vsub.f64 d12, d12, d12 + vsub.f64 d13, d13, d13 + vsub.f64 d14, d14, d14 + vsub.f64 d15, d15, d15 + +.endm + +.macro KERNEL_F2X4 + + KERNEL_F2X1 + KERNEL_F2X1 + KERNEL_F2X1 + KERNEL_F2X1 + +.endm + +.macro KERNEL_F2X1 + + fldmiad XO! , { d2 - d3 } + fldmiad AO1!, { d4 - d5 } + + fmacd d12 , d4 , d2 + fmacd d13 , d4 , d3 + fldmiad AO2!, { d8 - d9 } + KMAC_R d12 , d5 , d3 + KMAC_I d13 , d5 , d2 + + fmacd d14 , d8 , d2 + fmacd d15 , d8 , d3 + KMAC_R d14 , d9 , d3 + KMAC_I d15 , d9 , d2 + +.endm + +.macro SAVE_F2 + + fldmiad YO, { d4 - d7 } + + FMAC_R1 d4 , d0 , d12 + FMAC_I1 d5 , d0 , d13 + FMAC_R2 d4 , d1 , d13 + FMAC_I2 d5 , d1 , d12 + + FMAC_R1 d6 , d0 , d14 + FMAC_I1 d7 , d0 , d15 + FMAC_R2 d6 , d1 , d15 + FMAC_I2 d7 , d1 , d14 + + fstmiad YO!, { d4 - d7 } + +.endm + +/************************************************************************************************/ + +.macro INIT_F1 + + vsub.f64 d12, d12, d12 + vsub.f64 d13, d13, d13 + +.endm + +.macro KERNEL_F1X4 + + KERNEL_F1X1 + KERNEL_F1X1 + KERNEL_F1X1 + KERNEL_F1X1 + +.endm + +.macro KERNEL_F1X1 + + fldmiad XO! , { d2 - d3 } + fldmiad AO1!, { d4 - d5 } + + fmacd d12 , d4 , d2 + fmacd d13 , d4 , d3 + KMAC_R d12 , d5 , d3 + KMAC_I d13 , d5 , d2 + +.endm + +.macro SAVE_F1 + + fldmiad YO, { d4 - d5 } + + FMAC_R1 d4 , d0 , d12 + FMAC_I1 d5 , d0 , d13 + FMAC_R2 d4 , d1 , d13 + FMAC_I2 d5 , d1 , d12 + + fstmiad YO!, { d4 - d5 } + +.endm + +/************************************************************************************************/ + +.macro INIT_S2 + + vsub.f64 d12, d12, d12 + vsub.f64 d13, d13, d13 + vsub.f64 d14, d14, d14 + vsub.f64 d15, d15, d15 + +.endm + +.macro KERNEL_S2X4 + + KERNEL_S2X1 + KERNEL_S2X1 + KERNEL_S2X1 + KERNEL_S2X1 + +.endm + +.macro KERNEL_S2X1 + + fldmiad XO , { d2 - d3 } + fldmiad AO1!, { d4 - d5 } + fldmiad AO2!, { d8 - d9 } + + fmacd d12 , d4 , d2 + fmacd d13 , d4 , d3 + KMAC_R d12 , d5 , d3 + KMAC_I d13 , d5 , d2 + + fmacd d14 , d8 , d2 + fmacd d15 , d8 , d3 + KMAC_R d14 , d9 , d3 + KMAC_I d15 , d9 , d2 + + add XO, XO, INC_X + +.endm + +.macro SAVE_S2 + + fldmiad YO, { d4 - d5 } + + FMAC_R1 d4 , d0 , d12 + FMAC_I1 d5 , d0 , d13 + FMAC_R2 d4 , d1 , d13 + FMAC_I2 d5 , d1 , d12 + + fstmiad YO, { d4 - d5 } + + add YO, YO, INC_Y + + fldmiad YO, { d6 - d7 } + + FMAC_R1 d6 , d0 , d14 + FMAC_I1 d7 , d0 , d15 + FMAC_R2 d6 , d1 , d15 + FMAC_I2 d7 , d1 , d14 + + fstmiad YO, { d6 - d7 } + + add YO, YO, INC_Y + +.endm + +/************************************************************************************************/ + +.macro INIT_S1 + + vsub.f64 d12, d12, d12 + vsub.f64 d13, d13, d13 + +.endm + +.macro KERNEL_S1X4 + + KERNEL_S1X1 + KERNEL_S1X1 + KERNEL_S1X1 + KERNEL_S1X1 + +.endm + +.macro KERNEL_S1X1 + + fldmiad XO , { d2 - d3 } + fldmiad AO1!, { d4 - d5 } + + fmacd d12 , d4 , d2 + fmacd d13 , d4 , d3 + KMAC_R d12 , d5 , d3 + KMAC_I d13 , d5 , d2 + + add XO, XO, INC_X + +.endm + +.macro SAVE_S1 + + fldmiad YO, { d4 - d5 } + + FMAC_R1 d4 , d0 , d12 + FMAC_I1 d5 , d0 , d13 + FMAC_R2 d4 , d1 , d13 + FMAC_I2 d5 , d1 , d12 + + fstmiad YO, { d4 - d5 } + + add YO, YO, INC_Y + +.endm + + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + push {r4 - r9 , fp} + add fp, sp, #28 + sub sp, sp, #STACKSIZE // reserve stack + + sub r12, fp, #192 + +#if defined(DOUBLE) + vstm r12, { d8 - d15 } // store floating point registers +#else + vstm r12, { s8 - s15 } // store floating point registers +#endif + + cmp M, #0 + ble zgemvt_kernel_L999 + + cmp OLD_N, #0 + ble zgemvt_kernel_L999 + + str OLD_A, A + str OLD_N, N + + ldr INC_X , OLD_INC_X + ldr INC_Y , OLD_INC_Y + + cmp INC_X, #0 + beq zgemvt_kernel_L999 + + cmp INC_Y, #0 + beq zgemvt_kernel_L999 + + ldr LDA, OLD_LDA + + +#if defined(DOUBLE) + lsl LDA, LDA, #4 // LDA * SIZE +#else + lsl LDA, LDA, #3 // LDA * SIZE +#endif + + cmp INC_X, #1 + bne zgemvt_kernel_S2_BEGIN + + cmp INC_Y, #1 + bne zgemvt_kernel_S2_BEGIN + + +zgemvt_kernel_F2_BEGIN: + + ldr YO , Y + + ldr J, N + asrs J, J, #1 // J = N / 2 + ble zgemvt_kernel_F1_BEGIN + +zgemvt_kernel_F2X4: + + ldr AO1, A + add AO2, AO1, LDA + add r3 , AO2, LDA + str r3 , A + + ldr XO , X + + INIT_F2 + + asrs I, M, #2 // I = M / 4 + ble zgemvt_kernel_F2X1 + + +zgemvt_kernel_F2X4_10: + + KERNEL_F2X4 + + subs I, I, #1 + bne zgemvt_kernel_F2X4_10 + + +zgemvt_kernel_F2X1: + + ands I, M , #3 + ble zgemvt_kernel_F2_END + +zgemvt_kernel_F2X1_10: + + KERNEL_F2X1 + + subs I, I, #1 + bne zgemvt_kernel_F2X1_10 + + +zgemvt_kernel_F2_END: + + SAVE_F2 + + subs J , J , #1 + bne zgemvt_kernel_F2X4 + + +zgemvt_kernel_F1_BEGIN: + + ldr J, N + ands J, J, #1 + ble zgemvt_kernel_L999 + +zgemvt_kernel_F1X4: + + ldr AO1, A + + ldr XO , X + + INIT_F1 + + asrs I, M, #2 // I = M / 4 + ble zgemvt_kernel_F1X1 + + +zgemvt_kernel_F1X4_10: + + KERNEL_F1X4 + + subs I, I, #1 + bne zgemvt_kernel_F1X4_10 + + +zgemvt_kernel_F1X1: + + ands I, M , #3 + ble zgemvt_kernel_F1_END + +zgemvt_kernel_F1X1_10: + + KERNEL_F1X1 + + subs I, I, #1 + bne zgemvt_kernel_F1X1_10 + + +zgemvt_kernel_F1_END: + + SAVE_F1 + + b zgemvt_kernel_L999 + + + +/*************************************************************************************************************/ + +zgemvt_kernel_S2_BEGIN: + +#if defined(DOUBLE) + lsl INC_X, INC_X, #4 // INC_X * SIZE + lsl INC_Y, INC_Y, #4 // INC_Y * SIZE +#else + lsl INC_X, INC_X, #3 // INC_X * SIZE + lsl INC_Y, INC_Y, #3 // INC_Y * SIZE +#endif + + ldr YO , Y + + ldr J, N + asrs J, J, #1 // J = N / 2 + ble zgemvt_kernel_S1_BEGIN + +zgemvt_kernel_S2X4: + + ldr AO1, A + add AO2, AO1, LDA + add r3 , AO2, LDA + str r3 , A + + ldr XO , X + + INIT_S2 + + asrs I, M, #2 // I = M / 4 + ble zgemvt_kernel_S2X1 + + +zgemvt_kernel_S2X4_10: + + KERNEL_S2X4 + + subs I, I, #1 + bne zgemvt_kernel_S2X4_10 + + +zgemvt_kernel_S2X1: + + ands I, M , #3 + ble zgemvt_kernel_S2_END + +zgemvt_kernel_S2X1_10: + + KERNEL_S2X1 + + subs I, I, #1 + bne zgemvt_kernel_S2X1_10 + + +zgemvt_kernel_S2_END: + + SAVE_S2 + + subs J , J , #1 + bne zgemvt_kernel_S2X4 + + +zgemvt_kernel_S1_BEGIN: + + ldr J, N + ands J, J, #1 + ble zgemvt_kernel_L999 + +zgemvt_kernel_S1X4: + + ldr AO1, A + + ldr XO , X + + INIT_S1 + + asrs I, M, #2 // I = M / 4 + ble zgemvt_kernel_S1X1 + + +zgemvt_kernel_S1X4_10: + + KERNEL_S1X4 + + subs I, I, #1 + bne zgemvt_kernel_S1X4_10 + + +zgemvt_kernel_S1X1: + + ands I, M , #3 + ble zgemvt_kernel_S1_END + +zgemvt_kernel_S1X1_10: + + KERNEL_S1X1 + + subs I, I, #1 + bne zgemvt_kernel_S1X1_10 + + +zgemvt_kernel_S1_END: + + SAVE_S1 + + + +/*************************************************************************************************************/ + +zgemvt_kernel_L999: + + sub r3, fp, #192 + +#if defined(DOUBLE) + vldm r3, { d8 - d15 } // restore floating point registers +#else + vldm r3, { s8 - s15 } // restore floating point registers +#endif + + mov r0, #0 // set return value + + sub sp, fp, #28 + pop {r4 -r9 ,fp} + bx lr + + EPILOGUE + From be18cd47f6223cbe3d7fc45fcecec1035dd4d1db Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sun, 1 Dec 2013 13:52:48 +0100 Subject: [PATCH 77/81] changed level3.c --- driver/level3/level3.c | 26 +------------ driver/level3/level3_thread.c | 70 +---------------------------------- 2 files changed, 3 insertions(+), 93 deletions(-) diff --git a/driver/level3/level3.c b/driver/level3/level3.c index d87c5f546..5f746642c 100644 --- a/driver/level3/level3.c +++ b/driver/level3/level3.c @@ -36,8 +36,6 @@ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ -// #define TIMING 1 - /* This file is a template for level 3 operation */ #ifndef BETA_OPERATION @@ -335,24 +333,14 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; -#if defined(BULLDOZER) && defined(ARCH_X86_64) && !defined(XDOUBLE) && !defined(COMPLEX) - if (min_jj >= 12*GEMM_UNROLL_N) min_jj = 12*GEMM_UNROLL_N; - else +#if ( defined(BULLDOZER) || defined(PILEDRIVER) || defined(HASWELL) ) && defined(ARCH_X86_64) && !defined(XDOUBLE) && !defined(COMPLEX) if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; else if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N; else if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; -#elif defined(ARMV7) - if (min_jj >= 32) min_jj = 32; - else - if (min_jj >= 16) min_jj = 16; - else - if (min_jj >= 8) min_jj = 8; - else - if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; - #else + if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; #endif @@ -412,22 +400,12 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, #ifdef TIMING total = (double)outercost + (double)innercost + (double)kernelcost; -#ifdef ARMV7 - - printf( "Copy A : %5.2f Copy B: %5.2f Kernel : %5.2f\n", - innercost / total * 100., outercost / total * 100., - kernelcost / total * 100.); - - -#else - printf( "Copy A : %5.2f Copy B: %5.2f Kernel : %5.2f kernel Effi. : %5.2f Total Effi. : %5.2f\n", innercost / total * 100., outercost / total * 100., kernelcost / total * 100., (double)(m_to - m_from) * (double)(n_to - n_from) * (double)k / (double)kernelcost * 100. * (double)COMPSIZE / 2., (double)(m_to - m_from) * (double)(n_to - n_from) * (double)k / total * 100. * (double)COMPSIZE / 2.); -#endif #endif return 0; diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c index 56c4d6eca..ee1a8db7c 100644 --- a/driver/level3/level3_thread.c +++ b/driver/level3/level3_thread.c @@ -36,8 +36,6 @@ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ -// #define TIMING 1 - #ifndef CACHE_LINE_SIZE #define CACHE_LINE_SIZE 8 #endif @@ -235,21 +233,6 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, BLASLONG l1stride, l2size; #ifdef TIMING - -#ifdef ARMV7 - - unsigned long long rpcc_counter; - unsigned long long copy_A = 0; - unsigned long long copy_B = 0; - unsigned long long kernel = 0; - unsigned long long waiting1 = 0; - unsigned long long waiting2 = 0; - unsigned long long waiting3 = 0; - unsigned long long waiting6[MAX_CPU_NUMBER]; - unsigned long long ops = 0; - -#else - BLASULONG rpcc_counter; BLASULONG copy_A = 0; BLASULONG copy_B = 0; @@ -260,8 +243,6 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, BLASULONG waiting6[MAX_CPU_NUMBER]; BLASULONG ops = 0; -#endif - for (i = 0; i < args -> nthreads; i++) waiting6[i] = 0; #endif @@ -339,35 +320,15 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, min_l = k - ls; -#ifdef ARMV7_1 - if (min_l >= GEMM_Q / 4 * 2) { - min_l = GEMM_Q / 4; - } else { - if (min_l > GEMM_Q / 4) min_l = (min_l + 1) / 2; - } - -#else if (min_l >= GEMM_Q * 2) { min_l = GEMM_Q; } else { if (min_l > GEMM_Q) min_l = (min_l + 1) / 2; } -#endif l1stride = 1; min_i = m_to - m_from; -#ifdef ARMV7_1 - if (min_i >= GEMM_P / 4 * 2) { - min_i = GEMM_P / 4; - } else { - if (min_i > GEMM_P / 4) { - min_i = (min_i / 2 + GEMM_UNROLL_M - 1) & ~(GEMM_UNROLL_M - 1); - } else { - if (args -> nthreads == 1) l1stride = 0; - } - } -#else if (min_i >= GEMM_P * 2) { min_i = GEMM_P; } else { @@ -378,8 +339,6 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, } } -#endif - START_RPCC(); ICOPY_OPERATION(min_l, min_i, a, lda, ls, m_from, sa); @@ -408,22 +367,12 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){ min_jj = MIN(n_to, xxx + div_n) - jjs; -#if defined(BULLDOZER) && defined(ARCH_X86_64) && !defined(XDOUBLE) && !defined(COMPLEX) - if (min_jj >= 12*GEMM_UNROLL_N) min_jj = 12*GEMM_UNROLL_N; - else +#if ( defined(BULLDOZER) || defined(PILEDRIVER) || defined(HASWELL) ) && defined(ARCH_X86_64) && !defined(XDOUBLE) && !defined(COMPLEX) if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; else if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N; else if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; -#elif defined(ARMV7) - if (min_jj >= 16) min_jj = 16; - else - if (min_jj >= 8) min_jj = 8; - else - if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; - - #else if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; @@ -555,21 +504,6 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, STOP_RPCC(waiting3); #ifdef TIMING - -#ifdef ARMV7 - - unsigned long long waiting = waiting1 + waiting2 + waiting3; - unsigned long long total = copy_A + copy_B + kernel + waiting; - - fprintf(stderr, "GEMM [%2ld] Copy_A : %6.2f Copy_B : %6.2f Wait1 : %6.2f Wait2 : %6.2f Wait3 : %6.2f Kernel : %6.2f", - mypos, (double)copy_A /(double)total * 100., (double)copy_B /(double)total * 100., - (double)waiting1 /(double)total * 100., - (double)waiting2 /(double)total * 100., - (double)waiting3 /(double)total * 100., - (double)kernel /(double)total * 100.); - -#else - BLASLONG waiting = waiting1 + waiting2 + waiting3; BLASLONG total = copy_A + copy_B + kernel + waiting; @@ -580,8 +514,6 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, (double)waiting3 /(double)total * 100., (double)ops/(double)kernel / 4. * 100.); -#endif - #if 0 fprintf(stderr, "GEMM [%2ld] Copy_A : %6.2ld Copy_B : %6.2ld Wait : %6.2ld\n", mypos, copy_A, copy_B, waiting); From 759421641235481481ad663ee6cab0e5e49f6fed Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sun, 1 Dec 2013 16:32:54 +0100 Subject: [PATCH 78/81] modified common.h --- common.h | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/common.h b/common.h index 310fcad93..4e3230d7e 100644 --- a/common.h +++ b/common.h @@ -310,9 +310,12 @@ typedef int blasint; #define YIELDING SwitchToThread() #endif - #if defined(ARMV7) || defined(ARMV6) || defined(ARMV8) -#define YIELDING asm volatile ("nop;nop;nop;nop;nop;nop;nop;nop; \n"); +#define YIELDING asm volatile ("nop;nop;nop;nop;nop;nop;nop;nop; \n"); +#endif + +#ifdef PILEDRIVER +#define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n"); #endif @@ -320,8 +323,6 @@ typedef int blasint; #define YIELDING sched_yield() #endif - - /*** To alloc job_t on heap or statck. please https://github.com/xianyi/OpenBLAS/issues/246 @@ -379,6 +380,7 @@ please https://github.com/xianyi/OpenBLAS/issues/246 #include "common_arm64.h" #endif + #ifdef OS_LINUX #include "common_linux.h" #endif @@ -590,9 +592,10 @@ typedef struct { #include "common_level2.h" #include "common_level3.h" #include "common_lapack.h" + #ifdef CBLAS -/* This header file is generated from "cblas.h" (see Makefile.prebuild). */ -#include "cblas_noconst.h" +# define OPENBLAS_CONST /* see comment in cblas.h */ +# include "cblas.h" #endif #ifndef ASSEMBLER From 51e59835995b71de11e38ec872ed7d2d53ea3fbf Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sun, 1 Dec 2013 16:48:08 +0100 Subject: [PATCH 79/81] modified Makefile.system --- Makefile.system | 43 ++++++++++++++++++++++++++++++++++--------- 1 file changed, 34 insertions(+), 9 deletions(-) diff --git a/Makefile.system b/Makefile.system index d6c172f3d..ee6a89046 100644 --- a/Makefile.system +++ b/Makefile.system @@ -82,12 +82,19 @@ ifeq ($(HOSTCC), loongcc) GETARCH_FLAGS += -static endif +#if don't use Fortran, it will only compile CBLAS. +ifeq ($(ONLY_CBLAS), 1) +NO_LAPACK = 1 +else +ONLY_CBLAS = 0 +endif + # This operation is expensive, so execution should be once. ifndef GOTOBLAS_MAKEFILE export GOTOBLAS_MAKEFILE = 1 # Generating Makefile.conf and config.h -DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.prebuild CC="$(CC)" FC="$(FC)" HOSTCC="$(HOSTCC)" CFLAGS="$(GETARCH_FLAGS)" BINARY=$(BINARY) USE_OPENMP=$(USE_OPENMP) TARGET_CORE=$(TARGET_CORE) all) +DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.prebuild CC="$(CC)" FC="$(FC)" HOSTCC="$(HOSTCC)" CFLAGS="$(GETARCH_FLAGS)" BINARY=$(BINARY) USE_OPENMP=$(USE_OPENMP) TARGET_CORE=$(TARGET_CORE) ONLY_CBLAS=$(ONLY_CBLAS) all) ifndef TARGET_CORE include $(TOPDIR)/Makefile.conf @@ -329,16 +336,14 @@ ifeq ($(ARCH), x86) DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \ CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO ifneq ($(NO_AVX), 1) -DYNAMIC_CORE += SANDYBRIDGE -#BULLDOZER PILEDRIVER +DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER endif endif ifeq ($(ARCH), x86_64) DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO ifneq ($(NO_AVX), 1) -DYNAMIC_CORE += SANDYBRIDGE -#BULLDOZER PILEDRIVER +DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER endif endif @@ -369,17 +374,18 @@ BINARY_DEFINED = 1 endif ifeq ($(ARCH), arm) -NO_BINARY_MODE = 1 -BINARY_DEFINED = 1 +NO_BINARY_MODE = 1 +BINARY_DEFINED = 1 endif ifeq ($(ARCH), arm64) -NO_BINARY_MODE = 1 -BINARY_DEFINED = 1 +NO_BINARY_MODE = 1 +BINARY_DEFINED = 1 endif + # # C Compiler dependent settings # @@ -852,6 +858,7 @@ COMMON_OPT = -O3 endif endif + ifndef COMMON_OPT COMMON_OPT = -O2 endif @@ -921,6 +928,23 @@ LIBZIPNAME = $(LIBNAME:.$(LIBSUFFIX)=.zip) LIBS = $(TOPDIR)/$(LIBNAME) LIBS_P = $(TOPDIR)/$(LIBNAME_P) + +LIB_COMPONENTS = BLAS +ifneq ($(NO_CBLAS), 1) +LIB_COMPONENTS += CBLAS +endif + +ifneq ($(NO_LAPACK), 1) +LIB_COMPONENTS += LAPACK +ifneq ($(NO_LAPACKE), 1) +LIB_COMPONENTS += LAPACKE +endif +endif + +ifeq ($(ONLY_CBLAS), 1) +LIB_COMPONENTS = CBLAS +endif + export OSNAME export ARCH export CORE @@ -947,6 +971,7 @@ export USE_OPENMP export CROSS export CROSS_SUFFIX export NOFORTRAN +export NO_FBLAS export EXTRALIB export CEXTRALIB export FEXTRALIB From d844901062c94343356146bc9e8c5a0f1404bf15 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sun, 1 Dec 2013 17:38:58 +0100 Subject: [PATCH 80/81] modified Makefile.rule --- Makefile.rule | 34 +++++++++++++++++++--------------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/Makefile.rule b/Makefile.rule index c8433288b..e357d5ccc 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -12,33 +12,33 @@ VERSION = 0.2.8 # You can specify the target architecture, otherwise it's # automatically detected. -TARGET = ARMV6 +# TARGET = PENRYN # If you want to support multiple architecture in one binary # DYNAMIC_ARCH = 1 # C compiler including binary type(32bit / 64bit). Default is gcc. # Don't use Intel Compiler or PGI, it won't generate right codes as I expect. -CC = gcc +# CC = gcc # Fortran compiler. Default is g77. -FC = gfortran +# FC = gfortran # Even you can specify cross compiler. Meanwhile, please set HOSTCC. -#CC = arm-linux-gnueabihf-gcc -#FC = arm-linux-gnueabihf-gfortran +# CC = x86_64-w64-mingw32-gcc +# FC = x86_64-w64-mingw32-gfortran # If you use the cross compiler, please set this host compiler. -HOSTCC = gcc +# HOSTCC = gcc # If you need 32bit binary, define BINARY=32, otherwise define BINARY=64 -#BINARY=32 +# BINARY=64 # About threaded BLAS. It will be automatically detected if you don't # specify it. # For force setting for single threaded, specify USE_THREAD = 0 # For force setting for multi threaded, specify USE_THREAD = 1 -#USE_THREAD = 0 +# USE_THREAD = 0 # If you're going to use this library with OpenMP, please comment it in. # USE_OPENMP = 1 @@ -46,7 +46,7 @@ HOSTCC = gcc # You can define maximum number of threads. Basically it should be # less than actual number of cores. If you don't specify one, it's # automatically detected by the the script. -NUM_THREADS = 16 +# NUM_THREADS = 24 # if you don't need generate the shared library, please comment it in. # NO_SHARED = 1 @@ -54,12 +54,16 @@ NUM_THREADS = 16 # If you don't need CBLAS interface, please comment it in. # NO_CBLAS = 1 +# If you only want CBLAS interface without installing Fortran compiler, +# please comment it in. +# ONLY_CBLAS = 1 + # If you don't need LAPACK, please comment it in. # If you set NO_LAPACK=1, the library automatically sets NO_LAPACKE=1. -#NO_LAPACK = 1 +# NO_LAPACK = 1 # If you don't need LAPACKE (C Interface to LAPACK), please comment it in. -#NO_LAPACKE = 1 +# NO_LAPACKE = 1 # If you want to use legacy threaded Level 3 implementation. # USE_SIMPLE_THREADED_LEVEL3 = 1 @@ -72,10 +76,10 @@ NUM_THREADS = 16 # Unfortunately most of kernel won't give us high quality buffer. # BLAS tries to find the best region before entering main function, # but it will consume time. If you don't like it, you can disable one. -NO_WARMUP = 1 +# NO_WARMUP = 1 # If you want to disable CPU/Memory affinity on Linux. -NO_AFFINITY = 1 +# NO_AFFINITY = 1 # Don't use AVX kernel on Sandy Bridge. It is compatible with old compilers # and OS. However, the performance is low. @@ -123,13 +127,13 @@ NO_AFFINITY = 1 # Common Optimization Flag; # The default -O2 is enough. -#COMMON_OPT = -O3 -marm -mfpu=vfpv3 -mfloat-abi=hard +# COMMON_OPT = -O2 # Profiling flags COMMON_PROF = -pg # Build Debug version -DEBUG = 1 +# DEBUG = 1 # # End of user configuration From 9e38dbb658f71b393411c5189260a4d5a22ed7db Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sun, 1 Dec 2013 17:51:39 +0100 Subject: [PATCH 81/81] modified param.h --- param.h | 131 ++++++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 117 insertions(+), 14 deletions(-) diff --git a/param.h b/param.h index 0628a1972..b865287be 100644 --- a/param.h +++ b/param.h @@ -304,9 +304,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #ifdef PILEDRIVER - -#define SNUMOPT 8 -#define DNUMOPT 4 +#define SNUMOPT 8 +#define DNUMOPT 4 #define GEMM_DEFAULT_OFFSET_A 64 #define GEMM_DEFAULT_OFFSET_B 832 @@ -344,39 +343,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMV_UNROLL 8 #endif - #if defined(ARCH_X86_64) #define SGEMM_DEFAULT_P 768 -#define DGEMM_DEFAULT_P 384 +#define DGEMM_DEFAULT_P 768 +#define ZGEMM_DEFAULT_P 384 +#define CGEMM_DEFAULT_P 768 #else #define SGEMM_DEFAULT_P 448 -#define DGEMM_DEFAULT_P 224 +#define DGEMM_DEFAULT_P 480 +#define ZGEMM_DEFAULT_P 112 +#define CGEMM_DEFAULT_P 224 #endif #define QGEMM_DEFAULT_P 112 -#define CGEMM_DEFAULT_P 224 -#define ZGEMM_DEFAULT_P 112 #define XGEMM_DEFAULT_P 56 #if defined(ARCH_X86_64) -#define SGEMM_DEFAULT_Q 168 +#define SGEMM_DEFAULT_Q 192 #define DGEMM_DEFAULT_Q 168 +#define ZGEMM_DEFAULT_Q 168 +#define CGEMM_DEFAULT_Q 168 #else #define SGEMM_DEFAULT_Q 224 #define DGEMM_DEFAULT_Q 224 +#define ZGEMM_DEFAULT_Q 224 +#define CGEMM_DEFAULT_Q 224 #endif #define QGEMM_DEFAULT_Q 224 -#define CGEMM_DEFAULT_Q 224 -#define ZGEMM_DEFAULT_Q 224 #define XGEMM_DEFAULT_Q 224 -#define SGEMM_DEFAULT_R sgemm_r +#define SGEMM_DEFAULT_R 12288 #define QGEMM_DEFAULT_R qgemm_r -#define DGEMM_DEFAULT_R dgemm_r +#define DGEMM_DEFAULT_R 12288 #define CGEMM_DEFAULT_R cgemm_r #define ZGEMM_DEFAULT_R zgemm_r #define XGEMM_DEFAULT_R xgemm_r -#define SYMV_P 16 +#define SYMV_P 16 #define HAVE_EXCLUSIVE_CACHE #define GEMM_THREAD gemm_thread_mn @@ -1150,6 +1152,107 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif +#ifdef HASWELL + +#define SNUMOPT 8 +#define DNUMOPT 4 + +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 +#define GEMM_DEFAULT_ALIGN 0x03fffUL + +#define SYMV_P 8 + +#define SWITCH_RATIO 4 + +#ifdef ARCH_X86 + +#define SGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_M 2 +#define QGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_M 1 +#define XGEMM_DEFAULT_UNROLL_M 1 + +#define SGEMM_DEFAULT_UNROLL_N 4 +#define DGEMM_DEFAULT_UNROLL_N 4 +#define QGEMM_DEFAULT_UNROLL_N 2 +#define CGEMM_DEFAULT_UNROLL_N 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 +#define XGEMM_DEFAULT_UNROLL_N 1 + +#else + +#define SGEMM_DEFAULT_UNROLL_M 16 +#define DGEMM_DEFAULT_UNROLL_M 4 +#define QGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_M 8 +#define ZGEMM_DEFAULT_UNROLL_M 4 +#define XGEMM_DEFAULT_UNROLL_M 1 + +#define SGEMM_DEFAULT_UNROLL_N 4 +#define DGEMM_DEFAULT_UNROLL_N 4 +#define QGEMM_DEFAULT_UNROLL_N 2 +#define CGEMM_DEFAULT_UNROLL_N 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 +#define XGEMM_DEFAULT_UNROLL_N 1 + +#endif + +#ifdef ARCH_X86 + +#define SGEMM_DEFAULT_P 512 +#define SGEMM_DEFAULT_R sgemm_r +#define DGEMM_DEFAULT_P 512 +#define DGEMM_DEFAULT_R dgemm_r +#define QGEMM_DEFAULT_P 504 +#define QGEMM_DEFAULT_R qgemm_r +#define CGEMM_DEFAULT_P 128 +#define CGEMM_DEFAULT_R 1024 +#define ZGEMM_DEFAULT_P 512 +#define ZGEMM_DEFAULT_R zgemm_r +#define XGEMM_DEFAULT_P 252 +#define XGEMM_DEFAULT_R xgemm_r +#define SGEMM_DEFAULT_Q 256 +#define DGEMM_DEFAULT_Q 256 +#define QGEMM_DEFAULT_Q 128 +#define CGEMM_DEFAULT_Q 256 +#define ZGEMM_DEFAULT_Q 192 +#define XGEMM_DEFAULT_Q 128 + +#else + +#define SGEMM_DEFAULT_P 768 +#define DGEMM_DEFAULT_P 512 +#define CGEMM_DEFAULT_P 384 +#define ZGEMM_DEFAULT_P 256 + +#define SGEMM_DEFAULT_Q 384 +#define DGEMM_DEFAULT_Q 256 +#define CGEMM_DEFAULT_Q 192 +#define ZGEMM_DEFAULT_Q 128 + +#define SGEMM_DEFAULT_R sgemm_r +#define DGEMM_DEFAULT_R 13824 +#define CGEMM_DEFAULT_R cgemm_r +#define ZGEMM_DEFAULT_R zgemm_r + +#define QGEMM_DEFAULT_Q 128 +#define QGEMM_DEFAULT_P 504 +#define QGEMM_DEFAULT_R qgemm_r +#define XGEMM_DEFAULT_P 252 +#define XGEMM_DEFAULT_R xgemm_r +#define XGEMM_DEFAULT_Q 128 + +#define CGEMM3M_DEFAULT_UNROLL_N 4 +#define CGEMM3M_DEFAULT_UNROLL_M 8 +#define ZGEMM3M_DEFAULT_UNROLL_N 2 +#define ZGEMM3M_DEFAULT_UNROLL_M 8 +#endif + + +#endif + #ifdef ATOM