arm: Remove unnecessary files/code

Since softfp code has been added to all required vfp kernels,
the code for auto detection of abi is no longer required.

The option to force softfp ABI on make command line by giving
ARM_SOFTFP_ABI=1 is retained. But there is no need to give this option
anymore.

Also the newly added C versions of 4x4/4x2 gemm/trmm kernels are removed.
These are longer required. Moreover these kernels has bugs.
This commit is contained in:
Ashwin Sekhar T K 2017-07-02 03:06:36 +05:30
parent 97d671eb61
commit 37efb5bc1d
5 changed files with 8 additions and 1443 deletions

View File

@ -487,19 +487,14 @@ ifeq ($(ARCH), arm)
NO_BINARY_MODE = 1
BINARY_DEFINED = 1
# If ABI is specified on command line use it. Else use the automatically detected ABI.
ifeq ($(ARM_SOFTFP_ABI),1)
ARM_ABI = softfp
else
ifeq ($(ARM_HARD_ABI),1)
ARM_ABI = hard
else
ARM_ABI=$(ARM_ABI_AUTO)
CCOMMON_OPT += -marm
FCOMMON_OPT += -marm
# If softfp abi is mentioned on the command line, force it.
ifeq ($(ARM_SOFTFP_ABI), 1)
CCOMMON_OPT += -mfloat-abi=softfp
FCOMMON_OPT += -mfloat-abi=softfp
endif
endif
export ARM_ABI_AUTO
CCOMMON_OPT += -marm -mfloat-abi=$(ARM_ABI)
FCOMMON_OPT += -marm -mfloat-abi=$(ARM_ABI)
endif

16
c_check
View File

@ -94,17 +94,7 @@ if ($architecture eq "mips64") {
$defined = 1;
}
if ($architecture eq "arm") {
$defined = 1;
$data = `$compiler_name -dM -E ctest2.c | grep -w __ARM_PCS_VFP`;
if ($data ne "") {
$abi = "hard";
} else {
$abi = "softfp";
}
}
if ($architecture eq "arm64") {
if (($architecture eq "arm") || ($architecture eq "arm64")) {
$defined = 1;
}
@ -297,10 +287,6 @@ print MAKEFILE "CEXTRALIB=$linker_L $linker_l $linker_a\n";
print MAKEFILE "HAVE_MSA=1\n" if $have_msa eq 1;
print MAKEFILE "MSA_FLAGS=$msa_flags\n" if $have_msa eq 1;
if ($architecture eq "arm") {
print MAKEFILE "ARM_ABI_AUTO=$abi\n";
}
$os =~ tr/[a-z]/[A-Z]/;
$architecture =~ tr/[a-z]/[A-Z]/;
$compiler =~ tr/[a-z]/[A-Z]/;

View File

@ -1,317 +0,0 @@
/***************************************************************************
Copyright (c) 2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#include <stdbool.h>
int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc)
{
BLASLONG i,j,k;
FLOAT *C0,*C1,*ptrba,*ptrbb;
FLOAT res0_0;
FLOAT res0_1;
FLOAT res0_2;
FLOAT res0_3;
FLOAT res1_0;
FLOAT res1_1;
FLOAT res1_2;
FLOAT res1_3;
FLOAT a0;
FLOAT a1;
FLOAT b0;
FLOAT b1;
for (j=0; j<(bn/2); j+=2)
{
C0 = C;
C1 = C0+ldc;
ptrba = ba;
for (i=0; i<bm/4; i+=1)
{
ptrbb = bb;
res0_0 = 0;
res0_1 = 0;
res0_2 = 0;
res0_3 = 0;
res1_0 = 0;
res1_1 = 0;
res1_2 = 0;
res1_3 = 0;
for (k=0; k<bk; k++)
{
b0 = ptrbb[0];
b1 = ptrbb[1];
a0 = ptrba[0];
res0_0 += a0*b0;
res1_0 += a0*b1;
a1 = ptrba[1];
res0_1 += a1*b0;
res1_1 += a1*b1;
a0 = ptrba[2];
res0_2 += a0*b0;
res1_2 += a0*b1;
a1 = ptrba[3];
res0_3 += a1*b0;
res1_3 += a1*b1;
ptrba = ptrba+4;
ptrbb = ptrbb+2;
}
res0_0 *= alpha;
res0_1 *= alpha;
res0_2 *= alpha;
res0_3 *= alpha;
res1_0 *= alpha;
res1_1 *= alpha;
res1_2 *= alpha;
res1_3 *= alpha;
C0[0] += res0_0;
C0[1] += res0_1;
C0[2] += res0_2;
C0[3] += res0_3;
C1[0] += res1_0;
C1[1] += res1_1;
C1[2] += res1_2;
C1[3] += res1_3;
C0 = C0+4;
C1 = C1+4;
}
if ( bm & 2 )
{
ptrbb = bb;
res0_0 = 0;
res0_1 = 0;
res1_0 = 0;
res1_1 = 0;
for (k=0; k<bk; k++)
{
b0 = ptrbb[0];
b1 = ptrbb[1];
a0 = ptrba[0];
res0_0 += a0*b0;
res1_0 += a0*b1;
a1 = ptrba[1];
res0_1 += a1*b0;
res1_1 += a1*b1;
ptrba = ptrba+2;
ptrbb = ptrbb+2;
}
res0_0 *= alpha;
res0_1 *= alpha;
res1_0 *= alpha;
res1_1 *= alpha;
C0[0] += res0_0;
C0[1] += res0_1;
C1[0] += res1_0;
C1[1] += res1_1;
C0 = C0+2;
C1 = C1+2;
}
if ( bm & 1 )
{
ptrbb = bb;
res0_0 = 0;
res1_0 = 0;
for (k=0; k<bk; k++)
{
b0 = ptrbb[0];
b1 = ptrbb[1];
a0 = ptrba[0];
res0_0 += a0*b0;
res1_0 += a0*b1;
ptrba = ptrba+1;
ptrbb = ptrbb+2;
}
res0_0 *= alpha;
res1_0 *= alpha;
C0[0] += res0_0;
C1[0] += res1_0;
C0 += C0+1;
C1 += C1+1;
}
k = (bk<<1);
bb = bb+k;
i = (ldc<<1);
C = C+i;
}
for (j=0; j<(bn&1); j+=1) // do the Mx1 loops
{
C0 = C;
ptrba = ba;
for (i=0; i<bm/4; i+=1) // do blocks of 4x1 loops
{
ptrbb = bb;
res0_0 = 0;
res0_1 = 0;
res0_2 = 0;
res0_3 = 0;
for (k=0; k<bk; k++)
{
b0 = ptrbb[0];
a0 = ptrba[0];
res0_0 += a0*b0;
a1 = ptrba[1];
res0_1 += a1*b0;
a0 = ptrba[2];
res0_2 += a0*b0;
a1 = ptrba[3];
res0_3 += a1*b0;
ptrba = ptrba+4;
ptrbb = ptrbb+1;
}
res0_0 *= alpha;
res0_1 *= alpha;
res0_2 *= alpha;
res0_3 *= alpha;
C0[0] += res0_0;
C0[1] += res0_1;
C0[2] += res0_2;
C0[3] += res0_3;
C0 = C0+4;
}
if ( bm & 2 ) // do any 2x1 loop
{
ptrbb = bb;
res0_0 = 0;
res0_1 = 0;
for (k=0; k<bk; k++)
{
b0 = ptrbb[0];
a0 = ptrba[0];
res0_0 += a0*b0;
a1 = ptrba[1];
res0_1 += a1*b0;
ptrba = ptrba+2;
ptrbb = ptrbb+1;
}
res0_0 *= alpha;
res0_1 *= alpha;
C0[0] += res0_0;
C0[1] += res0_1;
C0 = C0+2;
}
if ( bm & 1 ) // do any 1x1 loop
{
ptrbb = bb;
res0_0 = 0;
for (k=0; k<bk; k++)
{
b0 = ptrbb[0];
a0 = ptrba[0];
res0_0 += a0*b0;
ptrba = ptrba+1;
ptrbb = ptrbb+1;
}
res0_0 *= alpha;
C0[0] = res0_0;
C0 += C0+1;
}
k = (bk<<0);
bb = bb+k;
C = C+ldc;
}
return 0;
}

View File

@ -1,571 +0,0 @@
/***************************************************************************
Copyright (c) 2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#include <stdbool.h>
int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc)
{
BLASLONG i,j,k;
FLOAT *C0,*C1,*C2,*C3,*ptrba,*ptrbb;
FLOAT res0_0;
FLOAT res0_1;
FLOAT res0_2;
FLOAT res0_3;
FLOAT res1_0;
FLOAT res1_1;
FLOAT res1_2;
FLOAT res1_3;
FLOAT res2_0;
FLOAT res2_1;
FLOAT res2_2;
FLOAT res2_3;
FLOAT res3_0;
FLOAT res3_1;
FLOAT res3_2;
FLOAT res3_3;
FLOAT a0;
FLOAT a1;
FLOAT b0;
FLOAT b1;
FLOAT b2;
FLOAT b3;
for (j=0; j<bn/4; j+=1) // do blocks of the Mx4 loops
{
C0 = C;
C1 = C0+ldc;
C2 = C1+ldc;
C3 = C2+ldc;
ptrba = ba;
for (i=0; i<bm/4; i+=1) // do blocks of 4x4
{
ptrbb = bb;
res0_0 = 0;
res0_1 = 0;
res0_2 = 0;
res0_3 = 0;
res1_0 = 0;
res1_1 = 0;
res1_2 = 0;
res1_3 = 0;
res2_0 = 0;
res2_1 = 0;
res2_2 = 0;
res2_3 = 0;
res3_0 = 0;
res3_1 = 0;
res3_2 = 0;
res3_3 = 0;
for (k=0; k<bk; k++)
{
b0 = ptrbb[0];
b1 = ptrbb[1];
b2 = ptrbb[2];
b3 = ptrbb[3];
a0 = ptrba[0];
res0_0 += a0*b0;
res1_0 += a0*b1;
res2_0 += a0*b2;
res3_0 += a0*b3;
a1 = ptrba[1];
res0_1 += a1*b0;
res1_1 += a1*b1;
res2_1 += a1*b2;
res3_1 += a1*b3;
a0 = ptrba[2];
res0_2 += a0*b0;
res1_2 += a0*b1;
res2_2 += a0*b2;
res3_2 += a0*b3;
a1 = ptrba[3];
res0_3 += a1*b0;
res1_3 += a1*b1;
res2_3 += a1*b2;
res3_3 += a1*b3;
ptrba = ptrba+4;
ptrbb = ptrbb+4;
}
res0_0 *= alpha;
res0_1 *= alpha;
res0_2 *= alpha;
res0_3 *= alpha;
res1_0 *= alpha;
res1_1 *= alpha;
res1_2 *= alpha;
res1_3 *= alpha;
res2_0 *= alpha;
res2_1 *= alpha;
res2_2 *= alpha;
res2_3 *= alpha;
res3_0 *= alpha;
res3_1 *= alpha;
res3_2 *= alpha;
res3_3 *= alpha;
C0[0] += res0_0;
C0[1] += res0_1;
C0[2] += res0_2;
C0[3] += res0_3;
C1[0] += res1_0;
C1[1] += res1_1;
C1[2] += res1_2;
C1[3] += res1_3;
C2[0] += res2_0;
C2[1] += res2_1;
C2[2] += res2_2;
C2[3] += res2_3;
C3[0] += res3_0;
C3[1] += res3_1;
C3[2] += res3_2;
C3[3] += res3_3;
C0 = C0+4;
C1 = C1+4;
C2 = C2+4;
C3 = C3+4;
}
if ( bm & 2 ) // do any 2x4 loop
{
ptrbb = bb;
res0_0 = 0;
res0_1 = 0;
res1_0 = 0;
res1_1 = 0;
res2_0 = 0;
res2_1 = 0;
res3_0 = 0;
res3_1 = 0;
for (k=0; k<bk; k++)
{
b0 = ptrbb[0];
b1 = ptrbb[1];
b2 = ptrbb[2];
b3 = ptrbb[3];
a0 = ptrba[0];
res0_0 += a0*b0;
res1_0 += a0*b1;
res2_0 += a0*b2;
res3_0 += a0*b3;
a1 = ptrba[1];
res0_1 += a1*b0;
res1_1 += a1*b1;
res2_1 += a1*b2;
res3_1 += a1*b3;
ptrba = ptrba+2;
ptrbb = ptrbb+4;
}
res0_0 *= alpha;
res0_1 *= alpha;
res1_0 *= alpha;
res1_1 *= alpha;
res2_0 *= alpha;
res2_1 *= alpha;
res3_0 *= alpha;
res3_1 *= alpha;
C0[0] += res0_0;
C0[1] += res0_1;
C1[0] += res1_0;
C1[1] += res1_1;
C2[0] += res2_0;
C2[1] += res2_1;
C3[0] += res3_0;
C3[1] += res3_1;
C0 = C0+2;
C1 = C1+2;
C2 = C2+2;
C3 = C3+2;
}
if ( bm & 1 ) // do any 1x4 loop
{
ptrbb = bb;
res0_0 = 0;
res1_0 = 0;
res2_0 = 0;
res3_0 = 0;
for (k=0; k<bk; k++)
{
b0 = ptrbb[0];
b1 = ptrbb[1];
b2 = ptrbb[2];
b3 = ptrbb[3];
a0 = ptrba[0];
res0_0 += a0*b0;
res1_0 += a0*b1;
res2_0 += a0*b2;
res3_0 += a0*b3;
ptrba = ptrba+1;
ptrbb = ptrbb+4;
}
res0_0 *= alpha;
res1_0 *= alpha;
res2_0 *= alpha;
res3_0 *= alpha;
C0[0] += res0_0;
C1[0] += res1_0;
C2[0] += res2_0;
C3[0] += res3_0;
C0 = C0+1;
C1 = C1+1;
C2 = C2+1;
C3 = C3+1;
}
k = (bk<<2);
bb = bb+k;
i = (ldc<<2);
C = C+i;
}
for (j=0; j<(bn&2); j+=2) // do the Mx2 loops
{
C0 = C;
C1 = C0+ldc;
ptrba = ba;
for (i=0; i<bm/4; i+=1) // do blocks of 4x2
{
ptrbb = bb;
res0_0 = 0;
res0_1 = 0;
res0_2 = 0;
res0_3 = 0;
res1_0 = 0;
res1_1 = 0;
res1_2 = 0;
res1_3 = 0;
for (k=0; k<bk; k++)
{
b0 = ptrbb[0];
b1 = ptrbb[1];
a0 = ptrba[0];
res0_0 += a0*b0;
res1_0 += a0*b1;
a1 = ptrba[1];
res0_1 += a1*b0;
res1_1 += a1*b1;
a0 = ptrba[2];
res0_2 += a0*b0;
res1_2 += a0*b1;
a1 = ptrba[3];
res0_3 += a1*b0;
res1_3 += a1*b1;
ptrba = ptrba+4;
ptrbb = ptrbb+2;
}
res0_0 *= alpha;
res0_1 *= alpha;
res0_2 *= alpha;
res0_3 *= alpha;
res1_0 *= alpha;
res1_1 *= alpha;
res1_2 *= alpha;
res1_3 *= alpha;
C0[0] += res0_0;
C0[1] += res0_1;
C0[2] += res0_2;
C0[3] += res0_3;
C1[0] += res1_0;
C1[1] += res1_1;
C1[2] += res1_2;
C1[3] += res1_3;
C0 = C0+4;
C1 = C1+4;
}
if ( bm & 2 ) // do any 2x2 loop
{
ptrbb = bb;
res0_0 = 0;
res0_1 = 0;
res1_0 = 0;
res1_1 = 0;
for (k=0; k<bk; k++)
{
b0 = ptrbb[0];
b1 = ptrbb[1];
a0 = ptrba[0];
res0_0 += a0*b0;
res1_0 += a0*b1;
a1 = ptrba[1];
res0_1 += a1*b0;
res1_1 += a1*b1;
ptrba = ptrba+2;
ptrbb = ptrbb+2;
}
res0_0 *= alpha;
res0_1 *= alpha;
res1_0 *= alpha;
res1_1 *= alpha;
C0[0] += res0_0;
C0[1] += res0_1;
C1[0] += res1_0;
C1[1] += res1_1;
C0 = C0+2;
C1 = C1+2;
}
if ( bm & 1 ) // do any 1x2 loop
{
ptrbb = bb;
res0_0 = 0;
res1_0 = 0;
for (k=0; k<bk; k++)
{
b0 = ptrbb[0];
b1 = ptrbb[1];
a0 = ptrba[0];
res0_0 += a0*b0;
res1_0 += a0*b1;
ptrba = ptrba+1;
ptrbb = ptrbb+2;
}
res0_0 *= alpha;
res1_0 *= alpha;
C0[0] += res0_0;
C1[0] += res1_0;
C0 = C0+1;
C1 = C1+1;
}
k = (bk<<1);
bb = bb+k;
i = (ldc<<1);
C = C+i;
}
for (j=0; j<(bn&1); j+=1) // do the Mx1 loops
{
C0 = C;
ptrba = ba;
for (i=0; i<bm/4; i+=1) // do blocks of 4x1 loops
{
ptrbb = bb;
res0_0 = 0;
res0_1 = 0;
res0_2 = 0;
res0_3 = 0;
for (k=0; k<bk; k++)
{
b0 = ptrbb[0];
a0 = ptrba[0];
res0_0 += a0*b0;
a1 = ptrba[1];
res0_1 += a1*b0;
a0 = ptrba[2];
res0_2 += a0*b0;
a1 = ptrba[3];
res0_3 += a1*b0;
ptrba = ptrba+4;
ptrbb = ptrbb+1;
}
res0_0 *= alpha;
res0_1 *= alpha;
res0_2 *= alpha;
res0_3 *= alpha;
C0[0] += res0_0;
C0[1] += res0_1;
C0[2] += res0_2;
C0[3] += res0_3;
C0 = C0+4;
}
if ( bm & 2 ) // do any 2x1 loop
{
ptrbb = bb;
res0_0 = 0;
res0_1 = 0;
for (k=0; k<bk; k++)
{
b0 = ptrbb[0];
a0 = ptrba[0];
res0_0 += a0*b0;
a1 = ptrba[1];
res0_1 += a1*b0;
ptrba = ptrba+2;
ptrbb = ptrbb+1;
}
res0_0 *= alpha;
res0_1 *= alpha;
C0[0] += res0_0;
C0[1] += res0_1;
C0 = C0+2;
}
if ( bm & 1 ) // do any 1x1 loop
{
ptrbb = bb;
res0_0 = 0;
for (k=0; k<bk; k++)
{
b0 = ptrbb[0];
a0 = ptrba[0];
res0_0 += a0*b0;
ptrba = ptrba+1;
ptrbb = ptrbb+1;
}
res0_0 *= alpha;
C0[0] += res0_0;
C0 = C0+1;
}
k = (bk<<0);
bb = bb+k;
C = C+ldc;
}
return 0;
}

View File

@ -1,528 +0,0 @@
#include "common.h"
#include <stdbool.h>
int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc ,BLASLONG offset)
{
BLASLONG i,j,k;
FLOAT *C0,*C1,*ptrba,*ptrbb;
FLOAT res0_0;
FLOAT res0_1;
FLOAT res0_2;
FLOAT res0_3;
FLOAT res1_0;
FLOAT res1_1;
FLOAT res1_2;
FLOAT res1_3;
FLOAT a0;
FLOAT a1;
FLOAT b0;
FLOAT b1;
BLASLONG off, temp;
bool left;
bool transposed;
bool backwards;
#ifdef LEFT
left = true;
#else
left = false;
#endif
#ifdef TRANSA
transposed = true;
#else
transposed = false;
#endif
backwards = left != transposed;
if (!left) {
off = -offset;
}
for (j=0; j<(bn/2); j+=2) // do the Mx2 loops
{
C0 = C;
C1 = C0+ldc;
#if defined(TRMMKERNEL) && defined(LEFT)
off = offset;
#endif
ptrba = ba;
for (i=0; i<bm/4; i+=1) // do blocks of 4x2
{
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
ptrbb = bb;
#else
ptrba += off*4;
ptrbb = bb + off*2;
#endif
res0_0 = 0;
res0_1 = 0;
res0_2 = 0;
res0_3 = 0;
res1_0 = 0;
res1_1 = 0;
res1_2 = 0;
res1_3 = 0;
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
temp = bk-off;
#elif defined(LEFT)
temp = off+4; // number of values in A
#else
temp = off+2; // number of values in B
#endif
for (k=0; k<temp; k++)
{
b0 = ptrbb[0];
b1 = ptrbb[1];
a0 = ptrba[0];
res0_0 += a0*b0;
res1_0 += a0*b1;
a1 = ptrba[1];
res0_1 += a1*b0;
res1_1 += a1*b1;
a0 = ptrba[2];
res0_2 += a0*b0;
res1_2 += a0*b1;
a1 = ptrba[3];
res0_3 += a1*b0;
res1_3 += a1*b1;
ptrba = ptrba+4;
ptrbb = ptrbb+2;
}
res0_0 *= alpha;
res0_1 *= alpha;
res0_2 *= alpha;
res0_3 *= alpha;
res1_0 *= alpha;
res1_1 *= alpha;
res1_2 *= alpha;
res1_3 *= alpha;
C0[0] = res0_0;
C0[1] = res0_1;
C0[2] = res0_2;
C0[3] = res0_3;
C1[0] = res1_0;
C1[1] = res1_1;
C1[2] = res1_2;
C1[3] = res1_3;
#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
temp = bk - off;
#ifdef LEFT
temp -= 4; // number of values in A
#else
temp -= 2; // number of values in B
#endif
ptrba += temp*4;
ptrbb += temp*2;
#endif
#ifdef LEFT
off += 4; // number of values in A
#endif
C0 = C0+4;
C1 = C1+4;
}
if ( bm & 2 ) // do any 2x2 loop
{
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
ptrbb = bb;
#else
ptrba += off*2;
ptrbb = bb + off*2;
#endif
res0_0 = 0;
res0_1 = 0;
res1_0 = 0;
res1_1 = 0;
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
temp = bk-off;
#elif defined(LEFT)
temp = off+2; // number of values in A
#else
temp = off+2; // number of values in B
#endif
for (k=0; k<temp; k++)
{
b0 = ptrbb[0];
b1 = ptrbb[1];
a0 = ptrba[0];
res0_0 += a0*b0;
res1_0 += a0*b1;
a1 = ptrba[1];
res0_1 += a1*b0;
res1_1 += a1*b1;
ptrba = ptrba+2;
ptrbb = ptrbb+2;
}
res0_0 *= alpha;
res0_1 *= alpha;
res1_0 *= alpha;
res1_1 *= alpha;
C0[0] = res0_0;
C0[1] = res0_1;
C1[0] = res1_0;
C1[1] = res1_1;
#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
temp = bk - off;
#ifdef LEFT
temp -= 2; // number of values in A
#else
temp -= 2; // number of values in B
#endif
ptrba += temp*2;
ptrbb += temp*2;
#endif
#ifdef LEFT
off += 2; // number of values in A
#endif
C0 = C0+2;
C1 = C1+2;
}
if ( bm & 1 ) // do any 1x2 loop
{
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
ptrbb = bb;
#else
ptrba += off*1;
ptrbb = bb + off*2;
#endif
res0_0 = 0;
res1_0 = 0;
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
temp = bk-off;
#elif defined(LEFT)
temp = off+1; // number of values in A
#else
temp = off+2; // number of values in B
#endif
for (k=0; k<temp; k++)
{
b0 = ptrbb[0];
b1 = ptrbb[1];
a0 = ptrba[0];
res0_0 += a0*b0;
res1_0 += a0*b1;
ptrba = ptrba+1;
ptrbb = ptrbb+2;
}
res0_0 *= alpha;
res1_0 *= alpha;
C0[0] = res0_0;
C1[0] = res1_0;
#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
temp = bk - off;
#ifdef LEFT
temp -= 1; // number of values in A
#else
temp -= 2; // number of values in B
#endif
ptrba += temp*1;
ptrbb += temp*2;
#endif
#ifdef LEFT
off += 1; // number of values in A
#endif
C0 = C0+1;
C1 = C1+1;
}
#if defined(TRMMKERNEL) && !defined(LEFT)
off += 2;
#endif
k = (bk<<1);
bb = bb+k;
i = (ldc<<1);
C = C+i;
}
for (j=0; j<(bn&1); j+=1) // do the Mx1 loops
{
C0 = C;
#if defined(TRMMKERNEL) && defined(LEFT)
off = offset;
#endif
ptrba = ba;
for (i=0; i<bm/4; i+=1) // do blocks of 4x1 loops
{
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
ptrbb = bb;
#else
ptrba += off*4;
ptrbb = bb + off*1;
#endif
res0_0 = 0;
res0_1 = 0;
res0_2 = 0;
res0_3 = 0;
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
temp = bk-off;
#elif defined(LEFT)
temp = off+4; // number of values in A
#else
temp = off+1; // number of values in B
#endif
for (k=0; k<temp; k++)
{
b0 = ptrbb[0];
a0 = ptrba[0];
res0_0 += a0*b0;
a1 = ptrba[1];
res0_1 += a1*b0;
a0 = ptrba[2];
res0_2 += a0*b0;
a1 = ptrba[3];
res0_3 += a1*b0;
ptrba = ptrba+4;
ptrbb = ptrbb+1;
}
res0_0 *= alpha;
res0_1 *= alpha;
res0_2 *= alpha;
res0_3 *= alpha;
C0[0] = res0_0;
C0[1] = res0_1;
C0[2] = res0_2;
C0[3] = res0_3;
#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
temp = bk - off;
#ifdef LEFT
temp -= 4; // number of values in A
#else
temp -= 1; // number of values in B
#endif
ptrba += temp*4;
ptrbb += temp*1;
#endif
#ifdef LEFT
off += 4; // number of values in A
#endif
C0 = C0+4;
}
if ( bm & 2 ) // do any 2x1 loop
{
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
ptrbb = bb;
#else
ptrba += off*2;
ptrbb = bb + off*1;
#endif
res0_0 = 0;
res0_1 = 0;
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
temp = bk-off;
#elif defined(LEFT)
temp = off+2; // number of values in A
#else
temp = off+1; // number of values in B
#endif
for (k=0; k<temp; k++)
{
b0 = ptrbb[0];
a0 = ptrba[0];
res0_0 += a0*b0;
a1 = ptrba[1];
res0_1 += a1*b0;
ptrba = ptrba+2;
ptrbb = ptrbb+1;
}
res0_0 *= alpha;
res0_1 *= alpha;
C0[0] = res0_0;
C0[1] = res0_1;
#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
temp = bk - off;
#ifdef LEFT
temp -= 2; // number of values in A
#else
temp -= 1; // number of values in B
#endif
ptrba += temp*2;
ptrbb += temp*1;
#endif
#ifdef LEFT
off += 2; // number of values in A
#endif
C0 = C0+2;
}
if ( bm & 1 ) // do any 1x1 loop
{
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
ptrbb = bb;
#else
ptrba += off*1;
ptrbb = bb + off*1;
#endif
res0_0 = 0;
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
temp = bk-off;
#elif defined(LEFT)
temp = off+1; // number of values in A
#else
temp = off+1; // number of values in B
#endif
for (k=0; k<temp; k++)
{
b0 = ptrbb[0];
a0 = ptrba[0];
res0_0 += a0*b0;
ptrba = ptrba+1;
ptrbb = ptrbb+1;
}
res0_0 *= alpha;
C0[0] = res0_0;
#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
temp = bk - off;
#ifdef LEFT
temp -= 1; // number of values in A
#else
temp -= 1; // number of values in B
#endif
ptrba += temp*1;
ptrbb += temp*1;
#endif
#ifdef LEFT
off += 1; // number of values in A
#endif
C0 = C0+1;
}
#if defined(TRMMKERNEL) && !defined(LEFT)
off += 1;
#endif
k = (bk<<0);
bb = bb+k;
C = C+ldc;
}
return 0;
}