Fixed #456. Merged the optimizations for APM's

xgene-1 (aarch64).
Merge branch 'benedikt-huber-dave-patch' into develop
This commit is contained in:
Zhang Xianyi 2014-11-11 22:21:04 +08:00
commit 4806715c97
8 changed files with 2442 additions and 16 deletions

View File

@ -117,5 +117,9 @@ In chronological order:
* Isaac Dunham <https://github.com/idunham> * Isaac Dunham <https://github.com/idunham>
* [2014-08-03] Fixed link error on Linux/musl * [2014-08-03] Fixed link error on Linux/musl
* Dave Nuechterlein
* [2014-10-10] trmm and sgemm kernels (optimized for APM's X-Gene 1).
ARMv8 support.
* [Your name or handle] <[email or website]> * [Your name or handle] <[email or website]>
* [Date] [Brief summary of your changes] * [Date] [Brief summary of your changes]

View File

@ -119,9 +119,9 @@ static inline int blas_quickdivide(blasint x, blasint y){
} }
#if defined(DOUBLE) #if defined(DOUBLE)
#define GET_IMAGE(res) __asm__ __volatile__("vstr.f64 d1, %0" : "=m"(res) : : "memory") #define GET_IMAGE(res) __asm__ __volatile__("str d1, %0" : "=m"(res) : : "memory")
#else #else
#define GET_IMAGE(res) __asm__ __volatile__("vstr.f32 s1, %0" : "=m"(res) : : "memory") #define GET_IMAGE(res) __asm__ __volatile__("str s1, %0" : "=m"(res) : : "memory")
#endif #endif
#define GET_IMAGE_CANCEL #define GET_IMAGE_CANCEL
@ -138,7 +138,6 @@ static inline int blas_quickdivide(blasint x, blasint y){
#if defined(ASSEMBLER) && !defined(NEEDPARAM) #if defined(ASSEMBLER) && !defined(NEEDPARAM)
#define PROLOGUE \ #define PROLOGUE \
.arm ;\
.global REALNAME ;\ .global REALNAME ;\
.func REALNAME ;\ .func REALNAME ;\
REALNAME: REALNAME:

217
cpuid_arm64.c Normal file
View File

@ -0,0 +1,217 @@
/**************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <string.h>
#define CPU_UNKNOWN 0
#define CPU_ARMV8 1
static char *cpuname[] = {
"UNKOWN",
"ARMV8"
};
int get_feature(char *search)
{
#ifdef linux
FILE *infile;
char buffer[2048], *p,*t;
p = (char *) NULL ;
infile = fopen("/proc/cpuinfo", "r");
while (fgets(buffer, sizeof(buffer), infile))
{
if (!strncmp("Features", buffer, 8))
{
p = strchr(buffer, ':') + 2;
break;
}
}
fclose(infile);
if( p == NULL ) return;
t = strtok(p," ");
while( t = strtok(NULL," "))
{
if (!strcmp(t, search)) { return(1); }
}
#endif
return(0);
}
int detect(void)
{
#ifdef linux
FILE *infile;
char buffer[512], *p;
p = (char *) NULL ;
infile = fopen("/proc/cpuinfo", "r");
while (fgets(buffer, sizeof(buffer), infile))
{
if ((!strncmp("model name", buffer, 10)) || (!strncmp("Processor", buffer, 9)))
{
p = strchr(buffer, ':') + 2;
break;
}
}
fclose(infile);
if(p != NULL)
{
if (strstr(p, "AArch64"))
{
return CPU_ARMV8;
}
}
#endif
return CPU_UNKNOWN;
}
char *get_corename(void)
{
return cpuname[detect()];
}
void get_architecture(void)
{
printf("ARM");
}
void get_subarchitecture(void)
{
int d = detect();
switch (d)
{
case CPU_ARMV8:
printf("ARMV8");
break;
default:
printf("UNKNOWN");
break;
}
}
void get_subdirname(void)
{
printf("arm64");
}
void get_cpuconfig(void)
{
int d = detect();
switch (d)
{
case CPU_ARMV8:
printf("#define ARMV8\n");
printf("#define L1_DATA_SIZE 32768\n");
printf("#define L1_DATA_LINESIZE 64\n");
printf("#define L2_SIZE 262144\n");
printf("#define L2_LINESIZE 64\n");
printf("#define DTB_DEFAULT_ENTRIES 64\n");
printf("#define DTB_SIZE 4096\n");
printf("#define L2_ASSOCIATIVE 4\n");
break;
}
}
void get_libname(void)
{
int d = detect();
switch (d)
{
case CPU_ARMV8:
printf("armv8\n");
break;
}
}
void get_features(void)
{
#ifdef linux
FILE *infile;
char buffer[2048], *p,*t;
p = (char *) NULL ;
infile = fopen("/proc/cpuinfo", "r");
while (fgets(buffer, sizeof(buffer), infile))
{
if (!strncmp("Features", buffer, 8))
{
p = strchr(buffer, ':') + 2;
break;
}
}
fclose(infile);
if( p == NULL ) return;
t = strtok(p," ");
while( t = strtok(NULL," "))
{
}
#endif
return;
}

View File

@ -746,12 +746,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define SUBARCHITECTURE "ARMV8" #define SUBARCHITECTURE "ARMV8"
#define SUBDIRNAME "arm64" #define SUBDIRNAME "arm64"
#define ARCHCONFIG "-DARMV8 " \ #define ARCHCONFIG "-DARMV8 " \
"-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \ "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
"-DL2_SIZE=512488 -DL2_LINESIZE=32 " \ "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " \ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=32 "
"-DHAVE_VFP -DHAVE_VFPV3 -DHAVE_VFPV4"
#define LIBNAME "armv8" #define LIBNAME "armv8"
#define CORENAME "ARMV8" #define CORENAME "XGENE1"
#else #else
#endif #endif
@ -801,6 +800,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define OPENBLAS_SUPPORTED #define OPENBLAS_SUPPORTED
#endif #endif
#ifdef __aarch64__
#include "cpuid_arm64.c"
#define OPENBLAS_SUPPORTED
#endif
#ifndef OPENBLAS_SUPPORTED #ifndef OPENBLAS_SUPPORTED
#error "This arch/CPU is not supported by OpenBLAS." #error "This arch/CPU is not supported by OpenBLAS."
@ -856,7 +860,7 @@ int main(int argc, char *argv[]){
#ifdef FORCE #ifdef FORCE
printf("CORE=%s\n", CORENAME); printf("CORE=%s\n", CORENAME);
#else #else
#if defined(__i386__) || defined(__x86_64__) || defined(POWER) || defined(__mips__) || defined(__arm__) #if defined(__i386__) || defined(__x86_64__) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__)
printf("CORE=%s\n", get_corename()); printf("CORE=%s\n", get_corename());
#endif #endif
#endif #endif
@ -956,7 +960,7 @@ int main(int argc, char *argv[]){
#ifdef FORCE #ifdef FORCE
printf("#define CHAR_CORENAME \"%s\"\n", CORENAME); printf("#define CHAR_CORENAME \"%s\"\n", CORENAME);
#else #else
#if defined(__i386__) || defined(__x86_64__) || defined(POWER) || defined(__mips__) || defined(__arm__) #if defined(__i386__) || defined(__x86_64__) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__)
printf("#define CHAR_CORENAME \"%s\"\n", get_corename()); printf("#define CHAR_CORENAME \"%s\"\n", get_corename());
#endif #endif
#endif #endif

View File

@ -80,14 +80,14 @@ DGEMVTKERNEL = ../arm/gemv_t.c
CGEMVTKERNEL = ../arm/zgemv_t.c CGEMVTKERNEL = ../arm/zgemv_t.c
ZGEMVTKERNEL = ../arm/zgemv_t.c ZGEMVTKERNEL = ../arm/zgemv_t.c
STRMMKERNEL = ../generic/trmmkernel_2x2.c STRMMKERNEL = ../generic/trmmkernel_4x4.c
DTRMMKERNEL = ../generic/trmmkernel_2x2.c DTRMMKERNEL = ../generic/trmmkernel_2x2.c
CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
SGEMMKERNEL = ../generic/gemmkernel_2x2.c SGEMMKERNEL = sgemm_kernel_4x4.S
SGEMMONCOPY = ../generic/gemm_ncopy_2.c SGEMMONCOPY = ../generic/gemm_ncopy_4.c
SGEMMOTCOPY = ../generic/gemm_tcopy_2.c SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
SGEMMONCOPYOBJ = sgemm_oncopy.o SGEMMONCOPYOBJ = sgemm_oncopy.o
SGEMMOTCOPYOBJ = sgemm_otcopy.o SGEMMOTCOPYOBJ = sgemm_otcopy.o

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,875 @@
#include "common.h"
#include <stdbool.h>
int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc ,BLASLONG offset)
{
BLASLONG i,j,k;
FLOAT *C0,*C1,*C2,*C3,*ptrba,*ptrbb;
FLOAT res0_0;
FLOAT res0_1;
FLOAT res0_2;
FLOAT res0_3;
FLOAT res1_0;
FLOAT res1_1;
FLOAT res1_2;
FLOAT res1_3;
FLOAT res2_0;
FLOAT res2_1;
FLOAT res2_2;
FLOAT res2_3;
FLOAT res3_0;
FLOAT res3_1;
FLOAT res3_2;
FLOAT res3_3;
FLOAT a0;
FLOAT a1;
FLOAT b0;
FLOAT b1;
FLOAT b2;
FLOAT b3;
BLASLONG off, temp;
bool left;
bool transposed;
bool backwards;
#ifdef LEFT
left = true;
#else
left = false;
#endif
#ifdef TRANSA
transposed = true;
#else
transposed = false;
#endif
backwards = left != transposed;
if (!left) {
off = -offset;
}
for (j=0; j<bn/4; j+=1) // do blocks of the Mx4 loops
{
C0 = C;
C1 = C0+ldc;
C2 = C1+ldc;
C3 = C2+ldc;
if (left) {
off = offset;
}
ptrba = ba;
for (i=0; i<bm/4; i+=1) // do blocks of 4x4
{
ptrbb = bb;
if (backwards)
{
ptrba += off*4; // number of values in A
ptrbb += off*4; // number of values in B
}
res0_0 = 0;
res0_1 = 0;
res0_2 = 0;
res0_3 = 0;
res1_0 = 0;
res1_1 = 0;
res1_2 = 0;
res1_3 = 0;
res2_0 = 0;
res2_1 = 0;
res2_2 = 0;
res2_3 = 0;
res3_0 = 0;
res3_1 = 0;
res3_2 = 0;
res3_3 = 0;
temp = backwards ? bk-off :
left ? off + 4 : // number of values in A
off + 4; // number of values in B
for (k=0; k<temp; k++)
{
b0 = ptrbb[0];
b1 = ptrbb[1];
b2 = ptrbb[2];
b3 = ptrbb[3];
a0 = ptrba[0];
res0_0 += a0*b0;
res1_0 += a0*b1;
res2_0 += a0*b2;
res3_0 += a0*b3;
a1 = ptrba[1];
res0_1 += a1*b0;
res1_1 += a1*b1;
res2_1 += a1*b2;
res3_1 += a1*b3;
a0 = ptrba[2];
res0_2 += a0*b0;
res1_2 += a0*b1;
res2_2 += a0*b2;
res3_2 += a0*b3;
a1 = ptrba[3];
res0_3 += a1*b0;
res1_3 += a1*b1;
res2_3 += a1*b2;
res3_3 += a1*b3;
ptrba = ptrba+4;
ptrbb = ptrbb+4;
}
res0_0 *= alpha;
res0_1 *= alpha;
res0_2 *= alpha;
res0_3 *= alpha;
res1_0 *= alpha;
res1_1 *= alpha;
res1_2 *= alpha;
res1_3 *= alpha;
res2_0 *= alpha;
res2_1 *= alpha;
res2_2 *= alpha;
res2_3 *= alpha;
res3_0 *= alpha;
res3_1 *= alpha;
res3_2 *= alpha;
res3_3 *= alpha;
C0[0] = res0_0;
C0[1] = res0_1;
C0[2] = res0_2;
C0[3] = res0_3;
C1[0] = res1_0;
C1[1] = res1_1;
C1[2] = res1_2;
C1[3] = res1_3;
C2[0] = res2_0;
C2[1] = res2_1;
C2[2] = res2_2;
C2[3] = res2_3;
C3[0] = res3_0;
C3[1] = res3_1;
C3[2] = res3_2;
C3[3] = res3_3;
if (!backwards) {
temp = bk-off;
temp = left ? temp - 4 : // number of values in A
temp - 4; // number of values in B
ptrba += temp*4; // number of values in A
ptrbb += temp*4; // number of values in B
}
#ifdef LEFT
off += 4; // number of values in A
#endif
C0 = C0+4;
C1 = C1+4;
C2 = C2+4;
C3 = C3+4;
}
if ( bm & 2 ) // do any 2x4 loop
{
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
ptrbb = bb;
#else
ptrba += off*2;
ptrbb = bb + off*4;
#endif
res0_0 = 0;
res0_1 = 0;
res1_0 = 0;
res1_1 = 0;
res2_0 = 0;
res2_1 = 0;
res3_0 = 0;
res3_1 = 0;
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
temp = bk-off;
#elif defined(LEFT)
temp = off+2; // number of values in A
#else
temp = off+4; // number of values in B
#endif
for (k=0; k<temp; k++)
{
b0 = ptrbb[0];
b1 = ptrbb[1];
b2 = ptrbb[2];
b3 = ptrbb[3];
a0 = ptrba[0];
res0_0 += a0*b0;
res1_0 += a0*b1;
res2_0 += a0*b2;
res3_0 += a0*b3;
a1 = ptrba[1];
res0_1 += a1*b0;
res1_1 += a1*b1;
res2_1 += a1*b2;
res3_1 += a1*b3;
ptrba = ptrba+2;
ptrbb = ptrbb+4;
}
res0_0 *= alpha;
res0_1 *= alpha;
res1_0 *= alpha;
res1_1 *= alpha;
res2_0 *= alpha;
res2_1 *= alpha;
res3_0 *= alpha;
res3_1 *= alpha;
C0[0] = res0_0;
C0[1] = res0_1;
C1[0] = res1_0;
C1[1] = res1_1;
C2[0] = res2_0;
C2[1] = res2_1;
C3[0] = res3_0;
C3[1] = res3_1;
#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
temp = bk - off;
#ifdef LEFT
temp -= 2; // number of values in A
#else
temp -= 4; // number of values in B
#endif
ptrba += temp*2;
ptrbb += temp*4;
#endif
#ifdef LEFT
off += 2; // number of values in A
#endif
C0 = C0+2;
C1 = C1+2;
C2 = C2+2;
C3 = C3+2;
}
if ( bm & 1 ) // do any 1x4 loop
{
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
ptrbb = bb;
#else
ptrba += off*1;
ptrbb = bb + off*4;
#endif
res0_0 = 0;
res1_0 = 0;
res2_0 = 0;
res3_0 = 0;
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
temp = bk-off;
#elif defined(LEFT)
temp = off+1; // number of values in A
#else
temp = off+4; // number of values in B
#endif
for (k=0; k<temp; k++)
{
b0 = ptrbb[0];
b1 = ptrbb[1];
b2 = ptrbb[2];
b3 = ptrbb[3];
a0 = ptrba[0];
res0_0 += a0*b0;
res1_0 += a0*b1;
res2_0 += a0*b2;
res3_0 += a0*b3;
ptrba = ptrba+1;
ptrbb = ptrbb+4;
}
res0_0 *= alpha;
res1_0 *= alpha;
res2_0 *= alpha;
res3_0 *= alpha;
C0[0] = res0_0;
C1[0] = res1_0;
C2[0] = res2_0;
C3[0] = res3_0;
#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
temp = bk - off;
#ifdef LEFT
temp -= 1; // number of values in A
#else
temp -= 4; // number of values in B
#endif
ptrba += temp*1;
ptrbb += temp*4;
#endif
#ifdef LEFT
off += 1; // number of values in A
#endif
C0 = C0+1;
C1 = C1+1;
C2 = C2+1;
C3 = C3+1;
}
#if defined(TRMMKERNEL) && !defined(LEFT)
off += 4;
#endif
k = (bk<<2);
bb = bb+k;
i = (ldc<<2);
C = C+i;
}
for (j=0; j<(bn&2); j+=2) // do the Mx2 loops
{
C0 = C;
C1 = C0+ldc;
#if defined(TRMMKERNEL) && defined(LEFT)
off = offset;
#endif
ptrba = ba;
for (i=0; i<bm/4; i+=1) // do blocks of 4x2
{
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
ptrbb = bb;
#else
ptrba += off*4;
ptrbb = bb + off*2;
#endif
res0_0 = 0;
res0_1 = 0;
res0_2 = 0;
res0_3 = 0;
res1_0 = 0;
res1_1 = 0;
res1_2 = 0;
res1_3 = 0;
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
temp = bk-off;
#elif defined(LEFT)
temp = off+4; // number of values in A
#else
temp = off+2; // number of values in B
#endif
for (k=0; k<temp; k++)
{
b0 = ptrbb[0];
b1 = ptrbb[1];
a0 = ptrba[0];
res0_0 += a0*b0;
res1_0 += a0*b1;
a1 = ptrba[1];
res0_1 += a1*b0;
res1_1 += a1*b1;
a0 = ptrba[2];
res0_2 += a0*b0;
res1_2 += a0*b1;
a1 = ptrba[3];
res0_3 += a1*b0;
res1_3 += a1*b1;
ptrba = ptrba+4;
ptrbb = ptrbb+2;
}
res0_0 *= alpha;
res0_1 *= alpha;
res0_2 *= alpha;
res0_3 *= alpha;
res1_0 *= alpha;
res1_1 *= alpha;
res1_2 *= alpha;
res1_3 *= alpha;
C0[0] = res0_0;
C0[1] = res0_1;
C0[2] = res0_2;
C0[3] = res0_3;
C1[0] = res1_0;
C1[1] = res1_1;
C1[2] = res1_2;
C1[3] = res1_3;
#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
temp = bk - off;
#ifdef LEFT
temp -= 4; // number of values in A
#else
temp -= 2; // number of values in B
#endif
ptrba += temp*4;
ptrbb += temp*2;
#endif
#ifdef LEFT
off += 4; // number of values in A
#endif
C0 = C0+4;
C1 = C1+4;
}
if ( bm & 2 ) // do any 2x2 loop
{
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
ptrbb = bb;
#else
ptrba += off*2;
ptrbb = bb + off*2;
#endif
res0_0 = 0;
res0_1 = 0;
res1_0 = 0;
res1_1 = 0;
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
temp = bk-off;
#elif defined(LEFT)
temp = off+2; // number of values in A
#else
temp = off+2; // number of values in B
#endif
for (k=0; k<temp; k++)
{
b0 = ptrbb[0];
b1 = ptrbb[1];
a0 = ptrba[0];
res0_0 += a0*b0;
res1_0 += a0*b1;
a1 = ptrba[1];
res0_1 += a1*b0;
res1_1 += a1*b1;
ptrba = ptrba+2;
ptrbb = ptrbb+2;
}
res0_0 *= alpha;
res0_1 *= alpha;
res1_0 *= alpha;
res1_1 *= alpha;
C0[0] = res0_0;
C0[1] = res0_1;
C1[0] = res1_0;
C1[1] = res1_1;
#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
temp = bk - off;
#ifdef LEFT
temp -= 2; // number of values in A
#else
temp -= 2; // number of values in B
#endif
ptrba += temp*2;
ptrbb += temp*2;
#endif
#ifdef LEFT
off += 2; // number of values in A
#endif
C0 = C0+2;
C1 = C1+2;
}
if ( bm & 1 ) // do any 1x2 loop
{
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
ptrbb = bb;
#else
ptrba += off*1;
ptrbb = bb + off*2;
#endif
res0_0 = 0;
res1_0 = 0;
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
temp = bk-off;
#elif defined(LEFT)
temp = off+1; // number of values in A
#else
temp = off+2; // number of values in B
#endif
for (k=0; k<temp; k++)
{
b0 = ptrbb[0];
b1 = ptrbb[1];
a0 = ptrba[0];
res0_0 += a0*b0;
res1_0 += a0*b1;
ptrba = ptrba+1;
ptrbb = ptrbb+2;
}
res0_0 *= alpha;
res1_0 *= alpha;
C0[0] = res0_0;
C1[0] = res1_0;
#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
temp = bk - off;
#ifdef LEFT
temp -= 1; // number of values in A
#else
temp -= 2; // number of values in B
#endif
ptrba += temp*1;
ptrbb += temp*2;
#endif
#ifdef LEFT
off += 1; // number of values in A
#endif
C0 = C0+1;
C1 = C1+1;
}
#if defined(TRMMKERNEL) && !defined(LEFT)
off += 2;
#endif
k = (bk<<1);
bb = bb+k;
i = (ldc<<1);
C = C+i;
}
for (j=0; j<(bn&1); j+=1) // do the Mx1 loops
{
C0 = C;
#if defined(TRMMKERNEL) && defined(LEFT)
off = offset;
#endif
ptrba = ba;
for (i=0; i<bm/4; i+=1) // do blocks of 4x1 loops
{
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
ptrbb = bb;
#else
ptrba += off*4;
ptrbb = bb + off*1;
#endif
res0_0 = 0;
res0_1 = 0;
res0_2 = 0;
res0_3 = 0;
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
temp = bk-off;
#elif defined(LEFT)
temp = off+4; // number of values in A
#else
temp = off+1; // number of values in B
#endif
for (k=0; k<temp; k++)
{
b0 = ptrbb[0];
a0 = ptrba[0];
res0_0 += a0*b0;
a1 = ptrba[1];
res0_1 += a1*b0;
a0 = ptrba[2];
res0_2 += a0*b0;
a1 = ptrba[3];
res0_3 += a1*b0;
ptrba = ptrba+4;
ptrbb = ptrbb+1;
}
res0_0 *= alpha;
res0_1 *= alpha;
res0_2 *= alpha;
res0_3 *= alpha;
C0[0] = res0_0;
C0[1] = res0_1;
C0[2] = res0_2;
C0[3] = res0_3;
#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
temp = bk - off;
#ifdef LEFT
temp -= 4; // number of values in A
#else
temp -= 1; // number of values in B
#endif
ptrba += temp*4;
ptrbb += temp*1;
#endif
#ifdef LEFT
off += 4; // number of values in A
#endif
C0 = C0+4;
}
if ( bm & 2 ) // do any 2x1 loop
{
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
ptrbb = bb;
#else
ptrba += off*2;
ptrbb = bb + off*1;
#endif
res0_0 = 0;
res0_1 = 0;
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
temp = bk-off;
#elif defined(LEFT)
temp = off+2; // number of values in A
#else
temp = off+1; // number of values in B
#endif
for (k=0; k<temp; k++)
{
b0 = ptrbb[0];
a0 = ptrba[0];
res0_0 += a0*b0;
a1 = ptrba[1];
res0_1 += a1*b0;
ptrba = ptrba+2;
ptrbb = ptrbb+1;
}
res0_0 *= alpha;
res0_1 *= alpha;
C0[0] = res0_0;
C0[1] = res0_1;
#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
temp = bk - off;
#ifdef LEFT
temp -= 2; // number of values in A
#else
temp -= 1; // number of values in B
#endif
ptrba += temp*2;
ptrbb += temp*1;
#endif
#ifdef LEFT
off += 2; // number of values in A
#endif
C0 = C0+2;
}
if ( bm & 1 ) // do any 1x1 loop
{
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
ptrbb = bb;
#else
ptrba += off*1;
ptrbb = bb + off*1;
#endif
res0_0 = 0;
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
temp = bk-off;
#elif defined(LEFT)
temp = off+1; // number of values in A
#else
temp = off+1; // number of values in B
#endif
for (k=0; k<temp; k++)
{
b0 = ptrbb[0];
a0 = ptrba[0];
res0_0 += a0*b0;
ptrba = ptrba+1;
ptrbb = ptrbb+1;
}
res0_0 *= alpha;
C0[0] = res0_0;
#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
temp = bk - off;
#ifdef LEFT
temp -= 1; // number of values in A
#else
temp -= 1; // number of values in B
#endif
ptrba += temp*1;
ptrbb += temp*1;
#endif
#ifdef LEFT
off += 1; // number of values in A
#endif
C0 = C0+1;
}
#if defined(TRMMKERNEL) && !defined(LEFT)
off += 1;
#endif
k = (bk<<0);
bb = bb+k;
C = C+ldc;
}
return 0;
}

View File

@ -2039,8 +2039,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define GEMM_DEFAULT_OFFSET_B 0 #define GEMM_DEFAULT_OFFSET_B 0
#define GEMM_DEFAULT_ALIGN 0x03fffUL #define GEMM_DEFAULT_ALIGN 0x03fffUL
#define SGEMM_DEFAULT_UNROLL_M 2 #define SGEMM_DEFAULT_UNROLL_M 4
#define SGEMM_DEFAULT_UNROLL_N 2 #define SGEMM_DEFAULT_UNROLL_N 4
#define DGEMM_DEFAULT_UNROLL_M 2 #define DGEMM_DEFAULT_UNROLL_M 2
#define DGEMM_DEFAULT_UNROLL_N 2 #define DGEMM_DEFAULT_UNROLL_N 2