Merge pull request #924 from ashwinyes/develop_aarch64_improvements_20160714

Improvements to Aarch64 kernels
This commit is contained in:
Zhang Xianyi 2016-07-14 15:47:55 -04:00 committed by GitHub
commit 8a592ee386
28 changed files with 3451 additions and 2043 deletions

View File

@ -173,7 +173,9 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \
sgetri.goto dgetri.goto cgetri.goto zgetri.goto \
spotrf.goto dpotrf.goto cpotrf.goto zpotrf.goto \
ssymm.goto dsymm.goto csymm.goto zsymm.goto \
smallscaling
smallscaling \
isamax.goto idamax.goto icamax.goto izamax.goto \
snrm2.goto dnrm2.goto scnrm2.goto dznrm2.goto
acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \
scholesky.acml dcholesky.acml ccholesky.acml zcholesky.acml \
@ -226,7 +228,9 @@ atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \
sgesv.atlas dgesv.atlas cgesv.atlas zgesv.atlas \
sgetri.atlas dgetri.atlas cgetri.atlas zgetri.atlas \
spotrf.atlas dpotrf.atlas cpotrf.atlas zpotrf.atlas \
ssymm.atlas dsymm.atlas csymm.atlas zsymm.atlas
ssymm.atlas dsymm.atlas csymm.atlas zsymm.atlas \
isamax.atlas idamax.atlas icamax.atlas izamax.atlas \
snrm2.goto dnrm2.goto scnrm2.goto dznrm2.goto
mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \
scholesky.mkl dcholesky.mkl ccholesky.mkl zcholesky.mkl \
@ -1937,6 +1941,63 @@ zgemm3m.mkl : zgemm3m.$(SUFFIX)
zgemm3m.veclib : zgemm3m.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
############################################## ISAMAX ##############################################
isamax.goto : isamax.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
isamax.atlas : isamax.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
############################################## IDAMAX ##############################################
idamax.goto : idamax.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
idamax.atlas : idamax.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
############################################## ICAMAX ##############################################
icamax.goto : icamax.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
icamax.atlas : icamax.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
############################################## IZAMAX ##############################################
izamax.goto : izamax.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
izamax.atlas : izamax.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
############################################## SNRM2 ##############################################
snrm2.goto : snrm2.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
snrm2.atlas : snrm2.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
############################################## DNRM2 ##############################################
dnrm2.goto : dnrm2.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
dnrm2.atlas : dnrm2.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
############################################## Sscnrm2 ##############################################
scnrm2.goto : scnrm2.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
scnrm2.atlas : scnrm2.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
############################################## Ddznrm2 ##############################################
dznrm2.goto : dznrm2.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
dznrm2.atlas : dznrm2.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
###################################################################################################
slinpack.$(SUFFIX) : linpack.c
@ -2243,6 +2304,33 @@ cgemm3m.$(SUFFIX) : gemm3m.c
zgemm3m.$(SUFFIX) : gemm3m.c
$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
isamax.$(SUFFIX) : iamax.c
$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
idamax.$(SUFFIX) : iamax.c
$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
icamax.$(SUFFIX) : iamax.c
$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
izamax.$(SUFFIX) : iamax.c
$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
snrm2.$(SUFFIX) : nrm2.c
$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
dnrm2.$(SUFFIX) : nrm2.c
$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
scnrm2.$(SUFFIX) : nrm2.c
$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
dznrm2.$(SUFFIX) : nrm2.c
$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
smallscaling: smallscaling.c ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(EXTRALIB) -fopenmp -lm -lpthread

View File

@ -183,9 +183,9 @@ int main(int argc, char *argv[]){
timeg /= loops;
#ifdef COMPLEX
fprintf(stderr, " %10.2f MFlops\n", 4. * (double)m / timeg * 1.e-6);
fprintf(stderr, " %10.2f MFlops %10.6f sec\n", 4. * (double)m / timeg * 1.e-6, timeg);
#else
fprintf(stderr, " %10.2f MFlops\n", 2. * (double)m / timeg * 1.e-6);
fprintf(stderr, " %10.2f MFlops %10.6f sec\n", 2. * (double)m / timeg * 1.e-6, timeg);
#endif
}

View File

@ -190,8 +190,8 @@ int main(int argc, char *argv[]){
timeg /= loops;
fprintf(stderr,
" %10.2f MFlops\n",
COMPSIZE * COMPSIZE * 2. * (double)m / timeg * 1.e-6);
" %10.2f MFlops %10.6f sec\n",
COMPSIZE * COMPSIZE * 2. * (double)m / timeg * 1.e-6, timeg);
}

View File

@ -190,8 +190,8 @@ int main(int argc, char *argv[]){
timeg /= loops;
fprintf(stderr,
" %10.2f MBytes\n",
COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6);
" %10.2f MBytes %10.6f sec\n",
COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg);
}

View File

@ -184,8 +184,8 @@ int main(int argc, char *argv[]){
timeg /= loops;
fprintf(stderr,
" %10.2f MFlops\n",
COMPSIZE * COMPSIZE * 2. * (double)m / timeg * 1.e-6);
" %10.2f MFlops %10.6f sec\n",
COMPSIZE * COMPSIZE * 2. * (double)m / timeg * 1.e-6, timeg);
}

View File

@ -221,7 +221,7 @@ int main(int argc, char *argv[]){
timeg /= loops;
fprintf(stderr, " %10.2f MFlops\n", COMPSIZE * COMPSIZE * 2. * (double)m * (double)n / timeg * 1.e-6);
fprintf(stderr, " %10.2f MFlops %10.6f sec\n", COMPSIZE * COMPSIZE * 2. * (double)m * (double)n / timeg * 1.e-6, timeg);
}
}
@ -258,7 +258,7 @@ int main(int argc, char *argv[]){
timeg /= loops;
fprintf(stderr, " %10.2f MFlops\n", COMPSIZE * COMPSIZE * 2. * (double)m * (double)n / timeg * 1.e-6);
fprintf(stderr, " %10.2f MFlops %10.6f sec\n", COMPSIZE * COMPSIZE * 2. * (double)m * (double)n / timeg * 1.e-6, timeg);
}
}

190
benchmark/iamax.c Normal file
View File

@ -0,0 +1,190 @@
/***************************************************************************
Copyright (c) 2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#undef IAMAX
#ifdef COMPLEX
#ifdef DOUBLE
#define IAMAX BLASFUNC(izamax)
#else
#define IAMAX BLASFUNC(icamax)
#endif
#else
#ifdef DOUBLE
#define IAMAX BLASFUNC(idamax)
#else
#define IAMAX BLASFUNC(isamax)
#endif
#endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){
FLOAT *x;
blasint m, i;
blasint inc_x=1;
int loops = 1;
int l;
char *p;
int from = 1;
int to = 200;
int step = 1;
struct timeval start, stop;
double time1,timeg;
argc--;argv++;
if (argc > 0) { from = atol(*argv); argc--; argv++;}
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}
if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops);
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
srandom(getpid());
#endif
fprintf(stderr, " SIZE Time\n");
for(m = from; m <= to; m += step)
{
timeg=0;
fprintf(stderr, " %6d : ", (int)m);
for (l=0; l<loops; l++)
{
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
gettimeofday( &start, (struct timezone *)0);
IAMAX (&m, x, &inc_x);
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg += time1;
}
timeg /= loops;
fprintf(stderr, " %10.6f secs\n", timeg);
}
return 0;
}
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

190
benchmark/nrm2.c Normal file
View File

@ -0,0 +1,190 @@
/***************************************************************************
Copyright (c) 2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#undef NRM2
#ifdef COMPLEX
#ifdef DOUBLE
#define NRM2 BLASFUNC(dznrm2)
#else
#define NRM2 BLASFUNC(scnrm2)
#endif
#else
#ifdef DOUBLE
#define NRM2 BLASFUNC(dnrm2)
#else
#define NRM2 BLASFUNC(snrm2)
#endif
#endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){
FLOAT *x;
blasint m, i;
blasint inc_x=1;
int loops = 1;
int l;
char *p;
int from = 1;
int to = 200;
int step = 1;
struct timeval start, stop;
double time1,timeg;
argc--;argv++;
if (argc > 0) { from = atol(*argv); argc--; argv++;}
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}
if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops);
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
srandom(getpid());
#endif
fprintf(stderr, " SIZE Time\n");
for(m = from; m <= to; m += step)
{
timeg=0;
fprintf(stderr, " %6d : ", (int)m);
for (l=0; l<loops; l++)
{
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
gettimeofday( &start, (struct timezone *)0);
NRM2 (&m, x, &inc_x);
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg += time1;
}
timeg /= loops;
fprintf(stderr, " %10.6f secs\n", timeg);
}
return 0;
}
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

View File

@ -186,8 +186,8 @@ int main(int argc, char *argv[]){
timeg /= loops;
fprintf(stderr,
" %10.2f MFlops\n",
COMPSIZE * COMPSIZE * 6. * (double)m / timeg * 1.e-6);
" %10.2f MFlops %10.6f sec\n",
COMPSIZE * COMPSIZE * 6. * (double)m / timeg * 1.e-6, timeg);
}

View File

@ -189,9 +189,9 @@ int main(int argc, char *argv[]){
timeg /= loops;
#ifdef COMPLEX
fprintf(stderr, " %10.2f MFlops\n", 6. * (double)m / timeg * 1.e-6);
fprintf(stderr, " %10.2f MFlops %10.6f sec\n", 6. * (double)m / timeg * 1.e-6, timeg);
#else
fprintf(stderr, " %10.2f MFlops\n", 1. * (double)m / timeg * 1.e-6);
fprintf(stderr, " %10.2f MFlops %10.6f sec\n", 1. * (double)m / timeg * 1.e-6, timeg);
#endif
}

View File

@ -190,8 +190,8 @@ int main(int argc, char *argv[]){
timeg /= loops;
fprintf(stderr,
" %10.2f MBytes\n",
COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6);
" %10.2f MBytes %10.6f sec\n",
COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg);
}

View File

@ -191,8 +191,8 @@ int main(int argc, char *argv[]){
gettimeofday( &start, (struct timezone *)0);
fprintf(stderr,
" %10.2f MFlops\n",
COMPSIZE * COMPSIZE * 1. * (double)m * (double)m * (double)m / time1 * 1.e-6);
" %10.2f MFlops %10.6f sec\n",
COMPSIZE * COMPSIZE * 1. * (double)m * (double)m * (double)m / time1 * 1.e-6, time1);
}

View File

@ -184,8 +184,8 @@ int main(int argc, char *argv[]){
timeg /= loops;
fprintf(stderr,
" %10.2f MFlops\n",
COMPSIZE * COMPSIZE * 2. * (double)m / timeg * 1.e-6);
" %10.2f MFlops %10.6f sec\n",
COMPSIZE * COMPSIZE * 2. * (double)m / timeg * 1.e-6, timeg);
}

File diff suppressed because it is too large Load Diff

View File

@ -58,43 +58,43 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
str TMPF, [Y], #SZ
#else
#if !defined(DOUBLE)
ld1 {v0.2s}, [X], #8
st1 {v0.2s}, [Y], #8
ldr d0, [X], #8
str d0, [Y], #8
#else
ld1 {v0.2d}, [X], #16
st1 {v0.2d}, [Y], #16
ldr q0, [X], #16
str q0, [Y], #16
#endif
#endif
.endm
.macro KERNEL_F4
#if !defined(COMPLEX)
#if !defined(DOUBLE)
ld1 {v0.4s}, [X], #16
st1 {v0.4s}, [Y], #16
ldr q0, [X], #16
str q0, [Y], #16
#else // DOUBLE
ld1 {v0.4s}, [X], #16
ld1 {v1.4s}, [X], #16
st1 {v0.4s}, [Y], #16
st1 {v1.4s}, [Y], #16
ldr q0, [X], #16
str q0, [Y], #16
ldr q1, [X], #16
str q1, [Y], #16
#endif
#else // COMPLEX
#if !defined(DOUBLE)
ld1 {v0.4s}, [X], #16
ld1 {v1.4s}, [X], #16
st1 {v0.4s}, [Y], #16
st1 {v1.4s}, [Y], #16
ldr q0, [X], #16
str q0, [Y], #16
ldr q1, [X], #16
str q1, [Y], #16
#else // DOUBLE
ld1 {v0.4s}, [X], #16
ld1 {v1.4s}, [X], #16
ld1 {v2.4s}, [X], #16
ld1 {v3.4s}, [X], #16
st1 {v0.4s}, [Y], #16
st1 {v1.4s}, [Y], #16
st1 {v2.4s}, [Y], #16
st1 {v3.4s}, [Y], #16
ldr q0, [X], #16
str q0, [Y], #16
ldr q1, [X], #16
str q1, [Y], #16
ldr q2, [X], #16
str q2, [Y], #16
ldr q3, [X], #16
str q3, [Y], #16
#endif
#endif

File diff suppressed because it is too large Load Diff

View File

@ -46,19 +46,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define pCRow0 x12
#define pCRow1 x13
#define pCRow2 x14
#define pA x15
#define temp x16
#define tempOffset x17
#define tempK x18
#define pCRow3 x15
#define pA x16
#define alpha x17
#define temp x18
#define tempOffset x19
#define tempK x20
#define alpha0 d10
#define alphaV0 v10.d[0]
#define alpha1 d11
#define alphaV1 v11.d[0]
#define alpha2 d14
#define alphaV2 v14.d[0]
#define alpha3 d15
#define alphaV3 v15.d[0]
#define A_PRE_SIZE 2560
#define B_PRE_SIZE 448
#define C_PRE_SIZE 128
// 00 origM
// 01 origN
@ -101,14 +101,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//v05 pA1_2, pA1_3
//v06 pA1_4, pA1_5
//v07 pA1_6, pA1_7
//v08 must save pB0_0, pB0_1
//v09 must save pB0_2, pB0_3
//v10 must save ALPHA0
//v11 must save ALPHA1
//v12 must save pB1_0, pB1_1
//v13 must save pB1_2, pB1_3
//v14 must save ALPHA2
//v15 must save ALPHA3
//v08 must save pB0_0
//v09 must save pB0_1
//v10 must save pB0_2 --> ALPHA0
//v11 must save pB0_3
//v12 must save pB1_0
//v13 must save pB1_1
//v14 must save pB1_2
//v15 must save pB1_3
//v16 must save C00, C01
//v17 must save C02, C03
//v18 C04, C05
@ -150,186 +150,249 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro KERNEL8x4_I
ld1 {v0.2d, v1.2d}, [pA]
add pA, pA, #32
ld1 {v8.2d, v9.2d}, [pB]
add pB, pB, #32
ld1 {v2.2d, v3.2d}, [pA]
add pA, pA, #32
ldp q0, q1, [pA], #32
ldp d8, d9, [pB], #16
fmul v16.2d, v0.2d, v8.d[0]
fmul v20.2d, v0.2d, v9.d[0]
ldp d10, d11, [pB], #16
fmul v17.2d, v1.2d, v8.d[0]
fmul v21.2d, v1.2d, v9.d[0]
ldp q2, q3, [pA], #32
fmul v24.2d, v0.2d, v10.d[0]
fmul v28.2d, v0.2d, v11.d[0]
ldp q4, q5, [pA], #32
fmul v25.2d, v1.2d, v10.d[0]
fmul v29.2d, v1.2d, v11.d[0]
ldp d12, d13, [pB], #16
fmul v18.2d, v2.2d, v8.d[0]
fmul v22.2d, v2.2d, v9.d[0]
ldp d14, d15, [pB], #16
fmul v26.2d, v2.2d, v10.d[0]
fmul v30.2d, v2.2d, v11.d[0]
ldp q6, q7, [pA], #32
fmul v19.2d, v3.2d, v8.d[0]
fmul v27.2d, v3.2d, v10.d[0]
fmul v20.2d, v0.2d, v8.d[1]
fmul v21.2d, v1.2d, v8.d[1]
fmul v22.2d, v2.2d, v8.d[1]
fmul v23.2d, v3.2d, v8.d[1]
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
fmul v24.2d, v0.2d, v9.d[0]
fmul v25.2d, v1.2d, v9.d[0]
fmul v26.2d, v2.2d, v9.d[0]
fmul v27.2d, v3.2d, v9.d[0]
fmul v31.2d, v3.2d, v11.d[0]
fmul v23.2d, v3.2d, v9.d[0]
fmul v28.2d, v0.2d, v9.d[1]
fmul v29.2d, v1.2d, v9.d[1]
fmul v30.2d, v2.2d, v9.d[1]
fmul v31.2d, v3.2d, v9.d[1]
ld1 {v4.2d, v5.2d}, [pA]
add pA, pA, #32
ld1 {v12.2d, v13.2d}, [pB]
add pB, pB, #32
ld1 {v6.2d, v7.2d}, [pA]
add pA, pA, #32
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
.endm
.macro KERNEL8x4_M1
fmla v16.2d, v0.2d, v8.d[0]
fmla v20.2d, v0.2d, v9.d[0]
ldp q4, q5, [pA], #32
fmla v24.2d, v0.2d, v10.d[0]
fmla v28.2d, v0.2d, v11.d[0]
ldp d12, d13, [pB], #16
fmla v17.2d, v1.2d, v8.d[0]
fmla v25.2d, v1.2d, v10.d[0]
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
fmla v21.2d, v1.2d, v9.d[0]
fmla v29.2d, v1.2d, v11.d[0]
ldp d14, d15, [pB], #16
fmla v18.2d, v2.2d, v8.d[0]
fmla v22.2d, v2.2d, v9.d[0]
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
fmla v26.2d, v2.2d, v10.d[0]
fmla v30.2d, v2.2d, v11.d[0]
fmla v19.2d, v3.2d, v8.d[0]
fmla v23.2d, v3.2d, v9.d[0]
fmla v20.2d, v0.2d, v8.d[1]
fmla v21.2d, v1.2d, v8.d[1]
fmla v22.2d, v2.2d, v8.d[1]
fmla v23.2d, v3.2d, v8.d[1]
ldp q6, q7, [pA], #32
fmla v24.2d, v0.2d, v9.d[0]
fmla v25.2d, v1.2d, v9.d[0]
fmla v26.2d, v2.2d, v9.d[0]
fmla v27.2d, v3.2d, v9.d[0]
fmla v28.2d, v0.2d, v9.d[1]
fmla v29.2d, v1.2d, v9.d[1]
fmla v30.2d, v2.2d, v9.d[1]
fmla v31.2d, v3.2d, v9.d[1]
ld1 {v4.2d, v5.2d}, [pA]
add pA, pA, #32
ld1 {v12.2d, v13.2d}, [pB]
add pB, pB, #32
ld1 {v6.2d, v7.2d}, [pA]
add pA, pA, #32
prfm PLDL1KEEP, [pA, #512]
fmla v27.2d, v3.2d, v10.d[0]
fmla v31.2d, v3.2d, v11.d[0]
.endm
.macro KERNEL8x4_M2
fmla v16.2d, v4.2d, v12.d[0]
fmla v20.2d, v4.2d, v13.d[0]
fmla v24.2d, v4.2d, v14.d[0]
fmla v28.2d, v4.2d, v15.d[0]
ldp q0, q1, [pA], #32
fmla v17.2d, v5.2d, v12.d[0]
fmla v25.2d, v5.2d, v14.d[0]
ldp d8, d9, [pB], #16
fmla v21.2d, v5.2d, v13.d[0]
fmla v29.2d, v5.2d, v15.d[0]
ldp d10, d11, [pB], #16
fmla v18.2d, v6.2d, v12.d[0]
fmla v22.2d, v6.2d, v13.d[0]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
fmla v26.2d, v6.2d, v14.d[0]
fmla v30.2d, v6.2d, v15.d[0]
fmla v19.2d, v7.2d, v12.d[0]
fmla v23.2d, v7.2d, v13.d[0]
fmla v20.2d, v4.2d, v12.d[1]
fmla v21.2d, v5.2d, v12.d[1]
fmla v22.2d, v6.2d, v12.d[1]
fmla v23.2d, v7.2d, v12.d[1]
ldp q2, q3, [pA], #32
fmla v24.2d, v4.2d, v13.d[0]
fmla v25.2d, v5.2d, v13.d[0]
fmla v26.2d, v6.2d, v13.d[0]
fmla v27.2d, v7.2d, v13.d[0]
fmla v28.2d, v4.2d, v13.d[1]
fmla v29.2d, v5.2d, v13.d[1]
fmla v30.2d, v6.2d, v13.d[1]
fmla v31.2d, v7.2d, v13.d[1]
ld1 {v0.2d, v1.2d}, [pA]
add pA, pA, #32
ld1 {v8.2d, v9.2d}, [pB]
add pB, pB, #32
ld1 {v2.2d, v3.2d}, [pA]
add pA, pA, #32
prfm PLDL1KEEP, [pB, #512]
fmla v27.2d, v7.2d, v14.d[0]
fmla v31.2d, v7.2d, v15.d[0]
.endm
.macro KERNEL8x4_E
fmla v16.2d, v4.2d, v12.d[0]
fmla v20.2d, v4.2d, v13.d[0]
fmla v24.2d, v4.2d, v14.d[0]
fmla v28.2d, v4.2d, v15.d[0]
fmla v17.2d, v5.2d, v12.d[0]
fmla v25.2d, v5.2d, v14.d[0]
fmla v21.2d, v5.2d, v13.d[0]
fmla v29.2d, v5.2d, v15.d[0]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
fmla v18.2d, v6.2d, v12.d[0]
fmla v22.2d, v6.2d, v13.d[0]
fmla v26.2d, v6.2d, v14.d[0]
fmla v30.2d, v6.2d, v15.d[0]
fmla v19.2d, v7.2d, v12.d[0]
fmla v20.2d, v4.2d, v12.d[1]
fmla v21.2d, v5.2d, v12.d[1]
fmla v22.2d, v6.2d, v12.d[1]
fmla v23.2d, v7.2d, v12.d[1]
fmla v24.2d, v4.2d, v13.d[0]
fmla v25.2d, v5.2d, v13.d[0]
fmla v26.2d, v6.2d, v13.d[0]
fmla v27.2d, v7.2d, v13.d[0]
fmla v28.2d, v4.2d, v13.d[1]
fmla v29.2d, v5.2d, v13.d[1]
fmla v30.2d, v6.2d, v13.d[1]
fmla v31.2d, v7.2d, v13.d[1]
fmla v23.2d, v7.2d, v13.d[0]
fmla v27.2d, v7.2d, v14.d[0]
fmla v31.2d, v7.2d, v15.d[0]
.endm
.macro KERNEL8x4_SUB
ld1 {v0.2d, v1.2d}, [pA]
add pA, pA, #32
ld1 {v8.2d, v9.2d}, [pB]
add pB, pB, #32
ld1 {v2.2d, v3.2d}, [pA]
add pA, pA, #32
ldp q0, q1, [pA], #32
ldp d8, d9, [pB], #16
fmla v16.2d, v0.2d, v8.d[0]
fmla v20.2d, v0.2d, v9.d[0]
ldp d10, d11, [pB], #16
fmla v17.2d, v1.2d, v8.d[0]
fmla v21.2d, v1.2d, v9.d[0]
ldp q2, q3, [pA], #32
fmla v24.2d, v0.2d, v10.d[0]
fmla v28.2d, v0.2d, v11.d[0]
fmla v25.2d, v1.2d, v10.d[0]
fmla v29.2d, v1.2d, v11.d[0]
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
fmla v18.2d, v2.2d, v8.d[0]
fmla v22.2d, v2.2d, v9.d[0]
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
fmla v26.2d, v2.2d, v10.d[0]
fmla v30.2d, v2.2d, v11.d[0]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
fmla v19.2d, v3.2d, v8.d[0]
fmla v27.2d, v3.2d, v10.d[0]
fmla v20.2d, v0.2d, v8.d[1]
fmla v21.2d, v1.2d, v8.d[1]
fmla v22.2d, v2.2d, v8.d[1]
fmla v23.2d, v3.2d, v8.d[1]
fmla v24.2d, v0.2d, v9.d[0]
fmla v25.2d, v1.2d, v9.d[0]
fmla v26.2d, v2.2d, v9.d[0]
fmla v27.2d, v3.2d, v9.d[0]
fmla v28.2d, v0.2d, v9.d[1]
fmla v29.2d, v1.2d, v9.d[1]
fmla v30.2d, v2.2d, v9.d[1]
fmla v31.2d, v3.2d, v9.d[1]
fmla v31.2d, v3.2d, v11.d[0]
fmla v23.2d, v3.2d, v9.d[0]
.endm
.macro SAVE8x4
add pCRow1, pCRow0, LDC
fmov alpha0, alpha
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
fmul v0.2d, v16.2d, alphaV0
fmul v1.2d, v17.2d, alphaV1
fmul v2.2d, v18.2d, alphaV2
fmul v3.2d, v19.2d, alphaV3
st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0]
fmul v1.2d, v17.2d, alphaV0
stp q0, q1, [pCRow0]
add pCRow2, pCRow1, LDC
add pCRow0, pCRow0, #32
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
fmul v2.2d, v18.2d, alphaV0
fmul v3.2d, v19.2d, alphaV0
stp q2, q3, [pCRow0]
add pCRow0, pCRow0, #32
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
fmul v4.2d, v20.2d, alphaV0
fmul v5.2d, v21.2d, alphaV1
fmul v6.2d, v22.2d, alphaV2
fmul v7.2d, v23.2d, alphaV3
st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1]
fmul v5.2d, v21.2d, alphaV0
stp q4, q5, [pCRow1]
add pCRow1, pCRow2, LDC
add pCRow1, pCRow1, #32
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
fmul v6.2d, v22.2d, alphaV0
fmul v7.2d, v23.2d, alphaV0
stp q6, q7, [pCRow1]
add pCRow1, pCRow1, #32
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
fmul v0.2d, v24.2d, alphaV0
fmul v1.2d, v25.2d, alphaV1
fmul v2.2d, v26.2d, alphaV2
fmul v3.2d, v27.2d, alphaV3
st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow2]
fmul v1.2d, v25.2d, alphaV0
stp q0, q1, [pCRow2]
add pCRow2, pCRow2, #32
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
fmul v2.2d, v26.2d, alphaV0
fmul v3.2d, v27.2d, alphaV0
stp q2, q3, [pCRow2]
add pCRow2, pCRow2, #32
prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
fmul v4.2d, v28.2d, alphaV0
fmul v5.2d, v29.2d, alphaV1
fmul v6.2d, v30.2d, alphaV2
fmul v7.2d, v31.2d, alphaV3
st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1]
fmul v5.2d, v29.2d, alphaV0
stp q4, q5, [pCRow3]
add pCRow0, pCRow0, #64
add pCRow3, pCRow3, #32
prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
fmul v6.2d, v30.2d, alphaV0
fmul v7.2d, v31.2d, alphaV0
stp q6, q7, [pCRow3]
add pCRow3, pCRow3, #32
.endm
/******************************************************************************/
@ -365,26 +428,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE4x4
fmov alpha0, alpha
fmul v8.2d, v16.2d, alphaV0
fmul v9.2d, v17.2d, alphaV1
fmul v9.2d, v17.2d, alphaV0
st1 {v8.2d, v9.2d}, [pCRow0]
add pCRow1, pCRow0, LDC
fmul v12.2d, v20.2d, alphaV2
fmul v13.2d, v21.2d, alphaV3
fmul v12.2d, v20.2d, alphaV0
fmul v13.2d, v21.2d, alphaV0
st1 {v12.2d, v13.2d}, [pCRow1]
add pCRow2, pCRow1, LDC
fmul v8.2d, v24.2d, alphaV0
fmul v9.2d, v25.2d, alphaV1
fmul v9.2d, v25.2d, alphaV0
st1 {v8.2d, v9.2d}, [pCRow2]
add pCRow1, pCRow2, LDC
fmul v12.2d, v28.2d, alphaV2
fmul v13.2d, v29.2d, alphaV3
fmul v12.2d, v28.2d, alphaV0
fmul v13.2d, v29.2d, alphaV0
st1 {v12.2d, v13.2d}, [pCRow1]
add pCRow0, pCRow0, #32
@ -413,22 +477,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE2x4
fmov alpha0, alpha
fmul v8.2d, v16.2d, alphaV0
st1 {v8.2d}, [pCRow0]
add pCRow1, pCRow0, LDC
fmul v12.2d, v20.2d, alphaV1
fmul v12.2d, v20.2d, alphaV0
st1 {v12.2d}, [pCRow1]
add pCRow2, pCRow1, LDC
fmul v8.2d, v24.2d, alphaV2
fmul v8.2d, v24.2d, alphaV0
st1 {v8.2d}, [pCRow2]
add pCRow1, pCRow2, LDC
fmul v12.2d, v28.2d, alphaV3
fmul v12.2d, v28.2d, alphaV0
st1 {v12.2d}, [pCRow1]
add pCRow0, pCRow0, #16
@ -453,6 +518,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE1x4
fmov alpha0, alpha
add pCRow1, pCRow0, LDC
fmul v8.2d, v16.2d, alphaV0
@ -462,7 +529,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add pCRow2, pCRow1, LDC
add pCRow1, pCRow2, LDC
fmul v12.2d, v20.2d, alphaV1
fmul v12.2d, v20.2d, alphaV0
st1 {v12.d}[0], [pCRow2]
st1 {v12.d}[1], [pCRow1]
@ -502,18 +569,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE8x2
fmov alpha0, alpha
add pCRow1, pCRow0, LDC
fmul v0.2d, v16.2d, alphaV0
fmul v1.2d, v17.2d, alphaV1
fmul v2.2d, v18.2d, alphaV2
fmul v3.2d, v19.2d, alphaV3
fmul v1.2d, v17.2d, alphaV0
fmul v2.2d, v18.2d, alphaV0
fmul v3.2d, v19.2d, alphaV0
st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0]
fmul v4.2d, v20.2d, alphaV0
fmul v5.2d, v21.2d, alphaV1
fmul v6.2d, v22.2d, alphaV2
fmul v7.2d, v23.2d, alphaV3
fmul v5.2d, v21.2d, alphaV0
fmul v6.2d, v22.2d, alphaV0
fmul v7.2d, v23.2d, alphaV0
st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1]
add pCRow0, pCRow0, #64
@ -541,14 +609,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE4x2
fmov alpha0, alpha
fmul v8.2d, v16.2d, alphaV0
fmul v9.2d, v17.2d, alphaV1
fmul v9.2d, v17.2d, alphaV0
st1 {v8.2d, v9.2d}, [pCRow0]
add pCRow1, pCRow0, LDC
fmul v12.2d, v20.2d, alphaV2
fmul v13.2d, v21.2d, alphaV3
fmul v12.2d, v20.2d, alphaV0
fmul v13.2d, v21.2d, alphaV0
st1 {v12.2d, v13.2d}, [pCRow1]
add pCRow0, pCRow0, #32
@ -573,12 +642,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE2x2
fmov alpha0, alpha
fmul v8.2d, v16.2d, alphaV0
st1 {v8.2d}, [pCRow0]
add pCRow1 , pCRow0, LDC
fmul v12.2d, v20.2d, alphaV1
fmul v12.2d, v20.2d, alphaV0
st1 {v12.2d}, [pCRow1]
add pCRow0, pCRow0, #16
@ -601,6 +671,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE1x2
fmov alpha0, alpha
add pCRow1 , pCRow0, LDC
fmul v8.2d, v16.2d, alphaV0
@ -636,10 +707,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE8x1
fmov alpha0, alpha
fmul v0.2d, v16.2d, alphaV0
fmul v1.2d, v17.2d, alphaV1
fmul v2.2d, v18.2d, alphaV2
fmul v3.2d, v19.2d, alphaV3
fmul v1.2d, v17.2d, alphaV0
fmul v2.2d, v18.2d, alphaV0
fmul v3.2d, v19.2d, alphaV0
st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0]
add pCRow0, pCRow0, #64
@ -665,8 +737,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE4x1
fmov alpha0, alpha
fmul v8.2d, v16.2d, alphaV0
fmul v9.2d, v17.2d, alphaV1
fmul v9.2d, v17.2d, alphaV0
st1 {v8.2d, v9.2d}, [pCRow0]
add pCRow0, pCRow0, #32
@ -690,6 +763,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE2x1
fmov alpha0, alpha
fmul v8.2d, v16.2d, alphaV0
st1 {v8.2d}, [pCRow0]
@ -713,6 +787,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE1x1
fmov alpha0, alpha
fmul d8, d16, alpha0
str d8, [pCRow0]
@ -739,10 +814,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stp x26, x27, [sp, #(9 * 16)]
str x28, [sp, #(10 * 16)]
fmov alpha0, d0
fmov alpha1, d0
fmov alpha2, d0
fmov alpha3, d0
prfm PLDL1KEEP, [origPB]
prfm PLDL1KEEP, [origPA]
fmov alpha, d0
lsl LDC, LDC, #3 // ldc = ldc * 8
@ -759,8 +834,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
/******************************************************************************/
dtrmm_kernel_L4_BEGIN:
mov pCRow0, pC // pCRow0 = C
add pC, pC, LDC, lsl #2
mov pCRow0, pC
add pCRow1, pCRow0, LDC
add pCRow2, pCRow1, LDC
add pCRow3, pCRow2, LDC
add pC, pCRow3, LDC
#if defined(LEFT)
mov tempOffset, offset
@ -774,6 +854,7 @@ dtrmm_kernel_L4_M8_BEGIN:
cmp counterI, #0
ble dtrmm_kernel_L4_M4_BEGIN
.align 5
dtrmm_kernel_L4_M8_20:
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
@ -794,40 +875,64 @@ dtrmm_kernel_L4_M8_20:
add tempK, tempOffset, #4
#endif
asr counterL , tempK, #1 // L = K / 2
asr counterL , tempK, #3 // L = K / 8
cmp counterL , #2 // is there at least 4 to do?
blt dtrmm_kernel_L4_M8_32
KERNEL8x4_I // do one in the K
KERNEL8x4_M2 // do another in the K
KERNEL8x4_M1
KERNEL8x4_M2
KERNEL8x4_M1
KERNEL8x4_M2
KERNEL8x4_M1
KERNEL8x4_M2
subs counterL, counterL, #2 // subtract 2
ble dtrmm_kernel_L4_M8_22a
.align 5
.align 5
dtrmm_kernel_L4_M8_22:
KERNEL8x4_M1
KERNEL8x4_M2
KERNEL8x4_M1
KERNEL8x4_M2
KERNEL8x4_M1
KERNEL8x4_M2
KERNEL8x4_M1
KERNEL8x4_M2
subs counterL, counterL, #1
bgt dtrmm_kernel_L4_M8_22
.align 5
dtrmm_kernel_L4_M8_22a:
KERNEL8x4_M1
KERNEL8x4_M2
KERNEL8x4_M1
KERNEL8x4_M2
KERNEL8x4_M1
KERNEL8x4_M2
KERNEL8x4_M1
KERNEL8x4_E
b dtrmm_kernel_L4_M8_44
.align 5
dtrmm_kernel_L4_M8_32:
tst counterL, #1
ble dtrmm_kernel_L4_M8_40
KERNEL8x4_I
KERNEL8x4_M2
KERNEL8x4_M1
KERNEL8x4_M2
KERNEL8x4_M1
KERNEL8x4_M2
KERNEL8x4_M1
KERNEL8x4_E
b dtrmm_kernel_L4_M8_44
@ -838,13 +943,17 @@ dtrmm_kernel_L4_M8_40:
dtrmm_kernel_L4_M8_44:
ands counterL , tempK, #1
ands counterL , tempK, #7
ble dtrmm_kernel_L4_M8_100
.align 5
dtrmm_kernel_L4_M8_46:
KERNEL8x4_SUB
subs counterL, counterL, #1
bne dtrmm_kernel_L4_M8_46
dtrmm_kernel_L4_M8_100:
SAVE8x4
@ -864,6 +973,9 @@ dtrmm_kernel_L4_M8_100:
#if defined(LEFT)
add tempOffset, tempOffset, #8
#endif
prfm PLDL1KEEP, [pA]
prfm PLDL1KEEP, [pA, #64]
prfm PLDL1KEEP, [origPB]
dtrmm_kernel_L4_M8_END:
subs counterI, counterI, #1

View File

@ -68,6 +68,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define SHZ 3
#endif
#define A_PRE_SIZE 768
#define Y_PRE_SIZE 768
/******************************************************************************/
.macro SAVE_REGS
@ -105,36 +108,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v2.4s, v3.4s}, [A_PTR], #32
ld1 {v4.4s, v5.4s}, [Y_IPTR], #32
fmla v4.4s, v1.4s, v2.4s
prfm PLDL1KEEP, [A_PTR, #A_PRE_SIZE]
fmla v5.4s, v1.4s, v3.4s
st1 {v4.4s, v5.4s}, [Y_OPTR], #32
ld1 {v6.4s, v7.4s}, [A_PTR], #32
ld1 {v8.4s, v9.4s}, [Y_IPTR], #32
fmla v8.4s, v1.4s, v6.4s
prfm PLDL1KEEP, [Y_IPTR, #Y_PRE_SIZE]
fmla v9.4s, v1.4s, v7.4s
st1 {v8.4s, v9.4s}, [Y_OPTR], #32
#else //DOUBLE
ld1 {v2.2d, v3.2d}, [A_PTR], #32
ld1 {v4.2d, v5.2d}, [Y_IPTR], #32
fmla v4.2d, v1.2d, v2.2d
prfm PLDL1KEEP, [A_PTR, #A_PRE_SIZE]
fmla v5.2d, v1.2d, v3.2d
st1 {v4.2d, v5.2d}, [Y_OPTR], #32
ld1 {v6.2d, v7.2d}, [A_PTR], #32
ld1 {v8.2d, v9.2d}, [Y_IPTR], #32
fmla v8.2d, v1.2d, v6.2d
prfm PLDL1KEEP, [Y_IPTR, #Y_PRE_SIZE]
fmla v9.2d, v1.2d, v7.2d
st1 {v8.2d, v9.2d}, [Y_OPTR], #32
ld1 {v10.2d, v11.2d}, [A_PTR], #32
ld1 {v12.2d, v13.2d}, [Y_IPTR], #32
fmla v12.2d, v1.2d, v10.2d
prfm PLDL1KEEP, [A_PTR, #A_PRE_SIZE]
fmla v13.2d, v1.2d, v11.2d
st1 {v12.2d, v13.2d}, [Y_OPTR], #32
ld1 {v14.2d, v15.2d}, [A_PTR], #32
ld1 {v16.2d, v17.2d}, [Y_IPTR], #32
fmla v16.2d, v1.2d, v14.2d
prfm PLDL1KEEP, [Y_IPTR, #Y_PRE_SIZE]
fmla v17.2d, v1.2d, v15.2d
st1 {v16.2d, v17.2d}, [Y_OPTR], #32
#endif

View File

@ -41,6 +41,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define J x11 /* loop variable */
#define I x12 /* loop variable */
#define X_PREFETCH_SIZE 768
#define A_PREFETCH_SIZE 768
/*******************************************************************************
* Macro definitions
*******************************************************************************/
@ -112,42 +115,54 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v5.4s, v6.4s, v7.4s, v8.4s}, [A_PTR], #64
ld1 {v9.4s, v10.4s, v11.4s, v12.4s}, [X_PTR], #64
fmla v1.4s, v5.4s, v9.4s
prfm PLDL1KEEP, [A_PTR, #A_PREFETCH_SIZE]
fmla v2.4s, v6.4s, v10.4s
prfm PLDL1KEEP, [X_PTR, #X_PREFETCH_SIZE]
fmla v3.4s, v7.4s, v11.4s
ld1 {v13.4s, v14.4s, v15.4s, v16.4s}, [A_PTR], #64
fmla v4.4s, v8.4s, v12.4s
ld1 {v13.4s, v14.4s, v15.4s, v16.4s}, [A_PTR], #64
ld1 {v17.4s, v18.4s, v19.4s, v20.4s}, [X_PTR], #64
fmla v1.4s, v13.4s, v17.4s
prfm PLDL1KEEP, [A_PTR, #A_PREFETCH_SIZE]
fmla v2.4s, v14.4s, v18.4s
prfm PLDL1KEEP, [X_PTR, #X_PREFETCH_SIZE]
fmla v3.4s, v15.4s, v19.4s
fmla v4.4s, v16.4s, v20.4s
#else
ld1 {v5.2d, v6.2d, v7.2d, v8.2d}, [A_PTR], #64
ld1 {v9.2d, v10.2d, v11.2d, v12.2d}, [X_PTR], #64
fmla v1.2d, v5.2d, v9.2d
prfm PLDL1KEEP, [A_PTR, #A_PREFETCH_SIZE]
fmla v2.2d, v6.2d, v10.2d
prfm PLDL1KEEP, [X_PTR, #X_PREFETCH_SIZE]
fmla v3.2d, v7.2d, v11.2d
fmla v4.2d, v8.2d, v12.2d
ld1 {v13.2d, v14.2d, v15.2d, v16.2d}, [A_PTR], #64
ld1 {v17.2d, v18.2d, v19.2d, v20.2d}, [X_PTR], #64
fmla v1.2d, v13.2d, v17.2d
prfm PLDL1KEEP, [A_PTR, #A_PREFETCH_SIZE]
fmla v2.2d, v14.2d, v18.2d
prfm PLDL1KEEP, [X_PTR, #X_PREFETCH_SIZE]
fmla v3.2d, v15.2d, v19.2d
fmla v4.2d, v16.2d, v20.2d
ld1 {v5.2d, v6.2d, v7.2d, v8.2d}, [A_PTR], #64
ld1 {v9.2d, v10.2d, v11.2d, v12.2d}, [X_PTR], #64
fmla v1.2d, v5.2d, v9.2d
prfm PLDL1KEEP, [A_PTR, #A_PREFETCH_SIZE]
fmla v2.2d, v6.2d, v10.2d
prfm PLDL1KEEP, [X_PTR, #X_PREFETCH_SIZE]
fmla v3.2d, v7.2d, v11.2d
fmla v4.2d, v8.2d, v12.2d
ld1 {v13.2d, v14.2d, v15.2d, v16.2d}, [A_PTR], #64
ld1 {v17.2d, v18.2d, v19.2d, v20.2d}, [X_PTR], #64
fmla v1.2d, v13.2d, v17.2d
prfm PLDL1KEEP, [A_PTR, #A_PREFETCH_SIZE]
fmla v2.2d, v14.2d, v18.2d
prfm PLDL1KEEP, [X_PTR, #X_PREFETCH_SIZE]
fmla v3.2d, v15.2d, v19.2d
fmla v4.2d, v16.2d, v20.2d
#endif

View File

@ -72,6 +72,148 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fabs MAXF, MAXF
.endm
.macro KERNEL_F8
#if !defined(DOUBLE)
ldp q2, q3, [X], #32
fabs v2.4s, v2.4s
fabs v3.4s, v3.4s
fmax v2.4s, v2.4s, v3.4s
fmaxv TMPF, v2.4s
fcmp MAXF, TMPF
fcsel MAXF, MAXF, TMPF, COND
csel INDEX, INDEX, Z, COND
add Z, Z, #8
#else
ldp q2, q3, [X], #32
ldp q4, q5, [X], #32
fabs v2.2d, v2.2d
fabs v3.2d, v3.2d
fabs v4.2d, v4.2d
fabs v5.2d, v5.2d
fmax v2.2d, v2.2d, v3.2d
fmax v4.2d, v4.2d, v5.2d
fmax v2.2d, v2.2d, v4.2d
fmaxp TMPF, v2.2d
fcmp MAXF, TMPF
fcsel MAXF, MAXF, TMPF, COND
csel INDEX, INDEX, Z, COND
add Z, Z, #8
#endif
PRFM PLDL1KEEP, [X, #1024]
.endm
.macro KERNEL_F8_FINALIZE
sub x6, INDEX, #1
#if !defined(DOUBLE)
lsl x6, x6, #2
add x7, x7, x6
ldp q2, q3, [x7]
fabs v2.4s, v2.4s
fabs v3.4s, v3.4s
ins v4.s[0], v3.s[0]
ins v5.s[0], v3.s[1]
ins v6.s[0], v3.s[2]
ins v7.s[0], v3.s[3]
add x6, INDEX, #7
fcmp MAXF, s7
csel INDEX, x6, INDEX, eq
sub x6, x6, #1
fcmp MAXF, s6
csel INDEX, x6, INDEX, eq
sub x6, x6, #1
fcmp MAXF, s5
csel INDEX, x6, INDEX, eq
sub x6, x6, #1
fcmp MAXF, s4
csel INDEX, x6, INDEX, eq
ins v4.s[0], v2.s[0]
ins v5.s[0], v2.s[1]
ins v6.s[0], v2.s[2]
ins v7.s[0], v2.s[3]
sub x6, x6, #1
fcmp MAXF, s7
csel INDEX, x6, INDEX, eq
sub x6, x6, #1
fcmp MAXF, s6
csel INDEX, x6, INDEX, eq
sub x6, x6, #1
fcmp MAXF, s5
csel INDEX, x6, INDEX, eq
sub x6, x6, #1
fcmp MAXF, s4
csel INDEX, x6, INDEX, eq
#else
add x6, x6, #4
lsl x6, x6, #3
add x7, x7, x6
ldp q2, q3, [x7]
fabs v2.2d, v2.2d
fabs v3.2d, v3.2d
ins v4.d[0], v2.d[0]
ins v5.d[0], v2.d[1]
ins v6.d[0], v3.d[0]
ins v7.d[0], v3.d[1]
add x6, INDEX, #7
fcmp MAXF, d7
csel INDEX, x6, INDEX, eq
sub x6, x6, #1
fcmp MAXF, d6
csel INDEX, x6, INDEX, eq
sub x6, x6, #1
fcmp MAXF, d5
csel INDEX, x6, INDEX, eq
sub x6, x6, #1
fcmp MAXF, d4
csel INDEX, x6, INDEX, eq
sub x7, x7, #32
ldp q2, q3, [x7]
fabs v2.2d, v2.2d
fabs v3.2d, v3.2d
ins v4.d[0], v2.d[0]
ins v5.d[0], v2.d[1]
ins v6.d[0], v3.d[0]
ins v7.d[0], v3.d[1]
sub x6, x6, #1
fcmp MAXF, d7
csel INDEX, x6, INDEX, eq
sub x6, x6, #1
fcmp MAXF, d6
csel INDEX, x6, INDEX, eq
sub x6, x6, #1
fcmp MAXF, d5
csel INDEX, x6, INDEX, eq
sub x6, x6, #1
fcmp MAXF, d4
csel INDEX, x6, INDEX, eq
#endif
.endm
.macro KERNEL_S1
ld1 TMPVF, [X], INC_X
add Z, Z, #1
@ -92,6 +234,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
cmp INC_X, xzr
ble iamax_kernel_zero
cmp INC_X, #1
bne iamax_kernel_S_BEGIN
mov x7, X
iamax_kernel_F_BEGIN:
INIT_S
subs N, N, #1
ble iamax_kernel_L999
asr I, N, #3
cmp I, xzr
beq iamax_kernel_F1
add Z, Z, #1
iamax_kernel_F8:
KERNEL_F8
subs I, I, #1
bne iamax_kernel_F8
KERNEL_F8_FINALIZE
sub Z, Z, #1
iamax_kernel_F1:
ands I, N, #7
ble iamax_kernel_L999
iamax_kernel_F10:
KERNEL_S1
subs I, I, #1
bne iamax_kernel_F10
b iamax_kernel_L999
iamax_kernel_S_BEGIN:
INIT_S
subs N, N, #1

View File

@ -78,6 +78,179 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
.endm
.macro KERNEL_F8
#if !defined(DOUBLE)
ldp q2, q3, [X], #32
ldp q4, q5, [X], #32
fabs v2.4s, v2.4s
fabs v3.4s, v3.4s
fabs v4.4s, v4.4s
fabs v5.4s, v5.4s
faddp v2.4s, v2.4s, v3.4s
faddp v3.4s, v4.4s, v5.4s
fmax v2.4s, v2.4s, v3.4s
fmaxv TMPF, v2.4s
fcmp MAXF, TMPF
fcsel MAXF, MAXF, TMPF, COND
csel INDEX, INDEX, Z, COND
add Z, Z, #8
#else
ldp q2, q3, [X], #32
ldp q4, q5, [X], #32
ldp q16, q17, [X], #32
ldp q18, q19, [X], #32
fabs v2.2d, v2.2d
fabs v3.2d, v3.2d
fabs v4.2d, v4.2d
fabs v5.2d, v5.2d
fabs v16.2d, v16.2d
fabs v17.2d, v17.2d
fabs v18.2d, v18.2d
fabs v19.2d, v19.2d
faddp v2.2d, v2.2d, v3.2d
faddp v3.2d, v4.2d, v5.2d
faddp v4.2d, v16.2d, v17.2d
faddp v5.2d, v18.2d, v19.2d
fmax v2.2d, v2.2d, v3.2d
fmax v4.2d, v4.2d, v5.2d
fmax v2.2d, v2.2d, v4.2d
fmaxp TMPF, v2.2d
fcmp MAXF, TMPF
fcsel MAXF, MAXF, TMPF, COND
csel INDEX, INDEX, Z, COND
add Z, Z, #8
#endif
PRFM PLDL1KEEP, [X, #1024]
.endm
.macro KERNEL_F8_FINALIZE
sub x6, INDEX, #1
#if !defined(DOUBLE)
lsl x6, x6, #3
add x7, x7, x6
ldp q2, q3, [x7]
ldp q4, q5, [x7, #32]
fabs v2.4s, v2.4s
fabs v3.4s, v3.4s
fabs v4.4s, v4.4s
fabs v5.4s, v5.4s
faddp v2.4s, v2.4s, v3.4s
faddp v3.4s, v4.4s, v5.4s
ins v4.s[0], v3.s[3]
add x6, INDEX, #7
fcmp MAXF, s4
csel INDEX, x6, INDEX, eq
ins v4.s[0], v3.s[2]
sub x6, x6, #1
fcmp MAXF, s4
csel INDEX, x6, INDEX, eq
ins v4.s[0], v3.s[1]
sub x6, x6, #1
fcmp MAXF, s4
csel INDEX, x6, INDEX, eq
ins v4.s[0], v3.s[0]
sub x6, x6, #1
fcmp MAXF, s4
csel INDEX, x6, INDEX, eq
ins v4.s[0], v2.s[3]
sub x6, x6, #1
fcmp MAXF, s4
csel INDEX, x6, INDEX, eq
ins v4.s[0], v2.s[2]
sub x6, x6, #1
fcmp MAXF, s4
csel INDEX, x6, INDEX, eq
ins v4.s[0], v2.s[1]
sub x6, x6, #1
fcmp MAXF, s4
csel INDEX, x6, INDEX, eq
ins v4.s[0], v2.s[0]
sub x6, x6, #1
fcmp MAXF, s4
csel INDEX, x6, INDEX, eq
#else
lsl x6, x6, #4
add x7, x7, x6
ldp q2, q3, [x7]
ldp q4, q5, [x7, #32]
ldp q16, q17, [x7, #64]
ldp q18, q19, [x7, #96]
fabs v2.2d, v2.2d
fabs v3.2d, v3.2d
fabs v4.2d, v4.2d
fabs v5.2d, v5.2d
fabs v16.2d, v16.2d
fabs v17.2d, v17.2d
fabs v18.2d, v18.2d
fabs v19.2d, v19.2d
faddp v2.2d, v2.2d, v3.2d
faddp v3.2d, v4.2d, v5.2d
faddp v4.2d, v16.2d, v17.2d
faddp v5.2d, v18.2d, v19.2d
ins v7.d[0], v5.d[1]
add x6, INDEX, #7
fcmp MAXF, d7
csel INDEX, x6, INDEX, eq
ins v7.d[0], v5.d[0]
sub x6, x6, #1
fcmp MAXF, d7
csel INDEX, x6, INDEX, eq
ins v7.d[0], v4.d[1]
sub x6, x6, #1
fcmp MAXF, d7
csel INDEX, x6, INDEX, eq
ins v7.d[0], v4.d[0]
sub x6, x6, #1
fcmp MAXF, d7
csel INDEX, x6, INDEX, eq
ins v7.d[0], v3.d[1]
sub x6, x6, #1
fcmp MAXF, d7
csel INDEX, x6, INDEX, eq
ins v7.d[0], v3.d[0]
sub x6, x6, #1
fcmp MAXF, d7
csel INDEX, x6, INDEX, eq
ins v7.d[0], v2.d[1]
sub x6, x6, #1
fcmp MAXF, d7
csel INDEX, x6, INDEX, eq
ins v7.d[0], v2.d[0]
sub x6, x6, #1
fcmp MAXF, d7
csel INDEX, x6, INDEX, eq
#endif
.endm
.macro KERNEL_S1
#if !defined(DOUBLE)
ld1 {v1.2s}, [X], INC_X
@ -107,6 +280,50 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
cmp INC_X, xzr
ble iamax_kernel_zero
cmp INC_X, #1
bne iamax_kernel_S_BEGIN
mov x7, X
iamax_kernel_F_BEGIN:
INIT_S
subs N, N, #1
ble iamax_kernel_L999
asr I, N, #3
cmp I, xzr
ble iamax_kernel_F1
add Z, Z, #1
iamax_kernel_F8:
KERNEL_F8
subs I, I, #1
bne iamax_kernel_F8
KERNEL_F8_FINALIZE
sub Z, Z, #1
iamax_kernel_F1:
ands I, N, #7
ble iamax_kernel_L999
iamax_kernel_F10:
KERNEL_S1
subs I, I, #1
bne iamax_kernel_F10
b iamax_kernel_L999
iamax_kernel_S_BEGIN:
INIT_S
subs N, N, #1

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -46,20 +46,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define pCRow0 x12
#define pCRow1 x13
#define pCRow2 x14
#define pA x15
#define alpha_save_R x16
#define alpha_save_I x17
#define pCRow3 x15
#define pA x16
#define alphaR x17
#define alphaI x18
#define alpha0_R d10
#define alphaV0_R v10.d[0]
#define alpha0_I d11
#define alphaV0_I v11.d[0]
#define alpha1_R d14
#define alphaV1_R v14.d[0]
#define alpha1_I d15
#define alphaV1_I v15.d[0]
#define A_PRE_SIZE 2560
#define B_PRE_SIZE 448
#define C_PRE_SIZE 128
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
#define OP_rr fmla
@ -98,10 +97,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
// 12 pCRow0
// 13 pCRow1
// 14 pCRow2
// 15 pA
// 16 alpha_save_R
// 17 alpha_save_I
// 18 must save
// 15 pCRow3
// 16 pA
// 17 alpha_save_R
// 18 must save alpha_save_I
// 19 must save
// 20 must save
// 21 must save
@ -175,12 +174,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro KERNEL4x4_I
ld2 {v8.2d, v9.2d}, [pB]
add pB, pB, #32
ld2 {v10.2d, v11.2d}, [pB]
add pB, pB, #32
ld2 {v0.2d, v1.2d}, [pA]
add pA, pA, #32
ld2 {v2.2d, v3.2d}, [pA]
add pA, pA, #32
fmul v16.2d, v0.2d, v8.d[0]
OP_ii v16.2d, v1.2d, v9.d[0]
@ -193,16 +188,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
OP_ir v17.2d, v1.2d, v8.d[0]
fmul v18.2d, v2.2d, v8.d[0]
OP_ii v18.2d, v3.2d, v9.d[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v19.16b, v19.16b, v19.16b
fmls v19.2d, v2.2d, v9.d[0]
#else
fmul v19.2d, v2.2d, v9.d[0]
#endif
OP_ir v19.2d, v3.2d, v8.d[0]
ld2 {v2.2d, v3.2d}, [pA]
add pA, pA, #32
fmul v20.2d, v0.2d, v8.d[1]
OP_ii v20.2d, v1.2d, v9.d[1]
@ -215,6 +202,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
OP_ir v21.2d, v1.2d, v8.d[1]
ld2 {v10.2d, v11.2d}, [pB]
add pB, pB, #32
fmul v22.2d, v2.2d, v8.d[1]
OP_ii v22.2d, v3.2d, v9.d[1]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
@ -226,6 +216,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
OP_ir v23.2d, v3.2d, v8.d[1]
ld2 {v12.2d, v13.2d}, [pB]
add pB, pB, #32
fmul v18.2d, v2.2d, v8.d[0]
OP_ii v18.2d, v3.2d, v9.d[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v19.16b, v19.16b, v19.16b
fmls v19.2d, v2.2d, v9.d[0]
#else
fmul v19.2d, v2.2d, v9.d[0]
#endif
OP_ir v19.2d, v3.2d, v8.d[0]
ld2 {v4.2d, v5.2d} , [pA]
add pA, pA, #32
fmul v24.2d, v0.2d, v10.d[0]
OP_ii v24.2d, v1.2d, v11.d[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
@ -237,6 +244,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
OP_ir v25.2d, v1.2d, v10.d[0]
ld2 {v6.2d, v7.2d} , [pA]
add pA, pA, #32
fmul v26.2d, v2.2d, v10.d[0]
OP_ii v26.2d, v3.2d, v11.d[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
@ -248,6 +258,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
OP_ir v27.2d, v3.2d, v10.d[0]
ld2 {v14.2d, v15.2d}, [pB]
add pB, pB, #32
fmul v28.2d, v0.2d, v10.d[1]
OP_ii v28.2d, v1.2d, v11.d[1]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
@ -259,6 +272,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
OP_ir v29.2d, v1.2d, v10.d[1]
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
fmul v30.2d, v2.2d, v10.d[1]
OP_ii v30.2d, v3.2d, v11.d[1]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
@ -270,14 +285,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
OP_ir v31.2d, v3.2d, v10.d[1]
ld2 {v12.2d, v13.2d}, [pB]
add pB, pB, #32
ld2 {v14.2d, v15.2d}, [pB]
add pB, pB, #32
ld2 {v4.2d, v5.2d} , [pA]
add pA, pA, #32
ld2 {v6.2d, v7.2d} , [pA]
add pA, pA, #32
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
.endm
.macro KERNEL4x4_M1
@ -286,7 +294,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
OP_ri v17.2d, v0.2d, v9.d[0]
OP_ir v17.2d, v1.2d, v8.d[0]
ld2 {v12.2d, v13.2d}, [pB] // For next round
ld2 {v12.2d, v13.2d}, [pB]
add pB, pB, #32
OP_rr v18.2d, v2.2d, v8.d[0]
@ -294,15 +302,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
OP_ri v19.2d, v2.2d, v9.d[0]
OP_ir v19.2d, v3.2d, v8.d[0]
ld2 {v14.2d, v15.2d}, [pB] // For next round
add pB, pB, #32
ld2 {v4.2d, v5.2d} , [pA]
add pA, pA, #32
OP_rr v20.2d, v0.2d, v8.d[1]
OP_ii v20.2d, v1.2d, v9.d[1]
OP_ri v21.2d, v0.2d, v9.d[1]
OP_ir v21.2d, v1.2d, v8.d[1]
ld2 {v4.2d, v5.2d} , [pA] // For next round
ld2 {v6.2d, v7.2d} , [pA]
add pA, pA, #32
OP_rr v22.2d, v2.2d, v8.d[1]
@ -310,22 +318,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
OP_ri v23.2d, v2.2d, v9.d[1]
OP_ir v23.2d, v3.2d, v8.d[1]
ld2 {v6.2d, v7.2d} , [pA] // For next round
add pA, pA, #32
ld2 {v14.2d, v15.2d}, [pB]
add pB, pB, #32
OP_rr v24.2d, v0.2d, v10.d[0]
OP_ii v24.2d, v1.2d, v11.d[0]
OP_ri v25.2d, v0.2d, v11.d[0]
OP_ir v25.2d, v1.2d, v10.d[0]
prfm PLDL1KEEP, [pA, #512]
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
OP_rr v26.2d, v2.2d, v10.d[0]
OP_ii v26.2d, v3.2d, v11.d[0]
OP_ri v27.2d, v2.2d, v11.d[0]
OP_ir v27.2d, v3.2d, v10.d[0]
prfm PLDL1KEEP, [pB, #512]
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
OP_rr v28.2d, v0.2d, v10.d[1]
OP_ii v28.2d, v1.2d, v11.d[1]
@ -344,7 +352,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
OP_ri v17.2d, v4.2d, v13.d[0]
OP_ir v17.2d, v5.2d, v12.d[0]
ld2 {v8.2d, v9.2d}, [pB] // For next round
ld2 {v8.2d, v9.2d}, [pB]
add pB, pB, #32
OP_rr v18.2d, v6.2d, v12.d[0]
@ -352,15 +360,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
OP_ri v19.2d, v6.2d, v13.d[0]
OP_ir v19.2d, v7.2d, v12.d[0]
ld2 {v10.2d, v11.2d}, [pB] // For next round
add pB, pB, #32
ld2 {v0.2d, v1.2d}, [pA]
add pA, pA, #32
OP_rr v20.2d, v4.2d, v12.d[1]
OP_ii v20.2d, v5.2d, v13.d[1]
OP_ri v21.2d, v4.2d, v13.d[1]
OP_ir v21.2d, v5.2d, v12.d[1]
ld2 {v0.2d, v1.2d}, [pA] // For next round
ld2 {v2.2d, v3.2d}, [pA]
add pA, pA, #32
OP_rr v22.2d, v6.2d, v12.d[1]
@ -368,22 +376,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
OP_ri v23.2d, v6.2d, v13.d[1]
OP_ir v23.2d, v7.2d, v12.d[1]
ld2 {v2.2d, v3.2d}, [pA] // For next round
add pA, pA, #32
ld2 {v10.2d, v11.2d}, [pB]
add pB, pB, #32
OP_rr v24.2d, v4.2d, v14.d[0]
OP_ii v24.2d, v5.2d, v15.d[0]
OP_ri v25.2d, v4.2d, v15.d[0]
OP_ir v25.2d, v5.2d, v14.d[0]
prfm PLDL1KEEP, [pA, #512]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
OP_rr v26.2d, v6.2d, v14.d[0]
OP_ii v26.2d, v7.2d, v15.d[0]
OP_ri v27.2d, v6.2d, v15.d[0]
OP_ir v27.2d, v7.2d, v14.d[0]
prfm PLDL1KEEP, [pB, #512]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64]
OP_rr v28.2d, v4.2d, v14.d[1]
OP_ii v28.2d, v5.2d, v15.d[1]
@ -412,6 +420,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
OP_ri v21.2d, v4.2d, v13.d[1]
OP_ir v21.2d, v5.2d, v12.d[1]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
OP_rr v22.2d, v6.2d, v12.d[1]
OP_ii v22.2d, v7.2d, v13.d[1]
OP_ri v23.2d, v6.2d, v13.d[1]
@ -422,6 +432,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
OP_ri v25.2d, v4.2d, v15.d[0]
OP_ir v25.2d, v5.2d, v14.d[0]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64]
OP_rr v26.2d, v6.2d, v14.d[0]
OP_ii v26.2d, v7.2d, v15.d[0]
OP_ri v27.2d, v6.2d, v15.d[0]
@ -441,33 +453,40 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro KERNEL4x4_SUB
ld2 {v8.2d, v9.2d}, [pB]
add pB, pB, #32
ld2 {v10.2d, v11.2d}, [pB]
add pB, pB, #32
ld2 {v0.2d, v1.2d}, [pA]
add pA, pA, #32
ld2 {v2.2d, v3.2d}, [pA]
add pA, pA, #32
OP_rr v16.2d, v0.2d, v8.d[0]
OP_ii v16.2d, v1.2d, v9.d[0]
OP_ri v17.2d, v0.2d, v9.d[0]
OP_ir v17.2d, v1.2d, v8.d[0]
OP_rr v18.2d, v2.2d, v8.d[0]
OP_ii v18.2d, v3.2d, v9.d[0]
OP_ri v19.2d, v2.2d, v9.d[0]
OP_ir v19.2d, v3.2d, v8.d[0]
ld2 {v2.2d, v3.2d}, [pA]
add pA, pA, #32
OP_rr v20.2d, v0.2d, v8.d[1]
OP_ii v20.2d, v1.2d, v9.d[1]
OP_ri v21.2d, v0.2d, v9.d[1]
OP_ir v21.2d, v1.2d, v8.d[1]
ld2 {v10.2d, v11.2d}, [pB]
add pB, pB, #32
OP_rr v18.2d, v2.2d, v8.d[0]
OP_ii v18.2d, v3.2d, v9.d[0]
OP_ri v19.2d, v2.2d, v9.d[0]
OP_ir v19.2d, v3.2d, v8.d[0]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
OP_rr v22.2d, v2.2d, v8.d[1]
OP_ii v22.2d, v3.2d, v9.d[1]
OP_ri v23.2d, v2.2d, v9.d[1]
OP_ir v23.2d, v3.2d, v8.d[1]
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
OP_rr v24.2d, v0.2d, v10.d[0]
OP_ii v24.2d, v1.2d, v11.d[0]
OP_ri v25.2d, v0.2d, v11.d[0]
@ -490,74 +509,85 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE4x4
fmov alpha0_R, alpha_save_R
fmov alpha0_I, alpha_save_I
fmov alpha1_R, alpha0_R
fmov alpha1_I, alpha0_I
fmov alpha0_R, alphaR
fmov alpha0_I, alphaI
mov pCRow1, pCRow0
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
ld2 {v0.2d, v1.2d}, [pCRow1]
ld2 {v0.2d, v1.2d}, [pCRow0]
fmla v0.2d, v16.2d, alphaV0_R
fmls v0.2d, v17.2d, alphaV0_I
fmla v1.2d, v16.2d, alphaV1_I
fmla v1.2d, v17.2d, alphaV1_R
st2 {v0.2d, v1.2d}, [pCRow1]
add pCRow2, pCRow1, #32
ld2 {v2.2d, v3.2d}, [pCRow2]
fmla v1.2d, v16.2d, alphaV0_I
fmla v1.2d, v17.2d, alphaV0_R
st2 {v0.2d, v1.2d}, [pCRow0]
add pCRow0, pCRow0, #32
ld2 {v2.2d, v3.2d}, [pCRow0]
fmla v2.2d, v18.2d, alphaV0_R
fmls v2.2d, v19.2d, alphaV0_I
fmla v3.2d, v18.2d, alphaV1_I
fmla v3.2d, v19.2d, alphaV1_R
st2 {v2.2d, v3.2d}, [pCRow2]
fmla v3.2d, v18.2d, alphaV0_I
fmla v3.2d, v19.2d, alphaV0_R
st2 {v2.2d, v3.2d}, [pCRow0]
add pCRow0, pCRow0, #32
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
add pCRow1, pCRow1, LDC
ld2 {v4.2d, v5.2d}, [pCRow1]
fmla v4.2d, v20.2d, alphaV0_R
fmls v4.2d, v21.2d, alphaV0_I
fmla v5.2d, v20.2d, alphaV1_I
fmla v5.2d, v21.2d, alphaV1_R
fmla v5.2d, v20.2d, alphaV0_I
fmla v5.2d, v21.2d, alphaV0_R
st2 {v4.2d, v5.2d}, [pCRow1]
add pCRow2, pCRow1, #32
ld2 {v6.2d, v7.2d}, [pCRow2]
add pCRow1, pCRow1, #32
ld2 {v6.2d, v7.2d}, [pCRow1]
fmla v6.2d, v22.2d, alphaV0_R
fmls v6.2d, v23.2d, alphaV0_I
fmla v7.2d, v22.2d, alphaV1_I
fmla v7.2d, v23.2d, alphaV1_R
st2 {v6.2d, v7.2d}, [pCRow2]
fmla v7.2d, v22.2d, alphaV0_I
fmla v7.2d, v23.2d, alphaV0_R
st2 {v6.2d, v7.2d}, [pCRow1]
add pCRow1, pCRow1, LDC
ld2 {v0.2d, v1.2d}, [pCRow1]
add pCRow1, pCRow1, #32
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
ld2 {v0.2d, v1.2d}, [pCRow2]
fmla v0.2d, v24.2d, alphaV0_R
fmls v0.2d, v25.2d, alphaV0_I
fmla v1.2d, v24.2d, alphaV1_I
fmla v1.2d, v25.2d, alphaV1_R
st2 {v0.2d, v1.2d}, [pCRow1]
add pCRow2, pCRow1, #32
fmla v1.2d, v24.2d, alphaV0_I
fmla v1.2d, v25.2d, alphaV0_R
st2 {v0.2d, v1.2d}, [pCRow2]
add pCRow2, pCRow2, #32
ld2 {v2.2d, v3.2d}, [pCRow2]
fmla v2.2d, v26.2d, alphaV0_R
fmls v2.2d, v27.2d, alphaV0_I
fmla v3.2d, v26.2d, alphaV1_I
fmla v3.2d, v27.2d, alphaV1_R
fmla v3.2d, v26.2d, alphaV0_I
fmla v3.2d, v27.2d, alphaV0_R
st2 {v2.2d, v3.2d}, [pCRow2]
add pCRow1, pCRow1, LDC
add pCRow2, pCRow2, #32
prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
ld2 {v4.2d, v5.2d}, [pCRow1]
ld2 {v4.2d, v5.2d}, [pCRow3]
fmla v4.2d, v28.2d, alphaV0_R
fmls v4.2d, v29.2d, alphaV0_I
fmla v5.2d, v28.2d, alphaV1_I
fmla v5.2d, v29.2d, alphaV1_R
st2 {v4.2d, v5.2d}, [pCRow1]
add pCRow2, pCRow1, #32
ld2 {v6.2d, v7.2d}, [pCRow2]
fmla v5.2d, v28.2d, alphaV0_I
fmla v5.2d, v29.2d, alphaV0_R
st2 {v4.2d, v5.2d}, [pCRow3]
add pCRow3, pCRow3, #32
ld2 {v6.2d, v7.2d}, [pCRow3]
fmla v6.2d, v30.2d, alphaV0_R
fmls v6.2d, v31.2d, alphaV0_I
fmla v7.2d, v30.2d, alphaV1_I
fmla v7.2d, v31.2d, alphaV1_R
st2 {v6.2d, v7.2d}, [pCRow2]
fmla v7.2d, v30.2d, alphaV0_I
fmla v7.2d, v31.2d, alphaV0_R
st2 {v6.2d, v7.2d}, [pCRow3]
add pCRow0, pCRow0, #64
add pCRow3, pCRow3, #32
.endm
/******************************************************************************/
@ -604,18 +634,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE2x4
fmov alpha0_R, alpha_save_R
fmov alpha0_I, alpha_save_I
fmov alpha1_R, alpha0_R
fmov alpha1_I, alpha0_I
fmov alpha0_R, alphaR
fmov alpha0_I, alphaI
mov pCRow1, pCRow0
ld2 {v0.2d, v1.2d}, [pCRow1]
fmla v0.2d, v16.2d, alphaV0_R
fmls v0.2d, v17.2d, alphaV0_I
fmla v1.2d, v16.2d, alphaV1_I
fmla v1.2d, v17.2d, alphaV1_R
fmla v1.2d, v16.2d, alphaV0_I
fmla v1.2d, v17.2d, alphaV0_R
st2 {v0.2d, v1.2d}, [pCRow1]
add pCRow1, pCRow1, LDC
@ -623,8 +651,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v4.2d, v5.2d}, [pCRow1]
fmla v4.2d, v20.2d, alphaV0_R
fmls v4.2d, v21.2d, alphaV0_I
fmla v5.2d, v20.2d, alphaV1_I
fmla v5.2d, v21.2d, alphaV1_R
fmla v5.2d, v20.2d, alphaV0_I
fmla v5.2d, v21.2d, alphaV0_R
st2 {v4.2d, v5.2d}, [pCRow1]
add pCRow1, pCRow1, LDC
@ -632,8 +660,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v0.2d, v1.2d}, [pCRow1]
fmla v0.2d, v24.2d, alphaV0_R
fmls v0.2d, v25.2d, alphaV0_I
fmla v1.2d, v24.2d, alphaV1_I
fmla v1.2d, v25.2d, alphaV1_R
fmla v1.2d, v24.2d, alphaV0_I
fmla v1.2d, v25.2d, alphaV0_R
st2 {v0.2d, v1.2d}, [pCRow1]
add pCRow1, pCRow1, LDC
@ -641,8 +669,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v4.2d, v5.2d}, [pCRow1]
fmla v4.2d, v28.2d, alphaV0_R
fmls v4.2d, v29.2d, alphaV0_I
fmla v5.2d, v28.2d, alphaV1_I
fmla v5.2d, v29.2d, alphaV1_R
fmla v5.2d, v28.2d, alphaV0_I
fmla v5.2d, v29.2d, alphaV0_R
st2 {v4.2d, v5.2d}, [pCRow1]
add pCRow0, pCRow0, #32
@ -691,18 +719,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE1x4
fmov alpha0_R, alpha_save_R
fmov alpha0_I, alpha_save_I
fmov alpha1_R, alpha0_R
fmov alpha1_I, alpha0_I
fmov alpha0_R, alphaR
fmov alpha0_I, alphaI
mov pCRow1, pCRow0
ld2 {v0.d, v1.d}[0], [pCRow1]
fmla d0, d16, alphaV0_R
fmls d0, d17, alphaV0_I
fmla d1, d16, alphaV1_I
fmla d1, d17, alphaV1_R
fmla d1, d16, alphaV0_I
fmla d1, d17, alphaV0_R
st2 {v0.d, v1.d}[0], [pCRow1]
add pCRow1, pCRow1, LDC
@ -710,8 +736,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v4.d, v5.d}[0], [pCRow1]
fmla d4, d20, alphaV0_R
fmls d4, d21, alphaV0_I
fmla d5, d20, alphaV1_I
fmla d5, d21, alphaV1_R
fmla d5, d20, alphaV0_I
fmla d5, d21, alphaV0_R
st2 {v4.d, v5.d}[0], [pCRow1]
add pCRow1, pCRow1, LDC
@ -719,8 +745,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v0.d, v1.d}[0], [pCRow1]
fmla d0, d24, alphaV0_R
fmls d0, d25, alphaV0_I
fmla d1, d24, alphaV1_I
fmla d1, d25, alphaV1_R
fmla d1, d24, alphaV0_I
fmla d1, d25, alphaV0_R
st2 {v0.d, v1.d}[0], [pCRow1]
add pCRow1, pCRow1, LDC
@ -728,8 +754,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v4.d, v5.d}[0], [pCRow1]
fmla d4, d28, alphaV0_R
fmls d4, d29, alphaV0_I
fmla d5, d28, alphaV1_I
fmla d5, d29, alphaV1_R
fmla d5, d28, alphaV0_I
fmla d5, d29, alphaV0_R
st2 {v4.d, v5.d}[0], [pCRow1]
add pCRow0, pCRow0, #16
@ -778,25 +804,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE4x2
fmov alpha0_R, alpha_save_R
fmov alpha0_I, alpha_save_I
fmov alpha1_R, alpha0_R
fmov alpha1_I, alpha0_I
fmov alpha0_R, alphaR
fmov alpha0_I, alphaI
mov pCRow1, pCRow0
ld2 {v0.2d, v1.2d}, [pCRow1]
fmla v0.2d, v16.2d, alphaV0_R
fmls v0.2d, v17.2d, alphaV0_I
fmla v1.2d, v16.2d, alphaV1_I
fmla v1.2d, v17.2d, alphaV1_R
fmla v1.2d, v16.2d, alphaV0_I
fmla v1.2d, v17.2d, alphaV0_R
st2 {v0.2d, v1.2d}, [pCRow1]
add pCRow2, pCRow1, #32
ld2 {v2.2d, v3.2d}, [pCRow2]
fmla v2.2d, v18.2d, alphaV0_R
fmls v2.2d, v19.2d, alphaV0_I
fmla v3.2d, v18.2d, alphaV1_I
fmla v3.2d, v19.2d, alphaV1_R
fmla v3.2d, v18.2d, alphaV0_I
fmla v3.2d, v19.2d, alphaV0_R
st2 {v2.2d, v3.2d}, [pCRow2]
add pCRow1, pCRow1, LDC
@ -804,15 +828,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v4.2d, v5.2d}, [pCRow1]
fmla v4.2d, v20.2d, alphaV0_R
fmls v4.2d, v21.2d, alphaV0_I
fmla v5.2d, v20.2d, alphaV1_I
fmla v5.2d, v21.2d, alphaV1_R
fmla v5.2d, v20.2d, alphaV0_I
fmla v5.2d, v21.2d, alphaV0_R
st2 {v4.2d, v5.2d}, [pCRow1]
add pCRow2, pCRow1, #32
ld2 {v6.2d, v7.2d}, [pCRow2]
fmla v6.2d, v22.2d, alphaV0_R
fmls v6.2d, v23.2d, alphaV0_I
fmla v7.2d, v22.2d, alphaV1_I
fmla v7.2d, v23.2d, alphaV1_R
fmla v7.2d, v22.2d, alphaV0_I
fmla v7.2d, v23.2d, alphaV0_R
st2 {v6.2d, v7.2d}, [pCRow2]
add pCRow0, pCRow0, #64
@ -845,18 +869,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE2x2
fmov alpha0_R, alpha_save_R
fmov alpha0_I, alpha_save_I
fmov alpha1_R, alpha0_R
fmov alpha1_I, alpha0_I
fmov alpha0_R, alphaR
fmov alpha0_I, alphaI
mov pCRow1, pCRow0
ld2 {v0.2d, v1.2d}, [pCRow1]
fmla v0.2d, v16.2d, alphaV0_R
fmls v0.2d, v17.2d, alphaV0_I
fmla v1.2d, v16.2d, alphaV1_I
fmla v1.2d, v17.2d, alphaV1_R
fmla v1.2d, v16.2d, alphaV0_I
fmla v1.2d, v17.2d, alphaV0_R
st2 {v0.2d, v1.2d}, [pCRow1]
add pCRow1, pCRow1, LDC
@ -864,8 +886,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v4.2d, v5.2d}, [pCRow1]
fmla v4.2d, v20.2d, alphaV0_R
fmls v4.2d, v21.2d, alphaV0_I
fmla v5.2d, v20.2d, alphaV1_I
fmla v5.2d, v21.2d, alphaV1_R
fmla v5.2d, v20.2d, alphaV0_I
fmla v5.2d, v21.2d, alphaV0_R
st2 {v4.2d, v5.2d}, [pCRow1]
add pCRow0, pCRow0, #32
@ -898,18 +920,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE1x2
fmov alpha0_R, alpha_save_R
fmov alpha0_I, alpha_save_I
fmov alpha1_R, alpha0_R
fmov alpha1_I, alpha0_I
fmov alpha0_R, alphaR
fmov alpha0_I, alphaI
mov pCRow1, pCRow0
ld2 {v0.d, v1.d}[0], [pCRow1]
fmla d0, d16, alphaV0_R
fmls d0, d17, alphaV0_I
fmla d1, d16, alphaV1_I
fmla d1, d17, alphaV1_R
fmla d1, d16, alphaV0_I
fmla d1, d17, alphaV0_R
st2 {v0.d, v1.d}[0], [pCRow1]
add pCRow1, pCRow1, LDC
@ -917,8 +937,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v4.d, v5.d}[0], [pCRow1]
fmla d4, d20, alphaV0_R
fmls d4, d21, alphaV0_I
fmla d5, d20, alphaV1_I
fmla d5, d21, alphaV1_R
fmla d5, d20, alphaV0_I
fmla d5, d21, alphaV0_R
st2 {v4.d, v5.d}[0], [pCRow1]
add pCRow0, pCRow0, #16
@ -953,25 +973,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE4x1
fmov alpha0_R, alpha_save_R
fmov alpha0_I, alpha_save_I
fmov alpha1_R, alpha0_R
fmov alpha1_I, alpha0_I
fmov alpha0_R, alphaR
fmov alpha0_I, alphaI
mov pCRow1, pCRow0
ld2 {v0.2d, v1.2d}, [pCRow1]
fmla v0.2d, v16.2d, alphaV0_R
fmls v0.2d, v17.2d, alphaV0_I
fmla v1.2d, v16.2d, alphaV1_I
fmla v1.2d, v17.2d, alphaV1_R
fmla v1.2d, v16.2d, alphaV0_I
fmla v1.2d, v17.2d, alphaV0_R
st2 {v0.2d, v1.2d}, [pCRow1]
add pCRow2, pCRow1, #32
ld2 {v2.2d, v3.2d}, [pCRow2]
fmla v2.2d, v18.2d, alphaV0_R
fmls v2.2d, v19.2d, alphaV0_I
fmla v3.2d, v18.2d, alphaV1_I
fmla v3.2d, v19.2d, alphaV1_R
fmla v3.2d, v18.2d, alphaV0_I
fmla v3.2d, v19.2d, alphaV0_R
st2 {v2.2d, v3.2d}, [pCRow2]
add pCRow0, pCRow0, #64
@ -997,18 +1015,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE2x1
fmov alpha0_R, alpha_save_R
fmov alpha0_I, alpha_save_I
fmov alpha1_R, alpha0_R
fmov alpha1_I, alpha0_I
fmov alpha0_R, alphaR
fmov alpha0_I, alphaI
mov pCRow1, pCRow0
ld2 {v0.2d, v1.2d}, [pCRow1]
fmla v0.2d, v16.2d, alphaV0_R
fmls v0.2d, v17.2d, alphaV0_I
fmla v1.2d, v16.2d, alphaV1_I
fmla v1.2d, v17.2d, alphaV1_R
fmla v1.2d, v16.2d, alphaV0_I
fmla v1.2d, v17.2d, alphaV0_R
st2 {v0.2d, v1.2d}, [pCRow1]
add pCRow0, pCRow0, #32
@ -1035,18 +1051,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE1x1
fmov alpha0_R, alpha_save_R
fmov alpha0_I, alpha_save_I
fmov alpha1_R, alpha0_R
fmov alpha1_I, alpha0_I
fmov alpha0_R, alphaR
fmov alpha0_I, alphaI
mov pCRow1, pCRow0
ld2 {v0.d, v1.d}[0], [pCRow1]
fmla d0, d16, alphaV0_R
fmls d0, d17, alphaV0_I
fmla d1, d16, alphaV1_I
fmla d1, d17, alphaV1_R
fmla d1, d16, alphaV0_I
fmla d1, d17, alphaV0_R
st2 {v0.d, v1.d}[0], [pCRow1]
add pCRow0, pCRow0, #16
@ -1072,8 +1086,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stp x26, x27, [sp, #(9 * 16)]
str x28, [sp, #(10 * 16)]
fmov alpha_save_R, d0
fmov alpha_save_I, d1
prfm PLDL1KEEP, [origPB]
prfm PLDL1KEEP, [origPA]
fmov alphaR, d0
fmov alphaI, d1
lsl LDC, LDC, #4 // ldc = ldc * 2 * 8
@ -1085,8 +1102,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ble zgemm_kernel_L2_BEGIN
zgemm_kernel_L4_BEGIN:
mov pCRow0, pC // pCRow0 = C
add pC, pC, LDC, lsl #2
mov pCRow0, pC
add pCRow1, pCRow0, LDC
add pCRow2, pCRow1, LDC
add pCRow3, pCRow2, LDC
add pC, pCRow3, LDC
mov pA, origPA // pA = start of A array
zgemm_kernel_L4_M4_BEGIN:
@ -1096,42 +1118,68 @@ zgemm_kernel_L4_M4_BEGIN:
cmp counterI, #0
ble zgemm_kernel_L4_M2_BEGIN
.align 5
zgemm_kernel_L4_M4_20:
mov pB, origPB
asr counterL , origK, #1 // L = K / 2
cmp counterL , #2 // is there at least 4 to do?
asr counterL , origK, #3
cmp counterL , #2
blt zgemm_kernel_L4_M4_32
KERNEL4x4_I // do one in the K
KERNEL4x4_M2 // do another in the K
KERNEL4x4_I
KERNEL4x4_M2
KERNEL4x4_M1
KERNEL4x4_M2
KERNEL4x4_M1
KERNEL4x4_M2
KERNEL4x4_M1
KERNEL4x4_M2
subs counterL, counterL, #2 // subtract 2
ble zgemm_kernel_L4_M4_22a
.align 5
.align 5
zgemm_kernel_L4_M4_22:
KERNEL4x4_M1
KERNEL4x4_M2
KERNEL4x4_M1
KERNEL4x4_M2
KERNEL4x4_M1
KERNEL4x4_M2
KERNEL4x4_M1
KERNEL4x4_M2
subs counterL, counterL, #1
bgt zgemm_kernel_L4_M4_22
.align 5
zgemm_kernel_L4_M4_22a:
KERNEL4x4_M1
KERNEL4x4_M2
KERNEL4x4_M1
KERNEL4x4_M2
KERNEL4x4_M1
KERNEL4x4_M2
KERNEL4x4_M1
KERNEL4x4_E
b zgemm_kernel_L4_M4_44
.align 5
zgemm_kernel_L4_M4_32:
tst counterL, #1
ble zgemm_kernel_L4_M4_40
KERNEL4x4_I
KERNEL4x4_M2
KERNEL4x4_M1
KERNEL4x4_M2
KERNEL4x4_M1
KERNEL4x4_M2
KERNEL4x4_M1
KERNEL4x4_E
b zgemm_kernel_L4_M4_44
@ -1143,13 +1191,20 @@ zgemm_kernel_L4_M4_40:
zgemm_kernel_L4_M4_44:
ands counterL , origK, #1
ands counterL , origK, #7
ble zgemm_kernel_L4_M4_100
.align 5
zgemm_kernel_L4_M4_46:
KERNEL4x4_SUB
subs counterL, counterL, #1
bne zgemm_kernel_L4_M4_46
zgemm_kernel_L4_M4_100:
prfm PLDL1KEEP, [pA]
prfm PLDL1KEEP, [pA, #64]
prfm PLDL1KEEP, [origPB]
SAVE4x4

View File

@ -43,6 +43,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define Y_OPTR x13 /* loop Y vector address */
#define X_PTR x14 /* loop X vector address */
#define A_PRE_SIZE 768
#define Y_PRE_SIZE 768
/*******************************************************************************
* Macro definitions
*******************************************************************************/
@ -50,14 +53,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if !defined(DOUBLE)
#define ALPHA_R s0
#define ALPHA_I s1
#define ALPHA_R_COPY s7
#define ALPHA_I_COPY s8
#define SHZ 3
#else
#define ALPHA_R d0
#define ALPHA_I d1
#define ALPHA_R_COPY d7
#define ALPHA_I_COPY d8
#define SHZ 4
#endif
@ -95,20 +94,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro INIT
/********** INIT FOR F4 LOOP **********/
fmov ALPHA_R_COPY, ALPHA_R
fmov ALPHA_I_COPY, ALPHA_I
#if !defined(DOUBLE)
ins v7.s[1], v7.s[0] // R(ALPHA), R(ALPHA)
ins v8.s[1], v8.s[0] // I(ALPHA), I(ALPHA)
ins v7.d[1], v7.d[0]
ins v8.d[1], v8.d[0]
#else
ins v7.d[1], v7.d[0] // R(ALPHA), R(ALPHA)
ins v8.d[1], v8.d[0] // I(ALPHA), I(ALPHA)
#endif
/******* INIT FOR F1 AND S1 LOOP ******/
#if !defined(DOUBLE)
ins v0.s[1], v0.s[0] // R(ALPHA), R(ALPHA)
eor v2.16b, v2.16b, v2.16b
@ -129,47 +114,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro INIT_LOOP
/********** INIT_LOOP FOR F4 LOOP **********/
#if !defined(DOUBLE)
ld1 {v9.2s}, [X_PTR] // [I(X), R(X)]
ins v10.s[0], v9.s[1]
ins v9.s[1], v9.s[0] // [R(X), R(X)]
ins v10.s[1], v10.s[0] // [I(X), I(X)]
ins v9.d[1], v9.d[0]
ins v10.d[1], v10.d[0]
#if !defined(CONJ)
#if !defined(XCONJ)
fmul v11.4s, v9.4s, v7.4s // [+ R(X) * R(ALPHA)]
fmls v11.4s, v10.4s, v8.4s // [- I(X) * I(ALPHA)]
fmul v12.4s, v9.4s, v8.4s // [+ R(X) * I(ALPHA)]
fmla v12.4s, v10.4s, v7.4s // [+ I(X) * R(ALPHA)]
#else
fmul v11.4s, v9.4s, v7.4s // [+ R(X) * R(ALPHA)]
fmla v11.4s, v10.4s, v8.4s // [+ I(X) * I(ALPHA)]
fmul v12.4s, v9.4s, v8.4s // [+ R(X) * I(ALPHA)]
fmls v12.4s, v10.4s, v7.4s // [- I(X) * R(ALPHA)]
#endif
#else // CONJ
#if !defined(XCONJ)
fmul v11.4s, v9.4s, v7.4s // [+ R(X) * R(ALPHA)]
fmls v11.4s, v10.4s, v8.4s // [+ I(X) * I(ALPHA)]
fmul v12.4s, v10.4s, v7.4s // [+ I(X) * R(ALPHA)]
fmls v12.4s, v9.4s, v8.4s // [- R(X) * I(ALPHA)]
#else
fmul v11.4s, v9.4s, v7.4s // [+ R(X) * R(ALPHA)]
fmls v11.4s, v10.4s, v8.4s // [- I(X) * I(ALPHA)]
eor v12.16b, v12.16b, v12.16b
fmls v12.4s, v9.4s, v8.4s // [- R(X) * I(ALPHA)]
fmla v12.4s, v10.4s, v7.4s // [- I(X) * R(ALPHA)]
#endif
#endif // CONJ
/****** INIT_LOOP FOR F1 AND S1 LOOP ******/
ld1 {v2.2s}, [X_PTR] // [I(X), R(X)]
ext v3.8b, v2.8b, v2.8b, #4 // [R(X), I(X)]
fmul v2.2s, v0.2s, v2.2s
fmla v2.2s, v1.2s, v3.2s // [I(TEMP), R(TEMP)]
ins v3.s[0], v2.s[1]
/********** INIT_LOOP FOR F4 LOOP **********/
#if !defined(CONJ)
#if !defined(XCONJ)
dup v21.4s, v2.s[0] // R[TEMP]
dup v22.4s, v2.s[0] // R[TEMP]
eor v25.16b, v25.16b, v25.16b
fsub s25, s25, s3
dup v23.4s, v25.s[0] // -I[TEMP]
dup v24.4s, v3.s[0] // I[TEMP]
#else
dup v21.4s, v2.s[0] // R[TEMP]
dup v22.4s, v2.s[0] // R[TEMP]
dup v23.4s, v3.s[0] // I[TEMP]
eor v25.16b, v25.16b, v25.16b
fsub s25, s25, s3
dup v24.4s, v25.s[0] // -I[TEMP]
#endif
#else // CONJ
#if !defined(XCONJ)
dup v21.4s, v2.s[0] // R[TEMP]
eor v25.16b, v25.16b, v25.16b
fsub s25, s25, s2
dup v22.4s, v25.s[0] // R[TEMP]
dup v23.4s, v3.s[0] // I[TEMP]
dup v24.4s, v3.s[0] // I[TEMP]
#else
dup v21.4s, v2.s[0] // R[TEMP]
eor v25.16b, v25.16b, v25.16b
fsub s25, s25, s2
dup v22.4s, v25.s[0] // R[TEMP]
eor v25.16b, v25.16b, v25.16b
fsub s25, s25, s3
dup v23.4s, v25.s[0] // I[TEMP]
dup v24.4s, v25.s[0] // I[TEMP]
#endif
#endif // CONJ
/****** INIT_LOOP FOR F1 AND S1 LOOP ******/
#if !defined(CONJ)
#if !defined(XCONJ)
eor v4.16b, v4.16b, v4.16b
@ -200,45 +191,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif // CONJ
#else // DOUBLE
/********** INIT_LOOP FOR F4 LOOP **********/
ld1 {v9.2d}, [X_PTR] // [I(X), R(X)]
ins v10.d[0], v9.d[1]
ins v9.d[1], v9.d[0] // [R(X), R(X)]
ins v10.d[1], v10.d[0] // [I(X), I(X)]
#if !defined(CONJ)
#if !defined(XCONJ)
fmul v11.2d, v9.2d, v7.2d // [+ R(X) * R(ALPHA)]
fmls v11.2d, v10.2d, v8.2d // [- I(X) * I(ALPHA)]
fmul v12.2d, v9.2d, v8.2d // [+ R(X) * I(ALPHA)]
fmla v12.2d, v10.2d, v7.2d // [+ I(X) * R(ALPHA)]
#else
fmul v11.2d, v9.2d, v7.2d // [+ R(X) * R(ALPHA)]
fmla v11.2d, v10.2d, v8.2d // [+ I(X) * I(ALPHA)]
fmul v12.2d, v9.2d, v8.2d // [+ R(X) * I(ALPHA)]
fmls v12.2d, v10.2d, v7.2d // [- I(X) * R(ALPHA)]
#endif
#else // CONJ
#if !defined(XCONJ)
fmul v11.2d, v9.2d, v7.2d // [+ R(X) * R(ALPHA)]
fmls v11.2d, v10.2d, v8.2d // [+ I(X) * I(ALPHA)]
fmul v12.2d, v10.2d, v7.2d // [+ I(X) * R(ALPHA)]
fmls v12.2d, v9.2d, v8.2d // [- R(X) * I(ALPHA)]
#else
fmul v11.2d, v9.2d, v7.2d // [+ R(X) * R(ALPHA)]
fmls v11.2d, v10.2d, v8.2d // [- I(X) * I(ALPHA)]
eor v12.16b, v12.16b, v12.16b
fmls v12.2d, v9.2d, v8.2d // [- R(X) * I(ALPHA)]
fmla v12.2d, v10.2d, v7.2d // [- I(X) * R(ALPHA)]
#endif
#endif // CONJ
/****** INIT_LOOP FOR F1 AND S1 LOOP ******/
ld1 {v2.2d}, [X_PTR] // [I(X), R(X)]
ext v3.16b, v2.16b, v2.16b, #8 // [R(X), I(X)]
fmul v2.2d, v0.2d, v2.2d
fmla v2.2d, v1.2d, v3.2d // [I(TEMP), R(TEMP)]
ins v3.d[0], v2.d[1] // I(TEMP)
/****** INIT_LOOP FOR F4 LOOP ******/
#if !defined(CONJ)
#if !defined(XCONJ)
dup v21.2d, v2.d[0] // R[TEMP]
dup v22.2d, v2.d[0] // R[TEMP]
eor v25.16b, v25.16b, v25.16b
fsub d25, d25, d3
dup v23.2d, v25.d[0] // -I[TEMP]
dup v24.2d, v3.d[0] // I[TEMP]
#else
dup v21.2d, v2.d[0] // R[TEMP]
dup v22.2d, v2.d[0] // R[TEMP]
dup v23.2d, v3.d[0] // I[TEMP]
eor v25.16b, v25.16b, v25.16b
fsub d25, d25, d3
dup v24.2d, v25.d[0] // -I[TEMP]
#endif
#else // CONJ
#if !defined(XCONJ)
dup v21.2d, v2.d[0] // R[TEMP]
eor v25.16b, v25.16b, v25.16b
fsub d25, d25, d2
dup v22.2d, v25.d[0] // R[TEMP]
dup v23.2d, v3.d[0] // I[TEMP]
dup v24.2d, v3.d[0] // I[TEMP]
#else
dup v21.2d, v2.d[0] // R[TEMP]
eor v25.16b, v25.16b, v25.16b
fsub d25, d25, d2
dup v22.2d, v25.d[0] // R[TEMP]
eor v25.16b, v25.16b, v25.16b
fsub d25, d25, d3
dup v23.2d, v25.d[0] // I[TEMP]
dup v24.2d, v25.d[0] // I[TEMP]
#endif
#endif // CONJ
/****** INIT_LOOP FOR F1 AND S1 LOOP ******/
#if !defined(CONJ)
#if !defined(XCONJ)
eor v4.16b, v4.16b, v4.16b
@ -276,91 +274,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v13.4s, v14.4s}, [A_PTR], #32
ld2 {v15.4s, v16.4s}, [Y_IPTR], #32
#if !defined(CONJ)
#if !defined(XCONJ)
fmla v15.4s, v11.4s, v13.4s // [+ R(ALPHA * X) * A_R]
fmls v15.4s, v12.4s, v14.4s // [- I(ALPHA * X) * A_I]
fmla v16.4s, v11.4s, v14.4s // [+ R(ALPHA * X) * A_I]
fmla v16.4s, v12.4s, v13.4s // [+ I(ALPHA * X) * A_R]
#else
fmla v15.4s, v11.4s, v13.4s // [+ R(ALPHA * X) * A_R]
fmla v15.4s, v12.4s, v14.4s // [+ I(ALPHA * X) * A_I]
fmla v16.4s, v11.4s, v14.4s // [+ R(ALPHA * X) * A_I]
fmls v16.4s, v12.4s, v13.4s // [- I(ALPHA * X) * A_R]
#endif
#else // CONJ
#if !defined(XCONJ)
fmla v15.4s, v11.4s, v13.4s // [+ R(ALPHA * X) * A_R]
fmla v15.4s, v12.4s, v14.4s // [+ I(ALPHA * X) * A_I]
fmls v16.4s, v11.4s, v14.4s // [- R(ALPHA * X) * A_I]
fmla v16.4s, v12.4s, v13.4s // [+ I(ALPHA * X) * A_R]
#else
fmla v15.4s, v11.4s, v13.4s // [+ R(ALPHA * X) * A_R]
fmls v15.4s, v12.4s, v14.4s // [- I(ALPHA * X) * A_I]
fmls v16.4s, v11.4s, v14.4s // [- R(ALPHA * X) * A_I]
fmls v16.4s, v12.4s, v13.4s // [- I(ALPHA * X) * A_R]
#endif
#endif // CONJ
prfm PLDL1KEEP, [A_PTR, #A_PRE_SIZE]
prfm PLDL1KEEP, [Y_IPTR, #Y_PRE_SIZE]
fmla v15.4s, v21.4s, v13.4s
fmla v15.4s, v23.4s, v14.4s
fmla v16.4s, v22.4s, v14.4s
fmla v16.4s, v24.4s, v13.4s
st2 {v15.4s, v16.4s}, [Y_OPTR], #32
#else // DOUBLE
ld2 {v13.2d, v14.2d}, [A_PTR], #32
ld2 {v15.2d, v16.2d}, [Y_IPTR], #32
#if !defined(CONJ)
#if !defined(XCONJ)
fmla v15.2d, v11.2d, v13.2d // [+ R(ALPHA * X) * A_R]
fmls v15.2d, v12.2d, v14.2d // [- I(ALPHA * X) * A_I]
fmla v16.2d, v11.2d, v14.2d // [+ R(ALPHA * X) * A_I]
fmla v16.2d, v12.2d, v13.2d // [+ I(ALPHA * X) * A_R]
#else
fmla v15.2d, v11.2d, v13.2d // [+ R(ALPHA * X) * A_R]
fmla v15.2d, v12.2d, v14.2d // [+ I(ALPHA * X) * A_I]
fmla v16.2d, v11.2d, v14.2d // [+ R(ALPHA * X) * A_I]
fmls v16.2d, v12.2d, v13.2d // [- I(ALPHA * X) * A_R]
#endif
#else // CONJ
#if !defined(XCONJ)
fmla v15.2d, v11.2d, v13.2d // [+ R(ALPHA * X) * A_R]
fmla v15.2d, v12.2d, v14.2d // [+ I(ALPHA * X) * A_I]
fmls v16.2d, v11.2d, v14.2d // [- R(ALPHA * X) * A_I]
fmla v16.2d, v12.2d, v13.2d // [+ I(ALPHA * X) * A_R]
#else
fmla v15.2d, v11.2d, v13.2d // [+ R(ALPHA * X) * A_R]
fmls v15.2d, v12.2d, v14.2d // [- I(ALPHA * X) * A_I]
fmls v16.2d, v11.2d, v14.2d // [- R(ALPHA * X) * A_I]
fmls v16.2d, v12.2d, v13.2d // [- I(ALPHA * X) * A_R]
#endif
#endif // CONJ
prfm PLDL1KEEP, [A_PTR, #A_PRE_SIZE]
fmla v15.2d, v21.2d, v13.2d
fmla v15.2d, v23.2d, v14.2d
fmla v16.2d, v22.2d, v14.2d
fmla v16.2d, v24.2d, v13.2d
st2 {v15.2d, v16.2d}, [Y_OPTR], #32
ld2 {v17.2d, v18.2d}, [A_PTR], #32
ld2 {v19.2d, v20.2d}, [Y_IPTR], #32
#if !defined(CONJ)
#if !defined(XCONJ)
fmla v19.2d, v11.2d, v17.2d // [+ R(ALPHA * X) * A_R]
fmls v19.2d, v12.2d, v18.2d // [- I(ALPHA * X) * A_I]
fmla v20.2d, v11.2d, v18.2d // [+ R(ALPHA * X) * A_I]
fmla v20.2d, v12.2d, v17.2d // [+ I(ALPHA * X) * A_R]
#else
fmla v19.2d, v11.2d, v17.2d // [+ R(ALPHA * X) * A_R]
fmla v19.2d, v12.2d, v18.2d // [- I(ALPHA * X) * A_I]
fmla v20.2d, v11.2d, v18.2d // [+ R(ALPHA * X) * A_I]
fmls v20.2d, v12.2d, v17.2d // [+ I(ALPHA * X) * A_R]
#endif
#else // CONJ
#if !defined(XCONJ)
fmla v19.2d, v11.2d, v17.2d // [+ R(ALPHA * X) * A_R]
fmla v19.2d, v12.2d, v18.2d // [- I(ALPHA * X) * A_I]
fmls v20.2d, v11.2d, v18.2d // [+ R(ALPHA * X) * A_I]
fmla v20.2d, v12.2d, v17.2d // [+ I(ALPHA * X) * A_R]
#else
fmla v19.2d, v11.2d, v17.2d // [+ R(ALPHA * X) * A_R]
fmls v19.2d, v12.2d, v18.2d // [- I(ALPHA * X) * A_I]
fmls v20.2d, v11.2d, v18.2d // [+ R(ALPHA * X) * A_I]
fmls v20.2d, v12.2d, v17.2d // [+ I(ALPHA * X) * A_R]
#endif
#endif // CONJ
prfm PLDL1KEEP, [Y_IPTR, #Y_PRE_SIZE]
fmla v19.2d, v21.2d, v17.2d
fmla v19.2d, v23.2d, v18.2d
fmla v20.2d, v22.2d, v18.2d
fmla v20.2d, v24.2d, v17.2d
st2 {v19.2d, v20.2d}, [Y_OPTR], #32
#endif
@ -445,10 +391,7 @@ zgemv_n_kernel_F_LOOP:
zgemv_n_kernel_F4:
KERNEL_F1
KERNEL_F1
KERNEL_F1
KERNEL_F1
KERNEL_F4
subs I, I, #1
bne zgemv_n_kernel_F4

View File

@ -41,6 +41,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define J x11 /* loop variable */
#define I x12 /* loop variable */
#define A_PRE_SIZE 768
#define X_PRE_SIZE 768
/*******************************************************************************
* Macro definitions
*******************************************************************************/
@ -139,6 +142,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v11.4s, v12.4s}, [X_PTR], #32
ld2 {v13.4s, v14.4s}, [A_PTR], #32
prfm PLDL1STRM, [X_PTR, #X_PRE_SIZE]
prfm PLDL1STRM, [A_PTR, #A_PRE_SIZE]
#if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ))
fmla v9.4s, v11.4s, v13.4s // [+ R(X) * A_R]
@ -155,7 +160,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#else // DOUBLE
ld2 {v11.2d, v12.2d}, [X_PTR], #32
ld2 {v13.2d, v14.2d}, [A_PTR], #32
prfm PLDL1STRM, [X_PTR, #512]
prfm PLDL1STRM, [X_PTR, #X_PRE_SIZE]
#if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ))
fmla v9.2d, v11.2d, v13.2d // [+ R(X) * A_R]
@ -171,7 +176,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v17.2d, v18.2d}, [X_PTR], #32
ld2 {v19.2d, v20.2d}, [A_PTR], #32
prfm PLDL1STRM, [A_PTR, #512]
prfm PLDL1STRM, [A_PTR, #A_PRE_SIZE]
#if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ))
fmla v15.2d, v17.2d, v19.2d // [+ R(X) * A_R]

View File

@ -46,23 +46,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define pCRow0 x12
#define pCRow1 x13
#define pCRow2 x14
#define pA x15
#define alpha_save_R x16
#define alpha_save_I x17
#define temp x18
#define tempOffset x19
#define tempK x20
#define pCRow3 x15
#define pA x16
#define alphaR x17
#define alphaI x18
#define temp x19
#define tempOffset x20
#define tempK x21
#define alpha0_R d10
#define alphaV0_R v10.d[0]
#define alpha0_I d11
#define alphaV0_I v11.d[0]
#define alpha1_R d14
#define alphaV1_R v14.d[0]
#define alpha1_I d15
#define alphaV1_I v15.d[0]
#define A_PRE_SIZE 2560
#define B_PRE_SIZE 448
#define C_PRE_SIZE 128
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
#define OP_rr fmla
@ -93,7 +92,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
// 04 origPB
// 05 pC
// 06 origLDC -> LDC
// 07 offset
// 07 offset -> temp
// 08 counterL
// 09 counterI
// 10 counterJ
@ -101,13 +100,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
// 12 pCRow0
// 13 pCRow1
// 14 pCRow2
// 15 pA
// 16 alpha_save_R
// 17 alpha_save_I
// 18 must save temp
// 19 must save tempOffset
// 20 must save tempK
// 21 must save
// 15 pCRow3
// 16 pA
// 17 alpha_save_R
// 18 must save alpha_save_I
// 19 must save temp
// 20 must save tempOffset
// 21 must save tempK
// 22 must save
// 23 must save
// 24 must save
@ -178,12 +177,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro KERNEL4x4_I
ld2 {v8.2d, v9.2d}, [pB]
add pB, pB, #32
ld2 {v10.2d, v11.2d}, [pB]
add pB, pB, #32
ld2 {v0.2d, v1.2d}, [pA]
add pA, pA, #32
ld2 {v2.2d, v3.2d}, [pA]
add pA, pA, #32
fmul v16.2d, v0.2d, v8.d[0]
OP_ii v16.2d, v1.2d, v9.d[0]
@ -196,16 +191,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
OP_ir v17.2d, v1.2d, v8.d[0]
fmul v18.2d, v2.2d, v8.d[0]
OP_ii v18.2d, v3.2d, v9.d[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v19.16b, v19.16b, v19.16b
fmls v19.2d, v2.2d, v9.d[0]
#else
fmul v19.2d, v2.2d, v9.d[0]
#endif
OP_ir v19.2d, v3.2d, v8.d[0]
ld2 {v2.2d, v3.2d}, [pA]
add pA, pA, #32
fmul v20.2d, v0.2d, v8.d[1]
OP_ii v20.2d, v1.2d, v9.d[1]
@ -218,6 +205,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
OP_ir v21.2d, v1.2d, v8.d[1]
ld2 {v10.2d, v11.2d}, [pB]
add pB, pB, #32
fmul v22.2d, v2.2d, v8.d[1]
OP_ii v22.2d, v3.2d, v9.d[1]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
@ -229,6 +219,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
OP_ir v23.2d, v3.2d, v8.d[1]
ld2 {v12.2d, v13.2d}, [pB]
add pB, pB, #32
fmul v18.2d, v2.2d, v8.d[0]
OP_ii v18.2d, v3.2d, v9.d[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v19.16b, v19.16b, v19.16b
fmls v19.2d, v2.2d, v9.d[0]
#else
fmul v19.2d, v2.2d, v9.d[0]
#endif
OP_ir v19.2d, v3.2d, v8.d[0]
ld2 {v4.2d, v5.2d} , [pA]
add pA, pA, #32
fmul v24.2d, v0.2d, v10.d[0]
OP_ii v24.2d, v1.2d, v11.d[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
@ -240,6 +247,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
OP_ir v25.2d, v1.2d, v10.d[0]
ld2 {v6.2d, v7.2d} , [pA]
add pA, pA, #32
fmul v26.2d, v2.2d, v10.d[0]
OP_ii v26.2d, v3.2d, v11.d[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
@ -251,6 +261,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
OP_ir v27.2d, v3.2d, v10.d[0]
ld2 {v14.2d, v15.2d}, [pB]
add pB, pB, #32
fmul v28.2d, v0.2d, v10.d[1]
OP_ii v28.2d, v1.2d, v11.d[1]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
@ -262,6 +275,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
OP_ir v29.2d, v1.2d, v10.d[1]
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
fmul v30.2d, v2.2d, v10.d[1]
OP_ii v30.2d, v3.2d, v11.d[1]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
@ -273,14 +288,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
OP_ir v31.2d, v3.2d, v10.d[1]
ld2 {v12.2d, v13.2d}, [pB]
add pB, pB, #32
ld2 {v14.2d, v15.2d}, [pB]
add pB, pB, #32
ld2 {v4.2d, v5.2d} , [pA]
add pA, pA, #32
ld2 {v6.2d, v7.2d} , [pA]
add pA, pA, #32
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
.endm
.macro KERNEL4x4_M1
@ -289,7 +297,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
OP_ri v17.2d, v0.2d, v9.d[0]
OP_ir v17.2d, v1.2d, v8.d[0]
ld2 {v12.2d, v13.2d}, [pB] // For next round
ld2 {v12.2d, v13.2d}, [pB]
add pB, pB, #32
OP_rr v18.2d, v2.2d, v8.d[0]
@ -297,15 +305,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
OP_ri v19.2d, v2.2d, v9.d[0]
OP_ir v19.2d, v3.2d, v8.d[0]
ld2 {v14.2d, v15.2d}, [pB] // For next round
add pB, pB, #32
ld2 {v4.2d, v5.2d} , [pA]
add pA, pA, #32
OP_rr v20.2d, v0.2d, v8.d[1]
OP_ii v20.2d, v1.2d, v9.d[1]
OP_ri v21.2d, v0.2d, v9.d[1]
OP_ir v21.2d, v1.2d, v8.d[1]
ld2 {v4.2d, v5.2d} , [pA] // For next round
ld2 {v6.2d, v7.2d} , [pA]
add pA, pA, #32
OP_rr v22.2d, v2.2d, v8.d[1]
@ -313,22 +321,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
OP_ri v23.2d, v2.2d, v9.d[1]
OP_ir v23.2d, v3.2d, v8.d[1]
ld2 {v6.2d, v7.2d} , [pA] // For next round
add pA, pA, #32
ld2 {v14.2d, v15.2d}, [pB]
add pB, pB, #32
OP_rr v24.2d, v0.2d, v10.d[0]
OP_ii v24.2d, v1.2d, v11.d[0]
OP_ri v25.2d, v0.2d, v11.d[0]
OP_ir v25.2d, v1.2d, v10.d[0]
prfm PLDL1KEEP, [pA, #512]
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
OP_rr v26.2d, v2.2d, v10.d[0]
OP_ii v26.2d, v3.2d, v11.d[0]
OP_ri v27.2d, v2.2d, v11.d[0]
OP_ir v27.2d, v3.2d, v10.d[0]
prfm PLDL1KEEP, [pB, #512]
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
OP_rr v28.2d, v0.2d, v10.d[1]
OP_ii v28.2d, v1.2d, v11.d[1]
@ -347,7 +355,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
OP_ri v17.2d, v4.2d, v13.d[0]
OP_ir v17.2d, v5.2d, v12.d[0]
ld2 {v8.2d, v9.2d}, [pB] // For next round
ld2 {v8.2d, v9.2d}, [pB]
add pB, pB, #32
OP_rr v18.2d, v6.2d, v12.d[0]
@ -355,15 +363,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
OP_ri v19.2d, v6.2d, v13.d[0]
OP_ir v19.2d, v7.2d, v12.d[0]
ld2 {v10.2d, v11.2d}, [pB] // For next round
add pB, pB, #32
ld2 {v0.2d, v1.2d}, [pA]
add pA, pA, #32
OP_rr v20.2d, v4.2d, v12.d[1]
OP_ii v20.2d, v5.2d, v13.d[1]
OP_ri v21.2d, v4.2d, v13.d[1]
OP_ir v21.2d, v5.2d, v12.d[1]
ld2 {v0.2d, v1.2d}, [pA] // For next round
ld2 {v2.2d, v3.2d}, [pA]
add pA, pA, #32
OP_rr v22.2d, v6.2d, v12.d[1]
@ -371,22 +379,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
OP_ri v23.2d, v6.2d, v13.d[1]
OP_ir v23.2d, v7.2d, v12.d[1]
ld2 {v2.2d, v3.2d}, [pA] // For next round
add pA, pA, #32
ld2 {v10.2d, v11.2d}, [pB]
add pB, pB, #32
OP_rr v24.2d, v4.2d, v14.d[0]
OP_ii v24.2d, v5.2d, v15.d[0]
OP_ri v25.2d, v4.2d, v15.d[0]
OP_ir v25.2d, v5.2d, v14.d[0]
prfm PLDL1KEEP, [pA, #512]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
OP_rr v26.2d, v6.2d, v14.d[0]
OP_ii v26.2d, v7.2d, v15.d[0]
OP_ri v27.2d, v6.2d, v15.d[0]
OP_ir v27.2d, v7.2d, v14.d[0]
prfm PLDL1KEEP, [pB, #512]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64]
OP_rr v28.2d, v4.2d, v14.d[1]
OP_ii v28.2d, v5.2d, v15.d[1]
@ -415,6 +423,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
OP_ri v21.2d, v4.2d, v13.d[1]
OP_ir v21.2d, v5.2d, v12.d[1]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
OP_rr v22.2d, v6.2d, v12.d[1]
OP_ii v22.2d, v7.2d, v13.d[1]
OP_ri v23.2d, v6.2d, v13.d[1]
@ -425,6 +435,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
OP_ri v25.2d, v4.2d, v15.d[0]
OP_ir v25.2d, v5.2d, v14.d[0]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64]
OP_rr v26.2d, v6.2d, v14.d[0]
OP_ii v26.2d, v7.2d, v15.d[0]
OP_ri v27.2d, v6.2d, v15.d[0]
@ -444,33 +456,40 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro KERNEL4x4_SUB
ld2 {v8.2d, v9.2d}, [pB]
add pB, pB, #32
ld2 {v10.2d, v11.2d}, [pB]
add pB, pB, #32
ld2 {v0.2d, v1.2d}, [pA]
add pA, pA, #32
ld2 {v2.2d, v3.2d}, [pA]
add pA, pA, #32
OP_rr v16.2d, v0.2d, v8.d[0]
OP_ii v16.2d, v1.2d, v9.d[0]
OP_ri v17.2d, v0.2d, v9.d[0]
OP_ir v17.2d, v1.2d, v8.d[0]
OP_rr v18.2d, v2.2d, v8.d[0]
OP_ii v18.2d, v3.2d, v9.d[0]
OP_ri v19.2d, v2.2d, v9.d[0]
OP_ir v19.2d, v3.2d, v8.d[0]
ld2 {v2.2d, v3.2d}, [pA]
add pA, pA, #32
OP_rr v20.2d, v0.2d, v8.d[1]
OP_ii v20.2d, v1.2d, v9.d[1]
OP_ri v21.2d, v0.2d, v9.d[1]
OP_ir v21.2d, v1.2d, v8.d[1]
ld2 {v10.2d, v11.2d}, [pB]
add pB, pB, #32
OP_rr v18.2d, v2.2d, v8.d[0]
OP_ii v18.2d, v3.2d, v9.d[0]
OP_ri v19.2d, v2.2d, v9.d[0]
OP_ir v19.2d, v3.2d, v8.d[0]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
OP_rr v22.2d, v2.2d, v8.d[1]
OP_ii v22.2d, v3.2d, v9.d[1]
OP_ri v23.2d, v2.2d, v9.d[1]
OP_ir v23.2d, v3.2d, v8.d[1]
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
OP_rr v24.2d, v0.2d, v10.d[0]
OP_ii v24.2d, v1.2d, v11.d[0]
OP_ri v25.2d, v0.2d, v11.d[0]
@ -493,66 +512,77 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE4x4
fmov alpha0_R, alpha_save_R
fmov alpha0_I, alpha_save_I
fmov alpha1_R, alpha0_R
fmov alpha1_I, alpha0_I
fmov alpha0_R, alphaR
fmov alpha0_I, alphaI
mov pCRow1, pCRow0
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
fmul v0.2d, v16.2d, alphaV0_R
fmls v0.2d, v17.2d, alphaV0_I
fmul v1.2d, v16.2d, alphaV1_I
fmla v1.2d, v17.2d, alphaV1_R
st2 {v0.2d, v1.2d}, [pCRow1]
add pCRow2, pCRow1, #32
fmul v1.2d, v16.2d, alphaV0_I
fmla v1.2d, v17.2d, alphaV0_R
st2 {v0.2d, v1.2d}, [pCRow0]
add pCRow0, pCRow0, #32
fmul v2.2d, v18.2d, alphaV0_R
fmls v2.2d, v19.2d, alphaV0_I
fmul v3.2d, v18.2d, alphaV1_I
fmla v3.2d, v19.2d, alphaV1_R
st2 {v2.2d, v3.2d}, [pCRow2]
fmul v3.2d, v18.2d, alphaV0_I
fmla v3.2d, v19.2d, alphaV0_R
st2 {v2.2d, v3.2d}, [pCRow0]
add pCRow0, pCRow0, #32
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
add pCRow1, pCRow1, LDC
fmul v4.2d, v20.2d, alphaV0_R
fmls v4.2d, v21.2d, alphaV0_I
fmul v5.2d, v20.2d, alphaV1_I
fmla v5.2d, v21.2d, alphaV1_R
fmul v5.2d, v20.2d, alphaV0_I
fmla v5.2d, v21.2d, alphaV0_R
st2 {v4.2d, v5.2d}, [pCRow1]
add pCRow2, pCRow1, #32
add pCRow1, pCRow1, #32
fmul v6.2d, v22.2d, alphaV0_R
fmls v6.2d, v23.2d, alphaV0_I
fmul v7.2d, v22.2d, alphaV1_I
fmla v7.2d, v23.2d, alphaV1_R
st2 {v6.2d, v7.2d}, [pCRow2]
fmul v7.2d, v22.2d, alphaV0_I
fmla v7.2d, v23.2d, alphaV0_R
st2 {v6.2d, v7.2d}, [pCRow1]
add pCRow1, pCRow1, #32
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
add pCRow1, pCRow1, LDC
fmul v0.2d, v24.2d, alphaV0_R
fmls v0.2d, v25.2d, alphaV0_I
fmul v1.2d, v24.2d, alphaV1_I
fmla v1.2d, v25.2d, alphaV1_R
st2 {v0.2d, v1.2d}, [pCRow1]
add pCRow2, pCRow1, #32
fmul v1.2d, v24.2d, alphaV0_I
fmla v1.2d, v25.2d, alphaV0_R
st2 {v0.2d, v1.2d}, [pCRow2]
add pCRow2, pCRow2, #32
fmul v2.2d, v26.2d, alphaV0_R
fmls v2.2d, v27.2d, alphaV0_I
fmul v3.2d, v26.2d, alphaV1_I
fmla v3.2d, v27.2d, alphaV1_R
fmul v3.2d, v26.2d, alphaV0_I
fmla v3.2d, v27.2d, alphaV0_R
st2 {v2.2d, v3.2d}, [pCRow2]
add pCRow1, pCRow1, LDC
add pCRow2, pCRow2, #32
prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
fmul v4.2d, v28.2d, alphaV0_R
fmls v4.2d, v29.2d, alphaV0_I
fmul v5.2d, v28.2d, alphaV1_I
fmla v5.2d, v29.2d, alphaV1_R
st2 {v4.2d, v5.2d}, [pCRow1]
add pCRow2, pCRow1, #32
fmul v5.2d, v28.2d, alphaV0_I
fmla v5.2d, v29.2d, alphaV0_R
st2 {v4.2d, v5.2d}, [pCRow3]
add pCRow3, pCRow3, #32
fmul v6.2d, v30.2d, alphaV0_R
fmls v6.2d, v31.2d, alphaV0_I
fmul v7.2d, v30.2d, alphaV1_I
fmla v7.2d, v31.2d, alphaV1_R
st2 {v6.2d, v7.2d}, [pCRow2]
fmul v7.2d, v30.2d, alphaV0_I
fmla v7.2d, v31.2d, alphaV0_R
st2 {v6.2d, v7.2d}, [pCRow3]
add pCRow0, pCRow0, #64
add pCRow3, pCRow3, #32
.endm
/******************************************************************************/
@ -599,41 +629,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE2x4
fmov alpha0_R, alpha_save_R
fmov alpha0_I, alpha_save_I
fmov alpha1_R, alpha0_R
fmov alpha1_I, alpha0_I
fmov alpha0_R, alphaR
fmov alpha0_I, alphaI
mov pCRow1, pCRow0
fmul v0.2d, v16.2d, alphaV0_R
fmls v0.2d, v17.2d, alphaV0_I
fmul v1.2d, v16.2d, alphaV1_I
fmla v1.2d, v17.2d, alphaV1_R
fmul v1.2d, v16.2d, alphaV0_I
fmla v1.2d, v17.2d, alphaV0_R
st2 {v0.2d, v1.2d}, [pCRow1]
add pCRow1, pCRow1, LDC
fmul v4.2d, v20.2d, alphaV0_R
fmls v4.2d, v21.2d, alphaV0_I
fmul v5.2d, v20.2d, alphaV1_I
fmla v5.2d, v21.2d, alphaV1_R
fmul v5.2d, v20.2d, alphaV0_I
fmla v5.2d, v21.2d, alphaV0_R
st2 {v4.2d, v5.2d}, [pCRow1]
add pCRow1, pCRow1, LDC
fmul v0.2d, v24.2d, alphaV0_R
fmls v0.2d, v25.2d, alphaV0_I
fmul v1.2d, v24.2d, alphaV1_I
fmla v1.2d, v25.2d, alphaV1_R
fmul v1.2d, v24.2d, alphaV0_I
fmla v1.2d, v25.2d, alphaV0_R
st2 {v0.2d, v1.2d}, [pCRow1]
add pCRow1, pCRow1, LDC
fmul v4.2d, v28.2d, alphaV0_R
fmls v4.2d, v29.2d, alphaV0_I
fmul v5.2d, v28.2d, alphaV1_I
fmla v5.2d, v29.2d, alphaV1_R
fmul v5.2d, v28.2d, alphaV0_I
fmla v5.2d, v29.2d, alphaV0_R
st2 {v4.2d, v5.2d}, [pCRow1]
add pCRow0, pCRow0, #32
@ -682,41 +710,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE1x4
fmov alpha0_R, alpha_save_R
fmov alpha0_I, alpha_save_I
fmov alpha1_R, alpha0_R
fmov alpha1_I, alpha0_I
fmov alpha0_R, alphaR
fmov alpha0_I, alphaI
mov pCRow1, pCRow0
fmul d0, d16, alphaV0_R
fmls d0, d17, alphaV0_I
fmul d1, d16, alphaV1_I
fmla d1, d17, alphaV1_R
fmul d1, d16, alphaV0_I
fmla d1, d17, alphaV0_R
st2 {v0.d, v1.d}[0], [pCRow1]
add pCRow1, pCRow1, LDC
fmul d4, d20, alphaV0_R
fmls d4, d21, alphaV0_I
fmul d5, d20, alphaV1_I
fmla d5, d21, alphaV1_R
fmul d5, d20, alphaV0_I
fmla d5, d21, alphaV0_R
st2 {v4.d, v5.d}[0], [pCRow1]
add pCRow1, pCRow1, LDC
fmul d0, d24, alphaV0_R
fmls d0, d25, alphaV0_I
fmul d1, d24, alphaV1_I
fmla d1, d25, alphaV1_R
fmul d1, d24, alphaV0_I
fmla d1, d25, alphaV0_R
st2 {v0.d, v1.d}[0], [pCRow1]
add pCRow1, pCRow1, LDC
fmul d4, d28, alphaV0_R
fmls d4, d29, alphaV0_I
fmul d5, d28, alphaV1_I
fmla d5, d29, alphaV1_R
fmul d5, d28, alphaV0_I
fmla d5, d29, alphaV0_R
st2 {v4.d, v5.d}[0], [pCRow1]
add pCRow0, pCRow0, #16
@ -765,37 +791,35 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE4x2
fmov alpha0_R, alpha_save_R
fmov alpha0_I, alpha_save_I
fmov alpha1_R, alpha0_R
fmov alpha1_I, alpha0_I
fmov alpha0_R, alphaR
fmov alpha0_I, alphaI
mov pCRow1, pCRow0
fmul v0.2d, v16.2d, alphaV0_R
fmls v0.2d, v17.2d, alphaV0_I
fmul v1.2d, v16.2d, alphaV1_I
fmla v1.2d, v17.2d, alphaV1_R
fmul v1.2d, v16.2d, alphaV0_I
fmla v1.2d, v17.2d, alphaV0_R
st2 {v0.2d, v1.2d}, [pCRow1]
add pCRow2, pCRow1, #32
fmul v2.2d, v18.2d, alphaV0_R
fmls v2.2d, v19.2d, alphaV0_I
fmul v3.2d, v18.2d, alphaV1_I
fmla v3.2d, v19.2d, alphaV1_R
fmul v3.2d, v18.2d, alphaV0_I
fmla v3.2d, v19.2d, alphaV0_R
st2 {v2.2d, v3.2d}, [pCRow2]
add pCRow1, pCRow1, LDC
fmul v4.2d, v20.2d, alphaV0_R
fmls v4.2d, v21.2d, alphaV0_I
fmul v5.2d, v20.2d, alphaV1_I
fmla v5.2d, v21.2d, alphaV1_R
fmul v5.2d, v20.2d, alphaV0_I
fmla v5.2d, v21.2d, alphaV0_R
st2 {v4.2d, v5.2d}, [pCRow1]
add pCRow2, pCRow1, #32
fmul v6.2d, v22.2d, alphaV0_R
fmls v6.2d, v23.2d, alphaV0_I
fmul v7.2d, v22.2d, alphaV1_I
fmla v7.2d, v23.2d, alphaV1_R
fmul v7.2d, v22.2d, alphaV0_I
fmla v7.2d, v23.2d, alphaV0_R
st2 {v6.2d, v7.2d}, [pCRow2]
add pCRow0, pCRow0, #64
@ -828,25 +852,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE2x2
fmov alpha0_R, alpha_save_R
fmov alpha0_I, alpha_save_I
fmov alpha1_R, alpha0_R
fmov alpha1_I, alpha0_I
fmov alpha0_R, alphaR
fmov alpha0_I, alphaI
mov pCRow1, pCRow0
fmul v0.2d, v16.2d, alphaV0_R
fmls v0.2d, v17.2d, alphaV0_I
fmul v1.2d, v16.2d, alphaV1_I
fmla v1.2d, v17.2d, alphaV1_R
fmul v1.2d, v16.2d, alphaV0_I
fmla v1.2d, v17.2d, alphaV0_R
st2 {v0.2d, v1.2d}, [pCRow1]
add pCRow1, pCRow1, LDC
fmul v4.2d, v20.2d, alphaV0_R
fmls v4.2d, v21.2d, alphaV0_I
fmul v5.2d, v20.2d, alphaV1_I
fmla v5.2d, v21.2d, alphaV1_R
fmul v5.2d, v20.2d, alphaV0_I
fmla v5.2d, v21.2d, alphaV0_R
st2 {v4.2d, v5.2d}, [pCRow1]
add pCRow0, pCRow0, #32
@ -879,25 +901,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE1x2
fmov alpha0_R, alpha_save_R
fmov alpha0_I, alpha_save_I
fmov alpha1_R, alpha0_R
fmov alpha1_I, alpha0_I
fmov alpha0_R, alphaR
fmov alpha0_I, alphaI
mov pCRow1, pCRow0
fmul d0, d16, alphaV0_R
fmls d0, d17, alphaV0_I
fmul d1, d16, alphaV1_I
fmla d1, d17, alphaV1_R
fmul d1, d16, alphaV0_I
fmla d1, d17, alphaV0_R
st2 {v0.d, v1.d}[0], [pCRow1]
add pCRow1, pCRow1, LDC
fmul d4, d20, alphaV0_R
fmls d4, d21, alphaV0_I
fmul d5, d20, alphaV1_I
fmla d5, d21, alphaV1_R
fmul d5, d20, alphaV0_I
fmla d5, d21, alphaV0_R
st2 {v4.d, v5.d}[0], [pCRow1]
add pCRow0, pCRow0, #16
@ -932,23 +952,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE4x1
fmov alpha0_R, alpha_save_R
fmov alpha0_I, alpha_save_I
fmov alpha1_R, alpha0_R
fmov alpha1_I, alpha0_I
fmov alpha0_R, alphaR
fmov alpha0_I, alphaI
mov pCRow1, pCRow0
fmul v0.2d, v16.2d, alphaV0_R
fmls v0.2d, v17.2d, alphaV0_I
fmul v1.2d, v16.2d, alphaV1_I
fmla v1.2d, v17.2d, alphaV1_R
fmul v1.2d, v16.2d, alphaV0_I
fmla v1.2d, v17.2d, alphaV0_R
st2 {v0.2d, v1.2d}, [pCRow1]
add pCRow2, pCRow1, #32
fmul v2.2d, v18.2d, alphaV0_R
fmls v2.2d, v19.2d, alphaV0_I
fmul v3.2d, v18.2d, alphaV1_I
fmla v3.2d, v19.2d, alphaV1_R
fmul v3.2d, v18.2d, alphaV0_I
fmla v3.2d, v19.2d, alphaV0_R
st2 {v2.2d, v3.2d}, [pCRow2]
add pCRow0, pCRow0, #64
@ -974,17 +992,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE2x1
fmov alpha0_R, alpha_save_R
fmov alpha0_I, alpha_save_I
fmov alpha1_R, alpha0_R
fmov alpha1_I, alpha0_I
fmov alpha0_R, alphaR
fmov alpha0_I, alphaI
mov pCRow1, pCRow0
fmul v0.2d, v16.2d, alphaV0_R
fmls v0.2d, v17.2d, alphaV0_I
fmul v1.2d, v16.2d, alphaV1_I
fmla v1.2d, v17.2d, alphaV1_R
fmul v1.2d, v16.2d, alphaV0_I
fmla v1.2d, v17.2d, alphaV0_R
st2 {v0.2d, v1.2d}, [pCRow1]
add pCRow0, pCRow0, #32
@ -1011,17 +1027,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE1x1
fmov alpha0_R, alpha_save_R
fmov alpha0_I, alpha_save_I
fmov alpha1_R, alpha0_R
fmov alpha1_I, alpha0_I
fmov alpha0_R, alphaR
fmov alpha0_I, alphaI
mov pCRow1, pCRow0
fmul d0, d16, alphaV0_R
fmls d0, d17, alphaV0_I
fmul d1, d16, alphaV1_I
fmla d1, d17, alphaV1_R
fmul d1, d16, alphaV0_I
fmla d1, d17, alphaV0_R
st2 {v0.d, v1.d}[0], [pCRow1]
add pCRow0, pCRow0, #16
@ -1047,8 +1061,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stp x26, x27, [sp, #(9 * 16)]
str x28, [sp, #(10 * 16)]
fmov alpha_save_R, d0
fmov alpha_save_I, d1
prfm PLDL1KEEP, [origPB]
prfm PLDL1KEEP, [origPA]
fmov alphaR, d0
fmov alphaI, d1
lsl LDC, LDC, #4 // ldc = ldc * 2 * 8
@ -1064,8 +1081,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ble ztrmm_kernel_L2_BEGIN
ztrmm_kernel_L4_BEGIN:
mov pCRow0, pC // pCRow0 = C
add pC, pC, LDC, lsl #2
mov pCRow0, pC
add pCRow1, pCRow0, LDC
add pCRow2, pCRow1, LDC
add pCRow3, pCRow2, LDC
add pC, pCRow3, LDC
#if defined(LEFT)
mov tempOffset, offset
@ -1079,6 +1101,7 @@ ztrmm_kernel_L4_M4_BEGIN:
cmp counterI, #0
ble ztrmm_kernel_L4_M2_BEGIN
.align 5
ztrmm_kernel_L4_M4_20:
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
@ -1098,39 +1121,64 @@ ztrmm_kernel_L4_M4_20:
add tempK, tempOffset, #4
#endif
asr counterL , tempK, #1 // L = K / 2
cmp counterL , #2 // is there at least 4 to do?
asr counterL , tempK, #3
cmp counterL , #2
blt ztrmm_kernel_L4_M4_32
KERNEL4x4_I // do one in the K
KERNEL4x4_M2 // do another in the K
KERNEL4x4_I
KERNEL4x4_M2
KERNEL4x4_M1
KERNEL4x4_M2
KERNEL4x4_M1
KERNEL4x4_M2
KERNEL4x4_M1
KERNEL4x4_M2
subs counterL, counterL, #2
ble ztrmm_kernel_L4_M4_22a
.align 5
.align 5
ztrmm_kernel_L4_M4_22:
KERNEL4x4_M1
KERNEL4x4_M2
KERNEL4x4_M1
KERNEL4x4_M2
KERNEL4x4_M1
KERNEL4x4_M2
KERNEL4x4_M1
KERNEL4x4_M2
subs counterL, counterL, #1
bgt ztrmm_kernel_L4_M4_22
.align 5
ztrmm_kernel_L4_M4_22a:
KERNEL4x4_M1
KERNEL4x4_M2
KERNEL4x4_M1
KERNEL4x4_M2
KERNEL4x4_M1
KERNEL4x4_M2
KERNEL4x4_M1
KERNEL4x4_E
b ztrmm_kernel_L4_M4_44
.align 5
ztrmm_kernel_L4_M4_32:
tst counterL, #1
ble ztrmm_kernel_L4_M4_40
KERNEL4x4_I
KERNEL4x4_M2
KERNEL4x4_M1
KERNEL4x4_M2
KERNEL4x4_M1
KERNEL4x4_M2
KERNEL4x4_M1
KERNEL4x4_E
b ztrmm_kernel_L4_M4_44
@ -1142,12 +1190,16 @@ ztrmm_kernel_L4_M4_40:
ztrmm_kernel_L4_M4_44:
ands counterL , tempK, #1
ands counterL , tempK, #7
ble ztrmm_kernel_L4_M4_100
.align 5
ztrmm_kernel_L4_M4_46:
KERNEL4x4_SUB
subs counterL, counterL, #1
bne ztrmm_kernel_L4_M4_46
ztrmm_kernel_L4_M4_100:
SAVE4x4
@ -1167,6 +1219,10 @@ ztrmm_kernel_L4_M4_100:
add tempOffset, tempOffset, #4
#endif
prfm PLDL1KEEP, [pA]
prfm PLDL1KEEP, [pA, #64]
prfm PLDL1KEEP, [origPB]
ztrmm_kernel_L4_M4_END:
subs counterI, counterI, #1
bne ztrmm_kernel_L4_M4_20

View File

@ -2341,13 +2341,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define GEMM_DEFAULT_OFFSET_B 0
#define GEMM_DEFAULT_ALIGN 0x03fffUL
#define SGEMM_DEFAULT_UNROLL_M 4
#define SGEMM_DEFAULT_UNROLL_M 16
#define SGEMM_DEFAULT_UNROLL_N 4
#define DGEMM_DEFAULT_UNROLL_M 4
#define DGEMM_DEFAULT_UNROLL_M 8
#define DGEMM_DEFAULT_UNROLL_N 4
#define CGEMM_DEFAULT_UNROLL_M 4
#define CGEMM_DEFAULT_UNROLL_M 8
#define CGEMM_DEFAULT_UNROLL_N 4
#define ZGEMM_DEFAULT_UNROLL_M 4