Merge pull request #924 from ashwinyes/develop_aarch64_improvements_20160714
Improvements to Aarch64 kernels
This commit is contained in:
commit
8a592ee386
|
@ -173,7 +173,9 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \
|
|||
sgetri.goto dgetri.goto cgetri.goto zgetri.goto \
|
||||
spotrf.goto dpotrf.goto cpotrf.goto zpotrf.goto \
|
||||
ssymm.goto dsymm.goto csymm.goto zsymm.goto \
|
||||
smallscaling
|
||||
smallscaling \
|
||||
isamax.goto idamax.goto icamax.goto izamax.goto \
|
||||
snrm2.goto dnrm2.goto scnrm2.goto dznrm2.goto
|
||||
|
||||
acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \
|
||||
scholesky.acml dcholesky.acml ccholesky.acml zcholesky.acml \
|
||||
|
@ -226,7 +228,9 @@ atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \
|
|||
sgesv.atlas dgesv.atlas cgesv.atlas zgesv.atlas \
|
||||
sgetri.atlas dgetri.atlas cgetri.atlas zgetri.atlas \
|
||||
spotrf.atlas dpotrf.atlas cpotrf.atlas zpotrf.atlas \
|
||||
ssymm.atlas dsymm.atlas csymm.atlas zsymm.atlas
|
||||
ssymm.atlas dsymm.atlas csymm.atlas zsymm.atlas \
|
||||
isamax.atlas idamax.atlas icamax.atlas izamax.atlas \
|
||||
snrm2.goto dnrm2.goto scnrm2.goto dznrm2.goto
|
||||
|
||||
mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \
|
||||
scholesky.mkl dcholesky.mkl ccholesky.mkl zcholesky.mkl \
|
||||
|
@ -1937,6 +1941,63 @@ zgemm3m.mkl : zgemm3m.$(SUFFIX)
|
|||
zgemm3m.veclib : zgemm3m.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
############################################## ISAMAX ##############################################
|
||||
isamax.goto : isamax.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
|
||||
isamax.atlas : isamax.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
############################################## IDAMAX ##############################################
|
||||
idamax.goto : idamax.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
|
||||
idamax.atlas : idamax.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
############################################## ICAMAX ##############################################
|
||||
icamax.goto : icamax.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
|
||||
icamax.atlas : icamax.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
############################################## IZAMAX ##############################################
|
||||
izamax.goto : izamax.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
|
||||
izamax.atlas : izamax.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
############################################## SNRM2 ##############################################
|
||||
snrm2.goto : snrm2.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
|
||||
snrm2.atlas : snrm2.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
############################################## DNRM2 ##############################################
|
||||
dnrm2.goto : dnrm2.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
|
||||
dnrm2.atlas : dnrm2.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
############################################## Sscnrm2 ##############################################
|
||||
scnrm2.goto : scnrm2.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
|
||||
scnrm2.atlas : scnrm2.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
############################################## Ddznrm2 ##############################################
|
||||
dznrm2.goto : dznrm2.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
|
||||
dznrm2.atlas : dznrm2.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
|
||||
###################################################################################################
|
||||
|
||||
slinpack.$(SUFFIX) : linpack.c
|
||||
|
@ -2243,6 +2304,33 @@ cgemm3m.$(SUFFIX) : gemm3m.c
|
|||
zgemm3m.$(SUFFIX) : gemm3m.c
|
||||
$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
|
||||
|
||||
|
||||
isamax.$(SUFFIX) : iamax.c
|
||||
$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
|
||||
|
||||
idamax.$(SUFFIX) : iamax.c
|
||||
$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
|
||||
|
||||
icamax.$(SUFFIX) : iamax.c
|
||||
$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
|
||||
|
||||
izamax.$(SUFFIX) : iamax.c
|
||||
$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
|
||||
|
||||
|
||||
snrm2.$(SUFFIX) : nrm2.c
|
||||
$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
|
||||
|
||||
dnrm2.$(SUFFIX) : nrm2.c
|
||||
$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
|
||||
|
||||
scnrm2.$(SUFFIX) : nrm2.c
|
||||
$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
|
||||
|
||||
dznrm2.$(SUFFIX) : nrm2.c
|
||||
$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
|
||||
|
||||
|
||||
smallscaling: smallscaling.c ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(EXTRALIB) -fopenmp -lm -lpthread
|
||||
|
||||
|
|
|
@ -183,9 +183,9 @@ int main(int argc, char *argv[]){
|
|||
timeg /= loops;
|
||||
|
||||
#ifdef COMPLEX
|
||||
fprintf(stderr, " %10.2f MFlops\n", 4. * (double)m / timeg * 1.e-6);
|
||||
fprintf(stderr, " %10.2f MFlops %10.6f sec\n", 4. * (double)m / timeg * 1.e-6, timeg);
|
||||
#else
|
||||
fprintf(stderr, " %10.2f MFlops\n", 2. * (double)m / timeg * 1.e-6);
|
||||
fprintf(stderr, " %10.2f MFlops %10.6f sec\n", 2. * (double)m / timeg * 1.e-6, timeg);
|
||||
#endif
|
||||
|
||||
}
|
||||
|
|
|
@ -190,8 +190,8 @@ int main(int argc, char *argv[]){
|
|||
timeg /= loops;
|
||||
|
||||
fprintf(stderr,
|
||||
" %10.2f MFlops\n",
|
||||
COMPSIZE * COMPSIZE * 2. * (double)m / timeg * 1.e-6);
|
||||
" %10.2f MFlops %10.6f sec\n",
|
||||
COMPSIZE * COMPSIZE * 2. * (double)m / timeg * 1.e-6, timeg);
|
||||
|
||||
}
|
||||
|
||||
|
|
|
@ -190,8 +190,8 @@ int main(int argc, char *argv[]){
|
|||
timeg /= loops;
|
||||
|
||||
fprintf(stderr,
|
||||
" %10.2f MBytes\n",
|
||||
COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6);
|
||||
" %10.2f MBytes %10.6f sec\n",
|
||||
COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg);
|
||||
|
||||
}
|
||||
|
||||
|
|
|
@ -184,8 +184,8 @@ int main(int argc, char *argv[]){
|
|||
timeg /= loops;
|
||||
|
||||
fprintf(stderr,
|
||||
" %10.2f MFlops\n",
|
||||
COMPSIZE * COMPSIZE * 2. * (double)m / timeg * 1.e-6);
|
||||
" %10.2f MFlops %10.6f sec\n",
|
||||
COMPSIZE * COMPSIZE * 2. * (double)m / timeg * 1.e-6, timeg);
|
||||
|
||||
}
|
||||
|
||||
|
|
|
@ -221,7 +221,7 @@ int main(int argc, char *argv[]){
|
|||
|
||||
timeg /= loops;
|
||||
|
||||
fprintf(stderr, " %10.2f MFlops\n", COMPSIZE * COMPSIZE * 2. * (double)m * (double)n / timeg * 1.e-6);
|
||||
fprintf(stderr, " %10.2f MFlops %10.6f sec\n", COMPSIZE * COMPSIZE * 2. * (double)m * (double)n / timeg * 1.e-6, timeg);
|
||||
|
||||
}
|
||||
}
|
||||
|
@ -258,7 +258,7 @@ int main(int argc, char *argv[]){
|
|||
|
||||
timeg /= loops;
|
||||
|
||||
fprintf(stderr, " %10.2f MFlops\n", COMPSIZE * COMPSIZE * 2. * (double)m * (double)n / timeg * 1.e-6);
|
||||
fprintf(stderr, " %10.2f MFlops %10.6f sec\n", COMPSIZE * COMPSIZE * 2. * (double)m * (double)n / timeg * 1.e-6, timeg);
|
||||
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,190 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#ifdef __CYGWIN32__
|
||||
#include <sys/time.h>
|
||||
#endif
|
||||
#include "common.h"
|
||||
|
||||
|
||||
#undef IAMAX
|
||||
|
||||
#ifdef COMPLEX
|
||||
#ifdef DOUBLE
|
||||
#define IAMAX BLASFUNC(izamax)
|
||||
#else
|
||||
#define IAMAX BLASFUNC(icamax)
|
||||
#endif
|
||||
#else
|
||||
#ifdef DOUBLE
|
||||
#define IAMAX BLASFUNC(idamax)
|
||||
#else
|
||||
#define IAMAX BLASFUNC(isamax)
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(__WIN32__) || defined(__WIN64__)
|
||||
|
||||
#ifndef DELTA_EPOCH_IN_MICROSECS
|
||||
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
|
||||
#endif
|
||||
|
||||
int gettimeofday(struct timeval *tv, void *tz){
|
||||
|
||||
FILETIME ft;
|
||||
unsigned __int64 tmpres = 0;
|
||||
static int tzflag;
|
||||
|
||||
if (NULL != tv)
|
||||
{
|
||||
GetSystemTimeAsFileTime(&ft);
|
||||
|
||||
tmpres |= ft.dwHighDateTime;
|
||||
tmpres <<= 32;
|
||||
tmpres |= ft.dwLowDateTime;
|
||||
|
||||
/*converting file time to unix epoch*/
|
||||
tmpres /= 10; /*convert into microseconds*/
|
||||
tmpres -= DELTA_EPOCH_IN_MICROSECS;
|
||||
tv->tv_sec = (long)(tmpres / 1000000UL);
|
||||
tv->tv_usec = (long)(tmpres % 1000000UL);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
|
||||
|
||||
static void *huge_malloc(BLASLONG size){
|
||||
int shmid;
|
||||
void *address;
|
||||
|
||||
#ifndef SHM_HUGETLB
|
||||
#define SHM_HUGETLB 04000
|
||||
#endif
|
||||
|
||||
if ((shmid =shmget(IPC_PRIVATE,
|
||||
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
|
||||
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
|
||||
printf( "Memory allocation failed(shmget).\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
address = shmat(shmid, NULL, SHM_RND);
|
||||
|
||||
if ((BLASLONG)address == -1){
|
||||
printf( "Memory allocation failed(shmat).\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
shmctl(shmid, IPC_RMID, 0);
|
||||
|
||||
return address;
|
||||
}
|
||||
|
||||
#define malloc huge_malloc
|
||||
|
||||
#endif
|
||||
|
||||
int main(int argc, char *argv[]){
|
||||
|
||||
FLOAT *x;
|
||||
blasint m, i;
|
||||
blasint inc_x=1;
|
||||
int loops = 1;
|
||||
int l;
|
||||
char *p;
|
||||
|
||||
int from = 1;
|
||||
int to = 200;
|
||||
int step = 1;
|
||||
|
||||
struct timeval start, stop;
|
||||
double time1,timeg;
|
||||
|
||||
argc--;argv++;
|
||||
|
||||
if (argc > 0) { from = atol(*argv); argc--; argv++;}
|
||||
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
|
||||
if (argc > 0) { step = atol(*argv); argc--; argv++;}
|
||||
|
||||
if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
|
||||
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
|
||||
|
||||
fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops);
|
||||
|
||||
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
|
||||
fprintf(stderr,"Out of Memory!!\n");exit(1);
|
||||
}
|
||||
|
||||
#ifdef linux
|
||||
srandom(getpid());
|
||||
#endif
|
||||
|
||||
fprintf(stderr, " SIZE Time\n");
|
||||
|
||||
for(m = from; m <= to; m += step)
|
||||
{
|
||||
|
||||
timeg=0;
|
||||
|
||||
fprintf(stderr, " %6d : ", (int)m);
|
||||
|
||||
|
||||
for (l=0; l<loops; l++)
|
||||
{
|
||||
|
||||
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
|
||||
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||
}
|
||||
|
||||
gettimeofday( &start, (struct timezone *)0);
|
||||
|
||||
IAMAX (&m, x, &inc_x);
|
||||
|
||||
gettimeofday( &stop, (struct timezone *)0);
|
||||
|
||||
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
|
||||
|
||||
timeg += time1;
|
||||
|
||||
}
|
||||
|
||||
timeg /= loops;
|
||||
|
||||
fprintf(stderr, " %10.6f secs\n", timeg);
|
||||
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
|
|
@ -0,0 +1,190 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#ifdef __CYGWIN32__
|
||||
#include <sys/time.h>
|
||||
#endif
|
||||
#include "common.h"
|
||||
|
||||
|
||||
#undef NRM2
|
||||
|
||||
#ifdef COMPLEX
|
||||
#ifdef DOUBLE
|
||||
#define NRM2 BLASFUNC(dznrm2)
|
||||
#else
|
||||
#define NRM2 BLASFUNC(scnrm2)
|
||||
#endif
|
||||
#else
|
||||
#ifdef DOUBLE
|
||||
#define NRM2 BLASFUNC(dnrm2)
|
||||
#else
|
||||
#define NRM2 BLASFUNC(snrm2)
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(__WIN32__) || defined(__WIN64__)
|
||||
|
||||
#ifndef DELTA_EPOCH_IN_MICROSECS
|
||||
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
|
||||
#endif
|
||||
|
||||
int gettimeofday(struct timeval *tv, void *tz){
|
||||
|
||||
FILETIME ft;
|
||||
unsigned __int64 tmpres = 0;
|
||||
static int tzflag;
|
||||
|
||||
if (NULL != tv)
|
||||
{
|
||||
GetSystemTimeAsFileTime(&ft);
|
||||
|
||||
tmpres |= ft.dwHighDateTime;
|
||||
tmpres <<= 32;
|
||||
tmpres |= ft.dwLowDateTime;
|
||||
|
||||
/*converting file time to unix epoch*/
|
||||
tmpres /= 10; /*convert into microseconds*/
|
||||
tmpres -= DELTA_EPOCH_IN_MICROSECS;
|
||||
tv->tv_sec = (long)(tmpres / 1000000UL);
|
||||
tv->tv_usec = (long)(tmpres % 1000000UL);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
|
||||
|
||||
static void *huge_malloc(BLASLONG size){
|
||||
int shmid;
|
||||
void *address;
|
||||
|
||||
#ifndef SHM_HUGETLB
|
||||
#define SHM_HUGETLB 04000
|
||||
#endif
|
||||
|
||||
if ((shmid =shmget(IPC_PRIVATE,
|
||||
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
|
||||
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
|
||||
printf( "Memory allocation failed(shmget).\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
address = shmat(shmid, NULL, SHM_RND);
|
||||
|
||||
if ((BLASLONG)address == -1){
|
||||
printf( "Memory allocation failed(shmat).\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
shmctl(shmid, IPC_RMID, 0);
|
||||
|
||||
return address;
|
||||
}
|
||||
|
||||
#define malloc huge_malloc
|
||||
|
||||
#endif
|
||||
|
||||
int main(int argc, char *argv[]){
|
||||
|
||||
FLOAT *x;
|
||||
blasint m, i;
|
||||
blasint inc_x=1;
|
||||
int loops = 1;
|
||||
int l;
|
||||
char *p;
|
||||
|
||||
int from = 1;
|
||||
int to = 200;
|
||||
int step = 1;
|
||||
|
||||
struct timeval start, stop;
|
||||
double time1,timeg;
|
||||
|
||||
argc--;argv++;
|
||||
|
||||
if (argc > 0) { from = atol(*argv); argc--; argv++;}
|
||||
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
|
||||
if (argc > 0) { step = atol(*argv); argc--; argv++;}
|
||||
|
||||
if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
|
||||
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
|
||||
|
||||
fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops);
|
||||
|
||||
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
|
||||
fprintf(stderr,"Out of Memory!!\n");exit(1);
|
||||
}
|
||||
|
||||
#ifdef linux
|
||||
srandom(getpid());
|
||||
#endif
|
||||
|
||||
fprintf(stderr, " SIZE Time\n");
|
||||
|
||||
for(m = from; m <= to; m += step)
|
||||
{
|
||||
|
||||
timeg=0;
|
||||
|
||||
fprintf(stderr, " %6d : ", (int)m);
|
||||
|
||||
|
||||
for (l=0; l<loops; l++)
|
||||
{
|
||||
|
||||
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
|
||||
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||
}
|
||||
|
||||
gettimeofday( &start, (struct timezone *)0);
|
||||
|
||||
NRM2 (&m, x, &inc_x);
|
||||
|
||||
gettimeofday( &stop, (struct timezone *)0);
|
||||
|
||||
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
|
||||
|
||||
timeg += time1;
|
||||
|
||||
}
|
||||
|
||||
timeg /= loops;
|
||||
|
||||
fprintf(stderr, " %10.6f secs\n", timeg);
|
||||
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
|
|
@ -186,8 +186,8 @@ int main(int argc, char *argv[]){
|
|||
timeg /= loops;
|
||||
|
||||
fprintf(stderr,
|
||||
" %10.2f MFlops\n",
|
||||
COMPSIZE * COMPSIZE * 6. * (double)m / timeg * 1.e-6);
|
||||
" %10.2f MFlops %10.6f sec\n",
|
||||
COMPSIZE * COMPSIZE * 6. * (double)m / timeg * 1.e-6, timeg);
|
||||
|
||||
}
|
||||
|
||||
|
|
|
@ -189,9 +189,9 @@ int main(int argc, char *argv[]){
|
|||
timeg /= loops;
|
||||
|
||||
#ifdef COMPLEX
|
||||
fprintf(stderr, " %10.2f MFlops\n", 6. * (double)m / timeg * 1.e-6);
|
||||
fprintf(stderr, " %10.2f MFlops %10.6f sec\n", 6. * (double)m / timeg * 1.e-6, timeg);
|
||||
#else
|
||||
fprintf(stderr, " %10.2f MFlops\n", 1. * (double)m / timeg * 1.e-6);
|
||||
fprintf(stderr, " %10.2f MFlops %10.6f sec\n", 1. * (double)m / timeg * 1.e-6, timeg);
|
||||
#endif
|
||||
|
||||
}
|
||||
|
|
|
@ -190,8 +190,8 @@ int main(int argc, char *argv[]){
|
|||
timeg /= loops;
|
||||
|
||||
fprintf(stderr,
|
||||
" %10.2f MBytes\n",
|
||||
COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6);
|
||||
" %10.2f MBytes %10.6f sec\n",
|
||||
COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg);
|
||||
|
||||
}
|
||||
|
||||
|
|
|
@ -191,8 +191,8 @@ int main(int argc, char *argv[]){
|
|||
gettimeofday( &start, (struct timezone *)0);
|
||||
|
||||
fprintf(stderr,
|
||||
" %10.2f MFlops\n",
|
||||
COMPSIZE * COMPSIZE * 1. * (double)m * (double)m * (double)m / time1 * 1.e-6);
|
||||
" %10.2f MFlops %10.6f sec\n",
|
||||
COMPSIZE * COMPSIZE * 1. * (double)m * (double)m * (double)m / time1 * 1.e-6, time1);
|
||||
|
||||
}
|
||||
|
||||
|
|
|
@ -184,8 +184,8 @@ int main(int argc, char *argv[]){
|
|||
timeg /= loops;
|
||||
|
||||
fprintf(stderr,
|
||||
" %10.2f MFlops\n",
|
||||
COMPSIZE * COMPSIZE * 2. * (double)m / timeg * 1.e-6);
|
||||
" %10.2f MFlops %10.6f sec\n",
|
||||
COMPSIZE * COMPSIZE * 2. * (double)m / timeg * 1.e-6, timeg);
|
||||
|
||||
}
|
||||
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -58,43 +58,43 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
str TMPF, [Y], #SZ
|
||||
#else
|
||||
#if !defined(DOUBLE)
|
||||
ld1 {v0.2s}, [X], #8
|
||||
st1 {v0.2s}, [Y], #8
|
||||
ldr d0, [X], #8
|
||||
str d0, [Y], #8
|
||||
#else
|
||||
ld1 {v0.2d}, [X], #16
|
||||
st1 {v0.2d}, [Y], #16
|
||||
ldr q0, [X], #16
|
||||
str q0, [Y], #16
|
||||
#endif
|
||||
#endif
|
||||
|
||||
.endm
|
||||
|
||||
.macro KERNEL_F4
|
||||
|
||||
#if !defined(COMPLEX)
|
||||
#if !defined(DOUBLE)
|
||||
ld1 {v0.4s}, [X], #16
|
||||
st1 {v0.4s}, [Y], #16
|
||||
ldr q0, [X], #16
|
||||
str q0, [Y], #16
|
||||
#else // DOUBLE
|
||||
ld1 {v0.4s}, [X], #16
|
||||
ld1 {v1.4s}, [X], #16
|
||||
st1 {v0.4s}, [Y], #16
|
||||
st1 {v1.4s}, [Y], #16
|
||||
ldr q0, [X], #16
|
||||
str q0, [Y], #16
|
||||
ldr q1, [X], #16
|
||||
str q1, [Y], #16
|
||||
|
||||
#endif
|
||||
#else // COMPLEX
|
||||
#if !defined(DOUBLE)
|
||||
ld1 {v0.4s}, [X], #16
|
||||
ld1 {v1.4s}, [X], #16
|
||||
st1 {v0.4s}, [Y], #16
|
||||
st1 {v1.4s}, [Y], #16
|
||||
ldr q0, [X], #16
|
||||
str q0, [Y], #16
|
||||
ldr q1, [X], #16
|
||||
str q1, [Y], #16
|
||||
#else // DOUBLE
|
||||
ld1 {v0.4s}, [X], #16
|
||||
ld1 {v1.4s}, [X], #16
|
||||
ld1 {v2.4s}, [X], #16
|
||||
ld1 {v3.4s}, [X], #16
|
||||
st1 {v0.4s}, [Y], #16
|
||||
st1 {v1.4s}, [Y], #16
|
||||
st1 {v2.4s}, [Y], #16
|
||||
st1 {v3.4s}, [Y], #16
|
||||
ldr q0, [X], #16
|
||||
str q0, [Y], #16
|
||||
ldr q1, [X], #16
|
||||
str q1, [Y], #16
|
||||
ldr q2, [X], #16
|
||||
str q2, [Y], #16
|
||||
ldr q3, [X], #16
|
||||
str q3, [Y], #16
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -46,19 +46,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define pCRow0 x12
|
||||
#define pCRow1 x13
|
||||
#define pCRow2 x14
|
||||
#define pA x15
|
||||
#define temp x16
|
||||
#define tempOffset x17
|
||||
#define tempK x18
|
||||
#define pCRow3 x15
|
||||
#define pA x16
|
||||
#define alpha x17
|
||||
#define temp x18
|
||||
#define tempOffset x19
|
||||
#define tempK x20
|
||||
|
||||
#define alpha0 d10
|
||||
#define alphaV0 v10.d[0]
|
||||
#define alpha1 d11
|
||||
#define alphaV1 v11.d[0]
|
||||
#define alpha2 d14
|
||||
#define alphaV2 v14.d[0]
|
||||
#define alpha3 d15
|
||||
#define alphaV3 v15.d[0]
|
||||
|
||||
#define A_PRE_SIZE 2560
|
||||
#define B_PRE_SIZE 448
|
||||
#define C_PRE_SIZE 128
|
||||
|
||||
// 00 origM
|
||||
// 01 origN
|
||||
|
@ -101,14 +101,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
//v05 pA1_2, pA1_3
|
||||
//v06 pA1_4, pA1_5
|
||||
//v07 pA1_6, pA1_7
|
||||
//v08 must save pB0_0, pB0_1
|
||||
//v09 must save pB0_2, pB0_3
|
||||
//v10 must save ALPHA0
|
||||
//v11 must save ALPHA1
|
||||
//v12 must save pB1_0, pB1_1
|
||||
//v13 must save pB1_2, pB1_3
|
||||
//v14 must save ALPHA2
|
||||
//v15 must save ALPHA3
|
||||
//v08 must save pB0_0
|
||||
//v09 must save pB0_1
|
||||
//v10 must save pB0_2 --> ALPHA0
|
||||
//v11 must save pB0_3
|
||||
//v12 must save pB1_0
|
||||
//v13 must save pB1_1
|
||||
//v14 must save pB1_2
|
||||
//v15 must save pB1_3
|
||||
//v16 must save C00, C01
|
||||
//v17 must save C02, C03
|
||||
//v18 C04, C05
|
||||
|
@ -150,186 +150,249 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro KERNEL8x4_I
|
||||
ld1 {v0.2d, v1.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
ld1 {v8.2d, v9.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
ld1 {v2.2d, v3.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
ldp q0, q1, [pA], #32
|
||||
|
||||
ldp d8, d9, [pB], #16
|
||||
|
||||
fmul v16.2d, v0.2d, v8.d[0]
|
||||
fmul v20.2d, v0.2d, v9.d[0]
|
||||
|
||||
ldp d10, d11, [pB], #16
|
||||
|
||||
fmul v17.2d, v1.2d, v8.d[0]
|
||||
fmul v21.2d, v1.2d, v9.d[0]
|
||||
|
||||
ldp q2, q3, [pA], #32
|
||||
|
||||
fmul v24.2d, v0.2d, v10.d[0]
|
||||
fmul v28.2d, v0.2d, v11.d[0]
|
||||
|
||||
ldp q4, q5, [pA], #32
|
||||
|
||||
fmul v25.2d, v1.2d, v10.d[0]
|
||||
fmul v29.2d, v1.2d, v11.d[0]
|
||||
|
||||
ldp d12, d13, [pB], #16
|
||||
|
||||
fmul v18.2d, v2.2d, v8.d[0]
|
||||
fmul v22.2d, v2.2d, v9.d[0]
|
||||
|
||||
ldp d14, d15, [pB], #16
|
||||
|
||||
fmul v26.2d, v2.2d, v10.d[0]
|
||||
fmul v30.2d, v2.2d, v11.d[0]
|
||||
|
||||
ldp q6, q7, [pA], #32
|
||||
|
||||
fmul v19.2d, v3.2d, v8.d[0]
|
||||
fmul v27.2d, v3.2d, v10.d[0]
|
||||
|
||||
fmul v20.2d, v0.2d, v8.d[1]
|
||||
fmul v21.2d, v1.2d, v8.d[1]
|
||||
fmul v22.2d, v2.2d, v8.d[1]
|
||||
fmul v23.2d, v3.2d, v8.d[1]
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||
|
||||
fmul v24.2d, v0.2d, v9.d[0]
|
||||
fmul v25.2d, v1.2d, v9.d[0]
|
||||
fmul v26.2d, v2.2d, v9.d[0]
|
||||
fmul v27.2d, v3.2d, v9.d[0]
|
||||
fmul v31.2d, v3.2d, v11.d[0]
|
||||
fmul v23.2d, v3.2d, v9.d[0]
|
||||
|
||||
fmul v28.2d, v0.2d, v9.d[1]
|
||||
fmul v29.2d, v1.2d, v9.d[1]
|
||||
fmul v30.2d, v2.2d, v9.d[1]
|
||||
fmul v31.2d, v3.2d, v9.d[1]
|
||||
|
||||
ld1 {v4.2d, v5.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
ld1 {v12.2d, v13.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
ld1 {v6.2d, v7.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
|
||||
.endm
|
||||
|
||||
.macro KERNEL8x4_M1
|
||||
fmla v16.2d, v0.2d, v8.d[0]
|
||||
fmla v20.2d, v0.2d, v9.d[0]
|
||||
|
||||
ldp q4, q5, [pA], #32
|
||||
|
||||
fmla v24.2d, v0.2d, v10.d[0]
|
||||
fmla v28.2d, v0.2d, v11.d[0]
|
||||
|
||||
ldp d12, d13, [pB], #16
|
||||
|
||||
fmla v17.2d, v1.2d, v8.d[0]
|
||||
fmla v25.2d, v1.2d, v10.d[0]
|
||||
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
|
||||
|
||||
fmla v21.2d, v1.2d, v9.d[0]
|
||||
fmla v29.2d, v1.2d, v11.d[0]
|
||||
|
||||
ldp d14, d15, [pB], #16
|
||||
|
||||
fmla v18.2d, v2.2d, v8.d[0]
|
||||
fmla v22.2d, v2.2d, v9.d[0]
|
||||
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||
|
||||
fmla v26.2d, v2.2d, v10.d[0]
|
||||
fmla v30.2d, v2.2d, v11.d[0]
|
||||
fmla v19.2d, v3.2d, v8.d[0]
|
||||
fmla v23.2d, v3.2d, v9.d[0]
|
||||
|
||||
fmla v20.2d, v0.2d, v8.d[1]
|
||||
fmla v21.2d, v1.2d, v8.d[1]
|
||||
fmla v22.2d, v2.2d, v8.d[1]
|
||||
fmla v23.2d, v3.2d, v8.d[1]
|
||||
ldp q6, q7, [pA], #32
|
||||
|
||||
fmla v24.2d, v0.2d, v9.d[0]
|
||||
fmla v25.2d, v1.2d, v9.d[0]
|
||||
fmla v26.2d, v2.2d, v9.d[0]
|
||||
fmla v27.2d, v3.2d, v9.d[0]
|
||||
|
||||
fmla v28.2d, v0.2d, v9.d[1]
|
||||
fmla v29.2d, v1.2d, v9.d[1]
|
||||
fmla v30.2d, v2.2d, v9.d[1]
|
||||
fmla v31.2d, v3.2d, v9.d[1]
|
||||
|
||||
ld1 {v4.2d, v5.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
ld1 {v12.2d, v13.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
ld1 {v6.2d, v7.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
prfm PLDL1KEEP, [pA, #512]
|
||||
fmla v27.2d, v3.2d, v10.d[0]
|
||||
fmla v31.2d, v3.2d, v11.d[0]
|
||||
.endm
|
||||
|
||||
.macro KERNEL8x4_M2
|
||||
fmla v16.2d, v4.2d, v12.d[0]
|
||||
fmla v20.2d, v4.2d, v13.d[0]
|
||||
fmla v24.2d, v4.2d, v14.d[0]
|
||||
fmla v28.2d, v4.2d, v15.d[0]
|
||||
|
||||
ldp q0, q1, [pA], #32
|
||||
|
||||
fmla v17.2d, v5.2d, v12.d[0]
|
||||
fmla v25.2d, v5.2d, v14.d[0]
|
||||
|
||||
ldp d8, d9, [pB], #16
|
||||
|
||||
fmla v21.2d, v5.2d, v13.d[0]
|
||||
fmla v29.2d, v5.2d, v15.d[0]
|
||||
|
||||
ldp d10, d11, [pB], #16
|
||||
|
||||
fmla v18.2d, v6.2d, v12.d[0]
|
||||
fmla v22.2d, v6.2d, v13.d[0]
|
||||
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
|
||||
fmla v26.2d, v6.2d, v14.d[0]
|
||||
fmla v30.2d, v6.2d, v15.d[0]
|
||||
|
||||
fmla v19.2d, v7.2d, v12.d[0]
|
||||
fmla v23.2d, v7.2d, v13.d[0]
|
||||
|
||||
fmla v20.2d, v4.2d, v12.d[1]
|
||||
fmla v21.2d, v5.2d, v12.d[1]
|
||||
fmla v22.2d, v6.2d, v12.d[1]
|
||||
fmla v23.2d, v7.2d, v12.d[1]
|
||||
ldp q2, q3, [pA], #32
|
||||
|
||||
fmla v24.2d, v4.2d, v13.d[0]
|
||||
fmla v25.2d, v5.2d, v13.d[0]
|
||||
fmla v26.2d, v6.2d, v13.d[0]
|
||||
fmla v27.2d, v7.2d, v13.d[0]
|
||||
|
||||
fmla v28.2d, v4.2d, v13.d[1]
|
||||
fmla v29.2d, v5.2d, v13.d[1]
|
||||
fmla v30.2d, v6.2d, v13.d[1]
|
||||
fmla v31.2d, v7.2d, v13.d[1]
|
||||
|
||||
ld1 {v0.2d, v1.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
ld1 {v8.2d, v9.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
ld1 {v2.2d, v3.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
prfm PLDL1KEEP, [pB, #512]
|
||||
fmla v27.2d, v7.2d, v14.d[0]
|
||||
fmla v31.2d, v7.2d, v15.d[0]
|
||||
.endm
|
||||
|
||||
.macro KERNEL8x4_E
|
||||
fmla v16.2d, v4.2d, v12.d[0]
|
||||
fmla v20.2d, v4.2d, v13.d[0]
|
||||
fmla v24.2d, v4.2d, v14.d[0]
|
||||
fmla v28.2d, v4.2d, v15.d[0]
|
||||
|
||||
fmla v17.2d, v5.2d, v12.d[0]
|
||||
fmla v25.2d, v5.2d, v14.d[0]
|
||||
fmla v21.2d, v5.2d, v13.d[0]
|
||||
fmla v29.2d, v5.2d, v15.d[0]
|
||||
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
|
||||
fmla v18.2d, v6.2d, v12.d[0]
|
||||
fmla v22.2d, v6.2d, v13.d[0]
|
||||
fmla v26.2d, v6.2d, v14.d[0]
|
||||
fmla v30.2d, v6.2d, v15.d[0]
|
||||
|
||||
fmla v19.2d, v7.2d, v12.d[0]
|
||||
|
||||
fmla v20.2d, v4.2d, v12.d[1]
|
||||
fmla v21.2d, v5.2d, v12.d[1]
|
||||
fmla v22.2d, v6.2d, v12.d[1]
|
||||
fmla v23.2d, v7.2d, v12.d[1]
|
||||
|
||||
fmla v24.2d, v4.2d, v13.d[0]
|
||||
fmla v25.2d, v5.2d, v13.d[0]
|
||||
fmla v26.2d, v6.2d, v13.d[0]
|
||||
fmla v27.2d, v7.2d, v13.d[0]
|
||||
|
||||
fmla v28.2d, v4.2d, v13.d[1]
|
||||
fmla v29.2d, v5.2d, v13.d[1]
|
||||
fmla v30.2d, v6.2d, v13.d[1]
|
||||
fmla v31.2d, v7.2d, v13.d[1]
|
||||
fmla v23.2d, v7.2d, v13.d[0]
|
||||
fmla v27.2d, v7.2d, v14.d[0]
|
||||
fmla v31.2d, v7.2d, v15.d[0]
|
||||
.endm
|
||||
|
||||
.macro KERNEL8x4_SUB
|
||||
ld1 {v0.2d, v1.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
ld1 {v8.2d, v9.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
ld1 {v2.2d, v3.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
ldp q0, q1, [pA], #32
|
||||
|
||||
ldp d8, d9, [pB], #16
|
||||
|
||||
fmla v16.2d, v0.2d, v8.d[0]
|
||||
fmla v20.2d, v0.2d, v9.d[0]
|
||||
|
||||
ldp d10, d11, [pB], #16
|
||||
|
||||
fmla v17.2d, v1.2d, v8.d[0]
|
||||
fmla v21.2d, v1.2d, v9.d[0]
|
||||
|
||||
ldp q2, q3, [pA], #32
|
||||
|
||||
fmla v24.2d, v0.2d, v10.d[0]
|
||||
fmla v28.2d, v0.2d, v11.d[0]
|
||||
|
||||
fmla v25.2d, v1.2d, v10.d[0]
|
||||
fmla v29.2d, v1.2d, v11.d[0]
|
||||
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||
|
||||
fmla v18.2d, v2.2d, v8.d[0]
|
||||
fmla v22.2d, v2.2d, v9.d[0]
|
||||
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
|
||||
|
||||
fmla v26.2d, v2.2d, v10.d[0]
|
||||
fmla v30.2d, v2.2d, v11.d[0]
|
||||
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
|
||||
fmla v19.2d, v3.2d, v8.d[0]
|
||||
fmla v27.2d, v3.2d, v10.d[0]
|
||||
|
||||
fmla v20.2d, v0.2d, v8.d[1]
|
||||
fmla v21.2d, v1.2d, v8.d[1]
|
||||
fmla v22.2d, v2.2d, v8.d[1]
|
||||
fmla v23.2d, v3.2d, v8.d[1]
|
||||
|
||||
fmla v24.2d, v0.2d, v9.d[0]
|
||||
fmla v25.2d, v1.2d, v9.d[0]
|
||||
fmla v26.2d, v2.2d, v9.d[0]
|
||||
fmla v27.2d, v3.2d, v9.d[0]
|
||||
|
||||
fmla v28.2d, v0.2d, v9.d[1]
|
||||
fmla v29.2d, v1.2d, v9.d[1]
|
||||
fmla v30.2d, v2.2d, v9.d[1]
|
||||
fmla v31.2d, v3.2d, v9.d[1]
|
||||
fmla v31.2d, v3.2d, v11.d[0]
|
||||
fmla v23.2d, v3.2d, v9.d[0]
|
||||
.endm
|
||||
|
||||
.macro SAVE8x4
|
||||
add pCRow1, pCRow0, LDC
|
||||
fmov alpha0, alpha
|
||||
|
||||
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
|
||||
|
||||
fmul v0.2d, v16.2d, alphaV0
|
||||
fmul v1.2d, v17.2d, alphaV1
|
||||
fmul v2.2d, v18.2d, alphaV2
|
||||
fmul v3.2d, v19.2d, alphaV3
|
||||
st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0]
|
||||
fmul v1.2d, v17.2d, alphaV0
|
||||
stp q0, q1, [pCRow0]
|
||||
|
||||
add pCRow2, pCRow1, LDC
|
||||
add pCRow0, pCRow0, #32
|
||||
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
|
||||
|
||||
fmul v2.2d, v18.2d, alphaV0
|
||||
fmul v3.2d, v19.2d, alphaV0
|
||||
stp q2, q3, [pCRow0]
|
||||
|
||||
add pCRow0, pCRow0, #32
|
||||
|
||||
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
|
||||
|
||||
fmul v4.2d, v20.2d, alphaV0
|
||||
fmul v5.2d, v21.2d, alphaV1
|
||||
fmul v6.2d, v22.2d, alphaV2
|
||||
fmul v7.2d, v23.2d, alphaV3
|
||||
st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1]
|
||||
fmul v5.2d, v21.2d, alphaV0
|
||||
stp q4, q5, [pCRow1]
|
||||
|
||||
add pCRow1, pCRow2, LDC
|
||||
add pCRow1, pCRow1, #32
|
||||
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
|
||||
|
||||
fmul v6.2d, v22.2d, alphaV0
|
||||
fmul v7.2d, v23.2d, alphaV0
|
||||
stp q6, q7, [pCRow1]
|
||||
|
||||
add pCRow1, pCRow1, #32
|
||||
|
||||
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
|
||||
|
||||
fmul v0.2d, v24.2d, alphaV0
|
||||
fmul v1.2d, v25.2d, alphaV1
|
||||
fmul v2.2d, v26.2d, alphaV2
|
||||
fmul v3.2d, v27.2d, alphaV3
|
||||
st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow2]
|
||||
fmul v1.2d, v25.2d, alphaV0
|
||||
stp q0, q1, [pCRow2]
|
||||
|
||||
add pCRow2, pCRow2, #32
|
||||
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
|
||||
|
||||
fmul v2.2d, v26.2d, alphaV0
|
||||
fmul v3.2d, v27.2d, alphaV0
|
||||
stp q2, q3, [pCRow2]
|
||||
|
||||
add pCRow2, pCRow2, #32
|
||||
|
||||
prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
|
||||
|
||||
fmul v4.2d, v28.2d, alphaV0
|
||||
fmul v5.2d, v29.2d, alphaV1
|
||||
fmul v6.2d, v30.2d, alphaV2
|
||||
fmul v7.2d, v31.2d, alphaV3
|
||||
st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1]
|
||||
fmul v5.2d, v29.2d, alphaV0
|
||||
stp q4, q5, [pCRow3]
|
||||
|
||||
add pCRow0, pCRow0, #64
|
||||
add pCRow3, pCRow3, #32
|
||||
prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
|
||||
|
||||
fmul v6.2d, v30.2d, alphaV0
|
||||
fmul v7.2d, v31.2d, alphaV0
|
||||
stp q6, q7, [pCRow3]
|
||||
|
||||
add pCRow3, pCRow3, #32
|
||||
.endm
|
||||
|
||||
/******************************************************************************/
|
||||
|
@ -365,26 +428,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro SAVE4x4
|
||||
fmov alpha0, alpha
|
||||
fmul v8.2d, v16.2d, alphaV0
|
||||
fmul v9.2d, v17.2d, alphaV1
|
||||
fmul v9.2d, v17.2d, alphaV0
|
||||
st1 {v8.2d, v9.2d}, [pCRow0]
|
||||
|
||||
add pCRow1, pCRow0, LDC
|
||||
|
||||
fmul v12.2d, v20.2d, alphaV2
|
||||
fmul v13.2d, v21.2d, alphaV3
|
||||
fmul v12.2d, v20.2d, alphaV0
|
||||
fmul v13.2d, v21.2d, alphaV0
|
||||
st1 {v12.2d, v13.2d}, [pCRow1]
|
||||
|
||||
add pCRow2, pCRow1, LDC
|
||||
|
||||
fmul v8.2d, v24.2d, alphaV0
|
||||
fmul v9.2d, v25.2d, alphaV1
|
||||
fmul v9.2d, v25.2d, alphaV0
|
||||
st1 {v8.2d, v9.2d}, [pCRow2]
|
||||
|
||||
add pCRow1, pCRow2, LDC
|
||||
|
||||
fmul v12.2d, v28.2d, alphaV2
|
||||
fmul v13.2d, v29.2d, alphaV3
|
||||
fmul v12.2d, v28.2d, alphaV0
|
||||
fmul v13.2d, v29.2d, alphaV0
|
||||
st1 {v12.2d, v13.2d}, [pCRow1]
|
||||
|
||||
add pCRow0, pCRow0, #32
|
||||
|
@ -413,22 +477,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro SAVE2x4
|
||||
fmov alpha0, alpha
|
||||
fmul v8.2d, v16.2d, alphaV0
|
||||
st1 {v8.2d}, [pCRow0]
|
||||
|
||||
add pCRow1, pCRow0, LDC
|
||||
|
||||
fmul v12.2d, v20.2d, alphaV1
|
||||
fmul v12.2d, v20.2d, alphaV0
|
||||
st1 {v12.2d}, [pCRow1]
|
||||
|
||||
add pCRow2, pCRow1, LDC
|
||||
|
||||
fmul v8.2d, v24.2d, alphaV2
|
||||
fmul v8.2d, v24.2d, alphaV0
|
||||
st1 {v8.2d}, [pCRow2]
|
||||
|
||||
add pCRow1, pCRow2, LDC
|
||||
|
||||
fmul v12.2d, v28.2d, alphaV3
|
||||
fmul v12.2d, v28.2d, alphaV0
|
||||
st1 {v12.2d}, [pCRow1]
|
||||
|
||||
add pCRow0, pCRow0, #16
|
||||
|
@ -453,6 +518,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro SAVE1x4
|
||||
fmov alpha0, alpha
|
||||
|
||||
add pCRow1, pCRow0, LDC
|
||||
|
||||
fmul v8.2d, v16.2d, alphaV0
|
||||
|
@ -462,7 +529,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
add pCRow2, pCRow1, LDC
|
||||
add pCRow1, pCRow2, LDC
|
||||
|
||||
fmul v12.2d, v20.2d, alphaV1
|
||||
fmul v12.2d, v20.2d, alphaV0
|
||||
st1 {v12.d}[0], [pCRow2]
|
||||
st1 {v12.d}[1], [pCRow1]
|
||||
|
||||
|
@ -502,18 +569,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro SAVE8x2
|
||||
fmov alpha0, alpha
|
||||
add pCRow1, pCRow0, LDC
|
||||
|
||||
fmul v0.2d, v16.2d, alphaV0
|
||||
fmul v1.2d, v17.2d, alphaV1
|
||||
fmul v2.2d, v18.2d, alphaV2
|
||||
fmul v3.2d, v19.2d, alphaV3
|
||||
fmul v1.2d, v17.2d, alphaV0
|
||||
fmul v2.2d, v18.2d, alphaV0
|
||||
fmul v3.2d, v19.2d, alphaV0
|
||||
st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0]
|
||||
|
||||
fmul v4.2d, v20.2d, alphaV0
|
||||
fmul v5.2d, v21.2d, alphaV1
|
||||
fmul v6.2d, v22.2d, alphaV2
|
||||
fmul v7.2d, v23.2d, alphaV3
|
||||
fmul v5.2d, v21.2d, alphaV0
|
||||
fmul v6.2d, v22.2d, alphaV0
|
||||
fmul v7.2d, v23.2d, alphaV0
|
||||
st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1]
|
||||
|
||||
add pCRow0, pCRow0, #64
|
||||
|
@ -541,14 +609,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro SAVE4x2
|
||||
fmov alpha0, alpha
|
||||
fmul v8.2d, v16.2d, alphaV0
|
||||
fmul v9.2d, v17.2d, alphaV1
|
||||
fmul v9.2d, v17.2d, alphaV0
|
||||
st1 {v8.2d, v9.2d}, [pCRow0]
|
||||
|
||||
add pCRow1, pCRow0, LDC
|
||||
|
||||
fmul v12.2d, v20.2d, alphaV2
|
||||
fmul v13.2d, v21.2d, alphaV3
|
||||
fmul v12.2d, v20.2d, alphaV0
|
||||
fmul v13.2d, v21.2d, alphaV0
|
||||
st1 {v12.2d, v13.2d}, [pCRow1]
|
||||
|
||||
add pCRow0, pCRow0, #32
|
||||
|
@ -573,12 +642,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro SAVE2x2
|
||||
fmov alpha0, alpha
|
||||
fmul v8.2d, v16.2d, alphaV0
|
||||
st1 {v8.2d}, [pCRow0]
|
||||
|
||||
add pCRow1 , pCRow0, LDC
|
||||
|
||||
fmul v12.2d, v20.2d, alphaV1
|
||||
fmul v12.2d, v20.2d, alphaV0
|
||||
st1 {v12.2d}, [pCRow1]
|
||||
|
||||
add pCRow0, pCRow0, #16
|
||||
|
@ -601,6 +671,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro SAVE1x2
|
||||
fmov alpha0, alpha
|
||||
add pCRow1 , pCRow0, LDC
|
||||
|
||||
fmul v8.2d, v16.2d, alphaV0
|
||||
|
@ -636,10 +707,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro SAVE8x1
|
||||
fmov alpha0, alpha
|
||||
fmul v0.2d, v16.2d, alphaV0
|
||||
fmul v1.2d, v17.2d, alphaV1
|
||||
fmul v2.2d, v18.2d, alphaV2
|
||||
fmul v3.2d, v19.2d, alphaV3
|
||||
fmul v1.2d, v17.2d, alphaV0
|
||||
fmul v2.2d, v18.2d, alphaV0
|
||||
fmul v3.2d, v19.2d, alphaV0
|
||||
st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0]
|
||||
|
||||
add pCRow0, pCRow0, #64
|
||||
|
@ -665,8 +737,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro SAVE4x1
|
||||
fmov alpha0, alpha
|
||||
fmul v8.2d, v16.2d, alphaV0
|
||||
fmul v9.2d, v17.2d, alphaV1
|
||||
fmul v9.2d, v17.2d, alphaV0
|
||||
st1 {v8.2d, v9.2d}, [pCRow0]
|
||||
|
||||
add pCRow0, pCRow0, #32
|
||||
|
@ -690,6 +763,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro SAVE2x1
|
||||
fmov alpha0, alpha
|
||||
fmul v8.2d, v16.2d, alphaV0
|
||||
st1 {v8.2d}, [pCRow0]
|
||||
|
||||
|
@ -713,6 +787,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro SAVE1x1
|
||||
fmov alpha0, alpha
|
||||
fmul d8, d16, alpha0
|
||||
str d8, [pCRow0]
|
||||
|
||||
|
@ -739,10 +814,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
stp x26, x27, [sp, #(9 * 16)]
|
||||
str x28, [sp, #(10 * 16)]
|
||||
|
||||
fmov alpha0, d0
|
||||
fmov alpha1, d0
|
||||
fmov alpha2, d0
|
||||
fmov alpha3, d0
|
||||
prfm PLDL1KEEP, [origPB]
|
||||
prfm PLDL1KEEP, [origPA]
|
||||
|
||||
fmov alpha, d0
|
||||
|
||||
lsl LDC, LDC, #3 // ldc = ldc * 8
|
||||
|
||||
|
@ -759,8 +834,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
/******************************************************************************/
|
||||
|
||||
dtrmm_kernel_L4_BEGIN:
|
||||
mov pCRow0, pC // pCRow0 = C
|
||||
add pC, pC, LDC, lsl #2
|
||||
mov pCRow0, pC
|
||||
add pCRow1, pCRow0, LDC
|
||||
add pCRow2, pCRow1, LDC
|
||||
add pCRow3, pCRow2, LDC
|
||||
|
||||
add pC, pCRow3, LDC
|
||||
|
||||
|
||||
#if defined(LEFT)
|
||||
mov tempOffset, offset
|
||||
|
@ -774,6 +854,7 @@ dtrmm_kernel_L4_M8_BEGIN:
|
|||
cmp counterI, #0
|
||||
ble dtrmm_kernel_L4_M4_BEGIN
|
||||
|
||||
.align 5
|
||||
dtrmm_kernel_L4_M8_20:
|
||||
|
||||
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||
|
@ -794,40 +875,64 @@ dtrmm_kernel_L4_M8_20:
|
|||
add tempK, tempOffset, #4
|
||||
#endif
|
||||
|
||||
asr counterL , tempK, #1 // L = K / 2
|
||||
asr counterL , tempK, #3 // L = K / 8
|
||||
cmp counterL , #2 // is there at least 4 to do?
|
||||
blt dtrmm_kernel_L4_M8_32
|
||||
|
||||
KERNEL8x4_I // do one in the K
|
||||
KERNEL8x4_M2 // do another in the K
|
||||
KERNEL8x4_M1
|
||||
KERNEL8x4_M2
|
||||
KERNEL8x4_M1
|
||||
KERNEL8x4_M2
|
||||
KERNEL8x4_M1
|
||||
KERNEL8x4_M2
|
||||
|
||||
subs counterL, counterL, #2 // subtract 2
|
||||
ble dtrmm_kernel_L4_M8_22a
|
||||
.align 5
|
||||
|
||||
.align 5
|
||||
dtrmm_kernel_L4_M8_22:
|
||||
|
||||
KERNEL8x4_M1
|
||||
KERNEL8x4_M2
|
||||
KERNEL8x4_M1
|
||||
KERNEL8x4_M2
|
||||
KERNEL8x4_M1
|
||||
KERNEL8x4_M2
|
||||
KERNEL8x4_M1
|
||||
KERNEL8x4_M2
|
||||
|
||||
subs counterL, counterL, #1
|
||||
bgt dtrmm_kernel_L4_M8_22
|
||||
|
||||
|
||||
.align 5
|
||||
dtrmm_kernel_L4_M8_22a:
|
||||
|
||||
KERNEL8x4_M1
|
||||
KERNEL8x4_M2
|
||||
KERNEL8x4_M1
|
||||
KERNEL8x4_M2
|
||||
KERNEL8x4_M1
|
||||
KERNEL8x4_M2
|
||||
KERNEL8x4_M1
|
||||
KERNEL8x4_E
|
||||
|
||||
b dtrmm_kernel_L4_M8_44
|
||||
|
||||
.align 5
|
||||
dtrmm_kernel_L4_M8_32:
|
||||
|
||||
tst counterL, #1
|
||||
ble dtrmm_kernel_L4_M8_40
|
||||
|
||||
KERNEL8x4_I
|
||||
|
||||
KERNEL8x4_M2
|
||||
KERNEL8x4_M1
|
||||
KERNEL8x4_M2
|
||||
KERNEL8x4_M1
|
||||
KERNEL8x4_M2
|
||||
KERNEL8x4_M1
|
||||
KERNEL8x4_E
|
||||
|
||||
b dtrmm_kernel_L4_M8_44
|
||||
|
@ -838,13 +943,17 @@ dtrmm_kernel_L4_M8_40:
|
|||
|
||||
dtrmm_kernel_L4_M8_44:
|
||||
|
||||
ands counterL , tempK, #1
|
||||
ands counterL , tempK, #7
|
||||
ble dtrmm_kernel_L4_M8_100
|
||||
|
||||
.align 5
|
||||
dtrmm_kernel_L4_M8_46:
|
||||
|
||||
KERNEL8x4_SUB
|
||||
|
||||
subs counterL, counterL, #1
|
||||
bne dtrmm_kernel_L4_M8_46
|
||||
|
||||
dtrmm_kernel_L4_M8_100:
|
||||
|
||||
SAVE8x4
|
||||
|
@ -864,6 +973,9 @@ dtrmm_kernel_L4_M8_100:
|
|||
#if defined(LEFT)
|
||||
add tempOffset, tempOffset, #8
|
||||
#endif
|
||||
prfm PLDL1KEEP, [pA]
|
||||
prfm PLDL1KEEP, [pA, #64]
|
||||
prfm PLDL1KEEP, [origPB]
|
||||
|
||||
dtrmm_kernel_L4_M8_END:
|
||||
subs counterI, counterI, #1
|
||||
|
|
|
@ -68,6 +68,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define SHZ 3
|
||||
#endif
|
||||
|
||||
#define A_PRE_SIZE 768
|
||||
#define Y_PRE_SIZE 768
|
||||
|
||||
/******************************************************************************/
|
||||
|
||||
.macro SAVE_REGS
|
||||
|
@ -105,36 +108,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v2.4s, v3.4s}, [A_PTR], #32
|
||||
ld1 {v4.4s, v5.4s}, [Y_IPTR], #32
|
||||
fmla v4.4s, v1.4s, v2.4s
|
||||
prfm PLDL1KEEP, [A_PTR, #A_PRE_SIZE]
|
||||
fmla v5.4s, v1.4s, v3.4s
|
||||
st1 {v4.4s, v5.4s}, [Y_OPTR], #32
|
||||
|
||||
ld1 {v6.4s, v7.4s}, [A_PTR], #32
|
||||
ld1 {v8.4s, v9.4s}, [Y_IPTR], #32
|
||||
fmla v8.4s, v1.4s, v6.4s
|
||||
prfm PLDL1KEEP, [Y_IPTR, #Y_PRE_SIZE]
|
||||
fmla v9.4s, v1.4s, v7.4s
|
||||
st1 {v8.4s, v9.4s}, [Y_OPTR], #32
|
||||
#else //DOUBLE
|
||||
ld1 {v2.2d, v3.2d}, [A_PTR], #32
|
||||
ld1 {v4.2d, v5.2d}, [Y_IPTR], #32
|
||||
fmla v4.2d, v1.2d, v2.2d
|
||||
prfm PLDL1KEEP, [A_PTR, #A_PRE_SIZE]
|
||||
fmla v5.2d, v1.2d, v3.2d
|
||||
st1 {v4.2d, v5.2d}, [Y_OPTR], #32
|
||||
|
||||
ld1 {v6.2d, v7.2d}, [A_PTR], #32
|
||||
ld1 {v8.2d, v9.2d}, [Y_IPTR], #32
|
||||
fmla v8.2d, v1.2d, v6.2d
|
||||
prfm PLDL1KEEP, [Y_IPTR, #Y_PRE_SIZE]
|
||||
fmla v9.2d, v1.2d, v7.2d
|
||||
st1 {v8.2d, v9.2d}, [Y_OPTR], #32
|
||||
|
||||
ld1 {v10.2d, v11.2d}, [A_PTR], #32
|
||||
ld1 {v12.2d, v13.2d}, [Y_IPTR], #32
|
||||
fmla v12.2d, v1.2d, v10.2d
|
||||
prfm PLDL1KEEP, [A_PTR, #A_PRE_SIZE]
|
||||
fmla v13.2d, v1.2d, v11.2d
|
||||
st1 {v12.2d, v13.2d}, [Y_OPTR], #32
|
||||
|
||||
ld1 {v14.2d, v15.2d}, [A_PTR], #32
|
||||
ld1 {v16.2d, v17.2d}, [Y_IPTR], #32
|
||||
fmla v16.2d, v1.2d, v14.2d
|
||||
prfm PLDL1KEEP, [Y_IPTR, #Y_PRE_SIZE]
|
||||
fmla v17.2d, v1.2d, v15.2d
|
||||
st1 {v16.2d, v17.2d}, [Y_OPTR], #32
|
||||
#endif
|
||||
|
|
|
@ -41,6 +41,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define J x11 /* loop variable */
|
||||
#define I x12 /* loop variable */
|
||||
|
||||
#define X_PREFETCH_SIZE 768
|
||||
#define A_PREFETCH_SIZE 768
|
||||
|
||||
/*******************************************************************************
|
||||
* Macro definitions
|
||||
*******************************************************************************/
|
||||
|
@ -112,42 +115,54 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v5.4s, v6.4s, v7.4s, v8.4s}, [A_PTR], #64
|
||||
ld1 {v9.4s, v10.4s, v11.4s, v12.4s}, [X_PTR], #64
|
||||
fmla v1.4s, v5.4s, v9.4s
|
||||
prfm PLDL1KEEP, [A_PTR, #A_PREFETCH_SIZE]
|
||||
fmla v2.4s, v6.4s, v10.4s
|
||||
prfm PLDL1KEEP, [X_PTR, #X_PREFETCH_SIZE]
|
||||
fmla v3.4s, v7.4s, v11.4s
|
||||
ld1 {v13.4s, v14.4s, v15.4s, v16.4s}, [A_PTR], #64
|
||||
fmla v4.4s, v8.4s, v12.4s
|
||||
|
||||
ld1 {v13.4s, v14.4s, v15.4s, v16.4s}, [A_PTR], #64
|
||||
ld1 {v17.4s, v18.4s, v19.4s, v20.4s}, [X_PTR], #64
|
||||
fmla v1.4s, v13.4s, v17.4s
|
||||
prfm PLDL1KEEP, [A_PTR, #A_PREFETCH_SIZE]
|
||||
fmla v2.4s, v14.4s, v18.4s
|
||||
prfm PLDL1KEEP, [X_PTR, #X_PREFETCH_SIZE]
|
||||
fmla v3.4s, v15.4s, v19.4s
|
||||
fmla v4.4s, v16.4s, v20.4s
|
||||
#else
|
||||
ld1 {v5.2d, v6.2d, v7.2d, v8.2d}, [A_PTR], #64
|
||||
ld1 {v9.2d, v10.2d, v11.2d, v12.2d}, [X_PTR], #64
|
||||
fmla v1.2d, v5.2d, v9.2d
|
||||
prfm PLDL1KEEP, [A_PTR, #A_PREFETCH_SIZE]
|
||||
fmla v2.2d, v6.2d, v10.2d
|
||||
prfm PLDL1KEEP, [X_PTR, #X_PREFETCH_SIZE]
|
||||
fmla v3.2d, v7.2d, v11.2d
|
||||
fmla v4.2d, v8.2d, v12.2d
|
||||
|
||||
ld1 {v13.2d, v14.2d, v15.2d, v16.2d}, [A_PTR], #64
|
||||
ld1 {v17.2d, v18.2d, v19.2d, v20.2d}, [X_PTR], #64
|
||||
fmla v1.2d, v13.2d, v17.2d
|
||||
prfm PLDL1KEEP, [A_PTR, #A_PREFETCH_SIZE]
|
||||
fmla v2.2d, v14.2d, v18.2d
|
||||
prfm PLDL1KEEP, [X_PTR, #X_PREFETCH_SIZE]
|
||||
fmla v3.2d, v15.2d, v19.2d
|
||||
fmla v4.2d, v16.2d, v20.2d
|
||||
|
||||
ld1 {v5.2d, v6.2d, v7.2d, v8.2d}, [A_PTR], #64
|
||||
ld1 {v9.2d, v10.2d, v11.2d, v12.2d}, [X_PTR], #64
|
||||
fmla v1.2d, v5.2d, v9.2d
|
||||
prfm PLDL1KEEP, [A_PTR, #A_PREFETCH_SIZE]
|
||||
fmla v2.2d, v6.2d, v10.2d
|
||||
prfm PLDL1KEEP, [X_PTR, #X_PREFETCH_SIZE]
|
||||
fmla v3.2d, v7.2d, v11.2d
|
||||
fmla v4.2d, v8.2d, v12.2d
|
||||
|
||||
ld1 {v13.2d, v14.2d, v15.2d, v16.2d}, [A_PTR], #64
|
||||
ld1 {v17.2d, v18.2d, v19.2d, v20.2d}, [X_PTR], #64
|
||||
fmla v1.2d, v13.2d, v17.2d
|
||||
prfm PLDL1KEEP, [A_PTR, #A_PREFETCH_SIZE]
|
||||
fmla v2.2d, v14.2d, v18.2d
|
||||
prfm PLDL1KEEP, [X_PTR, #X_PREFETCH_SIZE]
|
||||
fmla v3.2d, v15.2d, v19.2d
|
||||
fmla v4.2d, v16.2d, v20.2d
|
||||
#endif
|
||||
|
|
|
@ -72,6 +72,148 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
fabs MAXF, MAXF
|
||||
.endm
|
||||
|
||||
.macro KERNEL_F8
|
||||
#if !defined(DOUBLE)
|
||||
ldp q2, q3, [X], #32
|
||||
fabs v2.4s, v2.4s
|
||||
fabs v3.4s, v3.4s
|
||||
fmax v2.4s, v2.4s, v3.4s
|
||||
fmaxv TMPF, v2.4s
|
||||
fcmp MAXF, TMPF
|
||||
fcsel MAXF, MAXF, TMPF, COND
|
||||
csel INDEX, INDEX, Z, COND
|
||||
add Z, Z, #8
|
||||
#else
|
||||
ldp q2, q3, [X], #32
|
||||
ldp q4, q5, [X], #32
|
||||
fabs v2.2d, v2.2d
|
||||
fabs v3.2d, v3.2d
|
||||
fabs v4.2d, v4.2d
|
||||
fabs v5.2d, v5.2d
|
||||
|
||||
fmax v2.2d, v2.2d, v3.2d
|
||||
fmax v4.2d, v4.2d, v5.2d
|
||||
fmax v2.2d, v2.2d, v4.2d
|
||||
fmaxp TMPF, v2.2d
|
||||
|
||||
fcmp MAXF, TMPF
|
||||
fcsel MAXF, MAXF, TMPF, COND
|
||||
csel INDEX, INDEX, Z, COND
|
||||
add Z, Z, #8
|
||||
#endif
|
||||
PRFM PLDL1KEEP, [X, #1024]
|
||||
.endm
|
||||
|
||||
.macro KERNEL_F8_FINALIZE
|
||||
sub x6, INDEX, #1
|
||||
#if !defined(DOUBLE)
|
||||
lsl x6, x6, #2
|
||||
add x7, x7, x6
|
||||
ldp q2, q3, [x7]
|
||||
fabs v2.4s, v2.4s
|
||||
fabs v3.4s, v3.4s
|
||||
|
||||
ins v4.s[0], v3.s[0]
|
||||
ins v5.s[0], v3.s[1]
|
||||
ins v6.s[0], v3.s[2]
|
||||
ins v7.s[0], v3.s[3]
|
||||
|
||||
add x6, INDEX, #7
|
||||
fcmp MAXF, s7
|
||||
csel INDEX, x6, INDEX, eq
|
||||
|
||||
sub x6, x6, #1
|
||||
fcmp MAXF, s6
|
||||
csel INDEX, x6, INDEX, eq
|
||||
|
||||
sub x6, x6, #1
|
||||
fcmp MAXF, s5
|
||||
csel INDEX, x6, INDEX, eq
|
||||
|
||||
sub x6, x6, #1
|
||||
fcmp MAXF, s4
|
||||
csel INDEX, x6, INDEX, eq
|
||||
|
||||
ins v4.s[0], v2.s[0]
|
||||
ins v5.s[0], v2.s[1]
|
||||
ins v6.s[0], v2.s[2]
|
||||
ins v7.s[0], v2.s[3]
|
||||
|
||||
sub x6, x6, #1
|
||||
fcmp MAXF, s7
|
||||
csel INDEX, x6, INDEX, eq
|
||||
|
||||
sub x6, x6, #1
|
||||
fcmp MAXF, s6
|
||||
csel INDEX, x6, INDEX, eq
|
||||
|
||||
sub x6, x6, #1
|
||||
fcmp MAXF, s5
|
||||
csel INDEX, x6, INDEX, eq
|
||||
|
||||
sub x6, x6, #1
|
||||
fcmp MAXF, s4
|
||||
csel INDEX, x6, INDEX, eq
|
||||
#else
|
||||
add x6, x6, #4
|
||||
lsl x6, x6, #3
|
||||
add x7, x7, x6
|
||||
ldp q2, q3, [x7]
|
||||
|
||||
fabs v2.2d, v2.2d
|
||||
fabs v3.2d, v3.2d
|
||||
|
||||
ins v4.d[0], v2.d[0]
|
||||
ins v5.d[0], v2.d[1]
|
||||
ins v6.d[0], v3.d[0]
|
||||
ins v7.d[0], v3.d[1]
|
||||
|
||||
add x6, INDEX, #7
|
||||
fcmp MAXF, d7
|
||||
csel INDEX, x6, INDEX, eq
|
||||
|
||||
sub x6, x6, #1
|
||||
fcmp MAXF, d6
|
||||
csel INDEX, x6, INDEX, eq
|
||||
|
||||
sub x6, x6, #1
|
||||
fcmp MAXF, d5
|
||||
csel INDEX, x6, INDEX, eq
|
||||
|
||||
sub x6, x6, #1
|
||||
fcmp MAXF, d4
|
||||
csel INDEX, x6, INDEX, eq
|
||||
|
||||
sub x7, x7, #32
|
||||
ldp q2, q3, [x7]
|
||||
|
||||
fabs v2.2d, v2.2d
|
||||
fabs v3.2d, v3.2d
|
||||
|
||||
ins v4.d[0], v2.d[0]
|
||||
ins v5.d[0], v2.d[1]
|
||||
ins v6.d[0], v3.d[0]
|
||||
ins v7.d[0], v3.d[1]
|
||||
|
||||
sub x6, x6, #1
|
||||
fcmp MAXF, d7
|
||||
csel INDEX, x6, INDEX, eq
|
||||
|
||||
sub x6, x6, #1
|
||||
fcmp MAXF, d6
|
||||
csel INDEX, x6, INDEX, eq
|
||||
|
||||
sub x6, x6, #1
|
||||
fcmp MAXF, d5
|
||||
csel INDEX, x6, INDEX, eq
|
||||
|
||||
sub x6, x6, #1
|
||||
fcmp MAXF, d4
|
||||
csel INDEX, x6, INDEX, eq
|
||||
#endif
|
||||
.endm
|
||||
|
||||
|
||||
.macro KERNEL_S1
|
||||
ld1 TMPVF, [X], INC_X
|
||||
add Z, Z, #1
|
||||
|
@ -92,6 +234,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
cmp INC_X, xzr
|
||||
ble iamax_kernel_zero
|
||||
|
||||
cmp INC_X, #1
|
||||
bne iamax_kernel_S_BEGIN
|
||||
mov x7, X
|
||||
|
||||
iamax_kernel_F_BEGIN:
|
||||
|
||||
INIT_S
|
||||
|
||||
subs N, N, #1
|
||||
ble iamax_kernel_L999
|
||||
|
||||
asr I, N, #3
|
||||
cmp I, xzr
|
||||
beq iamax_kernel_F1
|
||||
|
||||
add Z, Z, #1
|
||||
iamax_kernel_F8:
|
||||
|
||||
KERNEL_F8
|
||||
|
||||
subs I, I, #1
|
||||
bne iamax_kernel_F8
|
||||
|
||||
KERNEL_F8_FINALIZE
|
||||
|
||||
sub Z, Z, #1
|
||||
iamax_kernel_F1:
|
||||
|
||||
ands I, N, #7
|
||||
ble iamax_kernel_L999
|
||||
|
||||
iamax_kernel_F10:
|
||||
|
||||
KERNEL_S1
|
||||
|
||||
subs I, I, #1
|
||||
bne iamax_kernel_F10
|
||||
|
||||
b iamax_kernel_L999
|
||||
|
||||
iamax_kernel_S_BEGIN:
|
||||
|
||||
INIT_S
|
||||
|
||||
subs N, N, #1
|
||||
|
|
|
@ -78,6 +78,179 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#endif
|
||||
.endm
|
||||
|
||||
.macro KERNEL_F8
|
||||
#if !defined(DOUBLE)
|
||||
ldp q2, q3, [X], #32
|
||||
ldp q4, q5, [X], #32
|
||||
|
||||
fabs v2.4s, v2.4s
|
||||
fabs v3.4s, v3.4s
|
||||
fabs v4.4s, v4.4s
|
||||
fabs v5.4s, v5.4s
|
||||
|
||||
faddp v2.4s, v2.4s, v3.4s
|
||||
faddp v3.4s, v4.4s, v5.4s
|
||||
|
||||
fmax v2.4s, v2.4s, v3.4s
|
||||
fmaxv TMPF, v2.4s
|
||||
fcmp MAXF, TMPF
|
||||
fcsel MAXF, MAXF, TMPF, COND
|
||||
csel INDEX, INDEX, Z, COND
|
||||
add Z, Z, #8
|
||||
#else
|
||||
ldp q2, q3, [X], #32
|
||||
ldp q4, q5, [X], #32
|
||||
ldp q16, q17, [X], #32
|
||||
ldp q18, q19, [X], #32
|
||||
|
||||
fabs v2.2d, v2.2d
|
||||
fabs v3.2d, v3.2d
|
||||
fabs v4.2d, v4.2d
|
||||
fabs v5.2d, v5.2d
|
||||
fabs v16.2d, v16.2d
|
||||
fabs v17.2d, v17.2d
|
||||
fabs v18.2d, v18.2d
|
||||
fabs v19.2d, v19.2d
|
||||
|
||||
faddp v2.2d, v2.2d, v3.2d
|
||||
faddp v3.2d, v4.2d, v5.2d
|
||||
faddp v4.2d, v16.2d, v17.2d
|
||||
faddp v5.2d, v18.2d, v19.2d
|
||||
|
||||
fmax v2.2d, v2.2d, v3.2d
|
||||
fmax v4.2d, v4.2d, v5.2d
|
||||
fmax v2.2d, v2.2d, v4.2d
|
||||
fmaxp TMPF, v2.2d
|
||||
|
||||
fcmp MAXF, TMPF
|
||||
fcsel MAXF, MAXF, TMPF, COND
|
||||
csel INDEX, INDEX, Z, COND
|
||||
add Z, Z, #8
|
||||
#endif
|
||||
PRFM PLDL1KEEP, [X, #1024]
|
||||
.endm
|
||||
|
||||
.macro KERNEL_F8_FINALIZE
|
||||
sub x6, INDEX, #1
|
||||
#if !defined(DOUBLE)
|
||||
lsl x6, x6, #3
|
||||
add x7, x7, x6
|
||||
|
||||
ldp q2, q3, [x7]
|
||||
ldp q4, q5, [x7, #32]
|
||||
|
||||
fabs v2.4s, v2.4s
|
||||
fabs v3.4s, v3.4s
|
||||
fabs v4.4s, v4.4s
|
||||
fabs v5.4s, v5.4s
|
||||
|
||||
faddp v2.4s, v2.4s, v3.4s
|
||||
faddp v3.4s, v4.4s, v5.4s
|
||||
|
||||
ins v4.s[0], v3.s[3]
|
||||
add x6, INDEX, #7
|
||||
fcmp MAXF, s4
|
||||
csel INDEX, x6, INDEX, eq
|
||||
|
||||
ins v4.s[0], v3.s[2]
|
||||
sub x6, x6, #1
|
||||
fcmp MAXF, s4
|
||||
csel INDEX, x6, INDEX, eq
|
||||
|
||||
ins v4.s[0], v3.s[1]
|
||||
sub x6, x6, #1
|
||||
fcmp MAXF, s4
|
||||
csel INDEX, x6, INDEX, eq
|
||||
|
||||
ins v4.s[0], v3.s[0]
|
||||
sub x6, x6, #1
|
||||
fcmp MAXF, s4
|
||||
csel INDEX, x6, INDEX, eq
|
||||
|
||||
ins v4.s[0], v2.s[3]
|
||||
sub x6, x6, #1
|
||||
fcmp MAXF, s4
|
||||
csel INDEX, x6, INDEX, eq
|
||||
|
||||
ins v4.s[0], v2.s[2]
|
||||
sub x6, x6, #1
|
||||
fcmp MAXF, s4
|
||||
csel INDEX, x6, INDEX, eq
|
||||
|
||||
ins v4.s[0], v2.s[1]
|
||||
sub x6, x6, #1
|
||||
fcmp MAXF, s4
|
||||
csel INDEX, x6, INDEX, eq
|
||||
|
||||
ins v4.s[0], v2.s[0]
|
||||
sub x6, x6, #1
|
||||
fcmp MAXF, s4
|
||||
csel INDEX, x6, INDEX, eq
|
||||
#else
|
||||
lsl x6, x6, #4
|
||||
add x7, x7, x6
|
||||
|
||||
ldp q2, q3, [x7]
|
||||
ldp q4, q5, [x7, #32]
|
||||
ldp q16, q17, [x7, #64]
|
||||
ldp q18, q19, [x7, #96]
|
||||
|
||||
fabs v2.2d, v2.2d
|
||||
fabs v3.2d, v3.2d
|
||||
fabs v4.2d, v4.2d
|
||||
fabs v5.2d, v5.2d
|
||||
fabs v16.2d, v16.2d
|
||||
fabs v17.2d, v17.2d
|
||||
fabs v18.2d, v18.2d
|
||||
fabs v19.2d, v19.2d
|
||||
|
||||
faddp v2.2d, v2.2d, v3.2d
|
||||
faddp v3.2d, v4.2d, v5.2d
|
||||
faddp v4.2d, v16.2d, v17.2d
|
||||
faddp v5.2d, v18.2d, v19.2d
|
||||
|
||||
ins v7.d[0], v5.d[1]
|
||||
add x6, INDEX, #7
|
||||
fcmp MAXF, d7
|
||||
csel INDEX, x6, INDEX, eq
|
||||
|
||||
ins v7.d[0], v5.d[0]
|
||||
sub x6, x6, #1
|
||||
fcmp MAXF, d7
|
||||
csel INDEX, x6, INDEX, eq
|
||||
|
||||
ins v7.d[0], v4.d[1]
|
||||
sub x6, x6, #1
|
||||
fcmp MAXF, d7
|
||||
csel INDEX, x6, INDEX, eq
|
||||
|
||||
ins v7.d[0], v4.d[0]
|
||||
sub x6, x6, #1
|
||||
fcmp MAXF, d7
|
||||
csel INDEX, x6, INDEX, eq
|
||||
|
||||
ins v7.d[0], v3.d[1]
|
||||
sub x6, x6, #1
|
||||
fcmp MAXF, d7
|
||||
csel INDEX, x6, INDEX, eq
|
||||
|
||||
ins v7.d[0], v3.d[0]
|
||||
sub x6, x6, #1
|
||||
fcmp MAXF, d7
|
||||
csel INDEX, x6, INDEX, eq
|
||||
|
||||
ins v7.d[0], v2.d[1]
|
||||
sub x6, x6, #1
|
||||
fcmp MAXF, d7
|
||||
csel INDEX, x6, INDEX, eq
|
||||
|
||||
ins v7.d[0], v2.d[0]
|
||||
sub x6, x6, #1
|
||||
fcmp MAXF, d7
|
||||
csel INDEX, x6, INDEX, eq
|
||||
#endif
|
||||
.endm
|
||||
|
||||
.macro KERNEL_S1
|
||||
#if !defined(DOUBLE)
|
||||
ld1 {v1.2s}, [X], INC_X
|
||||
|
@ -107,6 +280,50 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
cmp INC_X, xzr
|
||||
ble iamax_kernel_zero
|
||||
|
||||
cmp INC_X, #1
|
||||
bne iamax_kernel_S_BEGIN
|
||||
mov x7, X
|
||||
|
||||
|
||||
iamax_kernel_F_BEGIN:
|
||||
|
||||
INIT_S
|
||||
|
||||
subs N, N, #1
|
||||
ble iamax_kernel_L999
|
||||
|
||||
asr I, N, #3
|
||||
cmp I, xzr
|
||||
ble iamax_kernel_F1
|
||||
|
||||
add Z, Z, #1
|
||||
|
||||
iamax_kernel_F8:
|
||||
|
||||
KERNEL_F8
|
||||
|
||||
subs I, I, #1
|
||||
bne iamax_kernel_F8
|
||||
|
||||
KERNEL_F8_FINALIZE
|
||||
|
||||
sub Z, Z, #1
|
||||
iamax_kernel_F1:
|
||||
|
||||
ands I, N, #7
|
||||
ble iamax_kernel_L999
|
||||
|
||||
iamax_kernel_F10:
|
||||
|
||||
KERNEL_S1
|
||||
|
||||
subs I, I, #1
|
||||
bne iamax_kernel_F10
|
||||
|
||||
b iamax_kernel_L999
|
||||
|
||||
iamax_kernel_S_BEGIN:
|
||||
|
||||
INIT_S
|
||||
|
||||
subs N, N, #1
|
||||
|
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -46,20 +46,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define pCRow0 x12
|
||||
#define pCRow1 x13
|
||||
#define pCRow2 x14
|
||||
#define pA x15
|
||||
#define alpha_save_R x16
|
||||
#define alpha_save_I x17
|
||||
#define pCRow3 x15
|
||||
#define pA x16
|
||||
#define alphaR x17
|
||||
#define alphaI x18
|
||||
|
||||
#define alpha0_R d10
|
||||
#define alphaV0_R v10.d[0]
|
||||
#define alpha0_I d11
|
||||
#define alphaV0_I v11.d[0]
|
||||
|
||||
#define alpha1_R d14
|
||||
#define alphaV1_R v14.d[0]
|
||||
#define alpha1_I d15
|
||||
#define alphaV1_I v15.d[0]
|
||||
|
||||
#define A_PRE_SIZE 2560
|
||||
#define B_PRE_SIZE 448
|
||||
#define C_PRE_SIZE 128
|
||||
|
||||
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
|
||||
#define OP_rr fmla
|
||||
|
@ -98,10 +97,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
// 12 pCRow0
|
||||
// 13 pCRow1
|
||||
// 14 pCRow2
|
||||
// 15 pA
|
||||
// 16 alpha_save_R
|
||||
// 17 alpha_save_I
|
||||
// 18 must save
|
||||
// 15 pCRow3
|
||||
// 16 pA
|
||||
// 17 alpha_save_R
|
||||
// 18 must save alpha_save_I
|
||||
// 19 must save
|
||||
// 20 must save
|
||||
// 21 must save
|
||||
|
@ -175,12 +174,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.macro KERNEL4x4_I
|
||||
ld2 {v8.2d, v9.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
ld2 {v10.2d, v11.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
ld2 {v0.2d, v1.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
ld2 {v2.2d, v3.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
fmul v16.2d, v0.2d, v8.d[0]
|
||||
OP_ii v16.2d, v1.2d, v9.d[0]
|
||||
|
@ -193,16 +188,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#endif
|
||||
OP_ir v17.2d, v1.2d, v8.d[0]
|
||||
|
||||
fmul v18.2d, v2.2d, v8.d[0]
|
||||
OP_ii v18.2d, v3.2d, v9.d[0]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
eor v19.16b, v19.16b, v19.16b
|
||||
fmls v19.2d, v2.2d, v9.d[0]
|
||||
#else
|
||||
fmul v19.2d, v2.2d, v9.d[0]
|
||||
#endif
|
||||
OP_ir v19.2d, v3.2d, v8.d[0]
|
||||
ld2 {v2.2d, v3.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
fmul v20.2d, v0.2d, v8.d[1]
|
||||
OP_ii v20.2d, v1.2d, v9.d[1]
|
||||
|
@ -215,6 +202,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#endif
|
||||
OP_ir v21.2d, v1.2d, v8.d[1]
|
||||
|
||||
ld2 {v10.2d, v11.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
|
||||
fmul v22.2d, v2.2d, v8.d[1]
|
||||
OP_ii v22.2d, v3.2d, v9.d[1]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
|
@ -226,6 +216,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#endif
|
||||
OP_ir v23.2d, v3.2d, v8.d[1]
|
||||
|
||||
ld2 {v12.2d, v13.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
|
||||
fmul v18.2d, v2.2d, v8.d[0]
|
||||
OP_ii v18.2d, v3.2d, v9.d[0]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
eor v19.16b, v19.16b, v19.16b
|
||||
fmls v19.2d, v2.2d, v9.d[0]
|
||||
#else
|
||||
fmul v19.2d, v2.2d, v9.d[0]
|
||||
#endif
|
||||
OP_ir v19.2d, v3.2d, v8.d[0]
|
||||
|
||||
ld2 {v4.2d, v5.2d} , [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
fmul v24.2d, v0.2d, v10.d[0]
|
||||
OP_ii v24.2d, v1.2d, v11.d[0]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
|
@ -237,6 +244,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#endif
|
||||
OP_ir v25.2d, v1.2d, v10.d[0]
|
||||
|
||||
ld2 {v6.2d, v7.2d} , [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
fmul v26.2d, v2.2d, v10.d[0]
|
||||
OP_ii v26.2d, v3.2d, v11.d[0]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
|
@ -248,6 +258,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#endif
|
||||
OP_ir v27.2d, v3.2d, v10.d[0]
|
||||
|
||||
ld2 {v14.2d, v15.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
|
||||
fmul v28.2d, v0.2d, v10.d[1]
|
||||
OP_ii v28.2d, v1.2d, v11.d[1]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
|
@ -259,6 +272,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#endif
|
||||
OP_ir v29.2d, v1.2d, v10.d[1]
|
||||
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||
|
||||
fmul v30.2d, v2.2d, v10.d[1]
|
||||
OP_ii v30.2d, v3.2d, v11.d[1]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
|
@ -270,14 +285,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#endif
|
||||
OP_ir v31.2d, v3.2d, v10.d[1]
|
||||
|
||||
ld2 {v12.2d, v13.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
ld2 {v14.2d, v15.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
ld2 {v4.2d, v5.2d} , [pA]
|
||||
add pA, pA, #32
|
||||
ld2 {v6.2d, v7.2d} , [pA]
|
||||
add pA, pA, #32
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
|
||||
.endm
|
||||
|
||||
.macro KERNEL4x4_M1
|
||||
|
@ -286,7 +294,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
OP_ri v17.2d, v0.2d, v9.d[0]
|
||||
OP_ir v17.2d, v1.2d, v8.d[0]
|
||||
|
||||
ld2 {v12.2d, v13.2d}, [pB] // For next round
|
||||
ld2 {v12.2d, v13.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
|
||||
OP_rr v18.2d, v2.2d, v8.d[0]
|
||||
|
@ -294,15 +302,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
OP_ri v19.2d, v2.2d, v9.d[0]
|
||||
OP_ir v19.2d, v3.2d, v8.d[0]
|
||||
|
||||
ld2 {v14.2d, v15.2d}, [pB] // For next round
|
||||
add pB, pB, #32
|
||||
ld2 {v4.2d, v5.2d} , [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
OP_rr v20.2d, v0.2d, v8.d[1]
|
||||
OP_ii v20.2d, v1.2d, v9.d[1]
|
||||
OP_ri v21.2d, v0.2d, v9.d[1]
|
||||
OP_ir v21.2d, v1.2d, v8.d[1]
|
||||
|
||||
ld2 {v4.2d, v5.2d} , [pA] // For next round
|
||||
ld2 {v6.2d, v7.2d} , [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
OP_rr v22.2d, v2.2d, v8.d[1]
|
||||
|
@ -310,22 +318,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
OP_ri v23.2d, v2.2d, v9.d[1]
|
||||
OP_ir v23.2d, v3.2d, v8.d[1]
|
||||
|
||||
ld2 {v6.2d, v7.2d} , [pA] // For next round
|
||||
add pA, pA, #32
|
||||
ld2 {v14.2d, v15.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
|
||||
OP_rr v24.2d, v0.2d, v10.d[0]
|
||||
OP_ii v24.2d, v1.2d, v11.d[0]
|
||||
OP_ri v25.2d, v0.2d, v11.d[0]
|
||||
OP_ir v25.2d, v1.2d, v10.d[0]
|
||||
|
||||
prfm PLDL1KEEP, [pA, #512]
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||
|
||||
OP_rr v26.2d, v2.2d, v10.d[0]
|
||||
OP_ii v26.2d, v3.2d, v11.d[0]
|
||||
OP_ri v27.2d, v2.2d, v11.d[0]
|
||||
OP_ir v27.2d, v3.2d, v10.d[0]
|
||||
|
||||
prfm PLDL1KEEP, [pB, #512]
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
|
||||
|
||||
OP_rr v28.2d, v0.2d, v10.d[1]
|
||||
OP_ii v28.2d, v1.2d, v11.d[1]
|
||||
|
@ -344,7 +352,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
OP_ri v17.2d, v4.2d, v13.d[0]
|
||||
OP_ir v17.2d, v5.2d, v12.d[0]
|
||||
|
||||
ld2 {v8.2d, v9.2d}, [pB] // For next round
|
||||
ld2 {v8.2d, v9.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
|
||||
OP_rr v18.2d, v6.2d, v12.d[0]
|
||||
|
@ -352,15 +360,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
OP_ri v19.2d, v6.2d, v13.d[0]
|
||||
OP_ir v19.2d, v7.2d, v12.d[0]
|
||||
|
||||
ld2 {v10.2d, v11.2d}, [pB] // For next round
|
||||
add pB, pB, #32
|
||||
ld2 {v0.2d, v1.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
OP_rr v20.2d, v4.2d, v12.d[1]
|
||||
OP_ii v20.2d, v5.2d, v13.d[1]
|
||||
OP_ri v21.2d, v4.2d, v13.d[1]
|
||||
OP_ir v21.2d, v5.2d, v12.d[1]
|
||||
|
||||
ld2 {v0.2d, v1.2d}, [pA] // For next round
|
||||
ld2 {v2.2d, v3.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
OP_rr v22.2d, v6.2d, v12.d[1]
|
||||
|
@ -368,22 +376,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
OP_ri v23.2d, v6.2d, v13.d[1]
|
||||
OP_ir v23.2d, v7.2d, v12.d[1]
|
||||
|
||||
ld2 {v2.2d, v3.2d}, [pA] // For next round
|
||||
add pA, pA, #32
|
||||
ld2 {v10.2d, v11.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
|
||||
OP_rr v24.2d, v4.2d, v14.d[0]
|
||||
OP_ii v24.2d, v5.2d, v15.d[0]
|
||||
OP_ri v25.2d, v4.2d, v15.d[0]
|
||||
OP_ir v25.2d, v5.2d, v14.d[0]
|
||||
|
||||
prfm PLDL1KEEP, [pA, #512]
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
|
||||
OP_rr v26.2d, v6.2d, v14.d[0]
|
||||
OP_ii v26.2d, v7.2d, v15.d[0]
|
||||
OP_ri v27.2d, v6.2d, v15.d[0]
|
||||
OP_ir v27.2d, v7.2d, v14.d[0]
|
||||
|
||||
prfm PLDL1KEEP, [pB, #512]
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64]
|
||||
|
||||
OP_rr v28.2d, v4.2d, v14.d[1]
|
||||
OP_ii v28.2d, v5.2d, v15.d[1]
|
||||
|
@ -412,6 +420,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
OP_ri v21.2d, v4.2d, v13.d[1]
|
||||
OP_ir v21.2d, v5.2d, v12.d[1]
|
||||
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
|
||||
OP_rr v22.2d, v6.2d, v12.d[1]
|
||||
OP_ii v22.2d, v7.2d, v13.d[1]
|
||||
OP_ri v23.2d, v6.2d, v13.d[1]
|
||||
|
@ -422,6 +432,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
OP_ri v25.2d, v4.2d, v15.d[0]
|
||||
OP_ir v25.2d, v5.2d, v14.d[0]
|
||||
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64]
|
||||
|
||||
OP_rr v26.2d, v6.2d, v14.d[0]
|
||||
OP_ii v26.2d, v7.2d, v15.d[0]
|
||||
OP_ri v27.2d, v6.2d, v15.d[0]
|
||||
|
@ -441,33 +453,40 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.macro KERNEL4x4_SUB
|
||||
ld2 {v8.2d, v9.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
ld2 {v10.2d, v11.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
|
||||
ld2 {v0.2d, v1.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
ld2 {v2.2d, v3.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
OP_rr v16.2d, v0.2d, v8.d[0]
|
||||
OP_ii v16.2d, v1.2d, v9.d[0]
|
||||
OP_ri v17.2d, v0.2d, v9.d[0]
|
||||
OP_ir v17.2d, v1.2d, v8.d[0]
|
||||
|
||||
OP_rr v18.2d, v2.2d, v8.d[0]
|
||||
OP_ii v18.2d, v3.2d, v9.d[0]
|
||||
OP_ri v19.2d, v2.2d, v9.d[0]
|
||||
OP_ir v19.2d, v3.2d, v8.d[0]
|
||||
ld2 {v2.2d, v3.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
OP_rr v20.2d, v0.2d, v8.d[1]
|
||||
OP_ii v20.2d, v1.2d, v9.d[1]
|
||||
OP_ri v21.2d, v0.2d, v9.d[1]
|
||||
OP_ir v21.2d, v1.2d, v8.d[1]
|
||||
|
||||
ld2 {v10.2d, v11.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
|
||||
OP_rr v18.2d, v2.2d, v8.d[0]
|
||||
OP_ii v18.2d, v3.2d, v9.d[0]
|
||||
OP_ri v19.2d, v2.2d, v9.d[0]
|
||||
OP_ir v19.2d, v3.2d, v8.d[0]
|
||||
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
|
||||
OP_rr v22.2d, v2.2d, v8.d[1]
|
||||
OP_ii v22.2d, v3.2d, v9.d[1]
|
||||
OP_ri v23.2d, v2.2d, v9.d[1]
|
||||
OP_ir v23.2d, v3.2d, v8.d[1]
|
||||
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||
|
||||
OP_rr v24.2d, v0.2d, v10.d[0]
|
||||
OP_ii v24.2d, v1.2d, v11.d[0]
|
||||
OP_ri v25.2d, v0.2d, v11.d[0]
|
||||
|
@ -490,74 +509,85 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro SAVE4x4
|
||||
fmov alpha0_R, alpha_save_R
|
||||
fmov alpha0_I, alpha_save_I
|
||||
fmov alpha1_R, alpha0_R
|
||||
fmov alpha1_I, alpha0_I
|
||||
fmov alpha0_R, alphaR
|
||||
fmov alpha0_I, alphaI
|
||||
|
||||
mov pCRow1, pCRow0
|
||||
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
|
||||
|
||||
ld2 {v0.2d, v1.2d}, [pCRow1]
|
||||
ld2 {v0.2d, v1.2d}, [pCRow0]
|
||||
fmla v0.2d, v16.2d, alphaV0_R
|
||||
fmls v0.2d, v17.2d, alphaV0_I
|
||||
fmla v1.2d, v16.2d, alphaV1_I
|
||||
fmla v1.2d, v17.2d, alphaV1_R
|
||||
st2 {v0.2d, v1.2d}, [pCRow1]
|
||||
add pCRow2, pCRow1, #32
|
||||
ld2 {v2.2d, v3.2d}, [pCRow2]
|
||||
fmla v1.2d, v16.2d, alphaV0_I
|
||||
fmla v1.2d, v17.2d, alphaV0_R
|
||||
st2 {v0.2d, v1.2d}, [pCRow0]
|
||||
|
||||
add pCRow0, pCRow0, #32
|
||||
|
||||
ld2 {v2.2d, v3.2d}, [pCRow0]
|
||||
fmla v2.2d, v18.2d, alphaV0_R
|
||||
fmls v2.2d, v19.2d, alphaV0_I
|
||||
fmla v3.2d, v18.2d, alphaV1_I
|
||||
fmla v3.2d, v19.2d, alphaV1_R
|
||||
st2 {v2.2d, v3.2d}, [pCRow2]
|
||||
fmla v3.2d, v18.2d, alphaV0_I
|
||||
fmla v3.2d, v19.2d, alphaV0_R
|
||||
st2 {v2.2d, v3.2d}, [pCRow0]
|
||||
|
||||
add pCRow0, pCRow0, #32
|
||||
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
|
||||
|
||||
add pCRow1, pCRow1, LDC
|
||||
ld2 {v4.2d, v5.2d}, [pCRow1]
|
||||
fmla v4.2d, v20.2d, alphaV0_R
|
||||
fmls v4.2d, v21.2d, alphaV0_I
|
||||
fmla v5.2d, v20.2d, alphaV1_I
|
||||
fmla v5.2d, v21.2d, alphaV1_R
|
||||
fmla v5.2d, v20.2d, alphaV0_I
|
||||
fmla v5.2d, v21.2d, alphaV0_R
|
||||
st2 {v4.2d, v5.2d}, [pCRow1]
|
||||
add pCRow2, pCRow1, #32
|
||||
ld2 {v6.2d, v7.2d}, [pCRow2]
|
||||
|
||||
add pCRow1, pCRow1, #32
|
||||
|
||||
ld2 {v6.2d, v7.2d}, [pCRow1]
|
||||
fmla v6.2d, v22.2d, alphaV0_R
|
||||
fmls v6.2d, v23.2d, alphaV0_I
|
||||
fmla v7.2d, v22.2d, alphaV1_I
|
||||
fmla v7.2d, v23.2d, alphaV1_R
|
||||
st2 {v6.2d, v7.2d}, [pCRow2]
|
||||
fmla v7.2d, v22.2d, alphaV0_I
|
||||
fmla v7.2d, v23.2d, alphaV0_R
|
||||
st2 {v6.2d, v7.2d}, [pCRow1]
|
||||
|
||||
add pCRow1, pCRow1, LDC
|
||||
ld2 {v0.2d, v1.2d}, [pCRow1]
|
||||
add pCRow1, pCRow1, #32
|
||||
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
|
||||
|
||||
ld2 {v0.2d, v1.2d}, [pCRow2]
|
||||
fmla v0.2d, v24.2d, alphaV0_R
|
||||
fmls v0.2d, v25.2d, alphaV0_I
|
||||
fmla v1.2d, v24.2d, alphaV1_I
|
||||
fmla v1.2d, v25.2d, alphaV1_R
|
||||
st2 {v0.2d, v1.2d}, [pCRow1]
|
||||
add pCRow2, pCRow1, #32
|
||||
fmla v1.2d, v24.2d, alphaV0_I
|
||||
fmla v1.2d, v25.2d, alphaV0_R
|
||||
st2 {v0.2d, v1.2d}, [pCRow2]
|
||||
|
||||
add pCRow2, pCRow2, #32
|
||||
|
||||
ld2 {v2.2d, v3.2d}, [pCRow2]
|
||||
fmla v2.2d, v26.2d, alphaV0_R
|
||||
fmls v2.2d, v27.2d, alphaV0_I
|
||||
fmla v3.2d, v26.2d, alphaV1_I
|
||||
fmla v3.2d, v27.2d, alphaV1_R
|
||||
fmla v3.2d, v26.2d, alphaV0_I
|
||||
fmla v3.2d, v27.2d, alphaV0_R
|
||||
st2 {v2.2d, v3.2d}, [pCRow2]
|
||||
|
||||
add pCRow1, pCRow1, LDC
|
||||
add pCRow2, pCRow2, #32
|
||||
prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
|
||||
|
||||
ld2 {v4.2d, v5.2d}, [pCRow1]
|
||||
ld2 {v4.2d, v5.2d}, [pCRow3]
|
||||
fmla v4.2d, v28.2d, alphaV0_R
|
||||
fmls v4.2d, v29.2d, alphaV0_I
|
||||
fmla v5.2d, v28.2d, alphaV1_I
|
||||
fmla v5.2d, v29.2d, alphaV1_R
|
||||
st2 {v4.2d, v5.2d}, [pCRow1]
|
||||
add pCRow2, pCRow1, #32
|
||||
ld2 {v6.2d, v7.2d}, [pCRow2]
|
||||
fmla v5.2d, v28.2d, alphaV0_I
|
||||
fmla v5.2d, v29.2d, alphaV0_R
|
||||
st2 {v4.2d, v5.2d}, [pCRow3]
|
||||
|
||||
add pCRow3, pCRow3, #32
|
||||
|
||||
ld2 {v6.2d, v7.2d}, [pCRow3]
|
||||
fmla v6.2d, v30.2d, alphaV0_R
|
||||
fmls v6.2d, v31.2d, alphaV0_I
|
||||
fmla v7.2d, v30.2d, alphaV1_I
|
||||
fmla v7.2d, v31.2d, alphaV1_R
|
||||
st2 {v6.2d, v7.2d}, [pCRow2]
|
||||
fmla v7.2d, v30.2d, alphaV0_I
|
||||
fmla v7.2d, v31.2d, alphaV0_R
|
||||
st2 {v6.2d, v7.2d}, [pCRow3]
|
||||
|
||||
add pCRow0, pCRow0, #64
|
||||
add pCRow3, pCRow3, #32
|
||||
.endm
|
||||
|
||||
/******************************************************************************/
|
||||
|
@ -604,18 +634,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro SAVE2x4
|
||||
fmov alpha0_R, alpha_save_R
|
||||
fmov alpha0_I, alpha_save_I
|
||||
fmov alpha1_R, alpha0_R
|
||||
fmov alpha1_I, alpha0_I
|
||||
fmov alpha0_R, alphaR
|
||||
fmov alpha0_I, alphaI
|
||||
|
||||
mov pCRow1, pCRow0
|
||||
|
||||
ld2 {v0.2d, v1.2d}, [pCRow1]
|
||||
fmla v0.2d, v16.2d, alphaV0_R
|
||||
fmls v0.2d, v17.2d, alphaV0_I
|
||||
fmla v1.2d, v16.2d, alphaV1_I
|
||||
fmla v1.2d, v17.2d, alphaV1_R
|
||||
fmla v1.2d, v16.2d, alphaV0_I
|
||||
fmla v1.2d, v17.2d, alphaV0_R
|
||||
st2 {v0.2d, v1.2d}, [pCRow1]
|
||||
|
||||
add pCRow1, pCRow1, LDC
|
||||
|
@ -623,8 +651,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld2 {v4.2d, v5.2d}, [pCRow1]
|
||||
fmla v4.2d, v20.2d, alphaV0_R
|
||||
fmls v4.2d, v21.2d, alphaV0_I
|
||||
fmla v5.2d, v20.2d, alphaV1_I
|
||||
fmla v5.2d, v21.2d, alphaV1_R
|
||||
fmla v5.2d, v20.2d, alphaV0_I
|
||||
fmla v5.2d, v21.2d, alphaV0_R
|
||||
st2 {v4.2d, v5.2d}, [pCRow1]
|
||||
|
||||
add pCRow1, pCRow1, LDC
|
||||
|
@ -632,8 +660,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld2 {v0.2d, v1.2d}, [pCRow1]
|
||||
fmla v0.2d, v24.2d, alphaV0_R
|
||||
fmls v0.2d, v25.2d, alphaV0_I
|
||||
fmla v1.2d, v24.2d, alphaV1_I
|
||||
fmla v1.2d, v25.2d, alphaV1_R
|
||||
fmla v1.2d, v24.2d, alphaV0_I
|
||||
fmla v1.2d, v25.2d, alphaV0_R
|
||||
st2 {v0.2d, v1.2d}, [pCRow1]
|
||||
|
||||
add pCRow1, pCRow1, LDC
|
||||
|
@ -641,8 +669,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld2 {v4.2d, v5.2d}, [pCRow1]
|
||||
fmla v4.2d, v28.2d, alphaV0_R
|
||||
fmls v4.2d, v29.2d, alphaV0_I
|
||||
fmla v5.2d, v28.2d, alphaV1_I
|
||||
fmla v5.2d, v29.2d, alphaV1_R
|
||||
fmla v5.2d, v28.2d, alphaV0_I
|
||||
fmla v5.2d, v29.2d, alphaV0_R
|
||||
st2 {v4.2d, v5.2d}, [pCRow1]
|
||||
|
||||
add pCRow0, pCRow0, #32
|
||||
|
@ -691,18 +719,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro SAVE1x4
|
||||
fmov alpha0_R, alpha_save_R
|
||||
fmov alpha0_I, alpha_save_I
|
||||
fmov alpha1_R, alpha0_R
|
||||
fmov alpha1_I, alpha0_I
|
||||
fmov alpha0_R, alphaR
|
||||
fmov alpha0_I, alphaI
|
||||
|
||||
mov pCRow1, pCRow0
|
||||
|
||||
ld2 {v0.d, v1.d}[0], [pCRow1]
|
||||
fmla d0, d16, alphaV0_R
|
||||
fmls d0, d17, alphaV0_I
|
||||
fmla d1, d16, alphaV1_I
|
||||
fmla d1, d17, alphaV1_R
|
||||
fmla d1, d16, alphaV0_I
|
||||
fmla d1, d17, alphaV0_R
|
||||
st2 {v0.d, v1.d}[0], [pCRow1]
|
||||
|
||||
add pCRow1, pCRow1, LDC
|
||||
|
@ -710,8 +736,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld2 {v4.d, v5.d}[0], [pCRow1]
|
||||
fmla d4, d20, alphaV0_R
|
||||
fmls d4, d21, alphaV0_I
|
||||
fmla d5, d20, alphaV1_I
|
||||
fmla d5, d21, alphaV1_R
|
||||
fmla d5, d20, alphaV0_I
|
||||
fmla d5, d21, alphaV0_R
|
||||
st2 {v4.d, v5.d}[0], [pCRow1]
|
||||
|
||||
add pCRow1, pCRow1, LDC
|
||||
|
@ -719,8 +745,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld2 {v0.d, v1.d}[0], [pCRow1]
|
||||
fmla d0, d24, alphaV0_R
|
||||
fmls d0, d25, alphaV0_I
|
||||
fmla d1, d24, alphaV1_I
|
||||
fmla d1, d25, alphaV1_R
|
||||
fmla d1, d24, alphaV0_I
|
||||
fmla d1, d25, alphaV0_R
|
||||
st2 {v0.d, v1.d}[0], [pCRow1]
|
||||
|
||||
add pCRow1, pCRow1, LDC
|
||||
|
@ -728,8 +754,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld2 {v4.d, v5.d}[0], [pCRow1]
|
||||
fmla d4, d28, alphaV0_R
|
||||
fmls d4, d29, alphaV0_I
|
||||
fmla d5, d28, alphaV1_I
|
||||
fmla d5, d29, alphaV1_R
|
||||
fmla d5, d28, alphaV0_I
|
||||
fmla d5, d29, alphaV0_R
|
||||
st2 {v4.d, v5.d}[0], [pCRow1]
|
||||
|
||||
add pCRow0, pCRow0, #16
|
||||
|
@ -778,25 +804,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro SAVE4x2
|
||||
fmov alpha0_R, alpha_save_R
|
||||
fmov alpha0_I, alpha_save_I
|
||||
fmov alpha1_R, alpha0_R
|
||||
fmov alpha1_I, alpha0_I
|
||||
fmov alpha0_R, alphaR
|
||||
fmov alpha0_I, alphaI
|
||||
|
||||
mov pCRow1, pCRow0
|
||||
|
||||
ld2 {v0.2d, v1.2d}, [pCRow1]
|
||||
fmla v0.2d, v16.2d, alphaV0_R
|
||||
fmls v0.2d, v17.2d, alphaV0_I
|
||||
fmla v1.2d, v16.2d, alphaV1_I
|
||||
fmla v1.2d, v17.2d, alphaV1_R
|
||||
fmla v1.2d, v16.2d, alphaV0_I
|
||||
fmla v1.2d, v17.2d, alphaV0_R
|
||||
st2 {v0.2d, v1.2d}, [pCRow1]
|
||||
add pCRow2, pCRow1, #32
|
||||
ld2 {v2.2d, v3.2d}, [pCRow2]
|
||||
fmla v2.2d, v18.2d, alphaV0_R
|
||||
fmls v2.2d, v19.2d, alphaV0_I
|
||||
fmla v3.2d, v18.2d, alphaV1_I
|
||||
fmla v3.2d, v19.2d, alphaV1_R
|
||||
fmla v3.2d, v18.2d, alphaV0_I
|
||||
fmla v3.2d, v19.2d, alphaV0_R
|
||||
st2 {v2.2d, v3.2d}, [pCRow2]
|
||||
|
||||
add pCRow1, pCRow1, LDC
|
||||
|
@ -804,15 +828,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld2 {v4.2d, v5.2d}, [pCRow1]
|
||||
fmla v4.2d, v20.2d, alphaV0_R
|
||||
fmls v4.2d, v21.2d, alphaV0_I
|
||||
fmla v5.2d, v20.2d, alphaV1_I
|
||||
fmla v5.2d, v21.2d, alphaV1_R
|
||||
fmla v5.2d, v20.2d, alphaV0_I
|
||||
fmla v5.2d, v21.2d, alphaV0_R
|
||||
st2 {v4.2d, v5.2d}, [pCRow1]
|
||||
add pCRow2, pCRow1, #32
|
||||
ld2 {v6.2d, v7.2d}, [pCRow2]
|
||||
fmla v6.2d, v22.2d, alphaV0_R
|
||||
fmls v6.2d, v23.2d, alphaV0_I
|
||||
fmla v7.2d, v22.2d, alphaV1_I
|
||||
fmla v7.2d, v23.2d, alphaV1_R
|
||||
fmla v7.2d, v22.2d, alphaV0_I
|
||||
fmla v7.2d, v23.2d, alphaV0_R
|
||||
st2 {v6.2d, v7.2d}, [pCRow2]
|
||||
|
||||
add pCRow0, pCRow0, #64
|
||||
|
@ -845,18 +869,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro SAVE2x2
|
||||
fmov alpha0_R, alpha_save_R
|
||||
fmov alpha0_I, alpha_save_I
|
||||
fmov alpha1_R, alpha0_R
|
||||
fmov alpha1_I, alpha0_I
|
||||
fmov alpha0_R, alphaR
|
||||
fmov alpha0_I, alphaI
|
||||
|
||||
mov pCRow1, pCRow0
|
||||
|
||||
ld2 {v0.2d, v1.2d}, [pCRow1]
|
||||
fmla v0.2d, v16.2d, alphaV0_R
|
||||
fmls v0.2d, v17.2d, alphaV0_I
|
||||
fmla v1.2d, v16.2d, alphaV1_I
|
||||
fmla v1.2d, v17.2d, alphaV1_R
|
||||
fmla v1.2d, v16.2d, alphaV0_I
|
||||
fmla v1.2d, v17.2d, alphaV0_R
|
||||
st2 {v0.2d, v1.2d}, [pCRow1]
|
||||
|
||||
add pCRow1, pCRow1, LDC
|
||||
|
@ -864,8 +886,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld2 {v4.2d, v5.2d}, [pCRow1]
|
||||
fmla v4.2d, v20.2d, alphaV0_R
|
||||
fmls v4.2d, v21.2d, alphaV0_I
|
||||
fmla v5.2d, v20.2d, alphaV1_I
|
||||
fmla v5.2d, v21.2d, alphaV1_R
|
||||
fmla v5.2d, v20.2d, alphaV0_I
|
||||
fmla v5.2d, v21.2d, alphaV0_R
|
||||
st2 {v4.2d, v5.2d}, [pCRow1]
|
||||
|
||||
add pCRow0, pCRow0, #32
|
||||
|
@ -898,18 +920,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro SAVE1x2
|
||||
fmov alpha0_R, alpha_save_R
|
||||
fmov alpha0_I, alpha_save_I
|
||||
fmov alpha1_R, alpha0_R
|
||||
fmov alpha1_I, alpha0_I
|
||||
fmov alpha0_R, alphaR
|
||||
fmov alpha0_I, alphaI
|
||||
|
||||
mov pCRow1, pCRow0
|
||||
|
||||
ld2 {v0.d, v1.d}[0], [pCRow1]
|
||||
fmla d0, d16, alphaV0_R
|
||||
fmls d0, d17, alphaV0_I
|
||||
fmla d1, d16, alphaV1_I
|
||||
fmla d1, d17, alphaV1_R
|
||||
fmla d1, d16, alphaV0_I
|
||||
fmla d1, d17, alphaV0_R
|
||||
st2 {v0.d, v1.d}[0], [pCRow1]
|
||||
|
||||
add pCRow1, pCRow1, LDC
|
||||
|
@ -917,8 +937,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld2 {v4.d, v5.d}[0], [pCRow1]
|
||||
fmla d4, d20, alphaV0_R
|
||||
fmls d4, d21, alphaV0_I
|
||||
fmla d5, d20, alphaV1_I
|
||||
fmla d5, d21, alphaV1_R
|
||||
fmla d5, d20, alphaV0_I
|
||||
fmla d5, d21, alphaV0_R
|
||||
st2 {v4.d, v5.d}[0], [pCRow1]
|
||||
|
||||
add pCRow0, pCRow0, #16
|
||||
|
@ -953,25 +973,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro SAVE4x1
|
||||
fmov alpha0_R, alpha_save_R
|
||||
fmov alpha0_I, alpha_save_I
|
||||
fmov alpha1_R, alpha0_R
|
||||
fmov alpha1_I, alpha0_I
|
||||
fmov alpha0_R, alphaR
|
||||
fmov alpha0_I, alphaI
|
||||
|
||||
mov pCRow1, pCRow0
|
||||
|
||||
ld2 {v0.2d, v1.2d}, [pCRow1]
|
||||
fmla v0.2d, v16.2d, alphaV0_R
|
||||
fmls v0.2d, v17.2d, alphaV0_I
|
||||
fmla v1.2d, v16.2d, alphaV1_I
|
||||
fmla v1.2d, v17.2d, alphaV1_R
|
||||
fmla v1.2d, v16.2d, alphaV0_I
|
||||
fmla v1.2d, v17.2d, alphaV0_R
|
||||
st2 {v0.2d, v1.2d}, [pCRow1]
|
||||
add pCRow2, pCRow1, #32
|
||||
ld2 {v2.2d, v3.2d}, [pCRow2]
|
||||
fmla v2.2d, v18.2d, alphaV0_R
|
||||
fmls v2.2d, v19.2d, alphaV0_I
|
||||
fmla v3.2d, v18.2d, alphaV1_I
|
||||
fmla v3.2d, v19.2d, alphaV1_R
|
||||
fmla v3.2d, v18.2d, alphaV0_I
|
||||
fmla v3.2d, v19.2d, alphaV0_R
|
||||
st2 {v2.2d, v3.2d}, [pCRow2]
|
||||
|
||||
add pCRow0, pCRow0, #64
|
||||
|
@ -997,18 +1015,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro SAVE2x1
|
||||
fmov alpha0_R, alpha_save_R
|
||||
fmov alpha0_I, alpha_save_I
|
||||
fmov alpha1_R, alpha0_R
|
||||
fmov alpha1_I, alpha0_I
|
||||
fmov alpha0_R, alphaR
|
||||
fmov alpha0_I, alphaI
|
||||
|
||||
mov pCRow1, pCRow0
|
||||
|
||||
ld2 {v0.2d, v1.2d}, [pCRow1]
|
||||
fmla v0.2d, v16.2d, alphaV0_R
|
||||
fmls v0.2d, v17.2d, alphaV0_I
|
||||
fmla v1.2d, v16.2d, alphaV1_I
|
||||
fmla v1.2d, v17.2d, alphaV1_R
|
||||
fmla v1.2d, v16.2d, alphaV0_I
|
||||
fmla v1.2d, v17.2d, alphaV0_R
|
||||
st2 {v0.2d, v1.2d}, [pCRow1]
|
||||
|
||||
add pCRow0, pCRow0, #32
|
||||
|
@ -1035,18 +1051,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro SAVE1x1
|
||||
fmov alpha0_R, alpha_save_R
|
||||
fmov alpha0_I, alpha_save_I
|
||||
fmov alpha1_R, alpha0_R
|
||||
fmov alpha1_I, alpha0_I
|
||||
fmov alpha0_R, alphaR
|
||||
fmov alpha0_I, alphaI
|
||||
|
||||
mov pCRow1, pCRow0
|
||||
|
||||
ld2 {v0.d, v1.d}[0], [pCRow1]
|
||||
fmla d0, d16, alphaV0_R
|
||||
fmls d0, d17, alphaV0_I
|
||||
fmla d1, d16, alphaV1_I
|
||||
fmla d1, d17, alphaV1_R
|
||||
fmla d1, d16, alphaV0_I
|
||||
fmla d1, d17, alphaV0_R
|
||||
st2 {v0.d, v1.d}[0], [pCRow1]
|
||||
|
||||
add pCRow0, pCRow0, #16
|
||||
|
@ -1072,8 +1086,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
stp x26, x27, [sp, #(9 * 16)]
|
||||
str x28, [sp, #(10 * 16)]
|
||||
|
||||
fmov alpha_save_R, d0
|
||||
fmov alpha_save_I, d1
|
||||
prfm PLDL1KEEP, [origPB]
|
||||
prfm PLDL1KEEP, [origPA]
|
||||
|
||||
fmov alphaR, d0
|
||||
fmov alphaI, d1
|
||||
|
||||
lsl LDC, LDC, #4 // ldc = ldc * 2 * 8
|
||||
|
||||
|
@ -1085,8 +1102,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ble zgemm_kernel_L2_BEGIN
|
||||
|
||||
zgemm_kernel_L4_BEGIN:
|
||||
mov pCRow0, pC // pCRow0 = C
|
||||
add pC, pC, LDC, lsl #2
|
||||
mov pCRow0, pC
|
||||
add pCRow1, pCRow0, LDC
|
||||
add pCRow2, pCRow1, LDC
|
||||
add pCRow3, pCRow2, LDC
|
||||
|
||||
add pC, pCRow3, LDC
|
||||
|
||||
mov pA, origPA // pA = start of A array
|
||||
|
||||
zgemm_kernel_L4_M4_BEGIN:
|
||||
|
@ -1096,42 +1118,68 @@ zgemm_kernel_L4_M4_BEGIN:
|
|||
cmp counterI, #0
|
||||
ble zgemm_kernel_L4_M2_BEGIN
|
||||
|
||||
.align 5
|
||||
zgemm_kernel_L4_M4_20:
|
||||
|
||||
mov pB, origPB
|
||||
asr counterL , origK, #1 // L = K / 2
|
||||
cmp counterL , #2 // is there at least 4 to do?
|
||||
asr counterL , origK, #3
|
||||
cmp counterL , #2
|
||||
blt zgemm_kernel_L4_M4_32
|
||||
|
||||
KERNEL4x4_I // do one in the K
|
||||
KERNEL4x4_M2 // do another in the K
|
||||
KERNEL4x4_I
|
||||
KERNEL4x4_M2
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_M2
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_M2
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_M2
|
||||
|
||||
subs counterL, counterL, #2 // subtract 2
|
||||
ble zgemm_kernel_L4_M4_22a
|
||||
.align 5
|
||||
|
||||
.align 5
|
||||
zgemm_kernel_L4_M4_22:
|
||||
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_M2
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_M2
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_M2
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_M2
|
||||
|
||||
subs counterL, counterL, #1
|
||||
bgt zgemm_kernel_L4_M4_22
|
||||
|
||||
|
||||
.align 5
|
||||
zgemm_kernel_L4_M4_22a:
|
||||
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_M2
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_M2
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_M2
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_E
|
||||
|
||||
b zgemm_kernel_L4_M4_44
|
||||
|
||||
.align 5
|
||||
zgemm_kernel_L4_M4_32:
|
||||
|
||||
tst counterL, #1
|
||||
ble zgemm_kernel_L4_M4_40
|
||||
|
||||
KERNEL4x4_I
|
||||
KERNEL4x4_M2
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_M2
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_M2
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_E
|
||||
|
||||
b zgemm_kernel_L4_M4_44
|
||||
|
@ -1143,13 +1191,20 @@ zgemm_kernel_L4_M4_40:
|
|||
|
||||
zgemm_kernel_L4_M4_44:
|
||||
|
||||
ands counterL , origK, #1
|
||||
ands counterL , origK, #7
|
||||
ble zgemm_kernel_L4_M4_100
|
||||
|
||||
.align 5
|
||||
zgemm_kernel_L4_M4_46:
|
||||
KERNEL4x4_SUB
|
||||
|
||||
subs counterL, counterL, #1
|
||||
bne zgemm_kernel_L4_M4_46
|
||||
|
||||
zgemm_kernel_L4_M4_100:
|
||||
prfm PLDL1KEEP, [pA]
|
||||
prfm PLDL1KEEP, [pA, #64]
|
||||
prfm PLDL1KEEP, [origPB]
|
||||
|
||||
SAVE4x4
|
||||
|
||||
|
|
|
@ -43,6 +43,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define Y_OPTR x13 /* loop Y vector address */
|
||||
#define X_PTR x14 /* loop X vector address */
|
||||
|
||||
#define A_PRE_SIZE 768
|
||||
#define Y_PRE_SIZE 768
|
||||
|
||||
/*******************************************************************************
|
||||
* Macro definitions
|
||||
*******************************************************************************/
|
||||
|
@ -50,14 +53,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#if !defined(DOUBLE)
|
||||
#define ALPHA_R s0
|
||||
#define ALPHA_I s1
|
||||
#define ALPHA_R_COPY s7
|
||||
#define ALPHA_I_COPY s8
|
||||
#define SHZ 3
|
||||
#else
|
||||
#define ALPHA_R d0
|
||||
#define ALPHA_I d1
|
||||
#define ALPHA_R_COPY d7
|
||||
#define ALPHA_I_COPY d8
|
||||
#define SHZ 4
|
||||
#endif
|
||||
|
||||
|
@ -95,20 +94,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
|
||||
.macro INIT
|
||||
/********** INIT FOR F4 LOOP **********/
|
||||
fmov ALPHA_R_COPY, ALPHA_R
|
||||
fmov ALPHA_I_COPY, ALPHA_I
|
||||
#if !defined(DOUBLE)
|
||||
ins v7.s[1], v7.s[0] // R(ALPHA), R(ALPHA)
|
||||
ins v8.s[1], v8.s[0] // I(ALPHA), I(ALPHA)
|
||||
ins v7.d[1], v7.d[0]
|
||||
ins v8.d[1], v8.d[0]
|
||||
#else
|
||||
ins v7.d[1], v7.d[0] // R(ALPHA), R(ALPHA)
|
||||
ins v8.d[1], v8.d[0] // I(ALPHA), I(ALPHA)
|
||||
#endif
|
||||
|
||||
/******* INIT FOR F1 AND S1 LOOP ******/
|
||||
#if !defined(DOUBLE)
|
||||
ins v0.s[1], v0.s[0] // R(ALPHA), R(ALPHA)
|
||||
eor v2.16b, v2.16b, v2.16b
|
||||
|
@ -129,47 +114,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro INIT_LOOP
|
||||
/********** INIT_LOOP FOR F4 LOOP **********/
|
||||
#if !defined(DOUBLE)
|
||||
ld1 {v9.2s}, [X_PTR] // [I(X), R(X)]
|
||||
ins v10.s[0], v9.s[1]
|
||||
ins v9.s[1], v9.s[0] // [R(X), R(X)]
|
||||
ins v10.s[1], v10.s[0] // [I(X), I(X)]
|
||||
ins v9.d[1], v9.d[0]
|
||||
ins v10.d[1], v10.d[0]
|
||||
#if !defined(CONJ)
|
||||
#if !defined(XCONJ)
|
||||
fmul v11.4s, v9.4s, v7.4s // [+ R(X) * R(ALPHA)]
|
||||
fmls v11.4s, v10.4s, v8.4s // [- I(X) * I(ALPHA)]
|
||||
fmul v12.4s, v9.4s, v8.4s // [+ R(X) * I(ALPHA)]
|
||||
fmla v12.4s, v10.4s, v7.4s // [+ I(X) * R(ALPHA)]
|
||||
#else
|
||||
fmul v11.4s, v9.4s, v7.4s // [+ R(X) * R(ALPHA)]
|
||||
fmla v11.4s, v10.4s, v8.4s // [+ I(X) * I(ALPHA)]
|
||||
fmul v12.4s, v9.4s, v8.4s // [+ R(X) * I(ALPHA)]
|
||||
fmls v12.4s, v10.4s, v7.4s // [- I(X) * R(ALPHA)]
|
||||
#endif
|
||||
#else // CONJ
|
||||
#if !defined(XCONJ)
|
||||
fmul v11.4s, v9.4s, v7.4s // [+ R(X) * R(ALPHA)]
|
||||
fmls v11.4s, v10.4s, v8.4s // [+ I(X) * I(ALPHA)]
|
||||
fmul v12.4s, v10.4s, v7.4s // [+ I(X) * R(ALPHA)]
|
||||
fmls v12.4s, v9.4s, v8.4s // [- R(X) * I(ALPHA)]
|
||||
#else
|
||||
fmul v11.4s, v9.4s, v7.4s // [+ R(X) * R(ALPHA)]
|
||||
fmls v11.4s, v10.4s, v8.4s // [- I(X) * I(ALPHA)]
|
||||
eor v12.16b, v12.16b, v12.16b
|
||||
fmls v12.4s, v9.4s, v8.4s // [- R(X) * I(ALPHA)]
|
||||
fmla v12.4s, v10.4s, v7.4s // [- I(X) * R(ALPHA)]
|
||||
#endif
|
||||
#endif // CONJ
|
||||
|
||||
/****** INIT_LOOP FOR F1 AND S1 LOOP ******/
|
||||
ld1 {v2.2s}, [X_PTR] // [I(X), R(X)]
|
||||
ext v3.8b, v2.8b, v2.8b, #4 // [R(X), I(X)]
|
||||
fmul v2.2s, v0.2s, v2.2s
|
||||
fmla v2.2s, v1.2s, v3.2s // [I(TEMP), R(TEMP)]
|
||||
ins v3.s[0], v2.s[1]
|
||||
|
||||
/********** INIT_LOOP FOR F4 LOOP **********/
|
||||
#if !defined(CONJ)
|
||||
#if !defined(XCONJ)
|
||||
dup v21.4s, v2.s[0] // R[TEMP]
|
||||
dup v22.4s, v2.s[0] // R[TEMP]
|
||||
eor v25.16b, v25.16b, v25.16b
|
||||
fsub s25, s25, s3
|
||||
dup v23.4s, v25.s[0] // -I[TEMP]
|
||||
dup v24.4s, v3.s[0] // I[TEMP]
|
||||
#else
|
||||
dup v21.4s, v2.s[0] // R[TEMP]
|
||||
dup v22.4s, v2.s[0] // R[TEMP]
|
||||
dup v23.4s, v3.s[0] // I[TEMP]
|
||||
eor v25.16b, v25.16b, v25.16b
|
||||
fsub s25, s25, s3
|
||||
dup v24.4s, v25.s[0] // -I[TEMP]
|
||||
#endif
|
||||
#else // CONJ
|
||||
#if !defined(XCONJ)
|
||||
dup v21.4s, v2.s[0] // R[TEMP]
|
||||
eor v25.16b, v25.16b, v25.16b
|
||||
fsub s25, s25, s2
|
||||
dup v22.4s, v25.s[0] // R[TEMP]
|
||||
dup v23.4s, v3.s[0] // I[TEMP]
|
||||
dup v24.4s, v3.s[0] // I[TEMP]
|
||||
#else
|
||||
dup v21.4s, v2.s[0] // R[TEMP]
|
||||
eor v25.16b, v25.16b, v25.16b
|
||||
fsub s25, s25, s2
|
||||
dup v22.4s, v25.s[0] // R[TEMP]
|
||||
|
||||
eor v25.16b, v25.16b, v25.16b
|
||||
fsub s25, s25, s3
|
||||
dup v23.4s, v25.s[0] // I[TEMP]
|
||||
dup v24.4s, v25.s[0] // I[TEMP]
|
||||
#endif
|
||||
#endif // CONJ
|
||||
|
||||
|
||||
/****** INIT_LOOP FOR F1 AND S1 LOOP ******/
|
||||
#if !defined(CONJ)
|
||||
#if !defined(XCONJ)
|
||||
eor v4.16b, v4.16b, v4.16b
|
||||
|
@ -200,45 +191,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#endif // CONJ
|
||||
|
||||
#else // DOUBLE
|
||||
|
||||
/********** INIT_LOOP FOR F4 LOOP **********/
|
||||
ld1 {v9.2d}, [X_PTR] // [I(X), R(X)]
|
||||
ins v10.d[0], v9.d[1]
|
||||
ins v9.d[1], v9.d[0] // [R(X), R(X)]
|
||||
ins v10.d[1], v10.d[0] // [I(X), I(X)]
|
||||
#if !defined(CONJ)
|
||||
#if !defined(XCONJ)
|
||||
fmul v11.2d, v9.2d, v7.2d // [+ R(X) * R(ALPHA)]
|
||||
fmls v11.2d, v10.2d, v8.2d // [- I(X) * I(ALPHA)]
|
||||
fmul v12.2d, v9.2d, v8.2d // [+ R(X) * I(ALPHA)]
|
||||
fmla v12.2d, v10.2d, v7.2d // [+ I(X) * R(ALPHA)]
|
||||
#else
|
||||
fmul v11.2d, v9.2d, v7.2d // [+ R(X) * R(ALPHA)]
|
||||
fmla v11.2d, v10.2d, v8.2d // [+ I(X) * I(ALPHA)]
|
||||
fmul v12.2d, v9.2d, v8.2d // [+ R(X) * I(ALPHA)]
|
||||
fmls v12.2d, v10.2d, v7.2d // [- I(X) * R(ALPHA)]
|
||||
#endif
|
||||
#else // CONJ
|
||||
#if !defined(XCONJ)
|
||||
fmul v11.2d, v9.2d, v7.2d // [+ R(X) * R(ALPHA)]
|
||||
fmls v11.2d, v10.2d, v8.2d // [+ I(X) * I(ALPHA)]
|
||||
fmul v12.2d, v10.2d, v7.2d // [+ I(X) * R(ALPHA)]
|
||||
fmls v12.2d, v9.2d, v8.2d // [- R(X) * I(ALPHA)]
|
||||
#else
|
||||
fmul v11.2d, v9.2d, v7.2d // [+ R(X) * R(ALPHA)]
|
||||
fmls v11.2d, v10.2d, v8.2d // [- I(X) * I(ALPHA)]
|
||||
eor v12.16b, v12.16b, v12.16b
|
||||
fmls v12.2d, v9.2d, v8.2d // [- R(X) * I(ALPHA)]
|
||||
fmla v12.2d, v10.2d, v7.2d // [- I(X) * R(ALPHA)]
|
||||
#endif
|
||||
#endif // CONJ
|
||||
|
||||
/****** INIT_LOOP FOR F1 AND S1 LOOP ******/
|
||||
ld1 {v2.2d}, [X_PTR] // [I(X), R(X)]
|
||||
ext v3.16b, v2.16b, v2.16b, #8 // [R(X), I(X)]
|
||||
fmul v2.2d, v0.2d, v2.2d
|
||||
fmla v2.2d, v1.2d, v3.2d // [I(TEMP), R(TEMP)]
|
||||
ins v3.d[0], v2.d[1] // I(TEMP)
|
||||
|
||||
/****** INIT_LOOP FOR F4 LOOP ******/
|
||||
#if !defined(CONJ)
|
||||
#if !defined(XCONJ)
|
||||
dup v21.2d, v2.d[0] // R[TEMP]
|
||||
dup v22.2d, v2.d[0] // R[TEMP]
|
||||
eor v25.16b, v25.16b, v25.16b
|
||||
fsub d25, d25, d3
|
||||
dup v23.2d, v25.d[0] // -I[TEMP]
|
||||
dup v24.2d, v3.d[0] // I[TEMP]
|
||||
#else
|
||||
dup v21.2d, v2.d[0] // R[TEMP]
|
||||
dup v22.2d, v2.d[0] // R[TEMP]
|
||||
dup v23.2d, v3.d[0] // I[TEMP]
|
||||
eor v25.16b, v25.16b, v25.16b
|
||||
fsub d25, d25, d3
|
||||
dup v24.2d, v25.d[0] // -I[TEMP]
|
||||
#endif
|
||||
#else // CONJ
|
||||
#if !defined(XCONJ)
|
||||
dup v21.2d, v2.d[0] // R[TEMP]
|
||||
eor v25.16b, v25.16b, v25.16b
|
||||
fsub d25, d25, d2
|
||||
dup v22.2d, v25.d[0] // R[TEMP]
|
||||
dup v23.2d, v3.d[0] // I[TEMP]
|
||||
dup v24.2d, v3.d[0] // I[TEMP]
|
||||
#else
|
||||
dup v21.2d, v2.d[0] // R[TEMP]
|
||||
eor v25.16b, v25.16b, v25.16b
|
||||
fsub d25, d25, d2
|
||||
dup v22.2d, v25.d[0] // R[TEMP]
|
||||
|
||||
eor v25.16b, v25.16b, v25.16b
|
||||
fsub d25, d25, d3
|
||||
dup v23.2d, v25.d[0] // I[TEMP]
|
||||
dup v24.2d, v25.d[0] // I[TEMP]
|
||||
#endif
|
||||
#endif // CONJ
|
||||
|
||||
|
||||
/****** INIT_LOOP FOR F1 AND S1 LOOP ******/
|
||||
#if !defined(CONJ)
|
||||
#if !defined(XCONJ)
|
||||
eor v4.16b, v4.16b, v4.16b
|
||||
|
@ -276,91 +274,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
ld2 {v13.4s, v14.4s}, [A_PTR], #32
|
||||
ld2 {v15.4s, v16.4s}, [Y_IPTR], #32
|
||||
#if !defined(CONJ)
|
||||
#if !defined(XCONJ)
|
||||
fmla v15.4s, v11.4s, v13.4s // [+ R(ALPHA * X) * A_R]
|
||||
fmls v15.4s, v12.4s, v14.4s // [- I(ALPHA * X) * A_I]
|
||||
fmla v16.4s, v11.4s, v14.4s // [+ R(ALPHA * X) * A_I]
|
||||
fmla v16.4s, v12.4s, v13.4s // [+ I(ALPHA * X) * A_R]
|
||||
#else
|
||||
fmla v15.4s, v11.4s, v13.4s // [+ R(ALPHA * X) * A_R]
|
||||
fmla v15.4s, v12.4s, v14.4s // [+ I(ALPHA * X) * A_I]
|
||||
fmla v16.4s, v11.4s, v14.4s // [+ R(ALPHA * X) * A_I]
|
||||
fmls v16.4s, v12.4s, v13.4s // [- I(ALPHA * X) * A_R]
|
||||
#endif
|
||||
#else // CONJ
|
||||
#if !defined(XCONJ)
|
||||
fmla v15.4s, v11.4s, v13.4s // [+ R(ALPHA * X) * A_R]
|
||||
fmla v15.4s, v12.4s, v14.4s // [+ I(ALPHA * X) * A_I]
|
||||
fmls v16.4s, v11.4s, v14.4s // [- R(ALPHA * X) * A_I]
|
||||
fmla v16.4s, v12.4s, v13.4s // [+ I(ALPHA * X) * A_R]
|
||||
#else
|
||||
fmla v15.4s, v11.4s, v13.4s // [+ R(ALPHA * X) * A_R]
|
||||
fmls v15.4s, v12.4s, v14.4s // [- I(ALPHA * X) * A_I]
|
||||
fmls v16.4s, v11.4s, v14.4s // [- R(ALPHA * X) * A_I]
|
||||
fmls v16.4s, v12.4s, v13.4s // [- I(ALPHA * X) * A_R]
|
||||
#endif
|
||||
#endif // CONJ
|
||||
|
||||
prfm PLDL1KEEP, [A_PTR, #A_PRE_SIZE]
|
||||
prfm PLDL1KEEP, [Y_IPTR, #Y_PRE_SIZE]
|
||||
|
||||
fmla v15.4s, v21.4s, v13.4s
|
||||
fmla v15.4s, v23.4s, v14.4s
|
||||
fmla v16.4s, v22.4s, v14.4s
|
||||
fmla v16.4s, v24.4s, v13.4s
|
||||
|
||||
st2 {v15.4s, v16.4s}, [Y_OPTR], #32
|
||||
|
||||
#else // DOUBLE
|
||||
|
||||
ld2 {v13.2d, v14.2d}, [A_PTR], #32
|
||||
ld2 {v15.2d, v16.2d}, [Y_IPTR], #32
|
||||
#if !defined(CONJ)
|
||||
#if !defined(XCONJ)
|
||||
fmla v15.2d, v11.2d, v13.2d // [+ R(ALPHA * X) * A_R]
|
||||
fmls v15.2d, v12.2d, v14.2d // [- I(ALPHA * X) * A_I]
|
||||
fmla v16.2d, v11.2d, v14.2d // [+ R(ALPHA * X) * A_I]
|
||||
fmla v16.2d, v12.2d, v13.2d // [+ I(ALPHA * X) * A_R]
|
||||
#else
|
||||
fmla v15.2d, v11.2d, v13.2d // [+ R(ALPHA * X) * A_R]
|
||||
fmla v15.2d, v12.2d, v14.2d // [+ I(ALPHA * X) * A_I]
|
||||
fmla v16.2d, v11.2d, v14.2d // [+ R(ALPHA * X) * A_I]
|
||||
fmls v16.2d, v12.2d, v13.2d // [- I(ALPHA * X) * A_R]
|
||||
#endif
|
||||
#else // CONJ
|
||||
#if !defined(XCONJ)
|
||||
fmla v15.2d, v11.2d, v13.2d // [+ R(ALPHA * X) * A_R]
|
||||
fmla v15.2d, v12.2d, v14.2d // [+ I(ALPHA * X) * A_I]
|
||||
fmls v16.2d, v11.2d, v14.2d // [- R(ALPHA * X) * A_I]
|
||||
fmla v16.2d, v12.2d, v13.2d // [+ I(ALPHA * X) * A_R]
|
||||
#else
|
||||
fmla v15.2d, v11.2d, v13.2d // [+ R(ALPHA * X) * A_R]
|
||||
fmls v15.2d, v12.2d, v14.2d // [- I(ALPHA * X) * A_I]
|
||||
fmls v16.2d, v11.2d, v14.2d // [- R(ALPHA * X) * A_I]
|
||||
fmls v16.2d, v12.2d, v13.2d // [- I(ALPHA * X) * A_R]
|
||||
#endif
|
||||
#endif // CONJ
|
||||
prfm PLDL1KEEP, [A_PTR, #A_PRE_SIZE]
|
||||
|
||||
fmla v15.2d, v21.2d, v13.2d
|
||||
fmla v15.2d, v23.2d, v14.2d
|
||||
fmla v16.2d, v22.2d, v14.2d
|
||||
fmla v16.2d, v24.2d, v13.2d
|
||||
|
||||
st2 {v15.2d, v16.2d}, [Y_OPTR], #32
|
||||
|
||||
ld2 {v17.2d, v18.2d}, [A_PTR], #32
|
||||
ld2 {v19.2d, v20.2d}, [Y_IPTR], #32
|
||||
#if !defined(CONJ)
|
||||
#if !defined(XCONJ)
|
||||
fmla v19.2d, v11.2d, v17.2d // [+ R(ALPHA * X) * A_R]
|
||||
fmls v19.2d, v12.2d, v18.2d // [- I(ALPHA * X) * A_I]
|
||||
fmla v20.2d, v11.2d, v18.2d // [+ R(ALPHA * X) * A_I]
|
||||
fmla v20.2d, v12.2d, v17.2d // [+ I(ALPHA * X) * A_R]
|
||||
#else
|
||||
fmla v19.2d, v11.2d, v17.2d // [+ R(ALPHA * X) * A_R]
|
||||
fmla v19.2d, v12.2d, v18.2d // [- I(ALPHA * X) * A_I]
|
||||
fmla v20.2d, v11.2d, v18.2d // [+ R(ALPHA * X) * A_I]
|
||||
fmls v20.2d, v12.2d, v17.2d // [+ I(ALPHA * X) * A_R]
|
||||
#endif
|
||||
#else // CONJ
|
||||
#if !defined(XCONJ)
|
||||
fmla v19.2d, v11.2d, v17.2d // [+ R(ALPHA * X) * A_R]
|
||||
fmla v19.2d, v12.2d, v18.2d // [- I(ALPHA * X) * A_I]
|
||||
fmls v20.2d, v11.2d, v18.2d // [+ R(ALPHA * X) * A_I]
|
||||
fmla v20.2d, v12.2d, v17.2d // [+ I(ALPHA * X) * A_R]
|
||||
#else
|
||||
fmla v19.2d, v11.2d, v17.2d // [+ R(ALPHA * X) * A_R]
|
||||
fmls v19.2d, v12.2d, v18.2d // [- I(ALPHA * X) * A_I]
|
||||
fmls v20.2d, v11.2d, v18.2d // [+ R(ALPHA * X) * A_I]
|
||||
fmls v20.2d, v12.2d, v17.2d // [+ I(ALPHA * X) * A_R]
|
||||
#endif
|
||||
#endif // CONJ
|
||||
prfm PLDL1KEEP, [Y_IPTR, #Y_PRE_SIZE]
|
||||
|
||||
fmla v19.2d, v21.2d, v17.2d
|
||||
fmla v19.2d, v23.2d, v18.2d
|
||||
fmla v20.2d, v22.2d, v18.2d
|
||||
fmla v20.2d, v24.2d, v17.2d
|
||||
|
||||
st2 {v19.2d, v20.2d}, [Y_OPTR], #32
|
||||
|
||||
#endif
|
||||
|
@ -445,10 +391,7 @@ zgemv_n_kernel_F_LOOP:
|
|||
|
||||
zgemv_n_kernel_F4:
|
||||
|
||||
KERNEL_F1
|
||||
KERNEL_F1
|
||||
KERNEL_F1
|
||||
KERNEL_F1
|
||||
KERNEL_F4
|
||||
|
||||
subs I, I, #1
|
||||
bne zgemv_n_kernel_F4
|
||||
|
|
|
@ -41,6 +41,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define J x11 /* loop variable */
|
||||
#define I x12 /* loop variable */
|
||||
|
||||
#define A_PRE_SIZE 768
|
||||
#define X_PRE_SIZE 768
|
||||
|
||||
/*******************************************************************************
|
||||
* Macro definitions
|
||||
*******************************************************************************/
|
||||
|
@ -139,6 +142,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
ld2 {v11.4s, v12.4s}, [X_PTR], #32
|
||||
ld2 {v13.4s, v14.4s}, [A_PTR], #32
|
||||
prfm PLDL1STRM, [X_PTR, #X_PRE_SIZE]
|
||||
prfm PLDL1STRM, [A_PTR, #A_PRE_SIZE]
|
||||
|
||||
#if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ))
|
||||
fmla v9.4s, v11.4s, v13.4s // [+ R(X) * A_R]
|
||||
|
@ -155,7 +160,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#else // DOUBLE
|
||||
ld2 {v11.2d, v12.2d}, [X_PTR], #32
|
||||
ld2 {v13.2d, v14.2d}, [A_PTR], #32
|
||||
prfm PLDL1STRM, [X_PTR, #512]
|
||||
prfm PLDL1STRM, [X_PTR, #X_PRE_SIZE]
|
||||
|
||||
#if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ))
|
||||
fmla v9.2d, v11.2d, v13.2d // [+ R(X) * A_R]
|
||||
|
@ -171,7 +176,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
ld2 {v17.2d, v18.2d}, [X_PTR], #32
|
||||
ld2 {v19.2d, v20.2d}, [A_PTR], #32
|
||||
prfm PLDL1STRM, [A_PTR, #512]
|
||||
prfm PLDL1STRM, [A_PTR, #A_PRE_SIZE]
|
||||
|
||||
#if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ))
|
||||
fmla v15.2d, v17.2d, v19.2d // [+ R(X) * A_R]
|
||||
|
|
|
@ -46,23 +46,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define pCRow0 x12
|
||||
#define pCRow1 x13
|
||||
#define pCRow2 x14
|
||||
#define pA x15
|
||||
#define alpha_save_R x16
|
||||
#define alpha_save_I x17
|
||||
#define temp x18
|
||||
#define tempOffset x19
|
||||
#define tempK x20
|
||||
#define pCRow3 x15
|
||||
#define pA x16
|
||||
#define alphaR x17
|
||||
#define alphaI x18
|
||||
#define temp x19
|
||||
#define tempOffset x20
|
||||
#define tempK x21
|
||||
|
||||
#define alpha0_R d10
|
||||
#define alphaV0_R v10.d[0]
|
||||
#define alpha0_I d11
|
||||
#define alphaV0_I v11.d[0]
|
||||
|
||||
#define alpha1_R d14
|
||||
#define alphaV1_R v14.d[0]
|
||||
#define alpha1_I d15
|
||||
#define alphaV1_I v15.d[0]
|
||||
|
||||
#define A_PRE_SIZE 2560
|
||||
#define B_PRE_SIZE 448
|
||||
#define C_PRE_SIZE 128
|
||||
|
||||
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
|
||||
#define OP_rr fmla
|
||||
|
@ -93,7 +92,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
// 04 origPB
|
||||
// 05 pC
|
||||
// 06 origLDC -> LDC
|
||||
// 07 offset
|
||||
// 07 offset -> temp
|
||||
// 08 counterL
|
||||
// 09 counterI
|
||||
// 10 counterJ
|
||||
|
@ -101,13 +100,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
// 12 pCRow0
|
||||
// 13 pCRow1
|
||||
// 14 pCRow2
|
||||
// 15 pA
|
||||
// 16 alpha_save_R
|
||||
// 17 alpha_save_I
|
||||
// 18 must save temp
|
||||
// 19 must save tempOffset
|
||||
// 20 must save tempK
|
||||
// 21 must save
|
||||
// 15 pCRow3
|
||||
// 16 pA
|
||||
// 17 alpha_save_R
|
||||
// 18 must save alpha_save_I
|
||||
// 19 must save temp
|
||||
// 20 must save tempOffset
|
||||
// 21 must save tempK
|
||||
// 22 must save
|
||||
// 23 must save
|
||||
// 24 must save
|
||||
|
@ -178,12 +177,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.macro KERNEL4x4_I
|
||||
ld2 {v8.2d, v9.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
ld2 {v10.2d, v11.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
ld2 {v0.2d, v1.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
ld2 {v2.2d, v3.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
fmul v16.2d, v0.2d, v8.d[0]
|
||||
OP_ii v16.2d, v1.2d, v9.d[0]
|
||||
|
@ -196,16 +191,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#endif
|
||||
OP_ir v17.2d, v1.2d, v8.d[0]
|
||||
|
||||
fmul v18.2d, v2.2d, v8.d[0]
|
||||
OP_ii v18.2d, v3.2d, v9.d[0]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
eor v19.16b, v19.16b, v19.16b
|
||||
fmls v19.2d, v2.2d, v9.d[0]
|
||||
#else
|
||||
fmul v19.2d, v2.2d, v9.d[0]
|
||||
#endif
|
||||
OP_ir v19.2d, v3.2d, v8.d[0]
|
||||
ld2 {v2.2d, v3.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
fmul v20.2d, v0.2d, v8.d[1]
|
||||
OP_ii v20.2d, v1.2d, v9.d[1]
|
||||
|
@ -218,6 +205,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#endif
|
||||
OP_ir v21.2d, v1.2d, v8.d[1]
|
||||
|
||||
ld2 {v10.2d, v11.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
|
||||
fmul v22.2d, v2.2d, v8.d[1]
|
||||
OP_ii v22.2d, v3.2d, v9.d[1]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
|
@ -229,6 +219,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#endif
|
||||
OP_ir v23.2d, v3.2d, v8.d[1]
|
||||
|
||||
ld2 {v12.2d, v13.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
|
||||
fmul v18.2d, v2.2d, v8.d[0]
|
||||
OP_ii v18.2d, v3.2d, v9.d[0]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
eor v19.16b, v19.16b, v19.16b
|
||||
fmls v19.2d, v2.2d, v9.d[0]
|
||||
#else
|
||||
fmul v19.2d, v2.2d, v9.d[0]
|
||||
#endif
|
||||
OP_ir v19.2d, v3.2d, v8.d[0]
|
||||
|
||||
ld2 {v4.2d, v5.2d} , [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
fmul v24.2d, v0.2d, v10.d[0]
|
||||
OP_ii v24.2d, v1.2d, v11.d[0]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
|
@ -240,6 +247,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#endif
|
||||
OP_ir v25.2d, v1.2d, v10.d[0]
|
||||
|
||||
ld2 {v6.2d, v7.2d} , [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
fmul v26.2d, v2.2d, v10.d[0]
|
||||
OP_ii v26.2d, v3.2d, v11.d[0]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
|
@ -251,6 +261,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#endif
|
||||
OP_ir v27.2d, v3.2d, v10.d[0]
|
||||
|
||||
ld2 {v14.2d, v15.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
|
||||
fmul v28.2d, v0.2d, v10.d[1]
|
||||
OP_ii v28.2d, v1.2d, v11.d[1]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
|
@ -262,6 +275,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#endif
|
||||
OP_ir v29.2d, v1.2d, v10.d[1]
|
||||
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||
|
||||
fmul v30.2d, v2.2d, v10.d[1]
|
||||
OP_ii v30.2d, v3.2d, v11.d[1]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
|
@ -273,14 +288,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#endif
|
||||
OP_ir v31.2d, v3.2d, v10.d[1]
|
||||
|
||||
ld2 {v12.2d, v13.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
ld2 {v14.2d, v15.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
ld2 {v4.2d, v5.2d} , [pA]
|
||||
add pA, pA, #32
|
||||
ld2 {v6.2d, v7.2d} , [pA]
|
||||
add pA, pA, #32
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
|
||||
.endm
|
||||
|
||||
.macro KERNEL4x4_M1
|
||||
|
@ -289,7 +297,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
OP_ri v17.2d, v0.2d, v9.d[0]
|
||||
OP_ir v17.2d, v1.2d, v8.d[0]
|
||||
|
||||
ld2 {v12.2d, v13.2d}, [pB] // For next round
|
||||
ld2 {v12.2d, v13.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
|
||||
OP_rr v18.2d, v2.2d, v8.d[0]
|
||||
|
@ -297,15 +305,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
OP_ri v19.2d, v2.2d, v9.d[0]
|
||||
OP_ir v19.2d, v3.2d, v8.d[0]
|
||||
|
||||
ld2 {v14.2d, v15.2d}, [pB] // For next round
|
||||
add pB, pB, #32
|
||||
ld2 {v4.2d, v5.2d} , [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
OP_rr v20.2d, v0.2d, v8.d[1]
|
||||
OP_ii v20.2d, v1.2d, v9.d[1]
|
||||
OP_ri v21.2d, v0.2d, v9.d[1]
|
||||
OP_ir v21.2d, v1.2d, v8.d[1]
|
||||
|
||||
ld2 {v4.2d, v5.2d} , [pA] // For next round
|
||||
ld2 {v6.2d, v7.2d} , [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
OP_rr v22.2d, v2.2d, v8.d[1]
|
||||
|
@ -313,22 +321,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
OP_ri v23.2d, v2.2d, v9.d[1]
|
||||
OP_ir v23.2d, v3.2d, v8.d[1]
|
||||
|
||||
ld2 {v6.2d, v7.2d} , [pA] // For next round
|
||||
add pA, pA, #32
|
||||
ld2 {v14.2d, v15.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
|
||||
OP_rr v24.2d, v0.2d, v10.d[0]
|
||||
OP_ii v24.2d, v1.2d, v11.d[0]
|
||||
OP_ri v25.2d, v0.2d, v11.d[0]
|
||||
OP_ir v25.2d, v1.2d, v10.d[0]
|
||||
|
||||
prfm PLDL1KEEP, [pA, #512]
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||
|
||||
OP_rr v26.2d, v2.2d, v10.d[0]
|
||||
OP_ii v26.2d, v3.2d, v11.d[0]
|
||||
OP_ri v27.2d, v2.2d, v11.d[0]
|
||||
OP_ir v27.2d, v3.2d, v10.d[0]
|
||||
|
||||
prfm PLDL1KEEP, [pB, #512]
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
|
||||
|
||||
OP_rr v28.2d, v0.2d, v10.d[1]
|
||||
OP_ii v28.2d, v1.2d, v11.d[1]
|
||||
|
@ -347,7 +355,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
OP_ri v17.2d, v4.2d, v13.d[0]
|
||||
OP_ir v17.2d, v5.2d, v12.d[0]
|
||||
|
||||
ld2 {v8.2d, v9.2d}, [pB] // For next round
|
||||
ld2 {v8.2d, v9.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
|
||||
OP_rr v18.2d, v6.2d, v12.d[0]
|
||||
|
@ -355,15 +363,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
OP_ri v19.2d, v6.2d, v13.d[0]
|
||||
OP_ir v19.2d, v7.2d, v12.d[0]
|
||||
|
||||
ld2 {v10.2d, v11.2d}, [pB] // For next round
|
||||
add pB, pB, #32
|
||||
ld2 {v0.2d, v1.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
OP_rr v20.2d, v4.2d, v12.d[1]
|
||||
OP_ii v20.2d, v5.2d, v13.d[1]
|
||||
OP_ri v21.2d, v4.2d, v13.d[1]
|
||||
OP_ir v21.2d, v5.2d, v12.d[1]
|
||||
|
||||
ld2 {v0.2d, v1.2d}, [pA] // For next round
|
||||
ld2 {v2.2d, v3.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
OP_rr v22.2d, v6.2d, v12.d[1]
|
||||
|
@ -371,22 +379,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
OP_ri v23.2d, v6.2d, v13.d[1]
|
||||
OP_ir v23.2d, v7.2d, v12.d[1]
|
||||
|
||||
ld2 {v2.2d, v3.2d}, [pA] // For next round
|
||||
add pA, pA, #32
|
||||
ld2 {v10.2d, v11.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
|
||||
OP_rr v24.2d, v4.2d, v14.d[0]
|
||||
OP_ii v24.2d, v5.2d, v15.d[0]
|
||||
OP_ri v25.2d, v4.2d, v15.d[0]
|
||||
OP_ir v25.2d, v5.2d, v14.d[0]
|
||||
|
||||
prfm PLDL1KEEP, [pA, #512]
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
|
||||
OP_rr v26.2d, v6.2d, v14.d[0]
|
||||
OP_ii v26.2d, v7.2d, v15.d[0]
|
||||
OP_ri v27.2d, v6.2d, v15.d[0]
|
||||
OP_ir v27.2d, v7.2d, v14.d[0]
|
||||
|
||||
prfm PLDL1KEEP, [pB, #512]
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64]
|
||||
|
||||
OP_rr v28.2d, v4.2d, v14.d[1]
|
||||
OP_ii v28.2d, v5.2d, v15.d[1]
|
||||
|
@ -415,6 +423,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
OP_ri v21.2d, v4.2d, v13.d[1]
|
||||
OP_ir v21.2d, v5.2d, v12.d[1]
|
||||
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
|
||||
OP_rr v22.2d, v6.2d, v12.d[1]
|
||||
OP_ii v22.2d, v7.2d, v13.d[1]
|
||||
OP_ri v23.2d, v6.2d, v13.d[1]
|
||||
|
@ -425,6 +435,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
OP_ri v25.2d, v4.2d, v15.d[0]
|
||||
OP_ir v25.2d, v5.2d, v14.d[0]
|
||||
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64]
|
||||
|
||||
OP_rr v26.2d, v6.2d, v14.d[0]
|
||||
OP_ii v26.2d, v7.2d, v15.d[0]
|
||||
OP_ri v27.2d, v6.2d, v15.d[0]
|
||||
|
@ -444,33 +456,40 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.macro KERNEL4x4_SUB
|
||||
ld2 {v8.2d, v9.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
ld2 {v10.2d, v11.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
|
||||
ld2 {v0.2d, v1.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
ld2 {v2.2d, v3.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
OP_rr v16.2d, v0.2d, v8.d[0]
|
||||
OP_ii v16.2d, v1.2d, v9.d[0]
|
||||
OP_ri v17.2d, v0.2d, v9.d[0]
|
||||
OP_ir v17.2d, v1.2d, v8.d[0]
|
||||
|
||||
OP_rr v18.2d, v2.2d, v8.d[0]
|
||||
OP_ii v18.2d, v3.2d, v9.d[0]
|
||||
OP_ri v19.2d, v2.2d, v9.d[0]
|
||||
OP_ir v19.2d, v3.2d, v8.d[0]
|
||||
ld2 {v2.2d, v3.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
OP_rr v20.2d, v0.2d, v8.d[1]
|
||||
OP_ii v20.2d, v1.2d, v9.d[1]
|
||||
OP_ri v21.2d, v0.2d, v9.d[1]
|
||||
OP_ir v21.2d, v1.2d, v8.d[1]
|
||||
|
||||
ld2 {v10.2d, v11.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
|
||||
OP_rr v18.2d, v2.2d, v8.d[0]
|
||||
OP_ii v18.2d, v3.2d, v9.d[0]
|
||||
OP_ri v19.2d, v2.2d, v9.d[0]
|
||||
OP_ir v19.2d, v3.2d, v8.d[0]
|
||||
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
|
||||
OP_rr v22.2d, v2.2d, v8.d[1]
|
||||
OP_ii v22.2d, v3.2d, v9.d[1]
|
||||
OP_ri v23.2d, v2.2d, v9.d[1]
|
||||
OP_ir v23.2d, v3.2d, v8.d[1]
|
||||
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||
|
||||
OP_rr v24.2d, v0.2d, v10.d[0]
|
||||
OP_ii v24.2d, v1.2d, v11.d[0]
|
||||
OP_ri v25.2d, v0.2d, v11.d[0]
|
||||
|
@ -493,66 +512,77 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro SAVE4x4
|
||||
fmov alpha0_R, alpha_save_R
|
||||
fmov alpha0_I, alpha_save_I
|
||||
fmov alpha1_R, alpha0_R
|
||||
fmov alpha1_I, alpha0_I
|
||||
fmov alpha0_R, alphaR
|
||||
fmov alpha0_I, alphaI
|
||||
|
||||
mov pCRow1, pCRow0
|
||||
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
|
||||
|
||||
fmul v0.2d, v16.2d, alphaV0_R
|
||||
fmls v0.2d, v17.2d, alphaV0_I
|
||||
fmul v1.2d, v16.2d, alphaV1_I
|
||||
fmla v1.2d, v17.2d, alphaV1_R
|
||||
st2 {v0.2d, v1.2d}, [pCRow1]
|
||||
add pCRow2, pCRow1, #32
|
||||
fmul v1.2d, v16.2d, alphaV0_I
|
||||
fmla v1.2d, v17.2d, alphaV0_R
|
||||
st2 {v0.2d, v1.2d}, [pCRow0]
|
||||
|
||||
add pCRow0, pCRow0, #32
|
||||
|
||||
fmul v2.2d, v18.2d, alphaV0_R
|
||||
fmls v2.2d, v19.2d, alphaV0_I
|
||||
fmul v3.2d, v18.2d, alphaV1_I
|
||||
fmla v3.2d, v19.2d, alphaV1_R
|
||||
st2 {v2.2d, v3.2d}, [pCRow2]
|
||||
fmul v3.2d, v18.2d, alphaV0_I
|
||||
fmla v3.2d, v19.2d, alphaV0_R
|
||||
st2 {v2.2d, v3.2d}, [pCRow0]
|
||||
|
||||
add pCRow0, pCRow0, #32
|
||||
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
|
||||
|
||||
add pCRow1, pCRow1, LDC
|
||||
fmul v4.2d, v20.2d, alphaV0_R
|
||||
fmls v4.2d, v21.2d, alphaV0_I
|
||||
fmul v5.2d, v20.2d, alphaV1_I
|
||||
fmla v5.2d, v21.2d, alphaV1_R
|
||||
fmul v5.2d, v20.2d, alphaV0_I
|
||||
fmla v5.2d, v21.2d, alphaV0_R
|
||||
st2 {v4.2d, v5.2d}, [pCRow1]
|
||||
add pCRow2, pCRow1, #32
|
||||
|
||||
add pCRow1, pCRow1, #32
|
||||
|
||||
fmul v6.2d, v22.2d, alphaV0_R
|
||||
fmls v6.2d, v23.2d, alphaV0_I
|
||||
fmul v7.2d, v22.2d, alphaV1_I
|
||||
fmla v7.2d, v23.2d, alphaV1_R
|
||||
st2 {v6.2d, v7.2d}, [pCRow2]
|
||||
fmul v7.2d, v22.2d, alphaV0_I
|
||||
fmla v7.2d, v23.2d, alphaV0_R
|
||||
st2 {v6.2d, v7.2d}, [pCRow1]
|
||||
|
||||
add pCRow1, pCRow1, #32
|
||||
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
|
||||
|
||||
add pCRow1, pCRow1, LDC
|
||||
fmul v0.2d, v24.2d, alphaV0_R
|
||||
fmls v0.2d, v25.2d, alphaV0_I
|
||||
fmul v1.2d, v24.2d, alphaV1_I
|
||||
fmla v1.2d, v25.2d, alphaV1_R
|
||||
st2 {v0.2d, v1.2d}, [pCRow1]
|
||||
add pCRow2, pCRow1, #32
|
||||
fmul v1.2d, v24.2d, alphaV0_I
|
||||
fmla v1.2d, v25.2d, alphaV0_R
|
||||
st2 {v0.2d, v1.2d}, [pCRow2]
|
||||
|
||||
add pCRow2, pCRow2, #32
|
||||
|
||||
fmul v2.2d, v26.2d, alphaV0_R
|
||||
fmls v2.2d, v27.2d, alphaV0_I
|
||||
fmul v3.2d, v26.2d, alphaV1_I
|
||||
fmla v3.2d, v27.2d, alphaV1_R
|
||||
fmul v3.2d, v26.2d, alphaV0_I
|
||||
fmla v3.2d, v27.2d, alphaV0_R
|
||||
st2 {v2.2d, v3.2d}, [pCRow2]
|
||||
|
||||
add pCRow1, pCRow1, LDC
|
||||
add pCRow2, pCRow2, #32
|
||||
prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
|
||||
|
||||
fmul v4.2d, v28.2d, alphaV0_R
|
||||
fmls v4.2d, v29.2d, alphaV0_I
|
||||
fmul v5.2d, v28.2d, alphaV1_I
|
||||
fmla v5.2d, v29.2d, alphaV1_R
|
||||
st2 {v4.2d, v5.2d}, [pCRow1]
|
||||
add pCRow2, pCRow1, #32
|
||||
fmul v5.2d, v28.2d, alphaV0_I
|
||||
fmla v5.2d, v29.2d, alphaV0_R
|
||||
st2 {v4.2d, v5.2d}, [pCRow3]
|
||||
|
||||
add pCRow3, pCRow3, #32
|
||||
|
||||
fmul v6.2d, v30.2d, alphaV0_R
|
||||
fmls v6.2d, v31.2d, alphaV0_I
|
||||
fmul v7.2d, v30.2d, alphaV1_I
|
||||
fmla v7.2d, v31.2d, alphaV1_R
|
||||
st2 {v6.2d, v7.2d}, [pCRow2]
|
||||
fmul v7.2d, v30.2d, alphaV0_I
|
||||
fmla v7.2d, v31.2d, alphaV0_R
|
||||
st2 {v6.2d, v7.2d}, [pCRow3]
|
||||
|
||||
add pCRow0, pCRow0, #64
|
||||
add pCRow3, pCRow3, #32
|
||||
.endm
|
||||
|
||||
/******************************************************************************/
|
||||
|
@ -599,41 +629,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro SAVE2x4
|
||||
fmov alpha0_R, alpha_save_R
|
||||
fmov alpha0_I, alpha_save_I
|
||||
fmov alpha1_R, alpha0_R
|
||||
fmov alpha1_I, alpha0_I
|
||||
fmov alpha0_R, alphaR
|
||||
fmov alpha0_I, alphaI
|
||||
|
||||
mov pCRow1, pCRow0
|
||||
|
||||
fmul v0.2d, v16.2d, alphaV0_R
|
||||
fmls v0.2d, v17.2d, alphaV0_I
|
||||
fmul v1.2d, v16.2d, alphaV1_I
|
||||
fmla v1.2d, v17.2d, alphaV1_R
|
||||
fmul v1.2d, v16.2d, alphaV0_I
|
||||
fmla v1.2d, v17.2d, alphaV0_R
|
||||
st2 {v0.2d, v1.2d}, [pCRow1]
|
||||
|
||||
add pCRow1, pCRow1, LDC
|
||||
|
||||
fmul v4.2d, v20.2d, alphaV0_R
|
||||
fmls v4.2d, v21.2d, alphaV0_I
|
||||
fmul v5.2d, v20.2d, alphaV1_I
|
||||
fmla v5.2d, v21.2d, alphaV1_R
|
||||
fmul v5.2d, v20.2d, alphaV0_I
|
||||
fmla v5.2d, v21.2d, alphaV0_R
|
||||
st2 {v4.2d, v5.2d}, [pCRow1]
|
||||
|
||||
add pCRow1, pCRow1, LDC
|
||||
|
||||
fmul v0.2d, v24.2d, alphaV0_R
|
||||
fmls v0.2d, v25.2d, alphaV0_I
|
||||
fmul v1.2d, v24.2d, alphaV1_I
|
||||
fmla v1.2d, v25.2d, alphaV1_R
|
||||
fmul v1.2d, v24.2d, alphaV0_I
|
||||
fmla v1.2d, v25.2d, alphaV0_R
|
||||
st2 {v0.2d, v1.2d}, [pCRow1]
|
||||
|
||||
add pCRow1, pCRow1, LDC
|
||||
|
||||
fmul v4.2d, v28.2d, alphaV0_R
|
||||
fmls v4.2d, v29.2d, alphaV0_I
|
||||
fmul v5.2d, v28.2d, alphaV1_I
|
||||
fmla v5.2d, v29.2d, alphaV1_R
|
||||
fmul v5.2d, v28.2d, alphaV0_I
|
||||
fmla v5.2d, v29.2d, alphaV0_R
|
||||
st2 {v4.2d, v5.2d}, [pCRow1]
|
||||
|
||||
add pCRow0, pCRow0, #32
|
||||
|
@ -682,41 +710,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro SAVE1x4
|
||||
fmov alpha0_R, alpha_save_R
|
||||
fmov alpha0_I, alpha_save_I
|
||||
fmov alpha1_R, alpha0_R
|
||||
fmov alpha1_I, alpha0_I
|
||||
fmov alpha0_R, alphaR
|
||||
fmov alpha0_I, alphaI
|
||||
|
||||
mov pCRow1, pCRow0
|
||||
|
||||
fmul d0, d16, alphaV0_R
|
||||
fmls d0, d17, alphaV0_I
|
||||
fmul d1, d16, alphaV1_I
|
||||
fmla d1, d17, alphaV1_R
|
||||
fmul d1, d16, alphaV0_I
|
||||
fmla d1, d17, alphaV0_R
|
||||
st2 {v0.d, v1.d}[0], [pCRow1]
|
||||
|
||||
add pCRow1, pCRow1, LDC
|
||||
|
||||
fmul d4, d20, alphaV0_R
|
||||
fmls d4, d21, alphaV0_I
|
||||
fmul d5, d20, alphaV1_I
|
||||
fmla d5, d21, alphaV1_R
|
||||
fmul d5, d20, alphaV0_I
|
||||
fmla d5, d21, alphaV0_R
|
||||
st2 {v4.d, v5.d}[0], [pCRow1]
|
||||
|
||||
add pCRow1, pCRow1, LDC
|
||||
|
||||
fmul d0, d24, alphaV0_R
|
||||
fmls d0, d25, alphaV0_I
|
||||
fmul d1, d24, alphaV1_I
|
||||
fmla d1, d25, alphaV1_R
|
||||
fmul d1, d24, alphaV0_I
|
||||
fmla d1, d25, alphaV0_R
|
||||
st2 {v0.d, v1.d}[0], [pCRow1]
|
||||
|
||||
add pCRow1, pCRow1, LDC
|
||||
|
||||
fmul d4, d28, alphaV0_R
|
||||
fmls d4, d29, alphaV0_I
|
||||
fmul d5, d28, alphaV1_I
|
||||
fmla d5, d29, alphaV1_R
|
||||
fmul d5, d28, alphaV0_I
|
||||
fmla d5, d29, alphaV0_R
|
||||
st2 {v4.d, v5.d}[0], [pCRow1]
|
||||
|
||||
add pCRow0, pCRow0, #16
|
||||
|
@ -765,37 +791,35 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro SAVE4x2
|
||||
fmov alpha0_R, alpha_save_R
|
||||
fmov alpha0_I, alpha_save_I
|
||||
fmov alpha1_R, alpha0_R
|
||||
fmov alpha1_I, alpha0_I
|
||||
fmov alpha0_R, alphaR
|
||||
fmov alpha0_I, alphaI
|
||||
|
||||
mov pCRow1, pCRow0
|
||||
|
||||
fmul v0.2d, v16.2d, alphaV0_R
|
||||
fmls v0.2d, v17.2d, alphaV0_I
|
||||
fmul v1.2d, v16.2d, alphaV1_I
|
||||
fmla v1.2d, v17.2d, alphaV1_R
|
||||
fmul v1.2d, v16.2d, alphaV0_I
|
||||
fmla v1.2d, v17.2d, alphaV0_R
|
||||
st2 {v0.2d, v1.2d}, [pCRow1]
|
||||
add pCRow2, pCRow1, #32
|
||||
fmul v2.2d, v18.2d, alphaV0_R
|
||||
fmls v2.2d, v19.2d, alphaV0_I
|
||||
fmul v3.2d, v18.2d, alphaV1_I
|
||||
fmla v3.2d, v19.2d, alphaV1_R
|
||||
fmul v3.2d, v18.2d, alphaV0_I
|
||||
fmla v3.2d, v19.2d, alphaV0_R
|
||||
st2 {v2.2d, v3.2d}, [pCRow2]
|
||||
|
||||
add pCRow1, pCRow1, LDC
|
||||
|
||||
fmul v4.2d, v20.2d, alphaV0_R
|
||||
fmls v4.2d, v21.2d, alphaV0_I
|
||||
fmul v5.2d, v20.2d, alphaV1_I
|
||||
fmla v5.2d, v21.2d, alphaV1_R
|
||||
fmul v5.2d, v20.2d, alphaV0_I
|
||||
fmla v5.2d, v21.2d, alphaV0_R
|
||||
st2 {v4.2d, v5.2d}, [pCRow1]
|
||||
add pCRow2, pCRow1, #32
|
||||
fmul v6.2d, v22.2d, alphaV0_R
|
||||
fmls v6.2d, v23.2d, alphaV0_I
|
||||
fmul v7.2d, v22.2d, alphaV1_I
|
||||
fmla v7.2d, v23.2d, alphaV1_R
|
||||
fmul v7.2d, v22.2d, alphaV0_I
|
||||
fmla v7.2d, v23.2d, alphaV0_R
|
||||
st2 {v6.2d, v7.2d}, [pCRow2]
|
||||
|
||||
add pCRow0, pCRow0, #64
|
||||
|
@ -828,25 +852,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro SAVE2x2
|
||||
fmov alpha0_R, alpha_save_R
|
||||
fmov alpha0_I, alpha_save_I
|
||||
fmov alpha1_R, alpha0_R
|
||||
fmov alpha1_I, alpha0_I
|
||||
fmov alpha0_R, alphaR
|
||||
fmov alpha0_I, alphaI
|
||||
|
||||
mov pCRow1, pCRow0
|
||||
|
||||
fmul v0.2d, v16.2d, alphaV0_R
|
||||
fmls v0.2d, v17.2d, alphaV0_I
|
||||
fmul v1.2d, v16.2d, alphaV1_I
|
||||
fmla v1.2d, v17.2d, alphaV1_R
|
||||
fmul v1.2d, v16.2d, alphaV0_I
|
||||
fmla v1.2d, v17.2d, alphaV0_R
|
||||
st2 {v0.2d, v1.2d}, [pCRow1]
|
||||
|
||||
add pCRow1, pCRow1, LDC
|
||||
|
||||
fmul v4.2d, v20.2d, alphaV0_R
|
||||
fmls v4.2d, v21.2d, alphaV0_I
|
||||
fmul v5.2d, v20.2d, alphaV1_I
|
||||
fmla v5.2d, v21.2d, alphaV1_R
|
||||
fmul v5.2d, v20.2d, alphaV0_I
|
||||
fmla v5.2d, v21.2d, alphaV0_R
|
||||
st2 {v4.2d, v5.2d}, [pCRow1]
|
||||
|
||||
add pCRow0, pCRow0, #32
|
||||
|
@ -879,25 +901,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro SAVE1x2
|
||||
fmov alpha0_R, alpha_save_R
|
||||
fmov alpha0_I, alpha_save_I
|
||||
fmov alpha1_R, alpha0_R
|
||||
fmov alpha1_I, alpha0_I
|
||||
fmov alpha0_R, alphaR
|
||||
fmov alpha0_I, alphaI
|
||||
|
||||
mov pCRow1, pCRow0
|
||||
|
||||
fmul d0, d16, alphaV0_R
|
||||
fmls d0, d17, alphaV0_I
|
||||
fmul d1, d16, alphaV1_I
|
||||
fmla d1, d17, alphaV1_R
|
||||
fmul d1, d16, alphaV0_I
|
||||
fmla d1, d17, alphaV0_R
|
||||
st2 {v0.d, v1.d}[0], [pCRow1]
|
||||
|
||||
add pCRow1, pCRow1, LDC
|
||||
|
||||
fmul d4, d20, alphaV0_R
|
||||
fmls d4, d21, alphaV0_I
|
||||
fmul d5, d20, alphaV1_I
|
||||
fmla d5, d21, alphaV1_R
|
||||
fmul d5, d20, alphaV0_I
|
||||
fmla d5, d21, alphaV0_R
|
||||
st2 {v4.d, v5.d}[0], [pCRow1]
|
||||
|
||||
add pCRow0, pCRow0, #16
|
||||
|
@ -932,23 +952,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro SAVE4x1
|
||||
fmov alpha0_R, alpha_save_R
|
||||
fmov alpha0_I, alpha_save_I
|
||||
fmov alpha1_R, alpha0_R
|
||||
fmov alpha1_I, alpha0_I
|
||||
fmov alpha0_R, alphaR
|
||||
fmov alpha0_I, alphaI
|
||||
|
||||
mov pCRow1, pCRow0
|
||||
|
||||
fmul v0.2d, v16.2d, alphaV0_R
|
||||
fmls v0.2d, v17.2d, alphaV0_I
|
||||
fmul v1.2d, v16.2d, alphaV1_I
|
||||
fmla v1.2d, v17.2d, alphaV1_R
|
||||
fmul v1.2d, v16.2d, alphaV0_I
|
||||
fmla v1.2d, v17.2d, alphaV0_R
|
||||
st2 {v0.2d, v1.2d}, [pCRow1]
|
||||
add pCRow2, pCRow1, #32
|
||||
fmul v2.2d, v18.2d, alphaV0_R
|
||||
fmls v2.2d, v19.2d, alphaV0_I
|
||||
fmul v3.2d, v18.2d, alphaV1_I
|
||||
fmla v3.2d, v19.2d, alphaV1_R
|
||||
fmul v3.2d, v18.2d, alphaV0_I
|
||||
fmla v3.2d, v19.2d, alphaV0_R
|
||||
st2 {v2.2d, v3.2d}, [pCRow2]
|
||||
|
||||
add pCRow0, pCRow0, #64
|
||||
|
@ -974,17 +992,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro SAVE2x1
|
||||
fmov alpha0_R, alpha_save_R
|
||||
fmov alpha0_I, alpha_save_I
|
||||
fmov alpha1_R, alpha0_R
|
||||
fmov alpha1_I, alpha0_I
|
||||
fmov alpha0_R, alphaR
|
||||
fmov alpha0_I, alphaI
|
||||
|
||||
mov pCRow1, pCRow0
|
||||
|
||||
fmul v0.2d, v16.2d, alphaV0_R
|
||||
fmls v0.2d, v17.2d, alphaV0_I
|
||||
fmul v1.2d, v16.2d, alphaV1_I
|
||||
fmla v1.2d, v17.2d, alphaV1_R
|
||||
fmul v1.2d, v16.2d, alphaV0_I
|
||||
fmla v1.2d, v17.2d, alphaV0_R
|
||||
st2 {v0.2d, v1.2d}, [pCRow1]
|
||||
|
||||
add pCRow0, pCRow0, #32
|
||||
|
@ -1011,17 +1027,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro SAVE1x1
|
||||
fmov alpha0_R, alpha_save_R
|
||||
fmov alpha0_I, alpha_save_I
|
||||
fmov alpha1_R, alpha0_R
|
||||
fmov alpha1_I, alpha0_I
|
||||
fmov alpha0_R, alphaR
|
||||
fmov alpha0_I, alphaI
|
||||
|
||||
mov pCRow1, pCRow0
|
||||
|
||||
fmul d0, d16, alphaV0_R
|
||||
fmls d0, d17, alphaV0_I
|
||||
fmul d1, d16, alphaV1_I
|
||||
fmla d1, d17, alphaV1_R
|
||||
fmul d1, d16, alphaV0_I
|
||||
fmla d1, d17, alphaV0_R
|
||||
st2 {v0.d, v1.d}[0], [pCRow1]
|
||||
|
||||
add pCRow0, pCRow0, #16
|
||||
|
@ -1047,8 +1061,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
stp x26, x27, [sp, #(9 * 16)]
|
||||
str x28, [sp, #(10 * 16)]
|
||||
|
||||
fmov alpha_save_R, d0
|
||||
fmov alpha_save_I, d1
|
||||
prfm PLDL1KEEP, [origPB]
|
||||
prfm PLDL1KEEP, [origPA]
|
||||
|
||||
fmov alphaR, d0
|
||||
fmov alphaI, d1
|
||||
|
||||
lsl LDC, LDC, #4 // ldc = ldc * 2 * 8
|
||||
|
||||
|
@ -1064,8 +1081,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ble ztrmm_kernel_L2_BEGIN
|
||||
|
||||
ztrmm_kernel_L4_BEGIN:
|
||||
mov pCRow0, pC // pCRow0 = C
|
||||
add pC, pC, LDC, lsl #2
|
||||
mov pCRow0, pC
|
||||
add pCRow1, pCRow0, LDC
|
||||
add pCRow2, pCRow1, LDC
|
||||
add pCRow3, pCRow2, LDC
|
||||
|
||||
add pC, pCRow3, LDC
|
||||
|
||||
|
||||
#if defined(LEFT)
|
||||
mov tempOffset, offset
|
||||
|
@ -1079,6 +1101,7 @@ ztrmm_kernel_L4_M4_BEGIN:
|
|||
cmp counterI, #0
|
||||
ble ztrmm_kernel_L4_M2_BEGIN
|
||||
|
||||
.align 5
|
||||
ztrmm_kernel_L4_M4_20:
|
||||
|
||||
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||
|
@ -1098,39 +1121,64 @@ ztrmm_kernel_L4_M4_20:
|
|||
add tempK, tempOffset, #4
|
||||
#endif
|
||||
|
||||
asr counterL , tempK, #1 // L = K / 2
|
||||
cmp counterL , #2 // is there at least 4 to do?
|
||||
asr counterL , tempK, #3
|
||||
cmp counterL , #2
|
||||
blt ztrmm_kernel_L4_M4_32
|
||||
|
||||
KERNEL4x4_I // do one in the K
|
||||
KERNEL4x4_M2 // do another in the K
|
||||
KERNEL4x4_I
|
||||
KERNEL4x4_M2
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_M2
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_M2
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_M2
|
||||
|
||||
subs counterL, counterL, #2
|
||||
ble ztrmm_kernel_L4_M4_22a
|
||||
.align 5
|
||||
|
||||
.align 5
|
||||
ztrmm_kernel_L4_M4_22:
|
||||
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_M2
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_M2
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_M2
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_M2
|
||||
|
||||
subs counterL, counterL, #1
|
||||
bgt ztrmm_kernel_L4_M4_22
|
||||
|
||||
|
||||
.align 5
|
||||
ztrmm_kernel_L4_M4_22a:
|
||||
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_M2
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_M2
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_M2
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_E
|
||||
|
||||
b ztrmm_kernel_L4_M4_44
|
||||
|
||||
.align 5
|
||||
ztrmm_kernel_L4_M4_32:
|
||||
|
||||
tst counterL, #1
|
||||
ble ztrmm_kernel_L4_M4_40
|
||||
|
||||
KERNEL4x4_I
|
||||
KERNEL4x4_M2
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_M2
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_M2
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_E
|
||||
|
||||
b ztrmm_kernel_L4_M4_44
|
||||
|
@ -1142,12 +1190,16 @@ ztrmm_kernel_L4_M4_40:
|
|||
|
||||
ztrmm_kernel_L4_M4_44:
|
||||
|
||||
ands counterL , tempK, #1
|
||||
ands counterL , tempK, #7
|
||||
ble ztrmm_kernel_L4_M4_100
|
||||
|
||||
.align 5
|
||||
ztrmm_kernel_L4_M4_46:
|
||||
KERNEL4x4_SUB
|
||||
|
||||
subs counterL, counterL, #1
|
||||
bne ztrmm_kernel_L4_M4_46
|
||||
|
||||
ztrmm_kernel_L4_M4_100:
|
||||
|
||||
SAVE4x4
|
||||
|
@ -1167,6 +1219,10 @@ ztrmm_kernel_L4_M4_100:
|
|||
add tempOffset, tempOffset, #4
|
||||
#endif
|
||||
|
||||
prfm PLDL1KEEP, [pA]
|
||||
prfm PLDL1KEEP, [pA, #64]
|
||||
prfm PLDL1KEEP, [origPB]
|
||||
|
||||
ztrmm_kernel_L4_M4_END:
|
||||
subs counterI, counterI, #1
|
||||
bne ztrmm_kernel_L4_M4_20
|
||||
|
|
6
param.h
6
param.h
|
@ -2341,13 +2341,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define GEMM_DEFAULT_OFFSET_B 0
|
||||
#define GEMM_DEFAULT_ALIGN 0x03fffUL
|
||||
|
||||
#define SGEMM_DEFAULT_UNROLL_M 4
|
||||
#define SGEMM_DEFAULT_UNROLL_M 16
|
||||
#define SGEMM_DEFAULT_UNROLL_N 4
|
||||
|
||||
#define DGEMM_DEFAULT_UNROLL_M 4
|
||||
#define DGEMM_DEFAULT_UNROLL_M 8
|
||||
#define DGEMM_DEFAULT_UNROLL_N 4
|
||||
|
||||
#define CGEMM_DEFAULT_UNROLL_M 4
|
||||
#define CGEMM_DEFAULT_UNROLL_M 8
|
||||
#define CGEMM_DEFAULT_UNROLL_N 4
|
||||
|
||||
#define ZGEMM_DEFAULT_UNROLL_M 4
|
||||
|
|
Loading…
Reference in New Issue