Merge branch 'develop'
This commit is contained in:
commit
12ab1804b6
|
|
@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.4)
|
|||
project(OpenBLAS)
|
||||
set(OpenBLAS_MAJOR_VERSION 0)
|
||||
set(OpenBLAS_MINOR_VERSION 2)
|
||||
set(OpenBLAS_PATCH_VERSION 17)
|
||||
set(OpenBLAS_PATCH_VERSION 18)
|
||||
set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
|
||||
|
||||
enable_language(ASM)
|
||||
|
|
|
|||
|
|
@ -147,5 +147,6 @@ In chronological order:
|
|||
* [2016-03-14] Additional functional Assembly Kernels for Cortex-A57
|
||||
* [2016-03-14] Optimize Dgemm 4x4 for Cortex-A57
|
||||
|
||||
* [Your name or handle] <[email or website]>
|
||||
* [Date] [Brief summary of your changes]
|
||||
* theoractice <https://github.com/theoractice/>
|
||||
* [2016-03-20] Fix compiler error in VisualStudio with CMake
|
||||
* [2016-03-22] Fix access violation on Windows while static linking
|
||||
|
|
|
|||
|
|
@ -1,4 +1,22 @@
|
|||
OpenBLAS ChangeLog
|
||||
====================================================================
|
||||
Version 0.2.18
|
||||
12-Apr-2016
|
||||
common:
|
||||
* If you set MAKE_NB_JOBS flag less or equal than zero,
|
||||
make will be without -j.
|
||||
|
||||
x86/x86_64:
|
||||
* Support building Visual Studio static library. (#813, Thanks, theoractice)
|
||||
* Fix bugs to pass buidbot CI tests (http://build.openblas.net)
|
||||
|
||||
ARM:
|
||||
* Provide DGEMM 8x4 kernel for Cortex-A57 (Thanks, Ashwin Sekhar T K)
|
||||
|
||||
POWER:
|
||||
* Optimize S and C BLAS3 on Power8
|
||||
* Optimize BLAS2/1 on Power8
|
||||
|
||||
====================================================================
|
||||
Version 0.2.17
|
||||
20-Mar-2016
|
||||
|
|
|
|||
|
|
@ -3,7 +3,7 @@
|
|||
#
|
||||
|
||||
# This library's version
|
||||
VERSION = 0.2.17
|
||||
VERSION = 0.2.18
|
||||
|
||||
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
|
||||
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
|
||||
|
|
@ -112,7 +112,10 @@ NO_AFFINITY = 1
|
|||
# NO_PARALLEL_MAKE = 1
|
||||
|
||||
# Force number of make jobs. The default is the number of logical CPU of the host.
|
||||
# This is particularly useful when using distcc
|
||||
# This is particularly useful when using distcc.
|
||||
# A negative value will disable adding a -j flag to make, allowing to use a parent
|
||||
# make -j value. This is useful to call OpenBLAS make from an other project
|
||||
# makefile
|
||||
# MAKE_NB_JOBS = 2
|
||||
|
||||
# If you would like to know minute performance report of GotoBLAS.
|
||||
|
|
|
|||
|
|
@ -1,4 +1,4 @@
|
|||
version: 0.2.15.{build}
|
||||
version: 0.2.18.{build}
|
||||
|
||||
#environment:
|
||||
|
||||
|
|
|
|||
|
|
@ -33,6 +33,10 @@ LIBMKL = -L$(MKL) -lmkl_intel_lp64 -lmkl_gnu_thread -lmkl_core -lgomp -lpthread
|
|||
# Apple vecLib
|
||||
LIBVECLIB = -framework Accelerate
|
||||
|
||||
ESSL=/opt/ibm/lib
|
||||
#LIBESSL = -lesslsmp $(ESSL)/libxlomp_ser.so.1 $(ESSL)/libxlf90_r.so.1 $(ESSL)/libxlfmath.so.1 $(ESSL)/libxlsmp.so.1 /opt/ibm/xlC/13.1.3/lib/libxl.a
|
||||
LIBESSL = -lesslsmp $(ESSL)/libxlf90_r.so.1 $(ESSL)/libxlfmath.so.1 $(ESSL)/libxlsmp.so.1 /opt/ibm/xlC/13.1.3/lib/libxl.a
|
||||
|
||||
ifeq ($(OSNAME), WINNT)
|
||||
|
||||
goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \
|
||||
|
|
@ -44,6 +48,7 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \
|
|||
ssyr2k.goto dsyr2k.goto csyr2k.goto zsyr2k.goto \
|
||||
sger.goto dger.goto cger.goto zger.goto \
|
||||
sdot.goto ddot.goto \
|
||||
srot.goto drot.goto \
|
||||
saxpy.goto daxpy.goto caxpy.goto zaxpy.goto \
|
||||
scopy.goto dcopy.goto ccopy.goto zcopy.goto \
|
||||
sswap.goto dswap.goto cswap.goto zswap.goto \
|
||||
|
|
@ -151,6 +156,7 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \
|
|||
ssyr2k.goto dsyr2k.goto csyr2k.goto zsyr2k.goto \
|
||||
sger.goto dger.goto cger.goto zger.goto \
|
||||
sdot.goto ddot.goto cdot.goto zdot.goto \
|
||||
srot.goto drot.goto \
|
||||
saxpy.goto daxpy.goto caxpy.goto zaxpy.goto \
|
||||
scopy.goto dcopy.goto ccopy.goto zcopy.goto \
|
||||
sswap.goto dswap.goto cswap.goto zswap.goto \
|
||||
|
|
@ -253,7 +259,9 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \
|
|||
|
||||
endif
|
||||
|
||||
|
||||
essl :: sgemm.essl strmm.essl dgemm.essl dtrmm.essl \
|
||||
cgemm.essl ctrmm.essl zgemm.essl ztrmm.essl \
|
||||
slinpack.essl clinpack.essl dlinpack.essl zlinpack.essl
|
||||
|
||||
veclib :: slinpack.veclib dlinpack.veclib clinpack.veclib zlinpack.veclib \
|
||||
scholesky.veclib dcholesky.veclib ccholesky.veclib zcholesky.veclib \
|
||||
|
|
@ -306,6 +314,9 @@ slinpack.mkl : slinpack.$(SUFFIX)
|
|||
slinpack.veclib : slinpack.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
slinpack.essl : slinpack.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
##################################### Dlinpack ####################################################
|
||||
dlinpack.goto : dlinpack.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
|
|
@ -322,6 +333,9 @@ dlinpack.mkl : dlinpack.$(SUFFIX)
|
|||
dlinpack.veclib : dlinpack.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
dlinpack.essl : dlinpack.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
##################################### Clinpack ####################################################
|
||||
|
||||
clinpack.goto : clinpack.$(SUFFIX) ../$(LIBNAME)
|
||||
|
|
@ -339,6 +353,9 @@ clinpack.mkl : clinpack.$(SUFFIX)
|
|||
clinpack.veclib : clinpack.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
clinpack.essl : clinpack.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
##################################### Zlinpack ####################################################
|
||||
|
||||
zlinpack.goto : zlinpack.$(SUFFIX) ../$(LIBNAME)
|
||||
|
|
@ -356,6 +373,9 @@ zlinpack.mkl : zlinpack.$(SUFFIX)
|
|||
zlinpack.veclib : zlinpack.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
zlinpack.essl : zlinpack.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
##################################### Scholesky ###################################################
|
||||
|
||||
scholesky.goto : scholesky.$(SUFFIX) ../$(LIBNAME)
|
||||
|
|
@ -441,6 +461,9 @@ sgemm.mkl : sgemm.$(SUFFIX)
|
|||
sgemm.veclib : sgemm.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
sgemm.essl : sgemm.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
##################################### Dgemm ####################################################
|
||||
dgemm.goto : dgemm.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
|
|
@ -457,6 +480,9 @@ dgemm.mkl : dgemm.$(SUFFIX)
|
|||
dgemm.veclib : dgemm.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
dgemm.essl : dgemm.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
##################################### Cgemm ####################################################
|
||||
|
||||
cgemm.goto : cgemm.$(SUFFIX) ../$(LIBNAME)
|
||||
|
|
@ -474,6 +500,9 @@ cgemm.mkl : cgemm.$(SUFFIX)
|
|||
cgemm.veclib : cgemm.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
cgemm.essl : cgemm.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
##################################### Zgemm ####################################################
|
||||
|
||||
zgemm.goto : zgemm.$(SUFFIX) ../$(LIBNAME)
|
||||
|
|
@ -491,6 +520,9 @@ zgemm.mkl : zgemm.$(SUFFIX)
|
|||
zgemm.veclib : zgemm.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
zgemm.essl : zgemm.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
##################################### Ssymm ####################################################
|
||||
ssymm.goto : ssymm.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
|
|
@ -573,6 +605,9 @@ strmm.mkl : strmm.$(SUFFIX)
|
|||
strmm.veclib : strmm.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
strmm.essl : strmm.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
##################################### Dtrmm ####################################################
|
||||
dtrmm.goto : dtrmm.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
|
|
@ -589,6 +624,9 @@ dtrmm.mkl : dtrmm.$(SUFFIX)
|
|||
dtrmm.veclib : dtrmm.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
dtrmm.essl : dtrmm.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
##################################### Ctrmm ####################################################
|
||||
|
||||
ctrmm.goto : ctrmm.$(SUFFIX) ../$(LIBNAME)
|
||||
|
|
@ -606,6 +644,9 @@ ctrmm.mkl : ctrmm.$(SUFFIX)
|
|||
ctrmm.veclib : ctrmm.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
ctrmm.essl : ctrmm.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
##################################### Ztrmm ####################################################
|
||||
|
||||
ztrmm.goto : ztrmm.$(SUFFIX) ../$(LIBNAME)
|
||||
|
|
@ -623,6 +664,9 @@ ztrmm.mkl : ztrmm.$(SUFFIX)
|
|||
ztrmm.veclib : ztrmm.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
ztrmm.essl : ztrmm.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
##################################### Strsm ####################################################
|
||||
strsm.goto : strsm.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
|
|
@ -1413,6 +1457,39 @@ zdot.mkl : zdot-intel.$(SUFFIX)
|
|||
zdot.veclib : zdot-intel.$(SUFFIX)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
##################################### Srot ####################################################
|
||||
srot.goto : srot.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
|
||||
srot.acml : srot.$(SUFFIX)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
srot.atlas : srot.$(SUFFIX)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
srot.mkl : srot.$(SUFFIX)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
srot.veclib : srot.$(SUFFIX)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
##################################### Drot ####################################################
|
||||
drot.goto : drot.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
|
||||
drot.acml : drot.$(SUFFIX)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
drot.atlas : drot.$(SUFFIX)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
drot.mkl : drot.$(SUFFIX)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
drot.veclib : drot.$(SUFFIX)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
|
||||
##################################### Saxpy ####################################################
|
||||
saxpy.goto : saxpy.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
|
|
@ -2124,6 +2201,13 @@ cgesv.$(SUFFIX) : gesv.c
|
|||
zgesv.$(SUFFIX) : gesv.c
|
||||
$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
|
||||
|
||||
srot.$(SUFFIX) : rot.c
|
||||
$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
|
||||
|
||||
drot.$(SUFFIX) : rot.c
|
||||
$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
@ -2137,7 +2221,7 @@ smallscaling: smallscaling.c ../$(LIBNAME)
|
|||
$(CC) $(CFLAGS) -o $(@F) $^ $(EXTRALIB) -fopenmp -lm
|
||||
|
||||
clean ::
|
||||
@rm -f *.goto *.mkl *.acml *.atlas *.veclib
|
||||
@rm -f *.goto *.mkl *.acml *.atlas *.veclib *.essl
|
||||
|
||||
include $(TOPDIR)/Makefile.tail
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,197 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2014, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#ifdef __CYGWIN32__
|
||||
#include <sys/time.h>
|
||||
#endif
|
||||
#include "common.h"
|
||||
|
||||
|
||||
#undef DOT
|
||||
|
||||
|
||||
#ifdef DOUBLE
|
||||
#define ROT BLASFUNC(drot)
|
||||
#else
|
||||
#define ROT BLASFUNC(srot)
|
||||
#endif
|
||||
|
||||
|
||||
#if defined(__WIN32__) || defined(__WIN64__)
|
||||
|
||||
#ifndef DELTA_EPOCH_IN_MICROSECS
|
||||
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
|
||||
#endif
|
||||
|
||||
int gettimeofday(struct timeval *tv, void *tz){
|
||||
|
||||
FILETIME ft;
|
||||
unsigned __int64 tmpres = 0;
|
||||
static int tzflag;
|
||||
|
||||
if (NULL != tv)
|
||||
{
|
||||
GetSystemTimeAsFileTime(&ft);
|
||||
|
||||
tmpres |= ft.dwHighDateTime;
|
||||
tmpres <<= 32;
|
||||
tmpres |= ft.dwLowDateTime;
|
||||
|
||||
/*converting file time to unix epoch*/
|
||||
tmpres /= 10; /*convert into microseconds*/
|
||||
tmpres -= DELTA_EPOCH_IN_MICROSECS;
|
||||
tv->tv_sec = (long)(tmpres / 1000000UL);
|
||||
tv->tv_usec = (long)(tmpres % 1000000UL);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
|
||||
|
||||
static void *huge_malloc(BLASLONG size){
|
||||
int shmid;
|
||||
void *address;
|
||||
|
||||
#ifndef SHM_HUGETLB
|
||||
#define SHM_HUGETLB 04000
|
||||
#endif
|
||||
|
||||
if ((shmid =shmget(IPC_PRIVATE,
|
||||
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
|
||||
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
|
||||
printf( "Memory allocation failed(shmget).\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
address = shmat(shmid, NULL, SHM_RND);
|
||||
|
||||
if ((BLASLONG)address == -1){
|
||||
printf( "Memory allocation failed(shmat).\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
shmctl(shmid, IPC_RMID, 0);
|
||||
|
||||
return address;
|
||||
}
|
||||
|
||||
#define malloc huge_malloc
|
||||
|
||||
#endif
|
||||
|
||||
int main(int argc, char *argv[]){
|
||||
|
||||
FLOAT *x, *y;
|
||||
// FLOAT result;
|
||||
blasint m, i;
|
||||
blasint inc_x=1,inc_y=1;
|
||||
FLOAT c[1] = { 2.0 };
|
||||
FLOAT s[1] = { 2.0 };
|
||||
int loops = 1;
|
||||
int l;
|
||||
char *p;
|
||||
|
||||
int from = 1;
|
||||
int to = 200;
|
||||
int step = 1;
|
||||
|
||||
struct timeval start, stop;
|
||||
double time1,timeg;
|
||||
|
||||
argc--;argv++;
|
||||
|
||||
if (argc > 0) { from = atol(*argv); argc--; argv++;}
|
||||
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
|
||||
if (argc > 0) { step = atol(*argv); argc--; argv++;}
|
||||
|
||||
if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
|
||||
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
|
||||
if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p);
|
||||
|
||||
fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops);
|
||||
|
||||
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
|
||||
fprintf(stderr,"Out of Memory!!\n");exit(1);
|
||||
}
|
||||
|
||||
if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){
|
||||
fprintf(stderr,"Out of Memory!!\n");exit(1);
|
||||
}
|
||||
|
||||
#ifdef linux
|
||||
srandom(getpid());
|
||||
#endif
|
||||
|
||||
fprintf(stderr, " SIZE Flops\n");
|
||||
|
||||
for(m = from; m <= to; m += step)
|
||||
{
|
||||
|
||||
timeg=0;
|
||||
|
||||
fprintf(stderr, " %6d : ", (int)m);
|
||||
|
||||
|
||||
for (l=0; l<loops; l++)
|
||||
{
|
||||
|
||||
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
|
||||
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||
}
|
||||
|
||||
for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
|
||||
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||
}
|
||||
gettimeofday( &start, (struct timezone *)0);
|
||||
|
||||
ROT (&m, x, &inc_x, y, &inc_y, c, s);
|
||||
|
||||
gettimeofday( &stop, (struct timezone *)0);
|
||||
|
||||
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
|
||||
|
||||
timeg += time1;
|
||||
|
||||
}
|
||||
|
||||
timeg /= loops;
|
||||
|
||||
fprintf(stderr,
|
||||
" %10.2f MFlops\n",
|
||||
COMPSIZE * COMPSIZE * 6. * (double)m / timeg * 1.e-6);
|
||||
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
|
||||
|
|
@ -798,7 +798,7 @@ Lmcount$lazy_ptr:
|
|||
#elif defined(PPC440FP2)
|
||||
#define BUFFER_SIZE ( 16 << 20)
|
||||
#elif defined(POWER8)
|
||||
#define BUFFER_SIZE ( 64 << 20)
|
||||
#define BUFFER_SIZE ( 32 << 20)
|
||||
#else
|
||||
#define BUFFER_SIZE ( 16 << 20)
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -62,7 +62,7 @@ static void __inline blas_lock(volatile BLASULONG *address){
|
|||
|
||||
#if defined(_MSC_VER) && !defined(__clang__)
|
||||
// use intrinsic instead of inline assembly
|
||||
ret = _InterlockedExchange(address, 1);
|
||||
ret = _InterlockedExchange((volatile LONG *)address, 1);
|
||||
// inline assembly
|
||||
/*__asm {
|
||||
mov eax, address
|
||||
|
|
|
|||
|
|
@ -1452,6 +1452,31 @@ BOOL APIENTRY DllMain(HMODULE hModule, DWORD ul_reason_for_call, LPVOID lpReser
|
|||
}
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
/*
|
||||
This is to allow static linking.
|
||||
Code adapted from Google performance tools:
|
||||
https://gperftools.googlecode.com/git-history/perftools-1.0/src/windows/port.cc
|
||||
Reference:
|
||||
https://sourceware.org/ml/pthreads-win32/2008/msg00028.html
|
||||
http://ci.boost.org/svn-trac/browser/trunk/libs/thread/src/win32/tss_pe.cpp
|
||||
*/
|
||||
static int on_process_term(void)
|
||||
{
|
||||
gotoblas_quit();
|
||||
return 0;
|
||||
}
|
||||
#ifdef _WIN64
|
||||
#pragma comment(linker, "/INCLUDE:_tls_used")
|
||||
#else
|
||||
#pragma comment(linker, "/INCLUDE:__tls_used")
|
||||
#endif
|
||||
#pragma data_seg(push, old_seg)
|
||||
#pragma data_seg(".CRT$XLB")
|
||||
static void (APIENTRY *dll_callback)(HINSTANCE h, DWORD ul_reason_for_call, PVOID pv) = DllMain;
|
||||
#pragma data_seg(".CRT$XTU")
|
||||
static int(*p_process_term)(void) = on_process_term;
|
||||
#pragma data_seg(pop, old_seg)
|
||||
#endif
|
||||
|
||||
#if (defined(C_PGI) || (!defined(C_SUN) && defined(F_INTERFACE_SUN))) && (defined(ARCH_X86) || defined(ARCH_X86_64))
|
||||
|
|
|
|||
|
|
@ -1013,7 +1013,12 @@ int main(int argc, char *argv[]){
|
|||
#endif
|
||||
|
||||
#ifdef MAKE_NB_JOBS
|
||||
#if MAKE_NB_JOBS > 0
|
||||
printf("MAKE += -j %d\n", MAKE_NB_JOBS);
|
||||
#else
|
||||
// Let make use parent -j argument or -j1 if there
|
||||
// is no make parent
|
||||
#endif
|
||||
#elif NO_PARALLEL_MAKE==1
|
||||
printf("MAKE += -j 1\n");
|
||||
#else
|
||||
|
|
|
|||
|
|
@ -64,10 +64,13 @@ int main(int argc, char **argv) {
|
|||
|
||||
|
||||
if ((argc >= 2) && (*argv[1] == '1')) {
|
||||
|
||||
#if defined(ARCH_X86) || defined(ARCH_X86_64)
|
||||
printf("#define SLOCAL_BUFFER_SIZE\t%ld\n", (SGEMM_DEFAULT_Q * SGEMM_DEFAULT_UNROLL_N * 4 * 1 * sizeof(float)));
|
||||
printf("#define DLOCAL_BUFFER_SIZE\t%ld\n", (DGEMM_DEFAULT_Q * DGEMM_DEFAULT_UNROLL_N * 2 * 1 * sizeof(double)));
|
||||
printf("#define CLOCAL_BUFFER_SIZE\t%ld\n", (CGEMM_DEFAULT_Q * CGEMM_DEFAULT_UNROLL_N * 4 * 2 * sizeof(float)));
|
||||
printf("#define ZLOCAL_BUFFER_SIZE\t%ld\n", (ZGEMM_DEFAULT_Q * ZGEMM_DEFAULT_UNROLL_N * 2 * 2 * sizeof(double)));
|
||||
#endif
|
||||
|
||||
#ifdef USE64BITINT
|
||||
printf("#define USE64BITINT\n");
|
||||
|
|
|
|||
|
|
@ -179,93 +179,93 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld2 {v2.4s, v3.4s}, [ppA]
|
||||
add ppA, ppA, #32
|
||||
|
||||
fmul v16.4s, v0.4s, v8.4s[0]
|
||||
OP_ii v16.4s, v1.4s, v9.4s[0]
|
||||
fmul v16.4s, v0.4s, v8.s[0]
|
||||
OP_ii v16.4s, v1.4s, v9.s[0]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
eor v17.16b, v17.16b, v17.16b
|
||||
fmls v17.4s, v0.4s, v9.4s[0]
|
||||
fmls v17.4s, v0.4s, v9.s[0]
|
||||
#else
|
||||
fmul v17.4s, v0.4s, v9.4s[0]
|
||||
fmul v17.4s, v0.4s, v9.s[0]
|
||||
#endif
|
||||
OP_ir v17.4s, v1.4s, v8.4s[0]
|
||||
OP_ir v17.4s, v1.4s, v8.s[0]
|
||||
|
||||
fmul v20.4s, v0.4s, v8.4s[1]
|
||||
OP_ii v20.4s, v1.4s, v9.4s[1]
|
||||
fmul v20.4s, v0.4s, v8.s[1]
|
||||
OP_ii v20.4s, v1.4s, v9.s[1]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
eor v21.16b, v21.16b, v21.16b
|
||||
fmls v21.4s, v0.4s, v9.4s[1]
|
||||
fmls v21.4s, v0.4s, v9.s[1]
|
||||
#else
|
||||
fmul v21.4s, v0.4s, v9.4s[1]
|
||||
fmul v21.4s, v0.4s, v9.s[1]
|
||||
#endif
|
||||
OP_ir v21.4s, v1.4s, v8.4s[1]
|
||||
OP_ir v21.4s, v1.4s, v8.s[1]
|
||||
|
||||
fmul v24.4s, v0.4s, v8.4s[2]
|
||||
OP_ii v24.4s, v1.4s, v9.4s[2]
|
||||
fmul v24.4s, v0.4s, v8.s[2]
|
||||
OP_ii v24.4s, v1.4s, v9.s[2]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
eor v25.16b, v25.16b, v25.16b
|
||||
fmls v25.4s, v0.4s, v9.4s[2]
|
||||
fmls v25.4s, v0.4s, v9.s[2]
|
||||
#else
|
||||
fmul v25.4s, v0.4s, v9.4s[2]
|
||||
fmul v25.4s, v0.4s, v9.s[2]
|
||||
#endif
|
||||
OP_ir v25.4s, v1.4s, v8.4s[2]
|
||||
OP_ir v25.4s, v1.4s, v8.s[2]
|
||||
|
||||
fmul v28.4s, v0.4s, v8.4s[3]
|
||||
OP_ii v28.4s, v1.4s, v9.4s[3]
|
||||
fmul v28.4s, v0.4s, v8.s[3]
|
||||
OP_ii v28.4s, v1.4s, v9.s[3]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
eor v29.16b, v29.16b, v29.16b
|
||||
fmls v29.4s, v0.4s, v9.4s[3]
|
||||
fmls v29.4s, v0.4s, v9.s[3]
|
||||
#else
|
||||
fmul v29.4s, v0.4s, v9.4s[3]
|
||||
fmul v29.4s, v0.4s, v9.s[3]
|
||||
#endif
|
||||
OP_ir v29.4s, v1.4s, v8.4s[3]
|
||||
OP_ir v29.4s, v1.4s, v8.s[3]
|
||||
|
||||
fmul v18.4s, v2.4s, v8.4s[0]
|
||||
OP_ii v18.4s, v3.4s, v9.4s[0]
|
||||
fmul v18.4s, v2.4s, v8.s[0]
|
||||
OP_ii v18.4s, v3.4s, v9.s[0]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
eor v19.16b, v19.16b, v19.16b
|
||||
fmls v19.4s, v2.4s, v9.4s[0]
|
||||
fmls v19.4s, v2.4s, v9.s[0]
|
||||
#else
|
||||
fmul v19.4s, v2.4s, v9.4s[0]
|
||||
fmul v19.4s, v2.4s, v9.s[0]
|
||||
#endif
|
||||
OP_ir v19.4s, v3.4s, v8.4s[0]
|
||||
OP_ir v19.4s, v3.4s, v8.s[0]
|
||||
|
||||
fmul v22.4s, v2.4s, v8.4s[1]
|
||||
OP_ii v22.4s, v3.4s, v9.4s[1]
|
||||
fmul v22.4s, v2.4s, v8.s[1]
|
||||
OP_ii v22.4s, v3.4s, v9.s[1]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
eor v23.16b, v23.16b, v23.16b
|
||||
fmls v23.4s, v2.4s, v9.4s[1]
|
||||
fmls v23.4s, v2.4s, v9.s[1]
|
||||
#else
|
||||
fmul v23.4s, v2.4s, v9.4s[1]
|
||||
fmul v23.4s, v2.4s, v9.s[1]
|
||||
#endif
|
||||
OP_ir v23.4s, v3.4s, v8.4s[1]
|
||||
OP_ir v23.4s, v3.4s, v8.s[1]
|
||||
|
||||
fmul v26.4s, v2.4s, v8.4s[2]
|
||||
OP_ii v26.4s, v3.4s, v9.4s[2]
|
||||
fmul v26.4s, v2.4s, v8.s[2]
|
||||
OP_ii v26.4s, v3.4s, v9.s[2]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
eor v27.16b, v27.16b, v27.16b
|
||||
fmls v27.4s, v2.4s, v9.4s[2]
|
||||
fmls v27.4s, v2.4s, v9.s[2]
|
||||
#else
|
||||
fmul v27.4s, v2.4s, v9.4s[2]
|
||||
fmul v27.4s, v2.4s, v9.s[2]
|
||||
#endif
|
||||
OP_ir v27.4s, v3.4s, v8.4s[2]
|
||||
OP_ir v27.4s, v3.4s, v8.s[2]
|
||||
|
||||
fmul v30.4s, v2.4s, v8.4s[3]
|
||||
OP_ii v30.4s, v3.4s, v9.4s[3]
|
||||
fmul v30.4s, v2.4s, v8.s[3]
|
||||
OP_ii v30.4s, v3.4s, v9.s[3]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
eor v31.16b, v31.16b, v31.16b
|
||||
fmls v31.4s, v2.4s, v9.4s[3]
|
||||
fmls v31.4s, v2.4s, v9.s[3]
|
||||
#else
|
||||
fmul v31.4s, v2.4s, v9.4s[3]
|
||||
fmul v31.4s, v2.4s, v9.s[3]
|
||||
#endif
|
||||
OP_ir v31.4s, v3.4s, v8.4s[3]
|
||||
OP_ir v31.4s, v3.4s, v8.s[3]
|
||||
|
||||
ld2 {v12.4s, v13.4s}, [pB]
|
||||
add pB, pB, #32
|
||||
|
|
@ -276,159 +276,159 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro KERNEL8x4_M1
|
||||
OP_rr v16.4s, v0.4s, v8.4s[0]
|
||||
OP_ii v16.4s, v1.4s, v9.4s[0]
|
||||
OP_ri v17.4s, v0.4s, v9.4s[0]
|
||||
OP_ir v17.4s, v1.4s, v8.4s[0]
|
||||
OP_rr v16.4s, v0.4s, v8.s[0]
|
||||
OP_ii v16.4s, v1.4s, v9.s[0]
|
||||
OP_ri v17.4s, v0.4s, v9.s[0]
|
||||
OP_ir v17.4s, v1.4s, v8.s[0]
|
||||
|
||||
ld2 {v12.4s, v13.4s}, [pB] // for next round
|
||||
add pB, pB, #32
|
||||
|
||||
OP_rr v20.4s, v0.4s, v8.4s[1]
|
||||
OP_ii v20.4s, v1.4s, v9.4s[1]
|
||||
OP_ri v21.4s, v0.4s, v9.4s[1]
|
||||
OP_ir v21.4s, v1.4s, v8.4s[1]
|
||||
OP_rr v20.4s, v0.4s, v8.s[1]
|
||||
OP_ii v20.4s, v1.4s, v9.s[1]
|
||||
OP_ri v21.4s, v0.4s, v9.s[1]
|
||||
OP_ir v21.4s, v1.4s, v8.s[1]
|
||||
|
||||
prfm PLDL1KEEP, [pB, #512]
|
||||
|
||||
OP_rr v24.4s, v0.4s, v8.4s[2]
|
||||
OP_ii v24.4s, v1.4s, v9.4s[2]
|
||||
OP_ri v25.4s, v0.4s, v9.4s[2]
|
||||
OP_ir v25.4s, v1.4s, v8.4s[2]
|
||||
OP_rr v24.4s, v0.4s, v8.s[2]
|
||||
OP_ii v24.4s, v1.4s, v9.s[2]
|
||||
OP_ri v25.4s, v0.4s, v9.s[2]
|
||||
OP_ir v25.4s, v1.4s, v8.s[2]
|
||||
|
||||
ld2 {v4.4s, v5.4s} , [pA] // for next round
|
||||
add pA, pA, #32
|
||||
|
||||
OP_rr v28.4s, v0.4s, v8.4s[3]
|
||||
OP_ii v28.4s, v1.4s, v9.4s[3]
|
||||
OP_ri v29.4s, v0.4s, v9.4s[3]
|
||||
OP_ir v29.4s, v1.4s, v8.4s[3]
|
||||
OP_rr v28.4s, v0.4s, v8.s[3]
|
||||
OP_ii v28.4s, v1.4s, v9.s[3]
|
||||
OP_ri v29.4s, v0.4s, v9.s[3]
|
||||
OP_ir v29.4s, v1.4s, v8.s[3]
|
||||
|
||||
prfm PLDL1KEEP, [pA, #512]
|
||||
|
||||
OP_rr v18.4s, v2.4s, v8.4s[0]
|
||||
OP_ii v18.4s, v3.4s, v9.4s[0]
|
||||
OP_ri v19.4s, v2.4s, v9.4s[0]
|
||||
OP_ir v19.4s, v3.4s, v8.4s[0]
|
||||
OP_rr v18.4s, v2.4s, v8.s[0]
|
||||
OP_ii v18.4s, v3.4s, v9.s[0]
|
||||
OP_ri v19.4s, v2.4s, v9.s[0]
|
||||
OP_ir v19.4s, v3.4s, v8.s[0]
|
||||
|
||||
ld2 {v6.4s, v7.4s} , [ppA] // for next round
|
||||
add ppA, ppA, #32
|
||||
|
||||
OP_rr v22.4s, v2.4s, v8.4s[1]
|
||||
OP_ii v22.4s, v3.4s, v9.4s[1]
|
||||
OP_ri v23.4s, v2.4s, v9.4s[1]
|
||||
OP_ir v23.4s, v3.4s, v8.4s[1]
|
||||
OP_rr v22.4s, v2.4s, v8.s[1]
|
||||
OP_ii v22.4s, v3.4s, v9.s[1]
|
||||
OP_ri v23.4s, v2.4s, v9.s[1]
|
||||
OP_ir v23.4s, v3.4s, v8.s[1]
|
||||
|
||||
prfm PLDL1KEEP, [ppA, #512]
|
||||
|
||||
OP_rr v26.4s, v2.4s, v8.4s[2]
|
||||
OP_ii v26.4s, v3.4s, v9.4s[2]
|
||||
OP_ri v27.4s, v2.4s, v9.4s[2]
|
||||
OP_ir v27.4s, v3.4s, v8.4s[2]
|
||||
OP_rr v26.4s, v2.4s, v8.s[2]
|
||||
OP_ii v26.4s, v3.4s, v9.s[2]
|
||||
OP_ri v27.4s, v2.4s, v9.s[2]
|
||||
OP_ir v27.4s, v3.4s, v8.s[2]
|
||||
|
||||
OP_rr v30.4s, v2.4s, v8.4s[3]
|
||||
OP_ii v30.4s, v3.4s, v9.4s[3]
|
||||
OP_ri v31.4s, v2.4s, v9.4s[3]
|
||||
OP_ir v31.4s, v3.4s, v8.4s[3]
|
||||
OP_rr v30.4s, v2.4s, v8.s[3]
|
||||
OP_ii v30.4s, v3.4s, v9.s[3]
|
||||
OP_ri v31.4s, v2.4s, v9.s[3]
|
||||
OP_ir v31.4s, v3.4s, v8.s[3]
|
||||
.endm
|
||||
|
||||
.macro KERNEL8x4_M2
|
||||
OP_rr v16.4s, v4.4s, v12.4s[0]
|
||||
OP_ii v16.4s, v5.4s, v13.4s[0]
|
||||
OP_ri v17.4s, v4.4s, v13.4s[0]
|
||||
OP_ir v17.4s, v5.4s, v12.4s[0]
|
||||
OP_rr v16.4s, v4.4s, v12.s[0]
|
||||
OP_ii v16.4s, v5.4s, v13.s[0]
|
||||
OP_ri v17.4s, v4.4s, v13.s[0]
|
||||
OP_ir v17.4s, v5.4s, v12.s[0]
|
||||
|
||||
ld2 {v8.4s, v9.4s}, [pB] // for next round
|
||||
add pB, pB, #32
|
||||
|
||||
OP_rr v20.4s, v4.4s, v12.4s[1]
|
||||
OP_ii v20.4s, v5.4s, v13.4s[1]
|
||||
OP_ri v21.4s, v4.4s, v13.4s[1]
|
||||
OP_ir v21.4s, v5.4s, v12.4s[1]
|
||||
OP_rr v20.4s, v4.4s, v12.s[1]
|
||||
OP_ii v20.4s, v5.4s, v13.s[1]
|
||||
OP_ri v21.4s, v4.4s, v13.s[1]
|
||||
OP_ir v21.4s, v5.4s, v12.s[1]
|
||||
|
||||
prfm PLDL1KEEP, [pA, #512]
|
||||
|
||||
OP_rr v24.4s, v4.4s, v12.4s[2]
|
||||
OP_ii v24.4s, v5.4s, v13.4s[2]
|
||||
OP_ri v25.4s, v4.4s, v13.4s[2]
|
||||
OP_ir v25.4s, v5.4s, v12.4s[2]
|
||||
OP_rr v24.4s, v4.4s, v12.s[2]
|
||||
OP_ii v24.4s, v5.4s, v13.s[2]
|
||||
OP_ri v25.4s, v4.4s, v13.s[2]
|
||||
OP_ir v25.4s, v5.4s, v12.s[2]
|
||||
|
||||
ld2 {v0.4s, v1.4s}, [pA] // for next round
|
||||
add pA, pA, #32
|
||||
|
||||
OP_rr v28.4s, v4.4s, v12.4s[3]
|
||||
OP_ii v28.4s, v5.4s, v13.4s[3]
|
||||
OP_ri v29.4s, v4.4s, v13.4s[3]
|
||||
OP_ir v29.4s, v5.4s, v12.4s[3]
|
||||
OP_rr v28.4s, v4.4s, v12.s[3]
|
||||
OP_ii v28.4s, v5.4s, v13.s[3]
|
||||
OP_ri v29.4s, v4.4s, v13.s[3]
|
||||
OP_ir v29.4s, v5.4s, v12.s[3]
|
||||
|
||||
prfm PLDL1KEEP, [ppA, #512]
|
||||
|
||||
OP_rr v18.4s, v6.4s, v12.4s[0]
|
||||
OP_ii v18.4s, v7.4s, v13.4s[0]
|
||||
OP_ri v19.4s, v6.4s, v13.4s[0]
|
||||
OP_ir v19.4s, v7.4s, v12.4s[0]
|
||||
OP_rr v18.4s, v6.4s, v12.s[0]
|
||||
OP_ii v18.4s, v7.4s, v13.s[0]
|
||||
OP_ri v19.4s, v6.4s, v13.s[0]
|
||||
OP_ir v19.4s, v7.4s, v12.s[0]
|
||||
|
||||
ld2 {v2.4s, v3.4s}, [ppA] // for next round
|
||||
add ppA, ppA, #32
|
||||
|
||||
OP_rr v22.4s, v6.4s, v12.4s[1]
|
||||
OP_ii v22.4s, v7.4s, v13.4s[1]
|
||||
OP_ri v23.4s, v6.4s, v13.4s[1]
|
||||
OP_ir v23.4s, v7.4s, v12.4s[1]
|
||||
OP_rr v22.4s, v6.4s, v12.s[1]
|
||||
OP_ii v22.4s, v7.4s, v13.s[1]
|
||||
OP_ri v23.4s, v6.4s, v13.s[1]
|
||||
OP_ir v23.4s, v7.4s, v12.s[1]
|
||||
|
||||
prfm PLDL1KEEP, [pB, #512]
|
||||
|
||||
OP_rr v26.4s, v6.4s, v12.4s[2]
|
||||
OP_ii v26.4s, v7.4s, v13.4s[2]
|
||||
OP_ri v27.4s, v6.4s, v13.4s[2]
|
||||
OP_ir v27.4s, v7.4s, v12.4s[2]
|
||||
OP_rr v26.4s, v6.4s, v12.s[2]
|
||||
OP_ii v26.4s, v7.4s, v13.s[2]
|
||||
OP_ri v27.4s, v6.4s, v13.s[2]
|
||||
OP_ir v27.4s, v7.4s, v12.s[2]
|
||||
|
||||
OP_rr v30.4s, v6.4s, v12.4s[3]
|
||||
OP_ii v30.4s, v7.4s, v13.4s[3]
|
||||
OP_ri v31.4s, v6.4s, v13.4s[3]
|
||||
OP_ir v31.4s, v7.4s, v12.4s[3]
|
||||
OP_rr v30.4s, v6.4s, v12.s[3]
|
||||
OP_ii v30.4s, v7.4s, v13.s[3]
|
||||
OP_ri v31.4s, v6.4s, v13.s[3]
|
||||
OP_ir v31.4s, v7.4s, v12.s[3]
|
||||
.endm
|
||||
|
||||
.macro KERNEL8x4_E
|
||||
OP_rr v16.4s, v4.4s, v12.4s[0]
|
||||
OP_ii v16.4s, v5.4s, v13.4s[0]
|
||||
OP_ri v17.4s, v4.4s, v13.4s[0]
|
||||
OP_ir v17.4s, v5.4s, v12.4s[0]
|
||||
OP_rr v16.4s, v4.4s, v12.s[0]
|
||||
OP_ii v16.4s, v5.4s, v13.s[0]
|
||||
OP_ri v17.4s, v4.4s, v13.s[0]
|
||||
OP_ir v17.4s, v5.4s, v12.s[0]
|
||||
|
||||
OP_rr v20.4s, v4.4s, v12.4s[1]
|
||||
OP_ii v20.4s, v5.4s, v13.4s[1]
|
||||
OP_ri v21.4s, v4.4s, v13.4s[1]
|
||||
OP_ir v21.4s, v5.4s, v12.4s[1]
|
||||
OP_rr v20.4s, v4.4s, v12.s[1]
|
||||
OP_ii v20.4s, v5.4s, v13.s[1]
|
||||
OP_ri v21.4s, v4.4s, v13.s[1]
|
||||
OP_ir v21.4s, v5.4s, v12.s[1]
|
||||
|
||||
OP_rr v24.4s, v4.4s, v12.4s[2]
|
||||
OP_ii v24.4s, v5.4s, v13.4s[2]
|
||||
OP_ri v25.4s, v4.4s, v13.4s[2]
|
||||
OP_ir v25.4s, v5.4s, v12.4s[2]
|
||||
OP_rr v24.4s, v4.4s, v12.s[2]
|
||||
OP_ii v24.4s, v5.4s, v13.s[2]
|
||||
OP_ri v25.4s, v4.4s, v13.s[2]
|
||||
OP_ir v25.4s, v5.4s, v12.s[2]
|
||||
|
||||
OP_rr v28.4s, v4.4s, v12.4s[3]
|
||||
OP_ii v28.4s, v5.4s, v13.4s[3]
|
||||
OP_ri v29.4s, v4.4s, v13.4s[3]
|
||||
OP_ir v29.4s, v5.4s, v12.4s[3]
|
||||
OP_rr v28.4s, v4.4s, v12.s[3]
|
||||
OP_ii v28.4s, v5.4s, v13.s[3]
|
||||
OP_ri v29.4s, v4.4s, v13.s[3]
|
||||
OP_ir v29.4s, v5.4s, v12.s[3]
|
||||
|
||||
OP_rr v18.4s, v6.4s, v12.4s[0]
|
||||
OP_ii v18.4s, v7.4s, v13.4s[0]
|
||||
OP_ri v19.4s, v6.4s, v13.4s[0]
|
||||
OP_ir v19.4s, v7.4s, v12.4s[0]
|
||||
OP_rr v18.4s, v6.4s, v12.s[0]
|
||||
OP_ii v18.4s, v7.4s, v13.s[0]
|
||||
OP_ri v19.4s, v6.4s, v13.s[0]
|
||||
OP_ir v19.4s, v7.4s, v12.s[0]
|
||||
|
||||
OP_rr v22.4s, v6.4s, v12.4s[1]
|
||||
OP_ii v22.4s, v7.4s, v13.4s[1]
|
||||
OP_ri v23.4s, v6.4s, v13.4s[1]
|
||||
OP_ir v23.4s, v7.4s, v12.4s[1]
|
||||
OP_rr v22.4s, v6.4s, v12.s[1]
|
||||
OP_ii v22.4s, v7.4s, v13.s[1]
|
||||
OP_ri v23.4s, v6.4s, v13.s[1]
|
||||
OP_ir v23.4s, v7.4s, v12.s[1]
|
||||
|
||||
OP_rr v26.4s, v6.4s, v12.4s[2]
|
||||
OP_ii v26.4s, v7.4s, v13.4s[2]
|
||||
OP_ri v27.4s, v6.4s, v13.4s[2]
|
||||
OP_ir v27.4s, v7.4s, v12.4s[2]
|
||||
OP_rr v26.4s, v6.4s, v12.s[2]
|
||||
OP_ii v26.4s, v7.4s, v13.s[2]
|
||||
OP_ri v27.4s, v6.4s, v13.s[2]
|
||||
OP_ir v27.4s, v7.4s, v12.s[2]
|
||||
|
||||
OP_rr v30.4s, v6.4s, v12.4s[3]
|
||||
OP_ii v30.4s, v7.4s, v13.4s[3]
|
||||
OP_ri v31.4s, v6.4s, v13.4s[3]
|
||||
OP_ir v31.4s, v7.4s, v12.4s[3]
|
||||
OP_rr v30.4s, v6.4s, v12.s[3]
|
||||
OP_ii v30.4s, v7.4s, v13.s[3]
|
||||
OP_ri v31.4s, v6.4s, v13.s[3]
|
||||
OP_ir v31.4s, v7.4s, v12.s[3]
|
||||
.endm
|
||||
|
||||
.macro KERNEL8x4_SUB
|
||||
|
|
@ -437,48 +437,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld2 {v0.4s, v1.4s}, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
OP_rr v16.4s, v0.4s, v8.4s[0]
|
||||
OP_ii v16.4s, v1.4s, v9.4s[0]
|
||||
OP_ri v17.4s, v0.4s, v9.4s[0]
|
||||
OP_ir v17.4s, v1.4s, v8.4s[0]
|
||||
OP_rr v16.4s, v0.4s, v8.s[0]
|
||||
OP_ii v16.4s, v1.4s, v9.s[0]
|
||||
OP_ri v17.4s, v0.4s, v9.s[0]
|
||||
OP_ir v17.4s, v1.4s, v8.s[0]
|
||||
|
||||
OP_rr v20.4s, v0.4s, v8.4s[1]
|
||||
OP_ii v20.4s, v1.4s, v9.4s[1]
|
||||
OP_ri v21.4s, v0.4s, v9.4s[1]
|
||||
OP_ir v21.4s, v1.4s, v8.4s[1]
|
||||
OP_rr v20.4s, v0.4s, v8.s[1]
|
||||
OP_ii v20.4s, v1.4s, v9.s[1]
|
||||
OP_ri v21.4s, v0.4s, v9.s[1]
|
||||
OP_ir v21.4s, v1.4s, v8.s[1]
|
||||
|
||||
ld2 {v2.4s, v3.4s}, [ppA]
|
||||
add ppA, ppA, #32
|
||||
|
||||
OP_rr v24.4s, v0.4s, v8.4s[2]
|
||||
OP_ii v24.4s, v1.4s, v9.4s[2]
|
||||
OP_ri v25.4s, v0.4s, v9.4s[2]
|
||||
OP_ir v25.4s, v1.4s, v8.4s[2]
|
||||
OP_rr v24.4s, v0.4s, v8.s[2]
|
||||
OP_ii v24.4s, v1.4s, v9.s[2]
|
||||
OP_ri v25.4s, v0.4s, v9.s[2]
|
||||
OP_ir v25.4s, v1.4s, v8.s[2]
|
||||
|
||||
OP_rr v28.4s, v0.4s, v8.4s[3]
|
||||
OP_ii v28.4s, v1.4s, v9.4s[3]
|
||||
OP_ri v29.4s, v0.4s, v9.4s[3]
|
||||
OP_ir v29.4s, v1.4s, v8.4s[3]
|
||||
OP_rr v28.4s, v0.4s, v8.s[3]
|
||||
OP_ii v28.4s, v1.4s, v9.s[3]
|
||||
OP_ri v29.4s, v0.4s, v9.s[3]
|
||||
OP_ir v29.4s, v1.4s, v8.s[3]
|
||||
|
||||
OP_rr v18.4s, v2.4s, v8.4s[0]
|
||||
OP_ii v18.4s, v3.4s, v9.4s[0]
|
||||
OP_ri v19.4s, v2.4s, v9.4s[0]
|
||||
OP_ir v19.4s, v3.4s, v8.4s[0]
|
||||
OP_rr v18.4s, v2.4s, v8.s[0]
|
||||
OP_ii v18.4s, v3.4s, v9.s[0]
|
||||
OP_ri v19.4s, v2.4s, v9.s[0]
|
||||
OP_ir v19.4s, v3.4s, v8.s[0]
|
||||
|
||||
OP_rr v22.4s, v2.4s, v8.4s[1]
|
||||
OP_ii v22.4s, v3.4s, v9.4s[1]
|
||||
OP_ri v23.4s, v2.4s, v9.4s[1]
|
||||
OP_ir v23.4s, v3.4s, v8.4s[1]
|
||||
OP_rr v22.4s, v2.4s, v8.s[1]
|
||||
OP_ii v22.4s, v3.4s, v9.s[1]
|
||||
OP_ri v23.4s, v2.4s, v9.s[1]
|
||||
OP_ir v23.4s, v3.4s, v8.s[1]
|
||||
|
||||
OP_rr v26.4s, v2.4s, v8.4s[2]
|
||||
OP_ii v26.4s, v3.4s, v9.4s[2]
|
||||
OP_ri v27.4s, v2.4s, v9.4s[2]
|
||||
OP_ir v27.4s, v3.4s, v8.4s[2]
|
||||
OP_rr v26.4s, v2.4s, v8.s[2]
|
||||
OP_ii v26.4s, v3.4s, v9.s[2]
|
||||
OP_ri v27.4s, v2.4s, v9.s[2]
|
||||
OP_ir v27.4s, v3.4s, v8.s[2]
|
||||
|
||||
OP_rr v30.4s, v2.4s, v8.4s[3]
|
||||
OP_ii v30.4s, v3.4s, v9.4s[3]
|
||||
OP_ri v31.4s, v2.4s, v9.4s[3]
|
||||
OP_ir v31.4s, v3.4s, v8.4s[3]
|
||||
OP_rr v30.4s, v2.4s, v8.s[3]
|
||||
OP_ii v30.4s, v3.4s, v9.s[3]
|
||||
OP_ri v31.4s, v2.4s, v9.s[3]
|
||||
OP_ir v31.4s, v3.4s, v8.s[3]
|
||||
.endm
|
||||
|
||||
.macro SAVE8x4
|
||||
|
|
@ -578,25 +578,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld2 {v0.4s, v1.4s}, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
OP_rr v16.4s, v0.4s, v8.4s[0]
|
||||
OP_ii v16.4s, v1.4s, v9.4s[0]
|
||||
OP_ri v17.4s, v0.4s, v9.4s[0]
|
||||
OP_ir v17.4s, v1.4s, v8.4s[0]
|
||||
OP_rr v16.4s, v0.4s, v8.s[0]
|
||||
OP_ii v16.4s, v1.4s, v9.s[0]
|
||||
OP_ri v17.4s, v0.4s, v9.s[0]
|
||||
OP_ir v17.4s, v1.4s, v8.s[0]
|
||||
|
||||
OP_rr v20.4s, v0.4s, v8.4s[1]
|
||||
OP_ii v20.4s, v1.4s, v9.4s[1]
|
||||
OP_ri v21.4s, v0.4s, v9.4s[1]
|
||||
OP_ir v21.4s, v1.4s, v8.4s[1]
|
||||
OP_rr v20.4s, v0.4s, v8.s[1]
|
||||
OP_ii v20.4s, v1.4s, v9.s[1]
|
||||
OP_ri v21.4s, v0.4s, v9.s[1]
|
||||
OP_ir v21.4s, v1.4s, v8.s[1]
|
||||
|
||||
OP_rr v24.4s, v0.4s, v8.4s[2]
|
||||
OP_ii v24.4s, v1.4s, v9.4s[2]
|
||||
OP_ri v25.4s, v0.4s, v9.4s[2]
|
||||
OP_ir v25.4s, v1.4s, v8.4s[2]
|
||||
OP_rr v24.4s, v0.4s, v8.s[2]
|
||||
OP_ii v24.4s, v1.4s, v9.s[2]
|
||||
OP_ri v25.4s, v0.4s, v9.s[2]
|
||||
OP_ir v25.4s, v1.4s, v8.s[2]
|
||||
|
||||
OP_rr v28.4s, v0.4s, v8.4s[3]
|
||||
OP_ii v28.4s, v1.4s, v9.4s[3]
|
||||
OP_ri v29.4s, v0.4s, v9.4s[3]
|
||||
OP_ir v29.4s, v1.4s, v8.4s[3]
|
||||
OP_rr v28.4s, v0.4s, v8.s[3]
|
||||
OP_ii v28.4s, v1.4s, v9.s[3]
|
||||
OP_ri v29.4s, v0.4s, v9.s[3]
|
||||
OP_ir v29.4s, v1.4s, v8.s[3]
|
||||
.endm
|
||||
|
||||
.macro SAVE4x4
|
||||
|
|
@ -658,25 +658,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld2 {v0.2s, v1.2s}, [pA]
|
||||
add pA, pA, #16
|
||||
|
||||
OP_rr v16.2s, v0.2s, v8.4s[0]
|
||||
OP_ii v16.2s, v1.2s, v9.4s[0]
|
||||
OP_ri v17.2s, v0.2s, v9.4s[0]
|
||||
OP_ir v17.2s, v1.2s, v8.4s[0]
|
||||
OP_rr v16.2s, v0.2s, v8.s[0]
|
||||
OP_ii v16.2s, v1.2s, v9.s[0]
|
||||
OP_ri v17.2s, v0.2s, v9.s[0]
|
||||
OP_ir v17.2s, v1.2s, v8.s[0]
|
||||
|
||||
OP_rr v20.2s, v0.2s, v8.4s[1]
|
||||
OP_ii v20.2s, v1.2s, v9.4s[1]
|
||||
OP_ri v21.2s, v0.2s, v9.4s[1]
|
||||
OP_ir v21.2s, v1.2s, v8.4s[1]
|
||||
OP_rr v20.2s, v0.2s, v8.s[1]
|
||||
OP_ii v20.2s, v1.2s, v9.s[1]
|
||||
OP_ri v21.2s, v0.2s, v9.s[1]
|
||||
OP_ir v21.2s, v1.2s, v8.s[1]
|
||||
|
||||
OP_rr v24.2s, v0.2s, v8.4s[2]
|
||||
OP_ii v24.2s, v1.2s, v9.4s[2]
|
||||
OP_ri v25.2s, v0.2s, v9.4s[2]
|
||||
OP_ir v25.2s, v1.2s, v8.4s[2]
|
||||
OP_rr v24.2s, v0.2s, v8.s[2]
|
||||
OP_ii v24.2s, v1.2s, v9.s[2]
|
||||
OP_ri v25.2s, v0.2s, v9.s[2]
|
||||
OP_ir v25.2s, v1.2s, v8.s[2]
|
||||
|
||||
OP_rr v28.2s, v0.2s, v8.4s[3]
|
||||
OP_ii v28.2s, v1.2s, v9.4s[3]
|
||||
OP_ri v29.2s, v0.2s, v9.4s[3]
|
||||
OP_ir v29.2s, v1.2s, v8.4s[3]
|
||||
OP_rr v28.2s, v0.2s, v8.s[3]
|
||||
OP_ii v28.2s, v1.2s, v9.s[3]
|
||||
OP_ri v29.2s, v0.2s, v9.s[3]
|
||||
OP_ir v29.2s, v1.2s, v8.s[3]
|
||||
.endm
|
||||
|
||||
.macro SAVE2x4
|
||||
|
|
@ -738,25 +738,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld2 {v0.s, v1.s}[0], [pA]
|
||||
add pA, pA, #8
|
||||
|
||||
OP_rr s16, s0, v8.4s[0]
|
||||
OP_ii s16, s1, v9.4s[0]
|
||||
OP_ri s17, s0, v9.4s[0]
|
||||
OP_ir s17, s1, v8.4s[0]
|
||||
OP_rr s16, s0, v8.s[0]
|
||||
OP_ii s16, s1, v9.s[0]
|
||||
OP_ri s17, s0, v9.s[0]
|
||||
OP_ir s17, s1, v8.s[0]
|
||||
|
||||
OP_rr s20, s0, v8.4s[1]
|
||||
OP_ii s20, s1, v9.4s[1]
|
||||
OP_ri s21, s0, v9.4s[1]
|
||||
OP_ir s21, s1, v8.4s[1]
|
||||
OP_rr s20, s0, v8.s[1]
|
||||
OP_ii s20, s1, v9.s[1]
|
||||
OP_ri s21, s0, v9.s[1]
|
||||
OP_ir s21, s1, v8.s[1]
|
||||
|
||||
OP_rr s24, s0, v8.4s[2]
|
||||
OP_ii s24, s1, v9.4s[2]
|
||||
OP_ri s25, s0, v9.4s[2]
|
||||
OP_ir s25, s1, v8.4s[2]
|
||||
OP_rr s24, s0, v8.s[2]
|
||||
OP_ii s24, s1, v9.s[2]
|
||||
OP_ri s25, s0, v9.s[2]
|
||||
OP_ir s25, s1, v8.s[2]
|
||||
|
||||
OP_rr s28, s0, v8.4s[3]
|
||||
OP_ii s28, s1, v9.4s[3]
|
||||
OP_ri s29, s0, v9.4s[3]
|
||||
OP_ir s29, s1, v8.4s[3]
|
||||
OP_rr s28, s0, v8.s[3]
|
||||
OP_ii s28, s1, v9.s[3]
|
||||
OP_ri s29, s0, v9.s[3]
|
||||
OP_ir s29, s1, v8.s[3]
|
||||
.endm
|
||||
|
||||
.macro SAVE1x4
|
||||
|
|
@ -814,15 +814,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld2 {v0.4s, v1.4s}, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
OP_rr v16.4s, v0.4s, v8.2s[0]
|
||||
OP_ii v16.4s, v1.4s, v9.2s[0]
|
||||
OP_ri v17.4s, v0.4s, v9.2s[0]
|
||||
OP_ir v17.4s, v1.4s, v8.2s[0]
|
||||
OP_rr v16.4s, v0.4s, v8.s[0]
|
||||
OP_ii v16.4s, v1.4s, v9.s[0]
|
||||
OP_ri v17.4s, v0.4s, v9.s[0]
|
||||
OP_ir v17.4s, v1.4s, v8.s[0]
|
||||
|
||||
OP_rr v20.4s, v0.4s, v8.2s[1]
|
||||
OP_ii v20.4s, v1.4s, v9.2s[1]
|
||||
OP_ri v21.4s, v0.4s, v9.2s[1]
|
||||
OP_ir v21.4s, v1.4s, v8.2s[1]
|
||||
OP_rr v20.4s, v0.4s, v8.s[1]
|
||||
OP_ii v20.4s, v1.4s, v9.s[1]
|
||||
OP_ri v21.4s, v0.4s, v9.s[1]
|
||||
OP_ir v21.4s, v1.4s, v8.s[1]
|
||||
.endm
|
||||
|
||||
.macro SAVE4x2
|
||||
|
|
@ -862,15 +862,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld2 {v0.2s, v1.2s}, [pA]
|
||||
add pA, pA, #16
|
||||
|
||||
OP_rr v16.2s, v0.2s, v8.2s[0]
|
||||
OP_ii v16.2s, v1.2s, v9.2s[0]
|
||||
OP_ri v17.2s, v0.2s, v9.2s[0]
|
||||
OP_ir v17.2s, v1.2s, v8.2s[0]
|
||||
OP_rr v16.2s, v0.2s, v8.s[0]
|
||||
OP_ii v16.2s, v1.2s, v9.s[0]
|
||||
OP_ri v17.2s, v0.2s, v9.s[0]
|
||||
OP_ir v17.2s, v1.2s, v8.s[0]
|
||||
|
||||
OP_rr v20.2s, v0.2s, v8.2s[1]
|
||||
OP_ii v20.2s, v1.2s, v9.2s[1]
|
||||
OP_ri v21.2s, v0.2s, v9.2s[1]
|
||||
OP_ir v21.2s, v1.2s, v8.2s[1]
|
||||
OP_rr v20.2s, v0.2s, v8.s[1]
|
||||
OP_ii v20.2s, v1.2s, v9.s[1]
|
||||
OP_ri v21.2s, v0.2s, v9.s[1]
|
||||
OP_ir v21.2s, v1.2s, v8.s[1]
|
||||
.endm
|
||||
|
||||
.macro SAVE2x2
|
||||
|
|
@ -910,15 +910,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld2 {v0.s, v1.s}[0], [pA]
|
||||
add pA, pA, #8
|
||||
|
||||
OP_rr s16, s0, v8.2s[0]
|
||||
OP_ii s16, s1, v9.2s[0]
|
||||
OP_ri s17, s0, v9.2s[0]
|
||||
OP_ir s17, s1, v8.2s[0]
|
||||
OP_rr s16, s0, v8.s[0]
|
||||
OP_ii s16, s1, v9.s[0]
|
||||
OP_ri s17, s0, v9.s[0]
|
||||
OP_ir s17, s1, v8.s[0]
|
||||
|
||||
OP_rr s20, s0, v8.2s[1]
|
||||
OP_ii s20, s1, v9.2s[1]
|
||||
OP_ri s21, s0, v9.2s[1]
|
||||
OP_ir s21, s1, v8.2s[1]
|
||||
OP_rr s20, s0, v8.s[1]
|
||||
OP_ii s20, s1, v9.s[1]
|
||||
OP_ri s21, s0, v9.s[1]
|
||||
OP_ir s21, s1, v8.s[1]
|
||||
.endm
|
||||
|
||||
.macro SAVE1x2
|
||||
|
|
|
|||
|
|
@ -178,93 +178,93 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld2 {v2.4s, v3.4s}, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
fmul v16.4s, v0.4s, v8.4s[0]
|
||||
OP_ii v16.4s, v1.4s, v9.4s[0]
|
||||
fmul v16.4s, v0.4s, v8.s[0]
|
||||
OP_ii v16.4s, v1.4s, v9.s[0]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
eor v17.16b, v17.16b, v17.16b
|
||||
fmls v17.4s, v0.4s, v9.4s[0]
|
||||
fmls v17.4s, v0.4s, v9.s[0]
|
||||
#else
|
||||
fmul v17.4s, v0.4s, v9.4s[0]
|
||||
fmul v17.4s, v0.4s, v9.s[0]
|
||||
#endif
|
||||
OP_ir v17.4s, v1.4s, v8.4s[0]
|
||||
OP_ir v17.4s, v1.4s, v8.s[0]
|
||||
|
||||
fmul v18.4s, v2.4s, v8.4s[0]
|
||||
OP_ii v18.4s, v3.4s, v9.4s[0]
|
||||
fmul v18.4s, v2.4s, v8.s[0]
|
||||
OP_ii v18.4s, v3.4s, v9.s[0]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
eor v19.16b, v19.16b, v19.16b
|
||||
fmls v19.4s, v2.4s, v9.4s[0]
|
||||
fmls v19.4s, v2.4s, v9.s[0]
|
||||
#else
|
||||
fmul v19.4s, v2.4s, v9.4s[0]
|
||||
fmul v19.4s, v2.4s, v9.s[0]
|
||||
#endif
|
||||
OP_ir v19.4s, v3.4s, v8.4s[0]
|
||||
OP_ir v19.4s, v3.4s, v8.s[0]
|
||||
|
||||
fmul v20.4s, v0.4s, v8.4s[1]
|
||||
OP_ii v20.4s, v1.4s, v9.4s[1]
|
||||
fmul v20.4s, v0.4s, v8.s[1]
|
||||
OP_ii v20.4s, v1.4s, v9.s[1]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
eor v21.16b, v21.16b, v21.16b
|
||||
fmls v21.4s, v0.4s, v9.4s[1]
|
||||
fmls v21.4s, v0.4s, v9.s[1]
|
||||
#else
|
||||
fmul v21.4s, v0.4s, v9.4s[1]
|
||||
fmul v21.4s, v0.4s, v9.s[1]
|
||||
#endif
|
||||
OP_ir v21.4s, v1.4s, v8.4s[1]
|
||||
OP_ir v21.4s, v1.4s, v8.s[1]
|
||||
|
||||
fmul v22.4s, v2.4s, v8.4s[1]
|
||||
OP_ii v22.4s, v3.4s, v9.4s[1]
|
||||
fmul v22.4s, v2.4s, v8.s[1]
|
||||
OP_ii v22.4s, v3.4s, v9.s[1]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
eor v23.16b, v23.16b, v23.16b
|
||||
fmls v23.4s, v2.4s, v9.4s[1]
|
||||
fmls v23.4s, v2.4s, v9.s[1]
|
||||
#else
|
||||
fmul v23.4s, v2.4s, v9.4s[1]
|
||||
fmul v23.4s, v2.4s, v9.s[1]
|
||||
#endif
|
||||
OP_ir v23.4s, v3.4s, v8.4s[1]
|
||||
OP_ir v23.4s, v3.4s, v8.s[1]
|
||||
|
||||
fmul v24.4s, v0.4s, v8.4s[2]
|
||||
OP_ii v24.4s, v1.4s, v9.4s[2]
|
||||
fmul v24.4s, v0.4s, v8.s[2]
|
||||
OP_ii v24.4s, v1.4s, v9.s[2]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
eor v25.16b, v25.16b, v25.16b
|
||||
fmls v25.4s, v0.4s, v9.4s[2]
|
||||
fmls v25.4s, v0.4s, v9.s[2]
|
||||
#else
|
||||
fmul v25.4s, v0.4s, v9.4s[2]
|
||||
fmul v25.4s, v0.4s, v9.s[2]
|
||||
#endif
|
||||
OP_ir v25.4s, v1.4s, v8.4s[2]
|
||||
OP_ir v25.4s, v1.4s, v8.s[2]
|
||||
|
||||
fmul v26.4s, v2.4s, v8.4s[2]
|
||||
OP_ii v26.4s, v3.4s, v9.4s[2]
|
||||
fmul v26.4s, v2.4s, v8.s[2]
|
||||
OP_ii v26.4s, v3.4s, v9.s[2]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
eor v27.16b, v27.16b, v27.16b
|
||||
fmls v27.4s, v2.4s, v9.4s[2]
|
||||
fmls v27.4s, v2.4s, v9.s[2]
|
||||
#else
|
||||
fmul v27.4s, v2.4s, v9.4s[2]
|
||||
fmul v27.4s, v2.4s, v9.s[2]
|
||||
#endif
|
||||
OP_ir v27.4s, v3.4s, v8.4s[2]
|
||||
OP_ir v27.4s, v3.4s, v8.s[2]
|
||||
|
||||
fmul v28.4s, v0.4s, v8.4s[3]
|
||||
OP_ii v28.4s, v1.4s, v9.4s[3]
|
||||
fmul v28.4s, v0.4s, v8.s[3]
|
||||
OP_ii v28.4s, v1.4s, v9.s[3]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
eor v29.16b, v29.16b, v29.16b
|
||||
fmls v29.4s, v0.4s, v9.4s[3]
|
||||
fmls v29.4s, v0.4s, v9.s[3]
|
||||
#else
|
||||
fmul v29.4s, v0.4s, v9.4s[3]
|
||||
fmul v29.4s, v0.4s, v9.s[3]
|
||||
#endif
|
||||
OP_ir v29.4s, v1.4s, v8.4s[3]
|
||||
OP_ir v29.4s, v1.4s, v8.s[3]
|
||||
|
||||
fmul v30.4s, v2.4s, v8.4s[3]
|
||||
OP_ii v30.4s, v3.4s, v9.4s[3]
|
||||
fmul v30.4s, v2.4s, v8.s[3]
|
||||
OP_ii v30.4s, v3.4s, v9.s[3]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
eor v31.16b, v31.16b, v31.16b
|
||||
fmls v31.4s, v2.4s, v9.4s[3]
|
||||
fmls v31.4s, v2.4s, v9.s[3]
|
||||
#else
|
||||
fmul v31.4s, v2.4s, v9.4s[3]
|
||||
fmul v31.4s, v2.4s, v9.s[3]
|
||||
#endif
|
||||
OP_ir v31.4s, v3.4s, v8.4s[3]
|
||||
OP_ir v31.4s, v3.4s, v8.s[3]
|
||||
|
||||
ld2 {v12.4s, v13.4s}, [pB]
|
||||
add pB, pB, #32
|
||||
|
|
@ -275,45 +275,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro KERNEL8x4_M1
|
||||
OP_rr v16.4s, v0.4s, v8.4s[0]
|
||||
OP_ii v16.4s, v1.4s, v9.4s[0]
|
||||
OP_ri v17.4s, v0.4s, v9.4s[0]
|
||||
OP_ir v17.4s, v1.4s, v8.4s[0]
|
||||
OP_rr v16.4s, v0.4s, v8.s[0]
|
||||
OP_ii v16.4s, v1.4s, v9.s[0]
|
||||
OP_ri v17.4s, v0.4s, v9.s[0]
|
||||
OP_ir v17.4s, v1.4s, v8.s[0]
|
||||
|
||||
OP_rr v18.4s, v2.4s, v8.4s[0]
|
||||
OP_ii v18.4s, v3.4s, v9.4s[0]
|
||||
OP_ri v19.4s, v2.4s, v9.4s[0]
|
||||
OP_ir v19.4s, v3.4s, v8.4s[0]
|
||||
OP_rr v18.4s, v2.4s, v8.s[0]
|
||||
OP_ii v18.4s, v3.4s, v9.s[0]
|
||||
OP_ri v19.4s, v2.4s, v9.s[0]
|
||||
OP_ir v19.4s, v3.4s, v8.s[0]
|
||||
|
||||
OP_rr v20.4s, v0.4s, v8.4s[1]
|
||||
OP_ii v20.4s, v1.4s, v9.4s[1]
|
||||
OP_ri v21.4s, v0.4s, v9.4s[1]
|
||||
OP_ir v21.4s, v1.4s, v8.4s[1]
|
||||
OP_rr v20.4s, v0.4s, v8.s[1]
|
||||
OP_ii v20.4s, v1.4s, v9.s[1]
|
||||
OP_ri v21.4s, v0.4s, v9.s[1]
|
||||
OP_ir v21.4s, v1.4s, v8.s[1]
|
||||
|
||||
OP_rr v22.4s, v2.4s, v8.4s[1]
|
||||
OP_ii v22.4s, v3.4s, v9.4s[1]
|
||||
OP_ri v23.4s, v2.4s, v9.4s[1]
|
||||
OP_ir v23.4s, v3.4s, v8.4s[1]
|
||||
OP_rr v22.4s, v2.4s, v8.s[1]
|
||||
OP_ii v22.4s, v3.4s, v9.s[1]
|
||||
OP_ri v23.4s, v2.4s, v9.s[1]
|
||||
OP_ir v23.4s, v3.4s, v8.s[1]
|
||||
|
||||
OP_rr v24.4s, v0.4s, v8.4s[2]
|
||||
OP_ii v24.4s, v1.4s, v9.4s[2]
|
||||
OP_ri v25.4s, v0.4s, v9.4s[2]
|
||||
OP_ir v25.4s, v1.4s, v8.4s[2]
|
||||
OP_rr v24.4s, v0.4s, v8.s[2]
|
||||
OP_ii v24.4s, v1.4s, v9.s[2]
|
||||
OP_ri v25.4s, v0.4s, v9.s[2]
|
||||
OP_ir v25.4s, v1.4s, v8.s[2]
|
||||
|
||||
OP_rr v26.4s, v2.4s, v8.4s[2]
|
||||
OP_ii v26.4s, v3.4s, v9.4s[2]
|
||||
OP_ri v27.4s, v2.4s, v9.4s[2]
|
||||
OP_ir v27.4s, v3.4s, v8.4s[2]
|
||||
OP_rr v26.4s, v2.4s, v8.s[2]
|
||||
OP_ii v26.4s, v3.4s, v9.s[2]
|
||||
OP_ri v27.4s, v2.4s, v9.s[2]
|
||||
OP_ir v27.4s, v3.4s, v8.s[2]
|
||||
|
||||
OP_rr v28.4s, v0.4s, v8.4s[3]
|
||||
OP_ii v28.4s, v1.4s, v9.4s[3]
|
||||
OP_ri v29.4s, v0.4s, v9.4s[3]
|
||||
OP_ir v29.4s, v1.4s, v8.4s[3]
|
||||
OP_rr v28.4s, v0.4s, v8.s[3]
|
||||
OP_ii v28.4s, v1.4s, v9.s[3]
|
||||
OP_ri v29.4s, v0.4s, v9.s[3]
|
||||
OP_ir v29.4s, v1.4s, v8.s[3]
|
||||
|
||||
OP_rr v30.4s, v2.4s, v8.4s[3]
|
||||
OP_ii v30.4s, v3.4s, v9.4s[3]
|
||||
OP_ri v31.4s, v2.4s, v9.4s[3]
|
||||
OP_ir v31.4s, v3.4s, v8.4s[3]
|
||||
OP_rr v30.4s, v2.4s, v8.s[3]
|
||||
OP_ii v30.4s, v3.4s, v9.s[3]
|
||||
OP_ri v31.4s, v2.4s, v9.s[3]
|
||||
OP_ir v31.4s, v3.4s, v8.s[3]
|
||||
|
||||
ld2 {v12.4s, v13.4s}, [pB] // For next round
|
||||
add pB, pB, #32
|
||||
|
|
@ -324,45 +324,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro KERNEL8x4_M2
|
||||
OP_rr v16.4s, v4.4s, v12.4s[0]
|
||||
OP_ii v16.4s, v5.4s, v13.4s[0]
|
||||
OP_ri v17.4s, v4.4s, v13.4s[0]
|
||||
OP_ir v17.4s, v5.4s, v12.4s[0]
|
||||
OP_rr v16.4s, v4.4s, v12.s[0]
|
||||
OP_ii v16.4s, v5.4s, v13.s[0]
|
||||
OP_ri v17.4s, v4.4s, v13.s[0]
|
||||
OP_ir v17.4s, v5.4s, v12.s[0]
|
||||
|
||||
OP_rr v18.4s, v6.4s, v12.4s[0]
|
||||
OP_ii v18.4s, v7.4s, v13.4s[0]
|
||||
OP_ri v19.4s, v6.4s, v13.4s[0]
|
||||
OP_ir v19.4s, v7.4s, v12.4s[0]
|
||||
OP_rr v18.4s, v6.4s, v12.s[0]
|
||||
OP_ii v18.4s, v7.4s, v13.s[0]
|
||||
OP_ri v19.4s, v6.4s, v13.s[0]
|
||||
OP_ir v19.4s, v7.4s, v12.s[0]
|
||||
|
||||
OP_rr v20.4s, v4.4s, v12.4s[1]
|
||||
OP_ii v20.4s, v5.4s, v13.4s[1]
|
||||
OP_ri v21.4s, v4.4s, v13.4s[1]
|
||||
OP_ir v21.4s, v5.4s, v12.4s[1]
|
||||
OP_rr v20.4s, v4.4s, v12.s[1]
|
||||
OP_ii v20.4s, v5.4s, v13.s[1]
|
||||
OP_ri v21.4s, v4.4s, v13.s[1]
|
||||
OP_ir v21.4s, v5.4s, v12.s[1]
|
||||
|
||||
OP_rr v22.4s, v6.4s, v12.4s[1]
|
||||
OP_ii v22.4s, v7.4s, v13.4s[1]
|
||||
OP_ri v23.4s, v6.4s, v13.4s[1]
|
||||
OP_ir v23.4s, v7.4s, v12.4s[1]
|
||||
OP_rr v22.4s, v6.4s, v12.s[1]
|
||||
OP_ii v22.4s, v7.4s, v13.s[1]
|
||||
OP_ri v23.4s, v6.4s, v13.s[1]
|
||||
OP_ir v23.4s, v7.4s, v12.s[1]
|
||||
|
||||
OP_rr v24.4s, v4.4s, v12.4s[2]
|
||||
OP_ii v24.4s, v5.4s, v13.4s[2]
|
||||
OP_ri v25.4s, v4.4s, v13.4s[2]
|
||||
OP_ir v25.4s, v5.4s, v12.4s[2]
|
||||
OP_rr v24.4s, v4.4s, v12.s[2]
|
||||
OP_ii v24.4s, v5.4s, v13.s[2]
|
||||
OP_ri v25.4s, v4.4s, v13.s[2]
|
||||
OP_ir v25.4s, v5.4s, v12.s[2]
|
||||
|
||||
OP_rr v26.4s, v6.4s, v12.4s[2]
|
||||
OP_ii v26.4s, v7.4s, v13.4s[2]
|
||||
OP_ri v27.4s, v6.4s, v13.4s[2]
|
||||
OP_ir v27.4s, v7.4s, v12.4s[2]
|
||||
OP_rr v26.4s, v6.4s, v12.s[2]
|
||||
OP_ii v26.4s, v7.4s, v13.s[2]
|
||||
OP_ri v27.4s, v6.4s, v13.s[2]
|
||||
OP_ir v27.4s, v7.4s, v12.s[2]
|
||||
|
||||
OP_rr v28.4s, v4.4s, v12.4s[3]
|
||||
OP_ii v28.4s, v5.4s, v13.4s[3]
|
||||
OP_ri v29.4s, v4.4s, v13.4s[3]
|
||||
OP_ir v29.4s, v5.4s, v12.4s[3]
|
||||
OP_rr v28.4s, v4.4s, v12.s[3]
|
||||
OP_ii v28.4s, v5.4s, v13.s[3]
|
||||
OP_ri v29.4s, v4.4s, v13.s[3]
|
||||
OP_ir v29.4s, v5.4s, v12.s[3]
|
||||
|
||||
OP_rr v30.4s, v6.4s, v12.4s[3]
|
||||
OP_ii v30.4s, v7.4s, v13.4s[3]
|
||||
OP_ri v31.4s, v6.4s, v13.4s[3]
|
||||
OP_ir v31.4s, v7.4s, v12.4s[3]
|
||||
OP_rr v30.4s, v6.4s, v12.s[3]
|
||||
OP_ii v30.4s, v7.4s, v13.s[3]
|
||||
OP_ri v31.4s, v6.4s, v13.s[3]
|
||||
OP_ir v31.4s, v7.4s, v12.s[3]
|
||||
|
||||
ld2 {v8.4s, v9.4s}, [pB]
|
||||
add pB, pB, #32
|
||||
|
|
@ -373,45 +373,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro KERNEL8x4_E
|
||||
OP_rr v16.4s, v4.4s, v12.4s[0]
|
||||
OP_ii v16.4s, v5.4s, v13.4s[0]
|
||||
OP_ri v17.4s, v4.4s, v13.4s[0]
|
||||
OP_ir v17.4s, v5.4s, v12.4s[0]
|
||||
OP_rr v16.4s, v4.4s, v12.s[0]
|
||||
OP_ii v16.4s, v5.4s, v13.s[0]
|
||||
OP_ri v17.4s, v4.4s, v13.s[0]
|
||||
OP_ir v17.4s, v5.4s, v12.s[0]
|
||||
|
||||
OP_rr v18.4s, v6.4s, v12.4s[0]
|
||||
OP_ii v18.4s, v7.4s, v13.4s[0]
|
||||
OP_ri v19.4s, v6.4s, v13.4s[0]
|
||||
OP_ir v19.4s, v7.4s, v12.4s[0]
|
||||
OP_rr v18.4s, v6.4s, v12.s[0]
|
||||
OP_ii v18.4s, v7.4s, v13.s[0]
|
||||
OP_ri v19.4s, v6.4s, v13.s[0]
|
||||
OP_ir v19.4s, v7.4s, v12.s[0]
|
||||
|
||||
OP_rr v20.4s, v4.4s, v12.4s[1]
|
||||
OP_ii v20.4s, v5.4s, v13.4s[1]
|
||||
OP_ri v21.4s, v4.4s, v13.4s[1]
|
||||
OP_ir v21.4s, v5.4s, v12.4s[1]
|
||||
OP_rr v20.4s, v4.4s, v12.s[1]
|
||||
OP_ii v20.4s, v5.4s, v13.s[1]
|
||||
OP_ri v21.4s, v4.4s, v13.s[1]
|
||||
OP_ir v21.4s, v5.4s, v12.s[1]
|
||||
|
||||
OP_rr v22.4s, v6.4s, v12.4s[1]
|
||||
OP_ii v22.4s, v7.4s, v13.4s[1]
|
||||
OP_ri v23.4s, v6.4s, v13.4s[1]
|
||||
OP_ir v23.4s, v7.4s, v12.4s[1]
|
||||
OP_rr v22.4s, v6.4s, v12.s[1]
|
||||
OP_ii v22.4s, v7.4s, v13.s[1]
|
||||
OP_ri v23.4s, v6.4s, v13.s[1]
|
||||
OP_ir v23.4s, v7.4s, v12.s[1]
|
||||
|
||||
OP_rr v24.4s, v4.4s, v12.4s[2]
|
||||
OP_ii v24.4s, v5.4s, v13.4s[2]
|
||||
OP_ri v25.4s, v4.4s, v13.4s[2]
|
||||
OP_ir v25.4s, v5.4s, v12.4s[2]
|
||||
OP_rr v24.4s, v4.4s, v12.s[2]
|
||||
OP_ii v24.4s, v5.4s, v13.s[2]
|
||||
OP_ri v25.4s, v4.4s, v13.s[2]
|
||||
OP_ir v25.4s, v5.4s, v12.s[2]
|
||||
|
||||
OP_rr v26.4s, v6.4s, v12.4s[2]
|
||||
OP_ii v26.4s, v7.4s, v13.4s[2]
|
||||
OP_ri v27.4s, v6.4s, v13.4s[2]
|
||||
OP_ir v27.4s, v7.4s, v12.4s[2]
|
||||
OP_rr v26.4s, v6.4s, v12.s[2]
|
||||
OP_ii v26.4s, v7.4s, v13.s[2]
|
||||
OP_ri v27.4s, v6.4s, v13.s[2]
|
||||
OP_ir v27.4s, v7.4s, v12.s[2]
|
||||
|
||||
OP_rr v28.4s, v4.4s, v12.4s[3]
|
||||
OP_ii v28.4s, v5.4s, v13.4s[3]
|
||||
OP_ri v29.4s, v4.4s, v13.4s[3]
|
||||
OP_ir v29.4s, v5.4s, v12.4s[3]
|
||||
OP_rr v28.4s, v4.4s, v12.s[3]
|
||||
OP_ii v28.4s, v5.4s, v13.s[3]
|
||||
OP_ri v29.4s, v4.4s, v13.s[3]
|
||||
OP_ir v29.4s, v5.4s, v12.s[3]
|
||||
|
||||
OP_rr v30.4s, v6.4s, v12.4s[3]
|
||||
OP_ii v30.4s, v7.4s, v13.4s[3]
|
||||
OP_ri v31.4s, v6.4s, v13.4s[3]
|
||||
OP_ir v31.4s, v7.4s, v12.4s[3]
|
||||
OP_rr v30.4s, v6.4s, v12.s[3]
|
||||
OP_ii v30.4s, v7.4s, v13.s[3]
|
||||
OP_ri v31.4s, v6.4s, v13.s[3]
|
||||
OP_ir v31.4s, v7.4s, v12.s[3]
|
||||
|
||||
.endm
|
||||
|
||||
|
|
@ -423,45 +423,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld2 {v2.4s, v3.4s}, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
OP_rr v16.4s, v0.4s, v8.4s[0]
|
||||
OP_ii v16.4s, v1.4s, v9.4s[0]
|
||||
OP_ri v17.4s, v0.4s, v9.4s[0]
|
||||
OP_ir v17.4s, v1.4s, v8.4s[0]
|
||||
OP_rr v16.4s, v0.4s, v8.s[0]
|
||||
OP_ii v16.4s, v1.4s, v9.s[0]
|
||||
OP_ri v17.4s, v0.4s, v9.s[0]
|
||||
OP_ir v17.4s, v1.4s, v8.s[0]
|
||||
|
||||
OP_rr v18.4s, v2.4s, v8.4s[0]
|
||||
OP_ii v18.4s, v3.4s, v9.4s[0]
|
||||
OP_ri v19.4s, v2.4s, v9.4s[0]
|
||||
OP_ir v19.4s, v3.4s, v8.4s[0]
|
||||
OP_rr v18.4s, v2.4s, v8.s[0]
|
||||
OP_ii v18.4s, v3.4s, v9.s[0]
|
||||
OP_ri v19.4s, v2.4s, v9.s[0]
|
||||
OP_ir v19.4s, v3.4s, v8.s[0]
|
||||
|
||||
OP_rr v20.4s, v0.4s, v8.4s[1]
|
||||
OP_ii v20.4s, v1.4s, v9.4s[1]
|
||||
OP_ri v21.4s, v0.4s, v9.4s[1]
|
||||
OP_ir v21.4s, v1.4s, v8.4s[1]
|
||||
OP_rr v20.4s, v0.4s, v8.s[1]
|
||||
OP_ii v20.4s, v1.4s, v9.s[1]
|
||||
OP_ri v21.4s, v0.4s, v9.s[1]
|
||||
OP_ir v21.4s, v1.4s, v8.s[1]
|
||||
|
||||
OP_rr v22.4s, v2.4s, v8.4s[1]
|
||||
OP_ii v22.4s, v3.4s, v9.4s[1]
|
||||
OP_ri v23.4s, v2.4s, v9.4s[1]
|
||||
OP_ir v23.4s, v3.4s, v8.4s[1]
|
||||
OP_rr v22.4s, v2.4s, v8.s[1]
|
||||
OP_ii v22.4s, v3.4s, v9.s[1]
|
||||
OP_ri v23.4s, v2.4s, v9.s[1]
|
||||
OP_ir v23.4s, v3.4s, v8.s[1]
|
||||
|
||||
OP_rr v24.4s, v0.4s, v8.4s[2]
|
||||
OP_ii v24.4s, v1.4s, v9.4s[2]
|
||||
OP_ri v25.4s, v0.4s, v9.4s[2]
|
||||
OP_ir v25.4s, v1.4s, v8.4s[2]
|
||||
OP_rr v24.4s, v0.4s, v8.s[2]
|
||||
OP_ii v24.4s, v1.4s, v9.s[2]
|
||||
OP_ri v25.4s, v0.4s, v9.s[2]
|
||||
OP_ir v25.4s, v1.4s, v8.s[2]
|
||||
|
||||
OP_rr v26.4s, v2.4s, v8.4s[2]
|
||||
OP_ii v26.4s, v3.4s, v9.4s[2]
|
||||
OP_ri v27.4s, v2.4s, v9.4s[2]
|
||||
OP_ir v27.4s, v3.4s, v8.4s[2]
|
||||
OP_rr v26.4s, v2.4s, v8.s[2]
|
||||
OP_ii v26.4s, v3.4s, v9.s[2]
|
||||
OP_ri v27.4s, v2.4s, v9.s[2]
|
||||
OP_ir v27.4s, v3.4s, v8.s[2]
|
||||
|
||||
OP_rr v28.4s, v0.4s, v8.4s[3]
|
||||
OP_ii v28.4s, v1.4s, v9.4s[3]
|
||||
OP_ri v29.4s, v0.4s, v9.4s[3]
|
||||
OP_ir v29.4s, v1.4s, v8.4s[3]
|
||||
OP_rr v28.4s, v0.4s, v8.s[3]
|
||||
OP_ii v28.4s, v1.4s, v9.s[3]
|
||||
OP_ri v29.4s, v0.4s, v9.s[3]
|
||||
OP_ir v29.4s, v1.4s, v8.s[3]
|
||||
|
||||
OP_rr v30.4s, v2.4s, v8.4s[3]
|
||||
OP_ii v30.4s, v3.4s, v9.4s[3]
|
||||
OP_ri v31.4s, v2.4s, v9.4s[3]
|
||||
OP_ir v31.4s, v3.4s, v8.4s[3]
|
||||
OP_rr v30.4s, v2.4s, v8.s[3]
|
||||
OP_ii v30.4s, v3.4s, v9.s[3]
|
||||
OP_ri v31.4s, v2.4s, v9.s[3]
|
||||
OP_ir v31.4s, v3.4s, v8.s[3]
|
||||
|
||||
.endm
|
||||
|
||||
|
|
@ -560,49 +560,49 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld2 {v0.4s, v1.4s}, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
fmul v16.4s, v0.4s, v8.4s[0]
|
||||
OP_ii v16.4s, v1.4s, v9.4s[0]
|
||||
fmul v16.4s, v0.4s, v8.s[0]
|
||||
OP_ii v16.4s, v1.4s, v9.s[0]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
eor v17.16b, v17.16b, v17.16b
|
||||
fmls v17.4s, v0.4s, v9.4s[0]
|
||||
fmls v17.4s, v0.4s, v9.s[0]
|
||||
#else
|
||||
fmul v17.4s, v0.4s, v9.4s[0]
|
||||
fmul v17.4s, v0.4s, v9.s[0]
|
||||
#endif
|
||||
OP_ir v17.4s, v1.4s, v8.4s[0]
|
||||
OP_ir v17.4s, v1.4s, v8.s[0]
|
||||
|
||||
fmul v20.4s, v0.4s, v8.4s[1]
|
||||
OP_ii v20.4s, v1.4s, v9.4s[1]
|
||||
fmul v20.4s, v0.4s, v8.s[1]
|
||||
OP_ii v20.4s, v1.4s, v9.s[1]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
eor v21.16b, v21.16b, v21.16b
|
||||
fmls v21.4s, v0.4s, v9.4s[1]
|
||||
fmls v21.4s, v0.4s, v9.s[1]
|
||||
#else
|
||||
fmul v21.4s, v0.4s, v9.4s[1]
|
||||
fmul v21.4s, v0.4s, v9.s[1]
|
||||
#endif
|
||||
OP_ir v21.4s, v1.4s, v8.4s[1]
|
||||
OP_ir v21.4s, v1.4s, v8.s[1]
|
||||
|
||||
fmul v24.4s, v0.4s, v8.4s[2]
|
||||
OP_ii v24.4s, v1.4s, v9.4s[2]
|
||||
fmul v24.4s, v0.4s, v8.s[2]
|
||||
OP_ii v24.4s, v1.4s, v9.s[2]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
eor v25.16b, v25.16b, v25.16b
|
||||
fmls v25.4s, v0.4s, v9.4s[2]
|
||||
fmls v25.4s, v0.4s, v9.s[2]
|
||||
#else
|
||||
fmul v25.4s, v0.4s, v9.4s[2]
|
||||
fmul v25.4s, v0.4s, v9.s[2]
|
||||
#endif
|
||||
OP_ir v25.4s, v1.4s, v8.4s[2]
|
||||
OP_ir v25.4s, v1.4s, v8.s[2]
|
||||
|
||||
fmul v28.4s, v0.4s, v8.4s[3]
|
||||
OP_ii v28.4s, v1.4s, v9.4s[3]
|
||||
fmul v28.4s, v0.4s, v8.s[3]
|
||||
OP_ii v28.4s, v1.4s, v9.s[3]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
eor v29.16b, v29.16b, v29.16b
|
||||
fmls v29.4s, v0.4s, v9.4s[3]
|
||||
fmls v29.4s, v0.4s, v9.s[3]
|
||||
#else
|
||||
fmul v29.4s, v0.4s, v9.4s[3]
|
||||
fmul v29.4s, v0.4s, v9.s[3]
|
||||
#endif
|
||||
OP_ir v29.4s, v1.4s, v8.4s[3]
|
||||
OP_ir v29.4s, v1.4s, v8.s[3]
|
||||
|
||||
ld2 {v12.4s, v13.4s}, [pB]
|
||||
add pB, pB, #32
|
||||
|
|
@ -611,85 +611,85 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro KERNEL4x4_M1
|
||||
OP_rr v16.4s, v0.4s, v8.4s[0]
|
||||
OP_ii v16.4s, v1.4s, v9.4s[0]
|
||||
OP_ri v17.4s, v0.4s, v9.4s[0]
|
||||
OP_ir v17.4s, v1.4s, v8.4s[0]
|
||||
OP_rr v16.4s, v0.4s, v8.s[0]
|
||||
OP_ii v16.4s, v1.4s, v9.s[0]
|
||||
OP_ri v17.4s, v0.4s, v9.s[0]
|
||||
OP_ir v17.4s, v1.4s, v8.s[0]
|
||||
|
||||
ld2 {v12.4s, v13.4s}, [pB] // For next round
|
||||
add pB, pB, #32
|
||||
|
||||
OP_rr v20.4s, v0.4s, v8.4s[1]
|
||||
OP_ii v20.4s, v1.4s, v9.4s[1]
|
||||
OP_ri v21.4s, v0.4s, v9.4s[1]
|
||||
OP_ir v21.4s, v1.4s, v8.4s[1]
|
||||
OP_rr v20.4s, v0.4s, v8.s[1]
|
||||
OP_ii v20.4s, v1.4s, v9.s[1]
|
||||
OP_ri v21.4s, v0.4s, v9.s[1]
|
||||
OP_ir v21.4s, v1.4s, v8.s[1]
|
||||
|
||||
ld2 {v4.4s, v5.4s}, [pA] // For next round
|
||||
add pA, pA, #32
|
||||
|
||||
OP_rr v24.4s, v0.4s, v8.4s[2]
|
||||
OP_ii v24.4s, v1.4s, v9.4s[2]
|
||||
OP_ri v25.4s, v0.4s, v9.4s[2]
|
||||
OP_ir v25.4s, v1.4s, v8.4s[2]
|
||||
OP_rr v24.4s, v0.4s, v8.s[2]
|
||||
OP_ii v24.4s, v1.4s, v9.s[2]
|
||||
OP_ri v25.4s, v0.4s, v9.s[2]
|
||||
OP_ir v25.4s, v1.4s, v8.s[2]
|
||||
|
||||
prfm PLDL1KEEP, [pA, #512]
|
||||
|
||||
OP_rr v28.4s, v0.4s, v8.4s[3]
|
||||
OP_ii v28.4s, v1.4s, v9.4s[3]
|
||||
OP_ri v29.4s, v0.4s, v9.4s[3]
|
||||
OP_ir v29.4s, v1.4s, v8.4s[3]
|
||||
OP_rr v28.4s, v0.4s, v8.s[3]
|
||||
OP_ii v28.4s, v1.4s, v9.s[3]
|
||||
OP_ri v29.4s, v0.4s, v9.s[3]
|
||||
OP_ir v29.4s, v1.4s, v8.s[3]
|
||||
.endm
|
||||
|
||||
.macro KERNEL4x4_M2
|
||||
OP_rr v16.4s, v4.4s, v12.4s[0]
|
||||
OP_ii v16.4s, v5.4s, v13.4s[0]
|
||||
OP_ri v17.4s, v4.4s, v13.4s[0]
|
||||
OP_ir v17.4s, v5.4s, v12.4s[0]
|
||||
OP_rr v16.4s, v4.4s, v12.s[0]
|
||||
OP_ii v16.4s, v5.4s, v13.s[0]
|
||||
OP_ri v17.4s, v4.4s, v13.s[0]
|
||||
OP_ir v17.4s, v5.4s, v12.s[0]
|
||||
|
||||
ld2 {v8.4s, v9.4s}, [pB] // For next round
|
||||
add pB, pB, #32
|
||||
|
||||
OP_rr v20.4s, v4.4s, v12.4s[1]
|
||||
OP_ii v20.4s, v5.4s, v13.4s[1]
|
||||
OP_ri v21.4s, v4.4s, v13.4s[1]
|
||||
OP_ir v21.4s, v5.4s, v12.4s[1]
|
||||
OP_rr v20.4s, v4.4s, v12.s[1]
|
||||
OP_ii v20.4s, v5.4s, v13.s[1]
|
||||
OP_ri v21.4s, v4.4s, v13.s[1]
|
||||
OP_ir v21.4s, v5.4s, v12.s[1]
|
||||
|
||||
ld2 {v0.4s, v1.4s}, [pA] // For next round
|
||||
add pA, pA, #32
|
||||
|
||||
OP_rr v24.4s, v4.4s, v12.4s[2]
|
||||
OP_ii v24.4s, v5.4s, v13.4s[2]
|
||||
OP_ri v25.4s, v4.4s, v13.4s[2]
|
||||
OP_ir v25.4s, v5.4s, v12.4s[2]
|
||||
OP_rr v24.4s, v4.4s, v12.s[2]
|
||||
OP_ii v24.4s, v5.4s, v13.s[2]
|
||||
OP_ri v25.4s, v4.4s, v13.s[2]
|
||||
OP_ir v25.4s, v5.4s, v12.s[2]
|
||||
|
||||
prfm PLDL1KEEP, [pB, #512]
|
||||
|
||||
OP_rr v28.4s, v4.4s, v12.4s[3]
|
||||
OP_ii v28.4s, v5.4s, v13.4s[3]
|
||||
OP_ri v29.4s, v4.4s, v13.4s[3]
|
||||
OP_ir v29.4s, v5.4s, v12.4s[3]
|
||||
OP_rr v28.4s, v4.4s, v12.s[3]
|
||||
OP_ii v28.4s, v5.4s, v13.s[3]
|
||||
OP_ri v29.4s, v4.4s, v13.s[3]
|
||||
OP_ir v29.4s, v5.4s, v12.s[3]
|
||||
.endm
|
||||
|
||||
.macro KERNEL4x4_E
|
||||
OP_rr v16.4s, v4.4s, v12.4s[0]
|
||||
OP_ii v16.4s, v5.4s, v13.4s[0]
|
||||
OP_ri v17.4s, v4.4s, v13.4s[0]
|
||||
OP_ir v17.4s, v5.4s, v12.4s[0]
|
||||
OP_rr v16.4s, v4.4s, v12.s[0]
|
||||
OP_ii v16.4s, v5.4s, v13.s[0]
|
||||
OP_ri v17.4s, v4.4s, v13.s[0]
|
||||
OP_ir v17.4s, v5.4s, v12.s[0]
|
||||
|
||||
OP_rr v20.4s, v4.4s, v12.4s[1]
|
||||
OP_ii v20.4s, v5.4s, v13.4s[1]
|
||||
OP_ri v21.4s, v4.4s, v13.4s[1]
|
||||
OP_ir v21.4s, v5.4s, v12.4s[1]
|
||||
OP_rr v20.4s, v4.4s, v12.s[1]
|
||||
OP_ii v20.4s, v5.4s, v13.s[1]
|
||||
OP_ri v21.4s, v4.4s, v13.s[1]
|
||||
OP_ir v21.4s, v5.4s, v12.s[1]
|
||||
|
||||
OP_rr v24.4s, v4.4s, v12.4s[2]
|
||||
OP_ii v24.4s, v5.4s, v13.4s[2]
|
||||
OP_ri v25.4s, v4.4s, v13.4s[2]
|
||||
OP_ir v25.4s, v5.4s, v12.4s[2]
|
||||
OP_rr v24.4s, v4.4s, v12.s[2]
|
||||
OP_ii v24.4s, v5.4s, v13.s[2]
|
||||
OP_ri v25.4s, v4.4s, v13.s[2]
|
||||
OP_ir v25.4s, v5.4s, v12.s[2]
|
||||
|
||||
OP_rr v28.4s, v4.4s, v12.4s[3]
|
||||
OP_ii v28.4s, v5.4s, v13.4s[3]
|
||||
OP_ri v29.4s, v4.4s, v13.4s[3]
|
||||
OP_ir v29.4s, v5.4s, v12.4s[3]
|
||||
OP_rr v28.4s, v4.4s, v12.s[3]
|
||||
OP_ii v28.4s, v5.4s, v13.s[3]
|
||||
OP_ri v29.4s, v4.4s, v13.s[3]
|
||||
OP_ir v29.4s, v5.4s, v12.s[3]
|
||||
.endm
|
||||
|
||||
.macro KERNEL4x4_SUB
|
||||
|
|
@ -698,25 +698,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld2 {v0.4s, v1.4s}, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
OP_rr v16.4s, v0.4s, v8.4s[0]
|
||||
OP_ii v16.4s, v1.4s, v9.4s[0]
|
||||
OP_ri v17.4s, v0.4s, v9.4s[0]
|
||||
OP_ir v17.4s, v1.4s, v8.4s[0]
|
||||
OP_rr v16.4s, v0.4s, v8.s[0]
|
||||
OP_ii v16.4s, v1.4s, v9.s[0]
|
||||
OP_ri v17.4s, v0.4s, v9.s[0]
|
||||
OP_ir v17.4s, v1.4s, v8.s[0]
|
||||
|
||||
OP_rr v20.4s, v0.4s, v8.4s[1]
|
||||
OP_ii v20.4s, v1.4s, v9.4s[1]
|
||||
OP_ri v21.4s, v0.4s, v9.4s[1]
|
||||
OP_ir v21.4s, v1.4s, v8.4s[1]
|
||||
OP_rr v20.4s, v0.4s, v8.s[1]
|
||||
OP_ii v20.4s, v1.4s, v9.s[1]
|
||||
OP_ri v21.4s, v0.4s, v9.s[1]
|
||||
OP_ir v21.4s, v1.4s, v8.s[1]
|
||||
|
||||
OP_rr v24.4s, v0.4s, v8.4s[2]
|
||||
OP_ii v24.4s, v1.4s, v9.4s[2]
|
||||
OP_ri v25.4s, v0.4s, v9.4s[2]
|
||||
OP_ir v25.4s, v1.4s, v8.4s[2]
|
||||
OP_rr v24.4s, v0.4s, v8.s[2]
|
||||
OP_ii v24.4s, v1.4s, v9.s[2]
|
||||
OP_ri v25.4s, v0.4s, v9.s[2]
|
||||
OP_ir v25.4s, v1.4s, v8.s[2]
|
||||
|
||||
OP_rr v28.4s, v0.4s, v8.4s[3]
|
||||
OP_ii v28.4s, v1.4s, v9.4s[3]
|
||||
OP_ri v29.4s, v0.4s, v9.4s[3]
|
||||
OP_ir v29.4s, v1.4s, v8.4s[3]
|
||||
OP_rr v28.4s, v0.4s, v8.s[3]
|
||||
OP_ii v28.4s, v1.4s, v9.s[3]
|
||||
OP_ri v29.4s, v0.4s, v9.s[3]
|
||||
OP_ir v29.4s, v1.4s, v8.s[3]
|
||||
.endm
|
||||
|
||||
.macro SAVE4x4
|
||||
|
|
@ -778,25 +778,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld2 {v0.2s, v1.2s}, [pA]
|
||||
add pA, pA, #16
|
||||
|
||||
OP_rr v16.2s, v0.2s, v8.4s[0]
|
||||
OP_ii v16.2s, v1.2s, v9.4s[0]
|
||||
OP_ri v17.2s, v0.2s, v9.4s[0]
|
||||
OP_ir v17.2s, v1.2s, v8.4s[0]
|
||||
OP_rr v16.2s, v0.2s, v8.s[0]
|
||||
OP_ii v16.2s, v1.2s, v9.s[0]
|
||||
OP_ri v17.2s, v0.2s, v9.s[0]
|
||||
OP_ir v17.2s, v1.2s, v8.s[0]
|
||||
|
||||
OP_rr v20.2s, v0.2s, v8.4s[1]
|
||||
OP_ii v20.2s, v1.2s, v9.4s[1]
|
||||
OP_ri v21.2s, v0.2s, v9.4s[1]
|
||||
OP_ir v21.2s, v1.2s, v8.4s[1]
|
||||
OP_rr v20.2s, v0.2s, v8.s[1]
|
||||
OP_ii v20.2s, v1.2s, v9.s[1]
|
||||
OP_ri v21.2s, v0.2s, v9.s[1]
|
||||
OP_ir v21.2s, v1.2s, v8.s[1]
|
||||
|
||||
OP_rr v24.2s, v0.2s, v8.4s[2]
|
||||
OP_ii v24.2s, v1.2s, v9.4s[2]
|
||||
OP_ri v25.2s, v0.2s, v9.4s[2]
|
||||
OP_ir v25.2s, v1.2s, v8.4s[2]
|
||||
OP_rr v24.2s, v0.2s, v8.s[2]
|
||||
OP_ii v24.2s, v1.2s, v9.s[2]
|
||||
OP_ri v25.2s, v0.2s, v9.s[2]
|
||||
OP_ir v25.2s, v1.2s, v8.s[2]
|
||||
|
||||
OP_rr v28.2s, v0.2s, v8.4s[3]
|
||||
OP_ii v28.2s, v1.2s, v9.4s[3]
|
||||
OP_ri v29.2s, v0.2s, v9.4s[3]
|
||||
OP_ir v29.2s, v1.2s, v8.4s[3]
|
||||
OP_rr v28.2s, v0.2s, v8.s[3]
|
||||
OP_ii v28.2s, v1.2s, v9.s[3]
|
||||
OP_ri v29.2s, v0.2s, v9.s[3]
|
||||
OP_ir v29.2s, v1.2s, v8.s[3]
|
||||
.endm
|
||||
|
||||
.macro SAVE2x4
|
||||
|
|
@ -858,25 +858,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld2 {v0.s, v1.s}[0], [pA]
|
||||
add pA, pA, #8
|
||||
|
||||
OP_rr s16, s0, v8.4s[0]
|
||||
OP_ii s16, s1, v9.4s[0]
|
||||
OP_ri s17, s0, v9.4s[0]
|
||||
OP_ir s17, s1, v8.4s[0]
|
||||
OP_rr s16, s0, v8.s[0]
|
||||
OP_ii s16, s1, v9.s[0]
|
||||
OP_ri s17, s0, v9.s[0]
|
||||
OP_ir s17, s1, v8.s[0]
|
||||
|
||||
OP_rr s20, s0, v8.4s[1]
|
||||
OP_ii s20, s1, v9.4s[1]
|
||||
OP_ri s21, s0, v9.4s[1]
|
||||
OP_ir s21, s1, v8.4s[1]
|
||||
OP_rr s20, s0, v8.s[1]
|
||||
OP_ii s20, s1, v9.s[1]
|
||||
OP_ri s21, s0, v9.s[1]
|
||||
OP_ir s21, s1, v8.s[1]
|
||||
|
||||
OP_rr s24, s0, v8.4s[2]
|
||||
OP_ii s24, s1, v9.4s[2]
|
||||
OP_ri s25, s0, v9.4s[2]
|
||||
OP_ir s25, s1, v8.4s[2]
|
||||
OP_rr s24, s0, v8.s[2]
|
||||
OP_ii s24, s1, v9.s[2]
|
||||
OP_ri s25, s0, v9.s[2]
|
||||
OP_ir s25, s1, v8.s[2]
|
||||
|
||||
OP_rr s28, s0, v8.4s[3]
|
||||
OP_ii s28, s1, v9.4s[3]
|
||||
OP_ri s29, s0, v9.4s[3]
|
||||
OP_ir s29, s1, v8.4s[3]
|
||||
OP_rr s28, s0, v8.s[3]
|
||||
OP_ii s28, s1, v9.s[3]
|
||||
OP_ri s29, s0, v9.s[3]
|
||||
OP_ir s29, s1, v8.s[3]
|
||||
.endm
|
||||
|
||||
.macro SAVE1x4
|
||||
|
|
@ -940,25 +940,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld2 {v2.4s, v3.4s}, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
OP_rr v16.4s, v0.4s, v8.2s[0]
|
||||
OP_ii v16.4s, v1.4s, v9.2s[0]
|
||||
OP_ri v17.4s, v0.4s, v9.2s[0]
|
||||
OP_ir v17.4s, v1.4s, v8.2s[0]
|
||||
OP_rr v16.4s, v0.4s, v8.s[0]
|
||||
OP_ii v16.4s, v1.4s, v9.s[0]
|
||||
OP_ri v17.4s, v0.4s, v9.s[0]
|
||||
OP_ir v17.4s, v1.4s, v8.s[0]
|
||||
|
||||
OP_rr v18.4s, v2.4s, v8.2s[0]
|
||||
OP_ii v18.4s, v3.4s, v9.2s[0]
|
||||
OP_ri v19.4s, v2.4s, v9.2s[0]
|
||||
OP_ir v19.4s, v3.4s, v8.2s[0]
|
||||
OP_rr v18.4s, v2.4s, v8.s[0]
|
||||
OP_ii v18.4s, v3.4s, v9.s[0]
|
||||
OP_ri v19.4s, v2.4s, v9.s[0]
|
||||
OP_ir v19.4s, v3.4s, v8.s[0]
|
||||
|
||||
OP_rr v20.4s, v0.4s, v8.2s[1]
|
||||
OP_ii v20.4s, v1.4s, v9.2s[1]
|
||||
OP_ri v21.4s, v0.4s, v9.2s[1]
|
||||
OP_ir v21.4s, v1.4s, v8.2s[1]
|
||||
OP_rr v20.4s, v0.4s, v8.s[1]
|
||||
OP_ii v20.4s, v1.4s, v9.s[1]
|
||||
OP_ri v21.4s, v0.4s, v9.s[1]
|
||||
OP_ir v21.4s, v1.4s, v8.s[1]
|
||||
|
||||
OP_rr v22.4s, v2.4s, v8.2s[1]
|
||||
OP_ii v22.4s, v3.4s, v9.2s[1]
|
||||
OP_ri v23.4s, v2.4s, v9.2s[1]
|
||||
OP_ir v23.4s, v3.4s, v8.2s[1]
|
||||
OP_rr v22.4s, v2.4s, v8.s[1]
|
||||
OP_ii v22.4s, v3.4s, v9.s[1]
|
||||
OP_ri v23.4s, v2.4s, v9.s[1]
|
||||
OP_ir v23.4s, v3.4s, v8.s[1]
|
||||
.endm
|
||||
|
||||
.macro SAVE8x2
|
||||
|
|
@ -1016,15 +1016,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld2 {v0.4s, v1.4s}, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
OP_rr v16.4s, v0.4s, v8.2s[0]
|
||||
OP_ii v16.4s, v1.4s, v9.2s[0]
|
||||
OP_ri v17.4s, v0.4s, v9.2s[0]
|
||||
OP_ir v17.4s, v1.4s, v8.2s[0]
|
||||
OP_rr v16.4s, v0.4s, v8.s[0]
|
||||
OP_ii v16.4s, v1.4s, v9.s[0]
|
||||
OP_ri v17.4s, v0.4s, v9.s[0]
|
||||
OP_ir v17.4s, v1.4s, v8.s[0]
|
||||
|
||||
OP_rr v20.4s, v0.4s, v8.2s[1]
|
||||
OP_ii v20.4s, v1.4s, v9.2s[1]
|
||||
OP_ri v21.4s, v0.4s, v9.2s[1]
|
||||
OP_ir v21.4s, v1.4s, v8.2s[1]
|
||||
OP_rr v20.4s, v0.4s, v8.s[1]
|
||||
OP_ii v20.4s, v1.4s, v9.s[1]
|
||||
OP_ri v21.4s, v0.4s, v9.s[1]
|
||||
OP_ir v21.4s, v1.4s, v8.s[1]
|
||||
.endm
|
||||
|
||||
.macro SAVE4x2
|
||||
|
|
@ -1064,15 +1064,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld2 {v0.2s, v1.2s}, [pA]
|
||||
add pA, pA, #16
|
||||
|
||||
OP_rr v16.2s, v0.2s, v8.2s[0]
|
||||
OP_ii v16.2s, v1.2s, v9.2s[0]
|
||||
OP_ri v17.2s, v0.2s, v9.2s[0]
|
||||
OP_ir v17.2s, v1.2s, v8.2s[0]
|
||||
OP_rr v16.2s, v0.2s, v8.s[0]
|
||||
OP_ii v16.2s, v1.2s, v9.s[0]
|
||||
OP_ri v17.2s, v0.2s, v9.s[0]
|
||||
OP_ir v17.2s, v1.2s, v8.s[0]
|
||||
|
||||
OP_rr v20.2s, v0.2s, v8.2s[1]
|
||||
OP_ii v20.2s, v1.2s, v9.2s[1]
|
||||
OP_ri v21.2s, v0.2s, v9.2s[1]
|
||||
OP_ir v21.2s, v1.2s, v8.2s[1]
|
||||
OP_rr v20.2s, v0.2s, v8.s[1]
|
||||
OP_ii v20.2s, v1.2s, v9.s[1]
|
||||
OP_ri v21.2s, v0.2s, v9.s[1]
|
||||
OP_ir v21.2s, v1.2s, v8.s[1]
|
||||
.endm
|
||||
|
||||
.macro SAVE2x2
|
||||
|
|
@ -1112,15 +1112,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld2 {v0.s, v1.s}[0], [pA]
|
||||
add pA, pA, #8
|
||||
|
||||
OP_rr s16, s0, v8.2s[0]
|
||||
OP_ii s16, s1, v9.2s[0]
|
||||
OP_ri s17, s0, v9.2s[0]
|
||||
OP_ir s17, s1, v8.2s[0]
|
||||
OP_rr s16, s0, v8.s[0]
|
||||
OP_ii s16, s1, v9.s[0]
|
||||
OP_ri s17, s0, v9.s[0]
|
||||
OP_ir s17, s1, v8.s[0]
|
||||
|
||||
OP_rr s20, s0, v8.2s[1]
|
||||
OP_ii s20, s1, v9.2s[1]
|
||||
OP_ri s21, s0, v9.2s[1]
|
||||
OP_ir s21, s1, v8.2s[1]
|
||||
OP_rr s20, s0, v8.s[1]
|
||||
OP_ii s20, s1, v9.s[1]
|
||||
OP_ri s21, s0, v9.s[1]
|
||||
OP_ir s21, s1, v8.s[1]
|
||||
.endm
|
||||
|
||||
.macro SAVE1x2
|
||||
|
|
@ -1162,15 +1162,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld2 {v2.4s, v3.4s}, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
OP_rr v16.4s, v0.4s, v8.4s[0]
|
||||
OP_ii v16.4s, v1.4s, v8.4s[1]
|
||||
OP_ri v17.4s, v0.4s, v8.4s[1]
|
||||
OP_ir v17.4s, v1.4s, v8.4s[0]
|
||||
OP_rr v16.4s, v0.4s, v8.s[0]
|
||||
OP_ii v16.4s, v1.4s, v8.s[1]
|
||||
OP_ri v17.4s, v0.4s, v8.s[1]
|
||||
OP_ir v17.4s, v1.4s, v8.s[0]
|
||||
|
||||
OP_rr v18.4s, v2.4s, v8.4s[0]
|
||||
OP_ii v18.4s, v3.4s, v8.4s[1]
|
||||
OP_ri v19.4s, v2.4s, v8.4s[1]
|
||||
OP_ir v19.4s, v3.4s, v8.4s[0]
|
||||
OP_rr v18.4s, v2.4s, v8.s[0]
|
||||
OP_ii v18.4s, v3.4s, v8.s[1]
|
||||
OP_ri v19.4s, v2.4s, v8.s[1]
|
||||
OP_ir v19.4s, v3.4s, v8.s[0]
|
||||
.endm
|
||||
|
||||
.macro SAVE8x1
|
||||
|
|
|
|||
|
|
@ -170,49 +170,49 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld2 {v0.4s, v1.4s}, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
fmul v16.4s, v0.4s, v8.4s[0]
|
||||
OP_ii v16.4s, v1.4s, v9.4s[0]
|
||||
fmul v16.4s, v0.4s, v8.s[0]
|
||||
OP_ii v16.4s, v1.4s, v9.s[0]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
eor v17.16b, v17.16b, v17.16b
|
||||
fmls v17.4s, v0.4s, v9.4s[0]
|
||||
fmls v17.4s, v0.4s, v9.s[0]
|
||||
#else
|
||||
fmul v17.4s, v0.4s, v9.4s[0]
|
||||
fmul v17.4s, v0.4s, v9.s[0]
|
||||
#endif
|
||||
OP_ir v17.4s, v1.4s, v8.4s[0]
|
||||
OP_ir v17.4s, v1.4s, v8.s[0]
|
||||
|
||||
fmul v20.4s, v0.4s, v8.4s[1]
|
||||
OP_ii v20.4s, v1.4s, v9.4s[1]
|
||||
fmul v20.4s, v0.4s, v8.s[1]
|
||||
OP_ii v20.4s, v1.4s, v9.s[1]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
eor v21.16b, v21.16b, v21.16b
|
||||
fmls v21.4s, v0.4s, v9.4s[1]
|
||||
fmls v21.4s, v0.4s, v9.s[1]
|
||||
#else
|
||||
fmul v21.4s, v0.4s, v9.4s[1]
|
||||
fmul v21.4s, v0.4s, v9.s[1]
|
||||
#endif
|
||||
OP_ir v21.4s, v1.4s, v8.4s[1]
|
||||
OP_ir v21.4s, v1.4s, v8.s[1]
|
||||
|
||||
fmul v24.4s, v0.4s, v8.4s[2]
|
||||
OP_ii v24.4s, v1.4s, v9.4s[2]
|
||||
fmul v24.4s, v0.4s, v8.s[2]
|
||||
OP_ii v24.4s, v1.4s, v9.s[2]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
eor v25.16b, v25.16b, v25.16b
|
||||
fmls v25.4s, v0.4s, v9.4s[2]
|
||||
fmls v25.4s, v0.4s, v9.s[2]
|
||||
#else
|
||||
fmul v25.4s, v0.4s, v9.4s[2]
|
||||
fmul v25.4s, v0.4s, v9.s[2]
|
||||
#endif
|
||||
OP_ir v25.4s, v1.4s, v8.4s[2]
|
||||
OP_ir v25.4s, v1.4s, v8.s[2]
|
||||
|
||||
fmul v28.4s, v0.4s, v8.4s[3]
|
||||
OP_ii v28.4s, v1.4s, v9.4s[3]
|
||||
fmul v28.4s, v0.4s, v8.s[3]
|
||||
OP_ii v28.4s, v1.4s, v9.s[3]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
eor v29.16b, v29.16b, v29.16b
|
||||
fmls v29.4s, v0.4s, v9.4s[3]
|
||||
fmls v29.4s, v0.4s, v9.s[3]
|
||||
#else
|
||||
fmul v29.4s, v0.4s, v9.4s[3]
|
||||
fmul v29.4s, v0.4s, v9.s[3]
|
||||
#endif
|
||||
OP_ir v29.4s, v1.4s, v8.4s[3]
|
||||
OP_ir v29.4s, v1.4s, v8.s[3]
|
||||
|
||||
ld2 {v12.4s, v13.4s}, [pB]
|
||||
add pB, pB, #32
|
||||
|
|
@ -221,85 +221,85 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro KERNEL4x4_M1
|
||||
OP_rr v16.4s, v0.4s, v8.4s[0]
|
||||
OP_ii v16.4s, v1.4s, v9.4s[0]
|
||||
OP_ri v17.4s, v0.4s, v9.4s[0]
|
||||
OP_ir v17.4s, v1.4s, v8.4s[0]
|
||||
OP_rr v16.4s, v0.4s, v8.s[0]
|
||||
OP_ii v16.4s, v1.4s, v9.s[0]
|
||||
OP_ri v17.4s, v0.4s, v9.s[0]
|
||||
OP_ir v17.4s, v1.4s, v8.s[0]
|
||||
|
||||
ld2 {v12.4s, v13.4s}, [pB] // For next round
|
||||
add pB, pB, #32
|
||||
|
||||
OP_rr v20.4s, v0.4s, v8.4s[1]
|
||||
OP_ii v20.4s, v1.4s, v9.4s[1]
|
||||
OP_ri v21.4s, v0.4s, v9.4s[1]
|
||||
OP_ir v21.4s, v1.4s, v8.4s[1]
|
||||
OP_rr v20.4s, v0.4s, v8.s[1]
|
||||
OP_ii v20.4s, v1.4s, v9.s[1]
|
||||
OP_ri v21.4s, v0.4s, v9.s[1]
|
||||
OP_ir v21.4s, v1.4s, v8.s[1]
|
||||
|
||||
ld2 {v4.4s, v5.4s}, [pA] // For next round
|
||||
add pA, pA, #32
|
||||
|
||||
OP_rr v24.4s, v0.4s, v8.4s[2]
|
||||
OP_ii v24.4s, v1.4s, v9.4s[2]
|
||||
OP_ri v25.4s, v0.4s, v9.4s[2]
|
||||
OP_ir v25.4s, v1.4s, v8.4s[2]
|
||||
OP_rr v24.4s, v0.4s, v8.s[2]
|
||||
OP_ii v24.4s, v1.4s, v9.s[2]
|
||||
OP_ri v25.4s, v0.4s, v9.s[2]
|
||||
OP_ir v25.4s, v1.4s, v8.s[2]
|
||||
|
||||
prfm PLDL1KEEP, [pA, #512]
|
||||
|
||||
OP_rr v28.4s, v0.4s, v8.4s[3]
|
||||
OP_ii v28.4s, v1.4s, v9.4s[3]
|
||||
OP_ri v29.4s, v0.4s, v9.4s[3]
|
||||
OP_ir v29.4s, v1.4s, v8.4s[3]
|
||||
OP_rr v28.4s, v0.4s, v8.s[3]
|
||||
OP_ii v28.4s, v1.4s, v9.s[3]
|
||||
OP_ri v29.4s, v0.4s, v9.s[3]
|
||||
OP_ir v29.4s, v1.4s, v8.s[3]
|
||||
.endm
|
||||
|
||||
.macro KERNEL4x4_M2
|
||||
OP_rr v16.4s, v4.4s, v12.4s[0]
|
||||
OP_ii v16.4s, v5.4s, v13.4s[0]
|
||||
OP_ri v17.4s, v4.4s, v13.4s[0]
|
||||
OP_ir v17.4s, v5.4s, v12.4s[0]
|
||||
OP_rr v16.4s, v4.4s, v12.s[0]
|
||||
OP_ii v16.4s, v5.4s, v13.s[0]
|
||||
OP_ri v17.4s, v4.4s, v13.s[0]
|
||||
OP_ir v17.4s, v5.4s, v12.s[0]
|
||||
|
||||
ld2 {v8.4s, v9.4s}, [pB] // For next round
|
||||
add pB, pB, #32
|
||||
|
||||
OP_rr v20.4s, v4.4s, v12.4s[1]
|
||||
OP_ii v20.4s, v5.4s, v13.4s[1]
|
||||
OP_ri v21.4s, v4.4s, v13.4s[1]
|
||||
OP_ir v21.4s, v5.4s, v12.4s[1]
|
||||
OP_rr v20.4s, v4.4s, v12.s[1]
|
||||
OP_ii v20.4s, v5.4s, v13.s[1]
|
||||
OP_ri v21.4s, v4.4s, v13.s[1]
|
||||
OP_ir v21.4s, v5.4s, v12.s[1]
|
||||
|
||||
ld2 {v0.4s, v1.4s}, [pA] // For next round
|
||||
add pA, pA, #32
|
||||
|
||||
OP_rr v24.4s, v4.4s, v12.4s[2]
|
||||
OP_ii v24.4s, v5.4s, v13.4s[2]
|
||||
OP_ri v25.4s, v4.4s, v13.4s[2]
|
||||
OP_ir v25.4s, v5.4s, v12.4s[2]
|
||||
OP_rr v24.4s, v4.4s, v12.s[2]
|
||||
OP_ii v24.4s, v5.4s, v13.s[2]
|
||||
OP_ri v25.4s, v4.4s, v13.s[2]
|
||||
OP_ir v25.4s, v5.4s, v12.s[2]
|
||||
|
||||
prfm PLDL1KEEP, [pB, #512]
|
||||
|
||||
OP_rr v28.4s, v4.4s, v12.4s[3]
|
||||
OP_ii v28.4s, v5.4s, v13.4s[3]
|
||||
OP_ri v29.4s, v4.4s, v13.4s[3]
|
||||
OP_ir v29.4s, v5.4s, v12.4s[3]
|
||||
OP_rr v28.4s, v4.4s, v12.s[3]
|
||||
OP_ii v28.4s, v5.4s, v13.s[3]
|
||||
OP_ri v29.4s, v4.4s, v13.s[3]
|
||||
OP_ir v29.4s, v5.4s, v12.s[3]
|
||||
.endm
|
||||
|
||||
.macro KERNEL4x4_E
|
||||
OP_rr v16.4s, v4.4s, v12.4s[0]
|
||||
OP_ii v16.4s, v5.4s, v13.4s[0]
|
||||
OP_ri v17.4s, v4.4s, v13.4s[0]
|
||||
OP_ir v17.4s, v5.4s, v12.4s[0]
|
||||
OP_rr v16.4s, v4.4s, v12.s[0]
|
||||
OP_ii v16.4s, v5.4s, v13.s[0]
|
||||
OP_ri v17.4s, v4.4s, v13.s[0]
|
||||
OP_ir v17.4s, v5.4s, v12.s[0]
|
||||
|
||||
OP_rr v20.4s, v4.4s, v12.4s[1]
|
||||
OP_ii v20.4s, v5.4s, v13.4s[1]
|
||||
OP_ri v21.4s, v4.4s, v13.4s[1]
|
||||
OP_ir v21.4s, v5.4s, v12.4s[1]
|
||||
OP_rr v20.4s, v4.4s, v12.s[1]
|
||||
OP_ii v20.4s, v5.4s, v13.s[1]
|
||||
OP_ri v21.4s, v4.4s, v13.s[1]
|
||||
OP_ir v21.4s, v5.4s, v12.s[1]
|
||||
|
||||
OP_rr v24.4s, v4.4s, v12.4s[2]
|
||||
OP_ii v24.4s, v5.4s, v13.4s[2]
|
||||
OP_ri v25.4s, v4.4s, v13.4s[2]
|
||||
OP_ir v25.4s, v5.4s, v12.4s[2]
|
||||
OP_rr v24.4s, v4.4s, v12.s[2]
|
||||
OP_ii v24.4s, v5.4s, v13.s[2]
|
||||
OP_ri v25.4s, v4.4s, v13.s[2]
|
||||
OP_ir v25.4s, v5.4s, v12.s[2]
|
||||
|
||||
OP_rr v28.4s, v4.4s, v12.4s[3]
|
||||
OP_ii v28.4s, v5.4s, v13.4s[3]
|
||||
OP_ri v29.4s, v4.4s, v13.4s[3]
|
||||
OP_ir v29.4s, v5.4s, v12.4s[3]
|
||||
OP_rr v28.4s, v4.4s, v12.s[3]
|
||||
OP_ii v28.4s, v5.4s, v13.s[3]
|
||||
OP_ri v29.4s, v4.4s, v13.s[3]
|
||||
OP_ir v29.4s, v5.4s, v12.s[3]
|
||||
.endm
|
||||
|
||||
.macro KERNEL4x4_SUB
|
||||
|
|
@ -308,25 +308,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld2 {v0.4s, v1.4s}, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
OP_rr v16.4s, v0.4s, v8.4s[0]
|
||||
OP_ii v16.4s, v1.4s, v9.4s[0]
|
||||
OP_ri v17.4s, v0.4s, v9.4s[0]
|
||||
OP_ir v17.4s, v1.4s, v8.4s[0]
|
||||
OP_rr v16.4s, v0.4s, v8.s[0]
|
||||
OP_ii v16.4s, v1.4s, v9.s[0]
|
||||
OP_ri v17.4s, v0.4s, v9.s[0]
|
||||
OP_ir v17.4s, v1.4s, v8.s[0]
|
||||
|
||||
OP_rr v20.4s, v0.4s, v8.4s[1]
|
||||
OP_ii v20.4s, v1.4s, v9.4s[1]
|
||||
OP_ri v21.4s, v0.4s, v9.4s[1]
|
||||
OP_ir v21.4s, v1.4s, v8.4s[1]
|
||||
OP_rr v20.4s, v0.4s, v8.s[1]
|
||||
OP_ii v20.4s, v1.4s, v9.s[1]
|
||||
OP_ri v21.4s, v0.4s, v9.s[1]
|
||||
OP_ir v21.4s, v1.4s, v8.s[1]
|
||||
|
||||
OP_rr v24.4s, v0.4s, v8.4s[2]
|
||||
OP_ii v24.4s, v1.4s, v9.4s[2]
|
||||
OP_ri v25.4s, v0.4s, v9.4s[2]
|
||||
OP_ir v25.4s, v1.4s, v8.4s[2]
|
||||
OP_rr v24.4s, v0.4s, v8.s[2]
|
||||
OP_ii v24.4s, v1.4s, v9.s[2]
|
||||
OP_ri v25.4s, v0.4s, v9.s[2]
|
||||
OP_ir v25.4s, v1.4s, v8.s[2]
|
||||
|
||||
OP_rr v28.4s, v0.4s, v8.4s[3]
|
||||
OP_ii v28.4s, v1.4s, v9.4s[3]
|
||||
OP_ri v29.4s, v0.4s, v9.4s[3]
|
||||
OP_ir v29.4s, v1.4s, v8.4s[3]
|
||||
OP_rr v28.4s, v0.4s, v8.s[3]
|
||||
OP_ii v28.4s, v1.4s, v9.s[3]
|
||||
OP_ri v29.4s, v0.4s, v9.s[3]
|
||||
OP_ir v29.4s, v1.4s, v8.s[3]
|
||||
.endm
|
||||
|
||||
.macro SAVE4x4
|
||||
|
|
@ -384,25 +384,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld2 {v0.2s, v1.2s}, [pA]
|
||||
add pA, pA, #16
|
||||
|
||||
OP_rr v16.2s, v0.2s, v8.4s[0]
|
||||
OP_ii v16.2s, v1.2s, v9.4s[0]
|
||||
OP_ri v17.2s, v0.2s, v9.4s[0]
|
||||
OP_ir v17.2s, v1.2s, v8.4s[0]
|
||||
OP_rr v16.2s, v0.2s, v8.s[0]
|
||||
OP_ii v16.2s, v1.2s, v9.s[0]
|
||||
OP_ri v17.2s, v0.2s, v9.s[0]
|
||||
OP_ir v17.2s, v1.2s, v8.s[0]
|
||||
|
||||
OP_rr v20.2s, v0.2s, v8.4s[1]
|
||||
OP_ii v20.2s, v1.2s, v9.4s[1]
|
||||
OP_ri v21.2s, v0.2s, v9.4s[1]
|
||||
OP_ir v21.2s, v1.2s, v8.4s[1]
|
||||
OP_rr v20.2s, v0.2s, v8.s[1]
|
||||
OP_ii v20.2s, v1.2s, v9.s[1]
|
||||
OP_ri v21.2s, v0.2s, v9.s[1]
|
||||
OP_ir v21.2s, v1.2s, v8.s[1]
|
||||
|
||||
OP_rr v24.2s, v0.2s, v8.4s[2]
|
||||
OP_ii v24.2s, v1.2s, v9.4s[2]
|
||||
OP_ri v25.2s, v0.2s, v9.4s[2]
|
||||
OP_ir v25.2s, v1.2s, v8.4s[2]
|
||||
OP_rr v24.2s, v0.2s, v8.s[2]
|
||||
OP_ii v24.2s, v1.2s, v9.s[2]
|
||||
OP_ri v25.2s, v0.2s, v9.s[2]
|
||||
OP_ir v25.2s, v1.2s, v8.s[2]
|
||||
|
||||
OP_rr v28.2s, v0.2s, v8.4s[3]
|
||||
OP_ii v28.2s, v1.2s, v9.4s[3]
|
||||
OP_ri v29.2s, v0.2s, v9.4s[3]
|
||||
OP_ir v29.2s, v1.2s, v8.4s[3]
|
||||
OP_rr v28.2s, v0.2s, v8.s[3]
|
||||
OP_ii v28.2s, v1.2s, v9.s[3]
|
||||
OP_ri v29.2s, v0.2s, v9.s[3]
|
||||
OP_ir v29.2s, v1.2s, v8.s[3]
|
||||
.endm
|
||||
|
||||
.macro SAVE2x4
|
||||
|
|
@ -460,25 +460,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld2 {v0.s, v1.s}[0], [pA]
|
||||
add pA, pA, #8
|
||||
|
||||
OP_rr s16, s0, v8.4s[0]
|
||||
OP_ii s16, s1, v9.4s[0]
|
||||
OP_ri s17, s0, v9.4s[0]
|
||||
OP_ir s17, s1, v8.4s[0]
|
||||
OP_rr s16, s0, v8.s[0]
|
||||
OP_ii s16, s1, v9.s[0]
|
||||
OP_ri s17, s0, v9.s[0]
|
||||
OP_ir s17, s1, v8.s[0]
|
||||
|
||||
OP_rr s20, s0, v8.4s[1]
|
||||
OP_ii s20, s1, v9.4s[1]
|
||||
OP_ri s21, s0, v9.4s[1]
|
||||
OP_ir s21, s1, v8.4s[1]
|
||||
OP_rr s20, s0, v8.s[1]
|
||||
OP_ii s20, s1, v9.s[1]
|
||||
OP_ri s21, s0, v9.s[1]
|
||||
OP_ir s21, s1, v8.s[1]
|
||||
|
||||
OP_rr s24, s0, v8.4s[2]
|
||||
OP_ii s24, s1, v9.4s[2]
|
||||
OP_ri s25, s0, v9.4s[2]
|
||||
OP_ir s25, s1, v8.4s[2]
|
||||
OP_rr s24, s0, v8.s[2]
|
||||
OP_ii s24, s1, v9.s[2]
|
||||
OP_ri s25, s0, v9.s[2]
|
||||
OP_ir s25, s1, v8.s[2]
|
||||
|
||||
OP_rr s28, s0, v8.4s[3]
|
||||
OP_ii s28, s1, v9.4s[3]
|
||||
OP_ri s29, s0, v9.4s[3]
|
||||
OP_ir s29, s1, v8.4s[3]
|
||||
OP_rr s28, s0, v8.s[3]
|
||||
OP_ii s28, s1, v9.s[3]
|
||||
OP_ri s29, s0, v9.s[3]
|
||||
OP_ir s29, s1, v8.s[3]
|
||||
.endm
|
||||
|
||||
.macro SAVE1x4
|
||||
|
|
@ -532,15 +532,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld2 {v0.4s, v1.4s}, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
OP_rr v16.4s, v0.4s, v8.2s[0]
|
||||
OP_ii v16.4s, v1.4s, v9.2s[0]
|
||||
OP_ri v17.4s, v0.4s, v9.2s[0]
|
||||
OP_ir v17.4s, v1.4s, v8.2s[0]
|
||||
OP_rr v16.4s, v0.4s, v8.s[0]
|
||||
OP_ii v16.4s, v1.4s, v9.s[0]
|
||||
OP_ri v17.4s, v0.4s, v9.s[0]
|
||||
OP_ir v17.4s, v1.4s, v8.s[0]
|
||||
|
||||
OP_rr v20.4s, v0.4s, v8.2s[1]
|
||||
OP_ii v20.4s, v1.4s, v9.2s[1]
|
||||
OP_ri v21.4s, v0.4s, v9.2s[1]
|
||||
OP_ir v21.4s, v1.4s, v8.2s[1]
|
||||
OP_rr v20.4s, v0.4s, v8.s[1]
|
||||
OP_ii v20.4s, v1.4s, v9.s[1]
|
||||
OP_ri v21.4s, v0.4s, v9.s[1]
|
||||
OP_ir v21.4s, v1.4s, v8.s[1]
|
||||
.endm
|
||||
|
||||
.macro SAVE4x2
|
||||
|
|
@ -578,15 +578,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld2 {v0.2s, v1.2s}, [pA]
|
||||
add pA, pA, #16
|
||||
|
||||
OP_rr v16.2s, v0.2s, v8.2s[0]
|
||||
OP_ii v16.2s, v1.2s, v9.2s[0]
|
||||
OP_ri v17.2s, v0.2s, v9.2s[0]
|
||||
OP_ir v17.2s, v1.2s, v8.2s[0]
|
||||
OP_rr v16.2s, v0.2s, v8.s[0]
|
||||
OP_ii v16.2s, v1.2s, v9.s[0]
|
||||
OP_ri v17.2s, v0.2s, v9.s[0]
|
||||
OP_ir v17.2s, v1.2s, v8.s[0]
|
||||
|
||||
OP_rr v20.2s, v0.2s, v8.2s[1]
|
||||
OP_ii v20.2s, v1.2s, v9.2s[1]
|
||||
OP_ri v21.2s, v0.2s, v9.2s[1]
|
||||
OP_ir v21.2s, v1.2s, v8.2s[1]
|
||||
OP_rr v20.2s, v0.2s, v8.s[1]
|
||||
OP_ii v20.2s, v1.2s, v9.s[1]
|
||||
OP_ri v21.2s, v0.2s, v9.s[1]
|
||||
OP_ir v21.2s, v1.2s, v8.s[1]
|
||||
.endm
|
||||
|
||||
.macro SAVE2x2
|
||||
|
|
@ -624,15 +624,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld2 {v0.s, v1.s}[0], [pA]
|
||||
add pA, pA, #8
|
||||
|
||||
OP_rr s16, s0, v8.2s[0]
|
||||
OP_ii s16, s1, v9.2s[0]
|
||||
OP_ri s17, s0, v9.2s[0]
|
||||
OP_ir s17, s1, v8.2s[0]
|
||||
OP_rr s16, s0, v8.s[0]
|
||||
OP_ii s16, s1, v9.s[0]
|
||||
OP_ri s17, s0, v9.s[0]
|
||||
OP_ir s17, s1, v8.s[0]
|
||||
|
||||
OP_rr s20, s0, v8.2s[1]
|
||||
OP_ii s20, s1, v9.2s[1]
|
||||
OP_ri s21, s0, v9.2s[1]
|
||||
OP_ir s21, s1, v8.2s[1]
|
||||
OP_rr s20, s0, v8.s[1]
|
||||
OP_ii s20, s1, v9.s[1]
|
||||
OP_ri s21, s0, v9.s[1]
|
||||
OP_ir s21, s1, v8.s[1]
|
||||
.endm
|
||||
|
||||
.macro SAVE1x2
|
||||
|
|
|
|||
|
|
@ -180,93 +180,93 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld2 {v2.4s, v3.4s}, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
fmul v16.4s, v0.4s, v8.4s[0]
|
||||
OP_ii v16.4s, v1.4s, v9.4s[0]
|
||||
fmul v16.4s, v0.4s, v8.s[0]
|
||||
OP_ii v16.4s, v1.4s, v9.s[0]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
eor v17.16b, v17.16b, v17.16b
|
||||
fmls v17.4s, v0.4s, v9.4s[0]
|
||||
fmls v17.4s, v0.4s, v9.s[0]
|
||||
#else
|
||||
fmul v17.4s, v0.4s, v9.4s[0]
|
||||
fmul v17.4s, v0.4s, v9.s[0]
|
||||
#endif
|
||||
OP_ir v17.4s, v1.4s, v8.4s[0]
|
||||
OP_ir v17.4s, v1.4s, v8.s[0]
|
||||
|
||||
fmul v18.4s, v2.4s, v8.4s[0]
|
||||
OP_ii v18.4s, v3.4s, v9.4s[0]
|
||||
fmul v18.4s, v2.4s, v8.s[0]
|
||||
OP_ii v18.4s, v3.4s, v9.s[0]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
eor v19.16b, v19.16b, v19.16b
|
||||
fmls v19.4s, v2.4s, v9.4s[0]
|
||||
fmls v19.4s, v2.4s, v9.s[0]
|
||||
#else
|
||||
fmul v19.4s, v2.4s, v9.4s[0]
|
||||
fmul v19.4s, v2.4s, v9.s[0]
|
||||
#endif
|
||||
OP_ir v19.4s, v3.4s, v8.4s[0]
|
||||
OP_ir v19.4s, v3.4s, v8.s[0]
|
||||
|
||||
fmul v20.4s, v0.4s, v8.4s[1]
|
||||
OP_ii v20.4s, v1.4s, v9.4s[1]
|
||||
fmul v20.4s, v0.4s, v8.s[1]
|
||||
OP_ii v20.4s, v1.4s, v9.s[1]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
eor v21.16b, v21.16b, v21.16b
|
||||
fmls v21.4s, v0.4s, v9.4s[1]
|
||||
fmls v21.4s, v0.4s, v9.s[1]
|
||||
#else
|
||||
fmul v21.4s, v0.4s, v9.4s[1]
|
||||
fmul v21.4s, v0.4s, v9.s[1]
|
||||
#endif
|
||||
OP_ir v21.4s, v1.4s, v8.4s[1]
|
||||
OP_ir v21.4s, v1.4s, v8.s[1]
|
||||
|
||||
fmul v22.4s, v2.4s, v8.4s[1]
|
||||
OP_ii v22.4s, v3.4s, v9.4s[1]
|
||||
fmul v22.4s, v2.4s, v8.s[1]
|
||||
OP_ii v22.4s, v3.4s, v9.s[1]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
eor v23.16b, v23.16b, v23.16b
|
||||
fmls v23.4s, v2.4s, v9.4s[1]
|
||||
fmls v23.4s, v2.4s, v9.s[1]
|
||||
#else
|
||||
fmul v23.4s, v2.4s, v9.4s[1]
|
||||
fmul v23.4s, v2.4s, v9.s[1]
|
||||
#endif
|
||||
OP_ir v23.4s, v3.4s, v8.4s[1]
|
||||
OP_ir v23.4s, v3.4s, v8.s[1]
|
||||
|
||||
fmul v24.4s, v0.4s, v8.4s[2]
|
||||
OP_ii v24.4s, v1.4s, v9.4s[2]
|
||||
fmul v24.4s, v0.4s, v8.s[2]
|
||||
OP_ii v24.4s, v1.4s, v9.s[2]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
eor v25.16b, v25.16b, v25.16b
|
||||
fmls v25.4s, v0.4s, v9.4s[2]
|
||||
fmls v25.4s, v0.4s, v9.s[2]
|
||||
#else
|
||||
fmul v25.4s, v0.4s, v9.4s[2]
|
||||
fmul v25.4s, v0.4s, v9.s[2]
|
||||
#endif
|
||||
OP_ir v25.4s, v1.4s, v8.4s[2]
|
||||
OP_ir v25.4s, v1.4s, v8.s[2]
|
||||
|
||||
fmul v26.4s, v2.4s, v8.4s[2]
|
||||
OP_ii v26.4s, v3.4s, v9.4s[2]
|
||||
fmul v26.4s, v2.4s, v8.s[2]
|
||||
OP_ii v26.4s, v3.4s, v9.s[2]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
eor v27.16b, v27.16b, v27.16b
|
||||
fmls v27.4s, v2.4s, v9.4s[2]
|
||||
fmls v27.4s, v2.4s, v9.s[2]
|
||||
#else
|
||||
fmul v27.4s, v2.4s, v9.4s[2]
|
||||
fmul v27.4s, v2.4s, v9.s[2]
|
||||
#endif
|
||||
OP_ir v27.4s, v3.4s, v8.4s[2]
|
||||
OP_ir v27.4s, v3.4s, v8.s[2]
|
||||
|
||||
fmul v28.4s, v0.4s, v8.4s[3]
|
||||
OP_ii v28.4s, v1.4s, v9.4s[3]
|
||||
fmul v28.4s, v0.4s, v8.s[3]
|
||||
OP_ii v28.4s, v1.4s, v9.s[3]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
eor v29.16b, v29.16b, v29.16b
|
||||
fmls v29.4s, v0.4s, v9.4s[3]
|
||||
fmls v29.4s, v0.4s, v9.s[3]
|
||||
#else
|
||||
fmul v29.4s, v0.4s, v9.4s[3]
|
||||
fmul v29.4s, v0.4s, v9.s[3]
|
||||
#endif
|
||||
OP_ir v29.4s, v1.4s, v8.4s[3]
|
||||
OP_ir v29.4s, v1.4s, v8.s[3]
|
||||
|
||||
fmul v30.4s, v2.4s, v8.4s[3]
|
||||
OP_ii v30.4s, v3.4s, v9.4s[3]
|
||||
fmul v30.4s, v2.4s, v8.s[3]
|
||||
OP_ii v30.4s, v3.4s, v9.s[3]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
eor v31.16b, v31.16b, v31.16b
|
||||
fmls v31.4s, v2.4s, v9.4s[3]
|
||||
fmls v31.4s, v2.4s, v9.s[3]
|
||||
#else
|
||||
fmul v31.4s, v2.4s, v9.4s[3]
|
||||
fmul v31.4s, v2.4s, v9.s[3]
|
||||
#endif
|
||||
OP_ir v31.4s, v3.4s, v8.4s[3]
|
||||
OP_ir v31.4s, v3.4s, v8.s[3]
|
||||
|
||||
ld2 {v12.4s, v13.4s}, [pB]
|
||||
add pB, pB, #32
|
||||
|
|
@ -277,45 +277,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro KERNEL8x4_M1
|
||||
OP_rr v16.4s, v0.4s, v8.4s[0]
|
||||
OP_ii v16.4s, v1.4s, v9.4s[0]
|
||||
OP_ri v17.4s, v0.4s, v9.4s[0]
|
||||
OP_ir v17.4s, v1.4s, v8.4s[0]
|
||||
OP_rr v16.4s, v0.4s, v8.s[0]
|
||||
OP_ii v16.4s, v1.4s, v9.s[0]
|
||||
OP_ri v17.4s, v0.4s, v9.s[0]
|
||||
OP_ir v17.4s, v1.4s, v8.s[0]
|
||||
|
||||
OP_rr v18.4s, v2.4s, v8.4s[0]
|
||||
OP_ii v18.4s, v3.4s, v9.4s[0]
|
||||
OP_ri v19.4s, v2.4s, v9.4s[0]
|
||||
OP_ir v19.4s, v3.4s, v8.4s[0]
|
||||
OP_rr v18.4s, v2.4s, v8.s[0]
|
||||
OP_ii v18.4s, v3.4s, v9.s[0]
|
||||
OP_ri v19.4s, v2.4s, v9.s[0]
|
||||
OP_ir v19.4s, v3.4s, v8.s[0]
|
||||
|
||||
OP_rr v20.4s, v0.4s, v8.4s[1]
|
||||
OP_ii v20.4s, v1.4s, v9.4s[1]
|
||||
OP_ri v21.4s, v0.4s, v9.4s[1]
|
||||
OP_ir v21.4s, v1.4s, v8.4s[1]
|
||||
OP_rr v20.4s, v0.4s, v8.s[1]
|
||||
OP_ii v20.4s, v1.4s, v9.s[1]
|
||||
OP_ri v21.4s, v0.4s, v9.s[1]
|
||||
OP_ir v21.4s, v1.4s, v8.s[1]
|
||||
|
||||
OP_rr v22.4s, v2.4s, v8.4s[1]
|
||||
OP_ii v22.4s, v3.4s, v9.4s[1]
|
||||
OP_ri v23.4s, v2.4s, v9.4s[1]
|
||||
OP_ir v23.4s, v3.4s, v8.4s[1]
|
||||
OP_rr v22.4s, v2.4s, v8.s[1]
|
||||
OP_ii v22.4s, v3.4s, v9.s[1]
|
||||
OP_ri v23.4s, v2.4s, v9.s[1]
|
||||
OP_ir v23.4s, v3.4s, v8.s[1]
|
||||
|
||||
OP_rr v24.4s, v0.4s, v8.4s[2]
|
||||
OP_ii v24.4s, v1.4s, v9.4s[2]
|
||||
OP_ri v25.4s, v0.4s, v9.4s[2]
|
||||
OP_ir v25.4s, v1.4s, v8.4s[2]
|
||||
OP_rr v24.4s, v0.4s, v8.s[2]
|
||||
OP_ii v24.4s, v1.4s, v9.s[2]
|
||||
OP_ri v25.4s, v0.4s, v9.s[2]
|
||||
OP_ir v25.4s, v1.4s, v8.s[2]
|
||||
|
||||
OP_rr v26.4s, v2.4s, v8.4s[2]
|
||||
OP_ii v26.4s, v3.4s, v9.4s[2]
|
||||
OP_ri v27.4s, v2.4s, v9.4s[2]
|
||||
OP_ir v27.4s, v3.4s, v8.4s[2]
|
||||
OP_rr v26.4s, v2.4s, v8.s[2]
|
||||
OP_ii v26.4s, v3.4s, v9.s[2]
|
||||
OP_ri v27.4s, v2.4s, v9.s[2]
|
||||
OP_ir v27.4s, v3.4s, v8.s[2]
|
||||
|
||||
OP_rr v28.4s, v0.4s, v8.4s[3]
|
||||
OP_ii v28.4s, v1.4s, v9.4s[3]
|
||||
OP_ri v29.4s, v0.4s, v9.4s[3]
|
||||
OP_ir v29.4s, v1.4s, v8.4s[3]
|
||||
OP_rr v28.4s, v0.4s, v8.s[3]
|
||||
OP_ii v28.4s, v1.4s, v9.s[3]
|
||||
OP_ri v29.4s, v0.4s, v9.s[3]
|
||||
OP_ir v29.4s, v1.4s, v8.s[3]
|
||||
|
||||
OP_rr v30.4s, v2.4s, v8.4s[3]
|
||||
OP_ii v30.4s, v3.4s, v9.4s[3]
|
||||
OP_ri v31.4s, v2.4s, v9.4s[3]
|
||||
OP_ir v31.4s, v3.4s, v8.4s[3]
|
||||
OP_rr v30.4s, v2.4s, v8.s[3]
|
||||
OP_ii v30.4s, v3.4s, v9.s[3]
|
||||
OP_ri v31.4s, v2.4s, v9.s[3]
|
||||
OP_ir v31.4s, v3.4s, v8.s[3]
|
||||
|
||||
ld2 {v12.4s, v13.4s}, [pB] // For next round
|
||||
add pB, pB, #32
|
||||
|
|
@ -326,45 +326,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro KERNEL8x4_M2
|
||||
OP_rr v16.4s, v4.4s, v12.4s[0]
|
||||
OP_ii v16.4s, v5.4s, v13.4s[0]
|
||||
OP_ri v17.4s, v4.4s, v13.4s[0]
|
||||
OP_ir v17.4s, v5.4s, v12.4s[0]
|
||||
OP_rr v16.4s, v4.4s, v12.s[0]
|
||||
OP_ii v16.4s, v5.4s, v13.s[0]
|
||||
OP_ri v17.4s, v4.4s, v13.s[0]
|
||||
OP_ir v17.4s, v5.4s, v12.s[0]
|
||||
|
||||
OP_rr v18.4s, v6.4s, v12.4s[0]
|
||||
OP_ii v18.4s, v7.4s, v13.4s[0]
|
||||
OP_ri v19.4s, v6.4s, v13.4s[0]
|
||||
OP_ir v19.4s, v7.4s, v12.4s[0]
|
||||
OP_rr v18.4s, v6.4s, v12.s[0]
|
||||
OP_ii v18.4s, v7.4s, v13.s[0]
|
||||
OP_ri v19.4s, v6.4s, v13.s[0]
|
||||
OP_ir v19.4s, v7.4s, v12.s[0]
|
||||
|
||||
OP_rr v20.4s, v4.4s, v12.4s[1]
|
||||
OP_ii v20.4s, v5.4s, v13.4s[1]
|
||||
OP_ri v21.4s, v4.4s, v13.4s[1]
|
||||
OP_ir v21.4s, v5.4s, v12.4s[1]
|
||||
OP_rr v20.4s, v4.4s, v12.s[1]
|
||||
OP_ii v20.4s, v5.4s, v13.s[1]
|
||||
OP_ri v21.4s, v4.4s, v13.s[1]
|
||||
OP_ir v21.4s, v5.4s, v12.s[1]
|
||||
|
||||
OP_rr v22.4s, v6.4s, v12.4s[1]
|
||||
OP_ii v22.4s, v7.4s, v13.4s[1]
|
||||
OP_ri v23.4s, v6.4s, v13.4s[1]
|
||||
OP_ir v23.4s, v7.4s, v12.4s[1]
|
||||
OP_rr v22.4s, v6.4s, v12.s[1]
|
||||
OP_ii v22.4s, v7.4s, v13.s[1]
|
||||
OP_ri v23.4s, v6.4s, v13.s[1]
|
||||
OP_ir v23.4s, v7.4s, v12.s[1]
|
||||
|
||||
OP_rr v24.4s, v4.4s, v12.4s[2]
|
||||
OP_ii v24.4s, v5.4s, v13.4s[2]
|
||||
OP_ri v25.4s, v4.4s, v13.4s[2]
|
||||
OP_ir v25.4s, v5.4s, v12.4s[2]
|
||||
OP_rr v24.4s, v4.4s, v12.s[2]
|
||||
OP_ii v24.4s, v5.4s, v13.s[2]
|
||||
OP_ri v25.4s, v4.4s, v13.s[2]
|
||||
OP_ir v25.4s, v5.4s, v12.s[2]
|
||||
|
||||
OP_rr v26.4s, v6.4s, v12.4s[2]
|
||||
OP_ii v26.4s, v7.4s, v13.4s[2]
|
||||
OP_ri v27.4s, v6.4s, v13.4s[2]
|
||||
OP_ir v27.4s, v7.4s, v12.4s[2]
|
||||
OP_rr v26.4s, v6.4s, v12.s[2]
|
||||
OP_ii v26.4s, v7.4s, v13.s[2]
|
||||
OP_ri v27.4s, v6.4s, v13.s[2]
|
||||
OP_ir v27.4s, v7.4s, v12.s[2]
|
||||
|
||||
OP_rr v28.4s, v4.4s, v12.4s[3]
|
||||
OP_ii v28.4s, v5.4s, v13.4s[3]
|
||||
OP_ri v29.4s, v4.4s, v13.4s[3]
|
||||
OP_ir v29.4s, v5.4s, v12.4s[3]
|
||||
OP_rr v28.4s, v4.4s, v12.s[3]
|
||||
OP_ii v28.4s, v5.4s, v13.s[3]
|
||||
OP_ri v29.4s, v4.4s, v13.s[3]
|
||||
OP_ir v29.4s, v5.4s, v12.s[3]
|
||||
|
||||
OP_rr v30.4s, v6.4s, v12.4s[3]
|
||||
OP_ii v30.4s, v7.4s, v13.4s[3]
|
||||
OP_ri v31.4s, v6.4s, v13.4s[3]
|
||||
OP_ir v31.4s, v7.4s, v12.4s[3]
|
||||
OP_rr v30.4s, v6.4s, v12.s[3]
|
||||
OP_ii v30.4s, v7.4s, v13.s[3]
|
||||
OP_ri v31.4s, v6.4s, v13.s[3]
|
||||
OP_ir v31.4s, v7.4s, v12.s[3]
|
||||
|
||||
ld2 {v8.4s, v9.4s}, [pB]
|
||||
add pB, pB, #32
|
||||
|
|
@ -375,45 +375,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro KERNEL8x4_E
|
||||
OP_rr v16.4s, v4.4s, v12.4s[0]
|
||||
OP_ii v16.4s, v5.4s, v13.4s[0]
|
||||
OP_ri v17.4s, v4.4s, v13.4s[0]
|
||||
OP_ir v17.4s, v5.4s, v12.4s[0]
|
||||
OP_rr v16.4s, v4.4s, v12.s[0]
|
||||
OP_ii v16.4s, v5.4s, v13.s[0]
|
||||
OP_ri v17.4s, v4.4s, v13.s[0]
|
||||
OP_ir v17.4s, v5.4s, v12.s[0]
|
||||
|
||||
OP_rr v18.4s, v6.4s, v12.4s[0]
|
||||
OP_ii v18.4s, v7.4s, v13.4s[0]
|
||||
OP_ri v19.4s, v6.4s, v13.4s[0]
|
||||
OP_ir v19.4s, v7.4s, v12.4s[0]
|
||||
OP_rr v18.4s, v6.4s, v12.s[0]
|
||||
OP_ii v18.4s, v7.4s, v13.s[0]
|
||||
OP_ri v19.4s, v6.4s, v13.s[0]
|
||||
OP_ir v19.4s, v7.4s, v12.s[0]
|
||||
|
||||
OP_rr v20.4s, v4.4s, v12.4s[1]
|
||||
OP_ii v20.4s, v5.4s, v13.4s[1]
|
||||
OP_ri v21.4s, v4.4s, v13.4s[1]
|
||||
OP_ir v21.4s, v5.4s, v12.4s[1]
|
||||
OP_rr v20.4s, v4.4s, v12.s[1]
|
||||
OP_ii v20.4s, v5.4s, v13.s[1]
|
||||
OP_ri v21.4s, v4.4s, v13.s[1]
|
||||
OP_ir v21.4s, v5.4s, v12.s[1]
|
||||
|
||||
OP_rr v22.4s, v6.4s, v12.4s[1]
|
||||
OP_ii v22.4s, v7.4s, v13.4s[1]
|
||||
OP_ri v23.4s, v6.4s, v13.4s[1]
|
||||
OP_ir v23.4s, v7.4s, v12.4s[1]
|
||||
OP_rr v22.4s, v6.4s, v12.s[1]
|
||||
OP_ii v22.4s, v7.4s, v13.s[1]
|
||||
OP_ri v23.4s, v6.4s, v13.s[1]
|
||||
OP_ir v23.4s, v7.4s, v12.s[1]
|
||||
|
||||
OP_rr v24.4s, v4.4s, v12.4s[2]
|
||||
OP_ii v24.4s, v5.4s, v13.4s[2]
|
||||
OP_ri v25.4s, v4.4s, v13.4s[2]
|
||||
OP_ir v25.4s, v5.4s, v12.4s[2]
|
||||
OP_rr v24.4s, v4.4s, v12.s[2]
|
||||
OP_ii v24.4s, v5.4s, v13.s[2]
|
||||
OP_ri v25.4s, v4.4s, v13.s[2]
|
||||
OP_ir v25.4s, v5.4s, v12.s[2]
|
||||
|
||||
OP_rr v26.4s, v6.4s, v12.4s[2]
|
||||
OP_ii v26.4s, v7.4s, v13.4s[2]
|
||||
OP_ri v27.4s, v6.4s, v13.4s[2]
|
||||
OP_ir v27.4s, v7.4s, v12.4s[2]
|
||||
OP_rr v26.4s, v6.4s, v12.s[2]
|
||||
OP_ii v26.4s, v7.4s, v13.s[2]
|
||||
OP_ri v27.4s, v6.4s, v13.s[2]
|
||||
OP_ir v27.4s, v7.4s, v12.s[2]
|
||||
|
||||
OP_rr v28.4s, v4.4s, v12.4s[3]
|
||||
OP_ii v28.4s, v5.4s, v13.4s[3]
|
||||
OP_ri v29.4s, v4.4s, v13.4s[3]
|
||||
OP_ir v29.4s, v5.4s, v12.4s[3]
|
||||
OP_rr v28.4s, v4.4s, v12.s[3]
|
||||
OP_ii v28.4s, v5.4s, v13.s[3]
|
||||
OP_ri v29.4s, v4.4s, v13.s[3]
|
||||
OP_ir v29.4s, v5.4s, v12.s[3]
|
||||
|
||||
OP_rr v30.4s, v6.4s, v12.4s[3]
|
||||
OP_ii v30.4s, v7.4s, v13.4s[3]
|
||||
OP_ri v31.4s, v6.4s, v13.4s[3]
|
||||
OP_ir v31.4s, v7.4s, v12.4s[3]
|
||||
OP_rr v30.4s, v6.4s, v12.s[3]
|
||||
OP_ii v30.4s, v7.4s, v13.s[3]
|
||||
OP_ri v31.4s, v6.4s, v13.s[3]
|
||||
OP_ir v31.4s, v7.4s, v12.s[3]
|
||||
|
||||
.endm
|
||||
|
||||
|
|
@ -425,45 +425,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld2 {v2.4s, v3.4s}, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
OP_rr v16.4s, v0.4s, v8.4s[0]
|
||||
OP_ii v16.4s, v1.4s, v9.4s[0]
|
||||
OP_ri v17.4s, v0.4s, v9.4s[0]
|
||||
OP_ir v17.4s, v1.4s, v8.4s[0]
|
||||
OP_rr v16.4s, v0.4s, v8.s[0]
|
||||
OP_ii v16.4s, v1.4s, v9.s[0]
|
||||
OP_ri v17.4s, v0.4s, v9.s[0]
|
||||
OP_ir v17.4s, v1.4s, v8.s[0]
|
||||
|
||||
OP_rr v18.4s, v2.4s, v8.4s[0]
|
||||
OP_ii v18.4s, v3.4s, v9.4s[0]
|
||||
OP_ri v19.4s, v2.4s, v9.4s[0]
|
||||
OP_ir v19.4s, v3.4s, v8.4s[0]
|
||||
OP_rr v18.4s, v2.4s, v8.s[0]
|
||||
OP_ii v18.4s, v3.4s, v9.s[0]
|
||||
OP_ri v19.4s, v2.4s, v9.s[0]
|
||||
OP_ir v19.4s, v3.4s, v8.s[0]
|
||||
|
||||
OP_rr v20.4s, v0.4s, v8.4s[1]
|
||||
OP_ii v20.4s, v1.4s, v9.4s[1]
|
||||
OP_ri v21.4s, v0.4s, v9.4s[1]
|
||||
OP_ir v21.4s, v1.4s, v8.4s[1]
|
||||
OP_rr v20.4s, v0.4s, v8.s[1]
|
||||
OP_ii v20.4s, v1.4s, v9.s[1]
|
||||
OP_ri v21.4s, v0.4s, v9.s[1]
|
||||
OP_ir v21.4s, v1.4s, v8.s[1]
|
||||
|
||||
OP_rr v22.4s, v2.4s, v8.4s[1]
|
||||
OP_ii v22.4s, v3.4s, v9.4s[1]
|
||||
OP_ri v23.4s, v2.4s, v9.4s[1]
|
||||
OP_ir v23.4s, v3.4s, v8.4s[1]
|
||||
OP_rr v22.4s, v2.4s, v8.s[1]
|
||||
OP_ii v22.4s, v3.4s, v9.s[1]
|
||||
OP_ri v23.4s, v2.4s, v9.s[1]
|
||||
OP_ir v23.4s, v3.4s, v8.s[1]
|
||||
|
||||
OP_rr v24.4s, v0.4s, v8.4s[2]
|
||||
OP_ii v24.4s, v1.4s, v9.4s[2]
|
||||
OP_ri v25.4s, v0.4s, v9.4s[2]
|
||||
OP_ir v25.4s, v1.4s, v8.4s[2]
|
||||
OP_rr v24.4s, v0.4s, v8.s[2]
|
||||
OP_ii v24.4s, v1.4s, v9.s[2]
|
||||
OP_ri v25.4s, v0.4s, v9.s[2]
|
||||
OP_ir v25.4s, v1.4s, v8.s[2]
|
||||
|
||||
OP_rr v26.4s, v2.4s, v8.4s[2]
|
||||
OP_ii v26.4s, v3.4s, v9.4s[2]
|
||||
OP_ri v27.4s, v2.4s, v9.4s[2]
|
||||
OP_ir v27.4s, v3.4s, v8.4s[2]
|
||||
OP_rr v26.4s, v2.4s, v8.s[2]
|
||||
OP_ii v26.4s, v3.4s, v9.s[2]
|
||||
OP_ri v27.4s, v2.4s, v9.s[2]
|
||||
OP_ir v27.4s, v3.4s, v8.s[2]
|
||||
|
||||
OP_rr v28.4s, v0.4s, v8.4s[3]
|
||||
OP_ii v28.4s, v1.4s, v9.4s[3]
|
||||
OP_ri v29.4s, v0.4s, v9.4s[3]
|
||||
OP_ir v29.4s, v1.4s, v8.4s[3]
|
||||
OP_rr v28.4s, v0.4s, v8.s[3]
|
||||
OP_ii v28.4s, v1.4s, v9.s[3]
|
||||
OP_ri v29.4s, v0.4s, v9.s[3]
|
||||
OP_ir v29.4s, v1.4s, v8.s[3]
|
||||
|
||||
OP_rr v30.4s, v2.4s, v8.4s[3]
|
||||
OP_ii v30.4s, v3.4s, v9.4s[3]
|
||||
OP_ri v31.4s, v2.4s, v9.4s[3]
|
||||
OP_ir v31.4s, v3.4s, v8.4s[3]
|
||||
OP_rr v30.4s, v2.4s, v8.s[3]
|
||||
OP_ii v30.4s, v3.4s, v9.s[3]
|
||||
OP_ri v31.4s, v2.4s, v9.s[3]
|
||||
OP_ir v31.4s, v3.4s, v8.s[3]
|
||||
|
||||
.endm
|
||||
|
||||
|
|
@ -562,49 +562,49 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld2 {v0.4s, v1.4s}, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
fmul v16.4s, v0.4s, v8.4s[0]
|
||||
OP_ii v16.4s, v1.4s, v9.4s[0]
|
||||
fmul v16.4s, v0.4s, v8.s[0]
|
||||
OP_ii v16.4s, v1.4s, v9.s[0]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
eor v17.16b, v17.16b, v17.16b
|
||||
fmls v17.4s, v0.4s, v9.4s[0]
|
||||
fmls v17.4s, v0.4s, v9.s[0]
|
||||
#else
|
||||
fmul v17.4s, v0.4s, v9.4s[0]
|
||||
fmul v17.4s, v0.4s, v9.s[0]
|
||||
#endif
|
||||
OP_ir v17.4s, v1.4s, v8.4s[0]
|
||||
OP_ir v17.4s, v1.4s, v8.s[0]
|
||||
|
||||
fmul v20.4s, v0.4s, v8.4s[1]
|
||||
OP_ii v20.4s, v1.4s, v9.4s[1]
|
||||
fmul v20.4s, v0.4s, v8.s[1]
|
||||
OP_ii v20.4s, v1.4s, v9.s[1]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
eor v21.16b, v21.16b, v21.16b
|
||||
fmls v21.4s, v0.4s, v9.4s[1]
|
||||
fmls v21.4s, v0.4s, v9.s[1]
|
||||
#else
|
||||
fmul v21.4s, v0.4s, v9.4s[1]
|
||||
fmul v21.4s, v0.4s, v9.s[1]
|
||||
#endif
|
||||
OP_ir v21.4s, v1.4s, v8.4s[1]
|
||||
OP_ir v21.4s, v1.4s, v8.s[1]
|
||||
|
||||
fmul v24.4s, v0.4s, v8.4s[2]
|
||||
OP_ii v24.4s, v1.4s, v9.4s[2]
|
||||
fmul v24.4s, v0.4s, v8.s[2]
|
||||
OP_ii v24.4s, v1.4s, v9.s[2]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
eor v25.16b, v25.16b, v25.16b
|
||||
fmls v25.4s, v0.4s, v9.4s[2]
|
||||
fmls v25.4s, v0.4s, v9.s[2]
|
||||
#else
|
||||
fmul v25.4s, v0.4s, v9.4s[2]
|
||||
fmul v25.4s, v0.4s, v9.s[2]
|
||||
#endif
|
||||
OP_ir v25.4s, v1.4s, v8.4s[2]
|
||||
OP_ir v25.4s, v1.4s, v8.s[2]
|
||||
|
||||
fmul v28.4s, v0.4s, v8.4s[3]
|
||||
OP_ii v28.4s, v1.4s, v9.4s[3]
|
||||
fmul v28.4s, v0.4s, v8.s[3]
|
||||
OP_ii v28.4s, v1.4s, v9.s[3]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
eor v29.16b, v29.16b, v29.16b
|
||||
fmls v29.4s, v0.4s, v9.4s[3]
|
||||
fmls v29.4s, v0.4s, v9.s[3]
|
||||
#else
|
||||
fmul v29.4s, v0.4s, v9.4s[3]
|
||||
fmul v29.4s, v0.4s, v9.s[3]
|
||||
#endif
|
||||
OP_ir v29.4s, v1.4s, v8.4s[3]
|
||||
OP_ir v29.4s, v1.4s, v8.s[3]
|
||||
|
||||
ld2 {v12.4s, v13.4s}, [pB]
|
||||
add pB, pB, #32
|
||||
|
|
@ -613,85 +613,85 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro KERNEL4x4_M1
|
||||
OP_rr v16.4s, v0.4s, v8.4s[0]
|
||||
OP_ii v16.4s, v1.4s, v9.4s[0]
|
||||
OP_ri v17.4s, v0.4s, v9.4s[0]
|
||||
OP_ir v17.4s, v1.4s, v8.4s[0]
|
||||
OP_rr v16.4s, v0.4s, v8.s[0]
|
||||
OP_ii v16.4s, v1.4s, v9.s[0]
|
||||
OP_ri v17.4s, v0.4s, v9.s[0]
|
||||
OP_ir v17.4s, v1.4s, v8.s[0]
|
||||
|
||||
ld2 {v12.4s, v13.4s}, [pB] // For next round
|
||||
add pB, pB, #32
|
||||
|
||||
OP_rr v20.4s, v0.4s, v8.4s[1]
|
||||
OP_ii v20.4s, v1.4s, v9.4s[1]
|
||||
OP_ri v21.4s, v0.4s, v9.4s[1]
|
||||
OP_ir v21.4s, v1.4s, v8.4s[1]
|
||||
OP_rr v20.4s, v0.4s, v8.s[1]
|
||||
OP_ii v20.4s, v1.4s, v9.s[1]
|
||||
OP_ri v21.4s, v0.4s, v9.s[1]
|
||||
OP_ir v21.4s, v1.4s, v8.s[1]
|
||||
|
||||
ld2 {v4.4s, v5.4s}, [pA] // For next round
|
||||
add pA, pA, #32
|
||||
|
||||
OP_rr v24.4s, v0.4s, v8.4s[2]
|
||||
OP_ii v24.4s, v1.4s, v9.4s[2]
|
||||
OP_ri v25.4s, v0.4s, v9.4s[2]
|
||||
OP_ir v25.4s, v1.4s, v8.4s[2]
|
||||
OP_rr v24.4s, v0.4s, v8.s[2]
|
||||
OP_ii v24.4s, v1.4s, v9.s[2]
|
||||
OP_ri v25.4s, v0.4s, v9.s[2]
|
||||
OP_ir v25.4s, v1.4s, v8.s[2]
|
||||
|
||||
prfm PLDL1KEEP, [pA, #512]
|
||||
|
||||
OP_rr v28.4s, v0.4s, v8.4s[3]
|
||||
OP_ii v28.4s, v1.4s, v9.4s[3]
|
||||
OP_ri v29.4s, v0.4s, v9.4s[3]
|
||||
OP_ir v29.4s, v1.4s, v8.4s[3]
|
||||
OP_rr v28.4s, v0.4s, v8.s[3]
|
||||
OP_ii v28.4s, v1.4s, v9.s[3]
|
||||
OP_ri v29.4s, v0.4s, v9.s[3]
|
||||
OP_ir v29.4s, v1.4s, v8.s[3]
|
||||
.endm
|
||||
|
||||
.macro KERNEL4x4_M2
|
||||
OP_rr v16.4s, v4.4s, v12.4s[0]
|
||||
OP_ii v16.4s, v5.4s, v13.4s[0]
|
||||
OP_ri v17.4s, v4.4s, v13.4s[0]
|
||||
OP_ir v17.4s, v5.4s, v12.4s[0]
|
||||
OP_rr v16.4s, v4.4s, v12.s[0]
|
||||
OP_ii v16.4s, v5.4s, v13.s[0]
|
||||
OP_ri v17.4s, v4.4s, v13.s[0]
|
||||
OP_ir v17.4s, v5.4s, v12.s[0]
|
||||
|
||||
ld2 {v8.4s, v9.4s}, [pB] // For next round
|
||||
add pB, pB, #32
|
||||
|
||||
OP_rr v20.4s, v4.4s, v12.4s[1]
|
||||
OP_ii v20.4s, v5.4s, v13.4s[1]
|
||||
OP_ri v21.4s, v4.4s, v13.4s[1]
|
||||
OP_ir v21.4s, v5.4s, v12.4s[1]
|
||||
OP_rr v20.4s, v4.4s, v12.s[1]
|
||||
OP_ii v20.4s, v5.4s, v13.s[1]
|
||||
OP_ri v21.4s, v4.4s, v13.s[1]
|
||||
OP_ir v21.4s, v5.4s, v12.s[1]
|
||||
|
||||
ld2 {v0.4s, v1.4s}, [pA] // For next round
|
||||
add pA, pA, #32
|
||||
|
||||
OP_rr v24.4s, v4.4s, v12.4s[2]
|
||||
OP_ii v24.4s, v5.4s, v13.4s[2]
|
||||
OP_ri v25.4s, v4.4s, v13.4s[2]
|
||||
OP_ir v25.4s, v5.4s, v12.4s[2]
|
||||
OP_rr v24.4s, v4.4s, v12.s[2]
|
||||
OP_ii v24.4s, v5.4s, v13.s[2]
|
||||
OP_ri v25.4s, v4.4s, v13.s[2]
|
||||
OP_ir v25.4s, v5.4s, v12.s[2]
|
||||
|
||||
prfm PLDL1KEEP, [pB, #512]
|
||||
|
||||
OP_rr v28.4s, v4.4s, v12.4s[3]
|
||||
OP_ii v28.4s, v5.4s, v13.4s[3]
|
||||
OP_ri v29.4s, v4.4s, v13.4s[3]
|
||||
OP_ir v29.4s, v5.4s, v12.4s[3]
|
||||
OP_rr v28.4s, v4.4s, v12.s[3]
|
||||
OP_ii v28.4s, v5.4s, v13.s[3]
|
||||
OP_ri v29.4s, v4.4s, v13.s[3]
|
||||
OP_ir v29.4s, v5.4s, v12.s[3]
|
||||
.endm
|
||||
|
||||
.macro KERNEL4x4_E
|
||||
OP_rr v16.4s, v4.4s, v12.4s[0]
|
||||
OP_ii v16.4s, v5.4s, v13.4s[0]
|
||||
OP_ri v17.4s, v4.4s, v13.4s[0]
|
||||
OP_ir v17.4s, v5.4s, v12.4s[0]
|
||||
OP_rr v16.4s, v4.4s, v12.s[0]
|
||||
OP_ii v16.4s, v5.4s, v13.s[0]
|
||||
OP_ri v17.4s, v4.4s, v13.s[0]
|
||||
OP_ir v17.4s, v5.4s, v12.s[0]
|
||||
|
||||
OP_rr v20.4s, v4.4s, v12.4s[1]
|
||||
OP_ii v20.4s, v5.4s, v13.4s[1]
|
||||
OP_ri v21.4s, v4.4s, v13.4s[1]
|
||||
OP_ir v21.4s, v5.4s, v12.4s[1]
|
||||
OP_rr v20.4s, v4.4s, v12.s[1]
|
||||
OP_ii v20.4s, v5.4s, v13.s[1]
|
||||
OP_ri v21.4s, v4.4s, v13.s[1]
|
||||
OP_ir v21.4s, v5.4s, v12.s[1]
|
||||
|
||||
OP_rr v24.4s, v4.4s, v12.4s[2]
|
||||
OP_ii v24.4s, v5.4s, v13.4s[2]
|
||||
OP_ri v25.4s, v4.4s, v13.4s[2]
|
||||
OP_ir v25.4s, v5.4s, v12.4s[2]
|
||||
OP_rr v24.4s, v4.4s, v12.s[2]
|
||||
OP_ii v24.4s, v5.4s, v13.s[2]
|
||||
OP_ri v25.4s, v4.4s, v13.s[2]
|
||||
OP_ir v25.4s, v5.4s, v12.s[2]
|
||||
|
||||
OP_rr v28.4s, v4.4s, v12.4s[3]
|
||||
OP_ii v28.4s, v5.4s, v13.4s[3]
|
||||
OP_ri v29.4s, v4.4s, v13.4s[3]
|
||||
OP_ir v29.4s, v5.4s, v12.4s[3]
|
||||
OP_rr v28.4s, v4.4s, v12.s[3]
|
||||
OP_ii v28.4s, v5.4s, v13.s[3]
|
||||
OP_ri v29.4s, v4.4s, v13.s[3]
|
||||
OP_ir v29.4s, v5.4s, v12.s[3]
|
||||
.endm
|
||||
|
||||
.macro KERNEL4x4_SUB
|
||||
|
|
@ -700,25 +700,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld2 {v0.4s, v1.4s}, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
OP_rr v16.4s, v0.4s, v8.4s[0]
|
||||
OP_ii v16.4s, v1.4s, v9.4s[0]
|
||||
OP_ri v17.4s, v0.4s, v9.4s[0]
|
||||
OP_ir v17.4s, v1.4s, v8.4s[0]
|
||||
OP_rr v16.4s, v0.4s, v8.s[0]
|
||||
OP_ii v16.4s, v1.4s, v9.s[0]
|
||||
OP_ri v17.4s, v0.4s, v9.s[0]
|
||||
OP_ir v17.4s, v1.4s, v8.s[0]
|
||||
|
||||
OP_rr v20.4s, v0.4s, v8.4s[1]
|
||||
OP_ii v20.4s, v1.4s, v9.4s[1]
|
||||
OP_ri v21.4s, v0.4s, v9.4s[1]
|
||||
OP_ir v21.4s, v1.4s, v8.4s[1]
|
||||
OP_rr v20.4s, v0.4s, v8.s[1]
|
||||
OP_ii v20.4s, v1.4s, v9.s[1]
|
||||
OP_ri v21.4s, v0.4s, v9.s[1]
|
||||
OP_ir v21.4s, v1.4s, v8.s[1]
|
||||
|
||||
OP_rr v24.4s, v0.4s, v8.4s[2]
|
||||
OP_ii v24.4s, v1.4s, v9.4s[2]
|
||||
OP_ri v25.4s, v0.4s, v9.4s[2]
|
||||
OP_ir v25.4s, v1.4s, v8.4s[2]
|
||||
OP_rr v24.4s, v0.4s, v8.s[2]
|
||||
OP_ii v24.4s, v1.4s, v9.s[2]
|
||||
OP_ri v25.4s, v0.4s, v9.s[2]
|
||||
OP_ir v25.4s, v1.4s, v8.s[2]
|
||||
|
||||
OP_rr v28.4s, v0.4s, v8.4s[3]
|
||||
OP_ii v28.4s, v1.4s, v9.4s[3]
|
||||
OP_ri v29.4s, v0.4s, v9.4s[3]
|
||||
OP_ir v29.4s, v1.4s, v8.4s[3]
|
||||
OP_rr v28.4s, v0.4s, v8.s[3]
|
||||
OP_ii v28.4s, v1.4s, v9.s[3]
|
||||
OP_ri v29.4s, v0.4s, v9.s[3]
|
||||
OP_ir v29.4s, v1.4s, v8.s[3]
|
||||
.endm
|
||||
|
||||
.macro SAVE4x4
|
||||
|
|
@ -780,25 +780,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld2 {v0.2s, v1.2s}, [pA]
|
||||
add pA, pA, #16
|
||||
|
||||
OP_rr v16.2s, v0.2s, v8.4s[0]
|
||||
OP_ii v16.2s, v1.2s, v9.4s[0]
|
||||
OP_ri v17.2s, v0.2s, v9.4s[0]
|
||||
OP_ir v17.2s, v1.2s, v8.4s[0]
|
||||
OP_rr v16.2s, v0.2s, v8.s[0]
|
||||
OP_ii v16.2s, v1.2s, v9.s[0]
|
||||
OP_ri v17.2s, v0.2s, v9.s[0]
|
||||
OP_ir v17.2s, v1.2s, v8.s[0]
|
||||
|
||||
OP_rr v20.2s, v0.2s, v8.4s[1]
|
||||
OP_ii v20.2s, v1.2s, v9.4s[1]
|
||||
OP_ri v21.2s, v0.2s, v9.4s[1]
|
||||
OP_ir v21.2s, v1.2s, v8.4s[1]
|
||||
OP_rr v20.2s, v0.2s, v8.s[1]
|
||||
OP_ii v20.2s, v1.2s, v9.s[1]
|
||||
OP_ri v21.2s, v0.2s, v9.s[1]
|
||||
OP_ir v21.2s, v1.2s, v8.s[1]
|
||||
|
||||
OP_rr v24.2s, v0.2s, v8.4s[2]
|
||||
OP_ii v24.2s, v1.2s, v9.4s[2]
|
||||
OP_ri v25.2s, v0.2s, v9.4s[2]
|
||||
OP_ir v25.2s, v1.2s, v8.4s[2]
|
||||
OP_rr v24.2s, v0.2s, v8.s[2]
|
||||
OP_ii v24.2s, v1.2s, v9.s[2]
|
||||
OP_ri v25.2s, v0.2s, v9.s[2]
|
||||
OP_ir v25.2s, v1.2s, v8.s[2]
|
||||
|
||||
OP_rr v28.2s, v0.2s, v8.4s[3]
|
||||
OP_ii v28.2s, v1.2s, v9.4s[3]
|
||||
OP_ri v29.2s, v0.2s, v9.4s[3]
|
||||
OP_ir v29.2s, v1.2s, v8.4s[3]
|
||||
OP_rr v28.2s, v0.2s, v8.s[3]
|
||||
OP_ii v28.2s, v1.2s, v9.s[3]
|
||||
OP_ri v29.2s, v0.2s, v9.s[3]
|
||||
OP_ir v29.2s, v1.2s, v8.s[3]
|
||||
.endm
|
||||
|
||||
.macro SAVE2x4
|
||||
|
|
@ -860,25 +860,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld2 {v0.s, v1.s}[0], [pA]
|
||||
add pA, pA, #8
|
||||
|
||||
OP_rr s16, s0, v8.4s[0]
|
||||
OP_ii s16, s1, v9.4s[0]
|
||||
OP_ri s17, s0, v9.4s[0]
|
||||
OP_ir s17, s1, v8.4s[0]
|
||||
OP_rr s16, s0, v8.s[0]
|
||||
OP_ii s16, s1, v9.s[0]
|
||||
OP_ri s17, s0, v9.s[0]
|
||||
OP_ir s17, s1, v8.s[0]
|
||||
|
||||
OP_rr s20, s0, v8.4s[1]
|
||||
OP_ii s20, s1, v9.4s[1]
|
||||
OP_ri s21, s0, v9.4s[1]
|
||||
OP_ir s21, s1, v8.4s[1]
|
||||
OP_rr s20, s0, v8.s[1]
|
||||
OP_ii s20, s1, v9.s[1]
|
||||
OP_ri s21, s0, v9.s[1]
|
||||
OP_ir s21, s1, v8.s[1]
|
||||
|
||||
OP_rr s24, s0, v8.4s[2]
|
||||
OP_ii s24, s1, v9.4s[2]
|
||||
OP_ri s25, s0, v9.4s[2]
|
||||
OP_ir s25, s1, v8.4s[2]
|
||||
OP_rr s24, s0, v8.s[2]
|
||||
OP_ii s24, s1, v9.s[2]
|
||||
OP_ri s25, s0, v9.s[2]
|
||||
OP_ir s25, s1, v8.s[2]
|
||||
|
||||
OP_rr s28, s0, v8.4s[3]
|
||||
OP_ii s28, s1, v9.4s[3]
|
||||
OP_ri s29, s0, v9.4s[3]
|
||||
OP_ir s29, s1, v8.4s[3]
|
||||
OP_rr s28, s0, v8.s[3]
|
||||
OP_ii s28, s1, v9.s[3]
|
||||
OP_ri s29, s0, v9.s[3]
|
||||
OP_ir s29, s1, v8.s[3]
|
||||
.endm
|
||||
|
||||
.macro SAVE1x4
|
||||
|
|
@ -942,25 +942,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld2 {v2.4s, v3.4s}, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
OP_rr v16.4s, v0.4s, v8.2s[0]
|
||||
OP_ii v16.4s, v1.4s, v9.2s[0]
|
||||
OP_ri v17.4s, v0.4s, v9.2s[0]
|
||||
OP_ir v17.4s, v1.4s, v8.2s[0]
|
||||
OP_rr v16.4s, v0.4s, v8.s[0]
|
||||
OP_ii v16.4s, v1.4s, v9.s[0]
|
||||
OP_ri v17.4s, v0.4s, v9.s[0]
|
||||
OP_ir v17.4s, v1.4s, v8.s[0]
|
||||
|
||||
OP_rr v18.4s, v2.4s, v8.2s[0]
|
||||
OP_ii v18.4s, v3.4s, v9.2s[0]
|
||||
OP_ri v19.4s, v2.4s, v9.2s[0]
|
||||
OP_ir v19.4s, v3.4s, v8.2s[0]
|
||||
OP_rr v18.4s, v2.4s, v8.s[0]
|
||||
OP_ii v18.4s, v3.4s, v9.s[0]
|
||||
OP_ri v19.4s, v2.4s, v9.s[0]
|
||||
OP_ir v19.4s, v3.4s, v8.s[0]
|
||||
|
||||
OP_rr v20.4s, v0.4s, v8.2s[1]
|
||||
OP_ii v20.4s, v1.4s, v9.2s[1]
|
||||
OP_ri v21.4s, v0.4s, v9.2s[1]
|
||||
OP_ir v21.4s, v1.4s, v8.2s[1]
|
||||
OP_rr v20.4s, v0.4s, v8.s[1]
|
||||
OP_ii v20.4s, v1.4s, v9.s[1]
|
||||
OP_ri v21.4s, v0.4s, v9.s[1]
|
||||
OP_ir v21.4s, v1.4s, v8.s[1]
|
||||
|
||||
OP_rr v22.4s, v2.4s, v8.2s[1]
|
||||
OP_ii v22.4s, v3.4s, v9.2s[1]
|
||||
OP_ri v23.4s, v2.4s, v9.2s[1]
|
||||
OP_ir v23.4s, v3.4s, v8.2s[1]
|
||||
OP_rr v22.4s, v2.4s, v8.s[1]
|
||||
OP_ii v22.4s, v3.4s, v9.s[1]
|
||||
OP_ri v23.4s, v2.4s, v9.s[1]
|
||||
OP_ir v23.4s, v3.4s, v8.s[1]
|
||||
.endm
|
||||
|
||||
.macro SAVE8x2
|
||||
|
|
@ -1018,15 +1018,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld2 {v0.4s, v1.4s}, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
OP_rr v16.4s, v0.4s, v8.2s[0]
|
||||
OP_ii v16.4s, v1.4s, v9.2s[0]
|
||||
OP_ri v17.4s, v0.4s, v9.2s[0]
|
||||
OP_ir v17.4s, v1.4s, v8.2s[0]
|
||||
OP_rr v16.4s, v0.4s, v8.s[0]
|
||||
OP_ii v16.4s, v1.4s, v9.s[0]
|
||||
OP_ri v17.4s, v0.4s, v9.s[0]
|
||||
OP_ir v17.4s, v1.4s, v8.s[0]
|
||||
|
||||
OP_rr v20.4s, v0.4s, v8.2s[1]
|
||||
OP_ii v20.4s, v1.4s, v9.2s[1]
|
||||
OP_ri v21.4s, v0.4s, v9.2s[1]
|
||||
OP_ir v21.4s, v1.4s, v8.2s[1]
|
||||
OP_rr v20.4s, v0.4s, v8.s[1]
|
||||
OP_ii v20.4s, v1.4s, v9.s[1]
|
||||
OP_ri v21.4s, v0.4s, v9.s[1]
|
||||
OP_ir v21.4s, v1.4s, v8.s[1]
|
||||
.endm
|
||||
|
||||
.macro SAVE4x2
|
||||
|
|
@ -1066,15 +1066,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld2 {v0.2s, v1.2s}, [pA]
|
||||
add pA, pA, #16
|
||||
|
||||
OP_rr v16.2s, v0.2s, v8.2s[0]
|
||||
OP_ii v16.2s, v1.2s, v9.2s[0]
|
||||
OP_ri v17.2s, v0.2s, v9.2s[0]
|
||||
OP_ir v17.2s, v1.2s, v8.2s[0]
|
||||
OP_rr v16.2s, v0.2s, v8.s[0]
|
||||
OP_ii v16.2s, v1.2s, v9.s[0]
|
||||
OP_ri v17.2s, v0.2s, v9.s[0]
|
||||
OP_ir v17.2s, v1.2s, v8.s[0]
|
||||
|
||||
OP_rr v20.2s, v0.2s, v8.2s[1]
|
||||
OP_ii v20.2s, v1.2s, v9.2s[1]
|
||||
OP_ri v21.2s, v0.2s, v9.2s[1]
|
||||
OP_ir v21.2s, v1.2s, v8.2s[1]
|
||||
OP_rr v20.2s, v0.2s, v8.s[1]
|
||||
OP_ii v20.2s, v1.2s, v9.s[1]
|
||||
OP_ri v21.2s, v0.2s, v9.s[1]
|
||||
OP_ir v21.2s, v1.2s, v8.s[1]
|
||||
.endm
|
||||
|
||||
.macro SAVE2x2
|
||||
|
|
@ -1114,15 +1114,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld2 {v0.s, v1.s}[0], [pA]
|
||||
add pA, pA, #8
|
||||
|
||||
OP_rr s16, s0, v8.2s[0]
|
||||
OP_ii s16, s1, v9.2s[0]
|
||||
OP_ri s17, s0, v9.2s[0]
|
||||
OP_ir s17, s1, v8.2s[0]
|
||||
OP_rr s16, s0, v8.s[0]
|
||||
OP_ii s16, s1, v9.s[0]
|
||||
OP_ri s17, s0, v9.s[0]
|
||||
OP_ir s17, s1, v8.s[0]
|
||||
|
||||
OP_rr s20, s0, v8.2s[1]
|
||||
OP_ii s20, s1, v9.2s[1]
|
||||
OP_ri s21, s0, v9.2s[1]
|
||||
OP_ir s21, s1, v8.2s[1]
|
||||
OP_rr s20, s0, v8.s[1]
|
||||
OP_ii s20, s1, v9.s[1]
|
||||
OP_ri s21, s0, v9.s[1]
|
||||
OP_ir s21, s1, v8.s[1]
|
||||
.endm
|
||||
|
||||
.macro SAVE1x2
|
||||
|
|
@ -1164,15 +1164,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld2 {v2.4s, v3.4s}, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
OP_rr v16.4s, v0.4s, v8.4s[0]
|
||||
OP_ii v16.4s, v1.4s, v8.4s[1]
|
||||
OP_ri v17.4s, v0.4s, v8.4s[1]
|
||||
OP_ir v17.4s, v1.4s, v8.4s[0]
|
||||
OP_rr v16.4s, v0.4s, v8.s[0]
|
||||
OP_ii v16.4s, v1.4s, v8.s[1]
|
||||
OP_ri v17.4s, v0.4s, v8.s[1]
|
||||
OP_ir v17.4s, v1.4s, v8.s[0]
|
||||
|
||||
OP_rr v18.4s, v2.4s, v8.4s[0]
|
||||
OP_ii v18.4s, v3.4s, v8.4s[1]
|
||||
OP_ri v19.4s, v2.4s, v8.4s[1]
|
||||
OP_ir v19.4s, v3.4s, v8.4s[0]
|
||||
OP_rr v18.4s, v2.4s, v8.s[0]
|
||||
OP_ii v18.4s, v3.4s, v8.s[1]
|
||||
OP_ri v19.4s, v2.4s, v8.s[1]
|
||||
OP_ir v19.4s, v3.4s, v8.s[0]
|
||||
.endm
|
||||
|
||||
.macro SAVE8x1
|
||||
|
|
|
|||
|
|
@ -161,150 +161,150 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ldp q0, q1, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
fmul v16.2d, v0.2d, v8.2d[0]
|
||||
fmul v29.2d, v1.2d, v11.2d[0]
|
||||
fmul v16.2d, v0.2d, v8.d[0]
|
||||
fmul v29.2d, v1.2d, v11.d[0]
|
||||
|
||||
ldp q2, q3, [ppA]
|
||||
add ppA, ppA, #32
|
||||
|
||||
fmul v20.2d, v0.2d, v9.2d[0]
|
||||
fmul v25.2d, v1.2d, v10.2d[0]
|
||||
fmul v20.2d, v0.2d, v9.d[0]
|
||||
fmul v25.2d, v1.2d, v10.d[0]
|
||||
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||
|
||||
fmul v18.2d, v2.2d, v8.2d[0]
|
||||
fmul v31.2d, v3.2d, v11.2d[0]
|
||||
fmul v18.2d, v2.2d, v8.d[0]
|
||||
fmul v31.2d, v3.2d, v11.d[0]
|
||||
|
||||
prfm PLDL1KEEP, [ppA, #A_PRE_SIZE]
|
||||
|
||||
fmul v22.2d, v2.2d, v9.2d[0]
|
||||
fmul v27.2d, v3.2d, v10.2d[0]
|
||||
fmul v22.2d, v2.2d, v9.d[0]
|
||||
fmul v27.2d, v3.2d, v10.d[0]
|
||||
|
||||
ldp d12, d13, [pB]
|
||||
add pB, pB, #16
|
||||
|
||||
fmul v24.2d, v0.2d, v10.2d[0]
|
||||
fmul v21.2d, v1.2d, v9.2d[0]
|
||||
fmul v24.2d, v0.2d, v10.d[0]
|
||||
fmul v21.2d, v1.2d, v9.d[0]
|
||||
|
||||
ldp q4, q5, [pA] // for next round
|
||||
add pA, pA, #32
|
||||
|
||||
fmul v26.2d, v2.2d, v10.2d[0]
|
||||
fmul v23.2d, v3.2d, v9.2d[0]
|
||||
fmul v26.2d, v2.2d, v10.d[0]
|
||||
fmul v23.2d, v3.2d, v9.d[0]
|
||||
|
||||
ldp q6, q7, [ppA] // for next round
|
||||
add ppA, ppA, #32
|
||||
|
||||
fmul v28.2d, v0.2d, v11.2d[0]
|
||||
fmul v17.2d, v1.2d, v8.2d[0]
|
||||
fmul v28.2d, v0.2d, v11.d[0]
|
||||
fmul v17.2d, v1.2d, v8.d[0]
|
||||
|
||||
ldp d14, d15, [pB]
|
||||
add pB, pB, #16
|
||||
|
||||
fmul v30.2d, v2.2d, v11.2d[0]
|
||||
fmul v19.2d, v3.2d, v8.2d[0]
|
||||
fmul v30.2d, v2.2d, v11.d[0]
|
||||
fmul v19.2d, v3.2d, v8.d[0]
|
||||
.endm
|
||||
|
||||
.macro KERNEL8x4_M2
|
||||
fmla v16.2d, v4.2d, v12.2d[0]
|
||||
fmla v29.2d, v5.2d, v15.2d[0]
|
||||
fmla v16.2d, v4.2d, v12.d[0]
|
||||
fmla v29.2d, v5.2d, v15.d[0]
|
||||
|
||||
ldp d8, d9, [pB]
|
||||
add pB, pB, #16
|
||||
|
||||
fmla v18.2d, v6.2d, v12.2d[0]
|
||||
fmla v31.2d, v7.2d, v15.2d[0]
|
||||
fmla v18.2d, v6.2d, v12.d[0]
|
||||
fmla v31.2d, v7.2d, v15.d[0]
|
||||
|
||||
ldp d10, d11, [pB]
|
||||
add pB, pB, #16
|
||||
|
||||
fmla v20.2d, v4.2d, v13.2d[0]
|
||||
fmla v25.2d, v5.2d, v14.2d[0]
|
||||
fmla v20.2d, v4.2d, v13.d[0]
|
||||
fmla v25.2d, v5.2d, v14.d[0]
|
||||
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
|
||||
fmla v22.2d, v6.2d, v13.2d[0]
|
||||
fmla v27.2d, v7.2d, v14.2d[0]
|
||||
fmla v24.2d, v4.2d, v14.2d[0]
|
||||
fmla v21.2d, v5.2d, v13.2d[0]
|
||||
fmla v22.2d, v6.2d, v13.d[0]
|
||||
fmla v27.2d, v7.2d, v14.d[0]
|
||||
fmla v24.2d, v4.2d, v14.d[0]
|
||||
fmla v21.2d, v5.2d, v13.d[0]
|
||||
|
||||
ldp q0, q1, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
fmla v26.2d, v6.2d, v14.2d[0]
|
||||
fmla v23.2d, v7.2d, v13.2d[0]
|
||||
fmla v28.2d, v4.2d, v15.2d[0]
|
||||
fmla v17.2d, v5.2d, v12.2d[0]
|
||||
fmla v26.2d, v6.2d, v14.d[0]
|
||||
fmla v23.2d, v7.2d, v13.d[0]
|
||||
fmla v28.2d, v4.2d, v15.d[0]
|
||||
fmla v17.2d, v5.2d, v12.d[0]
|
||||
|
||||
ldp q2, q3, [ppA]
|
||||
add ppA, ppA, #32
|
||||
|
||||
fmla v30.2d, v6.2d, v15.2d[0]
|
||||
fmla v19.2d, v7.2d, v12.2d[0]
|
||||
fmla v30.2d, v6.2d, v15.d[0]
|
||||
fmla v19.2d, v7.2d, v12.d[0]
|
||||
.endm
|
||||
|
||||
.macro KERNEL8x4_M1
|
||||
fmla v16.2d, v0.2d, v8.2d[0]
|
||||
fmla v29.2d, v1.2d, v11.2d[0]
|
||||
fmla v16.2d, v0.2d, v8.d[0]
|
||||
fmla v29.2d, v1.2d, v11.d[0]
|
||||
|
||||
ldp d12, d13, [pB]
|
||||
add pB, pB, #16
|
||||
|
||||
fmla v18.2d, v2.2d, v8.2d[0]
|
||||
fmla v31.2d, v3.2d, v11.2d[0]
|
||||
fmla v18.2d, v2.2d, v8.d[0]
|
||||
fmla v31.2d, v3.2d, v11.d[0]
|
||||
|
||||
ldp d14, d15, [pB]
|
||||
add pB, pB, #16
|
||||
|
||||
fmla v20.2d, v0.2d, v9.2d[0]
|
||||
fmla v25.2d, v1.2d, v10.2d[0]
|
||||
fmla v20.2d, v0.2d, v9.d[0]
|
||||
fmla v25.2d, v1.2d, v10.d[0]
|
||||
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||
|
||||
fmla v22.2d, v2.2d, v9.2d[0]
|
||||
fmla v27.2d, v3.2d, v10.2d[0]
|
||||
fmla v22.2d, v2.2d, v9.d[0]
|
||||
fmla v27.2d, v3.2d, v10.d[0]
|
||||
|
||||
prfm PLDL1KEEP, [ppA, #A_PRE_SIZE]
|
||||
|
||||
fmla v24.2d, v0.2d, v10.2d[0]
|
||||
fmla v21.2d, v1.2d, v9.2d[0]
|
||||
fmla v24.2d, v0.2d, v10.d[0]
|
||||
fmla v21.2d, v1.2d, v9.d[0]
|
||||
|
||||
ldp q4, q5, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
fmla v26.2d, v2.2d, v10.2d[0]
|
||||
fmla v23.2d, v3.2d, v9.2d[0]
|
||||
fmla v26.2d, v2.2d, v10.d[0]
|
||||
fmla v23.2d, v3.2d, v9.d[0]
|
||||
|
||||
fmla v28.2d, v0.2d, v11.2d[0]
|
||||
fmla v17.2d, v1.2d, v8.2d[0]
|
||||
fmla v28.2d, v0.2d, v11.d[0]
|
||||
fmla v17.2d, v1.2d, v8.d[0]
|
||||
|
||||
ldp q6, q7, [ppA]
|
||||
add ppA, ppA, #32
|
||||
|
||||
fmla v30.2d, v2.2d, v11.2d[0]
|
||||
fmla v19.2d, v3.2d, v8.2d[0]
|
||||
fmla v30.2d, v2.2d, v11.d[0]
|
||||
fmla v19.2d, v3.2d, v8.d[0]
|
||||
.endm
|
||||
|
||||
.macro KERNEL8x4_E
|
||||
fmla v16.2d, v4.2d, v12.2d[0]
|
||||
fmla v25.2d, v5.2d, v14.2d[0]
|
||||
fmla v18.2d, v6.2d, v12.2d[0]
|
||||
fmla v27.2d, v7.2d, v14.2d[0]
|
||||
fmla v16.2d, v4.2d, v12.d[0]
|
||||
fmla v25.2d, v5.2d, v14.d[0]
|
||||
fmla v18.2d, v6.2d, v12.d[0]
|
||||
fmla v27.2d, v7.2d, v14.d[0]
|
||||
|
||||
fmla v20.2d, v4.2d, v13.2d[0]
|
||||
fmla v29.2d, v5.2d, v15.2d[0]
|
||||
fmla v22.2d, v6.2d, v13.2d[0]
|
||||
fmla v31.2d, v7.2d, v15.2d[0]
|
||||
fmla v20.2d, v4.2d, v13.d[0]
|
||||
fmla v29.2d, v5.2d, v15.d[0]
|
||||
fmla v22.2d, v6.2d, v13.d[0]
|
||||
fmla v31.2d, v7.2d, v15.d[0]
|
||||
|
||||
fmla v24.2d, v4.2d, v14.2d[0]
|
||||
fmla v17.2d, v5.2d, v12.2d[0]
|
||||
fmla v26.2d, v6.2d, v14.2d[0]
|
||||
fmla v19.2d, v7.2d, v12.2d[0]
|
||||
fmla v24.2d, v4.2d, v14.d[0]
|
||||
fmla v17.2d, v5.2d, v12.d[0]
|
||||
fmla v26.2d, v6.2d, v14.d[0]
|
||||
fmla v19.2d, v7.2d, v12.d[0]
|
||||
|
||||
fmla v28.2d, v4.2d, v15.2d[0]
|
||||
fmla v21.2d, v5.2d, v13.2d[0]
|
||||
fmla v30.2d, v6.2d, v15.2d[0]
|
||||
fmla v23.2d, v7.2d, v13.2d[0]
|
||||
fmla v28.2d, v4.2d, v15.d[0]
|
||||
fmla v21.2d, v5.2d, v13.d[0]
|
||||
fmla v30.2d, v6.2d, v15.d[0]
|
||||
fmla v23.2d, v7.2d, v13.d[0]
|
||||
.endm
|
||||
|
||||
.macro KERNEL8x4_SUB
|
||||
|
|
@ -315,28 +315,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ldp q0, q1, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
fmla v16.2d, v0.2d, v8.2d[0]
|
||||
fmla v29.2d, v1.2d, v11.2d[0]
|
||||
fmla v20.2d, v0.2d, v9.2d[0]
|
||||
fmla v25.2d, v1.2d, v10.2d[0]
|
||||
fmla v16.2d, v0.2d, v8.d[0]
|
||||
fmla v29.2d, v1.2d, v11.d[0]
|
||||
fmla v20.2d, v0.2d, v9.d[0]
|
||||
fmla v25.2d, v1.2d, v10.d[0]
|
||||
|
||||
ldp q2, q3, [ppA]
|
||||
add ppA, ppA, #32
|
||||
|
||||
fmla v24.2d, v0.2d, v10.2d[0]
|
||||
fmla v21.2d, v1.2d, v9.2d[0]
|
||||
fmla v28.2d, v0.2d, v11.2d[0]
|
||||
fmla v17.2d, v1.2d, v8.2d[0]
|
||||
fmla v24.2d, v0.2d, v10.d[0]
|
||||
fmla v21.2d, v1.2d, v9.d[0]
|
||||
fmla v28.2d, v0.2d, v11.d[0]
|
||||
fmla v17.2d, v1.2d, v8.d[0]
|
||||
|
||||
fmla v18.2d, v2.2d, v8.2d[0]
|
||||
fmla v31.2d, v3.2d, v11.2d[0]
|
||||
fmla v22.2d, v2.2d, v9.2d[0]
|
||||
fmla v27.2d, v3.2d, v10.2d[0]
|
||||
fmla v18.2d, v2.2d, v8.d[0]
|
||||
fmla v31.2d, v3.2d, v11.d[0]
|
||||
fmla v22.2d, v2.2d, v9.d[0]
|
||||
fmla v27.2d, v3.2d, v10.d[0]
|
||||
|
||||
fmla v26.2d, v2.2d, v10.2d[0]
|
||||
fmla v23.2d, v3.2d, v9.2d[0]
|
||||
fmla v30.2d, v2.2d, v11.2d[0]
|
||||
fmla v19.2d, v3.2d, v8.2d[0]
|
||||
fmla v26.2d, v2.2d, v10.d[0]
|
||||
fmla v23.2d, v3.2d, v9.d[0]
|
||||
fmla v30.2d, v2.2d, v11.d[0]
|
||||
fmla v19.2d, v3.2d, v8.d[0]
|
||||
.endm
|
||||
|
||||
.macro SAVE8x4
|
||||
|
|
@ -422,17 +422,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v0.2d, v1.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
fmla v16.2d, v0.2d, v8.2d[0]
|
||||
fmla v29.2d, v1.2d, v9.2d[1]
|
||||
fmla v16.2d, v0.2d, v8.d[0]
|
||||
fmla v29.2d, v1.2d, v9.d[1]
|
||||
|
||||
fmla v20.2d, v0.2d, v8.2d[1]
|
||||
fmla v25.2d, v1.2d, v9.2d[0]
|
||||
fmla v20.2d, v0.2d, v8.d[1]
|
||||
fmla v25.2d, v1.2d, v9.d[0]
|
||||
|
||||
fmla v24.2d, v0.2d, v9.2d[0]
|
||||
fmla v21.2d, v1.2d, v8.2d[1]
|
||||
fmla v24.2d, v0.2d, v9.d[0]
|
||||
fmla v21.2d, v1.2d, v8.d[1]
|
||||
|
||||
fmla v28.2d, v0.2d, v9.2d[1]
|
||||
fmla v17.2d, v1.2d, v8.2d[0]
|
||||
fmla v28.2d, v0.2d, v9.d[1]
|
||||
fmla v17.2d, v1.2d, v8.d[0]
|
||||
.endm
|
||||
|
||||
.macro SAVE4x4
|
||||
|
|
@ -482,10 +482,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v0.2d}, [pA]
|
||||
add pA, pA, #16
|
||||
|
||||
fmla v16.2d, v0.2d, v8.2d[0]
|
||||
fmla v20.2d, v0.2d, v8.2d[1]
|
||||
fmla v24.2d, v0.2d, v9.2d[0]
|
||||
fmla v28.2d, v0.2d, v9.2d[1]
|
||||
fmla v16.2d, v0.2d, v8.d[0]
|
||||
fmla v20.2d, v0.2d, v8.d[1]
|
||||
fmla v24.2d, v0.2d, v9.d[0]
|
||||
fmla v28.2d, v0.2d, v9.d[1]
|
||||
.endm
|
||||
|
||||
.macro SAVE2x4
|
||||
|
|
@ -572,10 +572,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v0.2d, v1.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
fmla v16.2d, v0.2d, v8.2d[0]
|
||||
fmla v17.2d, v1.2d, v8.2d[0]
|
||||
fmla v20.2d, v0.2d, v8.2d[1]
|
||||
fmla v21.2d, v1.2d, v8.2d[1]
|
||||
fmla v16.2d, v0.2d, v8.d[0]
|
||||
fmla v17.2d, v1.2d, v8.d[0]
|
||||
fmla v20.2d, v0.2d, v8.d[1]
|
||||
fmla v21.2d, v1.2d, v8.d[1]
|
||||
.endm
|
||||
|
||||
.macro SAVE4x2
|
||||
|
|
@ -610,8 +610,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v0.2d}, [pA]
|
||||
add pA, pA, #16
|
||||
|
||||
fmla v16.2d, v0.2d, v8.2d[0]
|
||||
fmla v20.2d, v0.2d, v8.2d[1]
|
||||
fmla v16.2d, v0.2d, v8.d[0]
|
||||
fmla v20.2d, v0.2d, v8.d[1]
|
||||
.endm
|
||||
|
||||
.macro SAVE2x2
|
||||
|
|
@ -643,7 +643,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ldr d0 , [pA]
|
||||
add pA, pA, #8
|
||||
|
||||
fmla v16.2d, v8.2d, v0.2d[0]
|
||||
fmla v16.2d, v8.2d, v0.d[0]
|
||||
.endm
|
||||
|
||||
.macro SAVE1x2
|
||||
|
|
@ -674,8 +674,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v0.2d, v1.2d}, [pA]
|
||||
add pA , pA, #32
|
||||
|
||||
fmla v16.2d, v0.2d, v8.2d[0]
|
||||
fmla v17.2d, v1.2d, v8.2d[0]
|
||||
fmla v16.2d, v0.2d, v8.d[0]
|
||||
fmla v17.2d, v1.2d, v8.d[0]
|
||||
.endm
|
||||
|
||||
.macro SAVE4x1
|
||||
|
|
@ -705,7 +705,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v0.2d}, [pA]
|
||||
add pA , pA, #16
|
||||
|
||||
fmla v16.2d, v0.2d, v8.2d[0]
|
||||
fmla v16.2d, v0.2d, v8.d[0]
|
||||
.endm
|
||||
|
||||
.macro SAVE2x1
|
||||
|
|
|
|||
|
|
@ -154,25 +154,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v10.2d, v11.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
|
||||
fmul v16.2d, v0.2d, v8.2d[0]
|
||||
fmul v17.2d, v1.2d, v8.2d[0]
|
||||
fmul v18.2d, v0.2d, v8.2d[1]
|
||||
fmul v19.2d, v1.2d, v8.2d[1]
|
||||
fmul v16.2d, v0.2d, v8.d[0]
|
||||
fmul v17.2d, v1.2d, v8.d[0]
|
||||
fmul v18.2d, v0.2d, v8.d[1]
|
||||
fmul v19.2d, v1.2d, v8.d[1]
|
||||
|
||||
fmul v20.2d, v0.2d, v9.2d[0]
|
||||
fmul v21.2d, v1.2d, v9.2d[0]
|
||||
fmul v22.2d, v0.2d, v9.2d[1]
|
||||
fmul v23.2d, v1.2d, v9.2d[1]
|
||||
fmul v20.2d, v0.2d, v9.d[0]
|
||||
fmul v21.2d, v1.2d, v9.d[0]
|
||||
fmul v22.2d, v0.2d, v9.d[1]
|
||||
fmul v23.2d, v1.2d, v9.d[1]
|
||||
|
||||
fmul v24.2d, v0.2d, v10.2d[0]
|
||||
fmul v25.2d, v1.2d, v10.2d[0]
|
||||
fmul v26.2d, v0.2d, v10.2d[1]
|
||||
fmul v27.2d, v1.2d, v10.2d[1]
|
||||
fmul v24.2d, v0.2d, v10.d[0]
|
||||
fmul v25.2d, v1.2d, v10.d[0]
|
||||
fmul v26.2d, v0.2d, v10.d[1]
|
||||
fmul v27.2d, v1.2d, v10.d[1]
|
||||
|
||||
fmul v28.2d, v0.2d, v11.2d[0]
|
||||
fmul v29.2d, v1.2d, v11.2d[0]
|
||||
fmul v30.2d, v0.2d, v11.2d[1]
|
||||
fmul v31.2d, v1.2d, v11.2d[1]
|
||||
fmul v28.2d, v0.2d, v11.d[0]
|
||||
fmul v29.2d, v1.2d, v11.d[0]
|
||||
fmul v30.2d, v0.2d, v11.d[1]
|
||||
fmul v31.2d, v1.2d, v11.d[1]
|
||||
|
||||
ld1 {v12.2d, v13.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
|
|
@ -183,25 +183,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro KERNEL4x8_M1
|
||||
fmla v16.2d, v0.2d, v8.2d[0]
|
||||
fmla v17.2d, v1.2d, v8.2d[0]
|
||||
fmla v18.2d, v0.2d, v8.2d[1]
|
||||
fmla v19.2d, v1.2d, v8.2d[1]
|
||||
fmla v16.2d, v0.2d, v8.d[0]
|
||||
fmla v17.2d, v1.2d, v8.d[0]
|
||||
fmla v18.2d, v0.2d, v8.d[1]
|
||||
fmla v19.2d, v1.2d, v8.d[1]
|
||||
|
||||
fmla v20.2d, v0.2d, v9.2d[0]
|
||||
fmla v21.2d, v1.2d, v9.2d[0]
|
||||
fmla v22.2d, v0.2d, v9.2d[1]
|
||||
fmla v23.2d, v1.2d, v9.2d[1]
|
||||
fmla v20.2d, v0.2d, v9.d[0]
|
||||
fmla v21.2d, v1.2d, v9.d[0]
|
||||
fmla v22.2d, v0.2d, v9.d[1]
|
||||
fmla v23.2d, v1.2d, v9.d[1]
|
||||
|
||||
fmla v24.2d, v0.2d, v10.2d[0]
|
||||
fmla v25.2d, v1.2d, v10.2d[0]
|
||||
fmla v26.2d, v0.2d, v10.2d[1]
|
||||
fmla v27.2d, v1.2d, v10.2d[1]
|
||||
fmla v24.2d, v0.2d, v10.d[0]
|
||||
fmla v25.2d, v1.2d, v10.d[0]
|
||||
fmla v26.2d, v0.2d, v10.d[1]
|
||||
fmla v27.2d, v1.2d, v10.d[1]
|
||||
|
||||
fmla v28.2d, v0.2d, v11.2d[0]
|
||||
fmla v29.2d, v1.2d, v11.2d[0]
|
||||
fmla v30.2d, v0.2d, v11.2d[1]
|
||||
fmla v31.2d, v1.2d, v11.2d[1]
|
||||
fmla v28.2d, v0.2d, v11.d[0]
|
||||
fmla v29.2d, v1.2d, v11.d[0]
|
||||
fmla v30.2d, v0.2d, v11.d[1]
|
||||
fmla v31.2d, v1.2d, v11.d[1]
|
||||
|
||||
ld1 {v12.2d, v13.2d}, [pB] // For next round
|
||||
add pB, pB, #32
|
||||
|
|
@ -214,25 +214,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro KERNEL4x8_M2
|
||||
fmla v16.2d, v4.2d, v12.2d[0]
|
||||
fmla v17.2d, v5.2d, v12.2d[0]
|
||||
fmla v18.2d, v4.2d, v12.2d[1]
|
||||
fmla v19.2d, v5.2d, v12.2d[1]
|
||||
fmla v16.2d, v4.2d, v12.d[0]
|
||||
fmla v17.2d, v5.2d, v12.d[0]
|
||||
fmla v18.2d, v4.2d, v12.d[1]
|
||||
fmla v19.2d, v5.2d, v12.d[1]
|
||||
|
||||
fmla v20.2d, v4.2d, v13.2d[0]
|
||||
fmla v21.2d, v5.2d, v13.2d[0]
|
||||
fmla v22.2d, v4.2d, v13.2d[1]
|
||||
fmla v23.2d, v5.2d, v13.2d[1]
|
||||
fmla v20.2d, v4.2d, v13.d[0]
|
||||
fmla v21.2d, v5.2d, v13.d[0]
|
||||
fmla v22.2d, v4.2d, v13.d[1]
|
||||
fmla v23.2d, v5.2d, v13.d[1]
|
||||
|
||||
fmla v24.2d, v4.2d, v14.2d[0]
|
||||
fmla v25.2d, v5.2d, v14.2d[0]
|
||||
fmla v26.2d, v4.2d, v14.2d[1]
|
||||
fmla v27.2d, v5.2d, v14.2d[1]
|
||||
fmla v24.2d, v4.2d, v14.d[0]
|
||||
fmla v25.2d, v5.2d, v14.d[0]
|
||||
fmla v26.2d, v4.2d, v14.d[1]
|
||||
fmla v27.2d, v5.2d, v14.d[1]
|
||||
|
||||
fmla v28.2d, v4.2d, v15.2d[0]
|
||||
fmla v29.2d, v5.2d, v15.2d[0]
|
||||
fmla v30.2d, v4.2d, v15.2d[1]
|
||||
fmla v31.2d, v5.2d, v15.2d[1]
|
||||
fmla v28.2d, v4.2d, v15.d[0]
|
||||
fmla v29.2d, v5.2d, v15.d[0]
|
||||
fmla v30.2d, v4.2d, v15.d[1]
|
||||
fmla v31.2d, v5.2d, v15.d[1]
|
||||
|
||||
ld1 {v8.2d, v9.2d}, [pB] // For next round
|
||||
add pB, pB, #32
|
||||
|
|
@ -245,25 +245,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro KERNEL4x8_E
|
||||
fmla v16.2d, v4.2d, v12.2d[0]
|
||||
fmla v17.2d, v5.2d, v12.2d[0]
|
||||
fmla v18.2d, v4.2d, v12.2d[1]
|
||||
fmla v19.2d, v5.2d, v12.2d[1]
|
||||
fmla v16.2d, v4.2d, v12.d[0]
|
||||
fmla v17.2d, v5.2d, v12.d[0]
|
||||
fmla v18.2d, v4.2d, v12.d[1]
|
||||
fmla v19.2d, v5.2d, v12.d[1]
|
||||
|
||||
fmla v20.2d, v4.2d, v13.2d[0]
|
||||
fmla v21.2d, v5.2d, v13.2d[0]
|
||||
fmla v22.2d, v4.2d, v13.2d[1]
|
||||
fmla v23.2d, v5.2d, v13.2d[1]
|
||||
fmla v20.2d, v4.2d, v13.d[0]
|
||||
fmla v21.2d, v5.2d, v13.d[0]
|
||||
fmla v22.2d, v4.2d, v13.d[1]
|
||||
fmla v23.2d, v5.2d, v13.d[1]
|
||||
|
||||
fmla v24.2d, v4.2d, v14.2d[0]
|
||||
fmla v25.2d, v5.2d, v14.2d[0]
|
||||
fmla v26.2d, v4.2d, v14.2d[1]
|
||||
fmla v27.2d, v5.2d, v14.2d[1]
|
||||
fmla v24.2d, v4.2d, v14.d[0]
|
||||
fmla v25.2d, v5.2d, v14.d[0]
|
||||
fmla v26.2d, v4.2d, v14.d[1]
|
||||
fmla v27.2d, v5.2d, v14.d[1]
|
||||
|
||||
fmla v28.2d, v4.2d, v15.2d[0]
|
||||
fmla v29.2d, v5.2d, v15.2d[0]
|
||||
fmla v30.2d, v4.2d, v15.2d[1]
|
||||
fmla v31.2d, v5.2d, v15.2d[1]
|
||||
fmla v28.2d, v4.2d, v15.d[0]
|
||||
fmla v29.2d, v5.2d, v15.d[0]
|
||||
fmla v30.2d, v4.2d, v15.d[1]
|
||||
fmla v31.2d, v5.2d, v15.d[1]
|
||||
.endm
|
||||
|
||||
.macro KERNEL4x8_SUB
|
||||
|
|
@ -274,25 +274,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v10.2d, v11.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
|
||||
fmla v16.2d, v0.2d, v8.2d[0]
|
||||
fmla v17.2d, v1.2d, v8.2d[0]
|
||||
fmla v18.2d, v0.2d, v8.2d[1]
|
||||
fmla v19.2d, v1.2d, v8.2d[1]
|
||||
fmla v16.2d, v0.2d, v8.d[0]
|
||||
fmla v17.2d, v1.2d, v8.d[0]
|
||||
fmla v18.2d, v0.2d, v8.d[1]
|
||||
fmla v19.2d, v1.2d, v8.d[1]
|
||||
|
||||
fmla v20.2d, v0.2d, v9.2d[0]
|
||||
fmla v21.2d, v1.2d, v9.2d[0]
|
||||
fmla v22.2d, v0.2d, v9.2d[1]
|
||||
fmla v23.2d, v1.2d, v9.2d[1]
|
||||
fmla v20.2d, v0.2d, v9.d[0]
|
||||
fmla v21.2d, v1.2d, v9.d[0]
|
||||
fmla v22.2d, v0.2d, v9.d[1]
|
||||
fmla v23.2d, v1.2d, v9.d[1]
|
||||
|
||||
fmla v24.2d, v0.2d, v10.2d[0]
|
||||
fmla v25.2d, v1.2d, v10.2d[0]
|
||||
fmla v26.2d, v0.2d, v10.2d[1]
|
||||
fmla v27.2d, v1.2d, v10.2d[1]
|
||||
fmla v24.2d, v0.2d, v10.d[0]
|
||||
fmla v25.2d, v1.2d, v10.d[0]
|
||||
fmla v26.2d, v0.2d, v10.d[1]
|
||||
fmla v27.2d, v1.2d, v10.d[1]
|
||||
|
||||
fmla v28.2d, v0.2d, v11.2d[0]
|
||||
fmla v29.2d, v1.2d, v11.2d[0]
|
||||
fmla v30.2d, v0.2d, v11.2d[1]
|
||||
fmla v31.2d, v1.2d, v11.2d[1]
|
||||
fmla v28.2d, v0.2d, v11.d[0]
|
||||
fmla v29.2d, v1.2d, v11.d[0]
|
||||
fmla v30.2d, v0.2d, v11.d[1]
|
||||
fmla v31.2d, v1.2d, v11.d[1]
|
||||
.endm
|
||||
|
||||
.macro SAVE4x8
|
||||
|
|
@ -374,17 +374,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v10.2d, v11.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
|
||||
fmla v16.2d, v0.2d, v8.2d[0]
|
||||
fmla v18.2d, v0.2d, v8.2d[1]
|
||||
fmla v16.2d, v0.2d, v8.d[0]
|
||||
fmla v18.2d, v0.2d, v8.d[1]
|
||||
|
||||
fmla v20.2d, v0.2d, v9.2d[0]
|
||||
fmla v22.2d, v0.2d, v9.2d[1]
|
||||
fmla v20.2d, v0.2d, v9.d[0]
|
||||
fmla v22.2d, v0.2d, v9.d[1]
|
||||
|
||||
fmla v24.2d, v0.2d, v10.2d[0]
|
||||
fmla v26.2d, v0.2d, v10.2d[1]
|
||||
fmla v24.2d, v0.2d, v10.d[0]
|
||||
fmla v26.2d, v0.2d, v10.d[1]
|
||||
|
||||
fmla v28.2d, v0.2d, v11.2d[0]
|
||||
fmla v30.2d, v0.2d, v11.2d[1]
|
||||
fmla v28.2d, v0.2d, v11.d[0]
|
||||
fmla v30.2d, v0.2d, v11.d[1]
|
||||
.endm
|
||||
|
||||
.macro SAVE2x8
|
||||
|
|
@ -520,17 +520,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v0.2d, v1.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
fmul v16.2d, v0.2d, v8.2d[0]
|
||||
fmul v29.2d, v1.2d, v9.2d[1]
|
||||
fmul v16.2d, v0.2d, v8.d[0]
|
||||
fmul v29.2d, v1.2d, v9.d[1]
|
||||
|
||||
fmul v20.2d, v0.2d, v8.2d[1]
|
||||
fmul v25.2d, v1.2d, v9.2d[0]
|
||||
fmul v20.2d, v0.2d, v8.d[1]
|
||||
fmul v25.2d, v1.2d, v9.d[0]
|
||||
|
||||
fmul v24.2d, v0.2d, v9.2d[0]
|
||||
fmul v21.2d, v1.2d, v8.2d[1]
|
||||
fmul v24.2d, v0.2d, v9.d[0]
|
||||
fmul v21.2d, v1.2d, v8.d[1]
|
||||
|
||||
fmul v28.2d, v0.2d, v9.2d[1]
|
||||
fmul v17.2d, v1.2d, v8.2d[0]
|
||||
fmul v28.2d, v0.2d, v9.d[1]
|
||||
fmul v17.2d, v1.2d, v8.d[0]
|
||||
|
||||
ld1 {v12.2d, v13.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
|
|
@ -539,61 +539,61 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro KERNEL4x4_M1
|
||||
fmla v16.2d, v0.2d, v8.2d[0]
|
||||
fmla v29.2d, v1.2d, v9.2d[1]
|
||||
fmla v16.2d, v0.2d, v8.d[0]
|
||||
fmla v29.2d, v1.2d, v9.d[1]
|
||||
|
||||
ld1 {v12.2d, v13.2d}, [pB] // For next round
|
||||
add pB, pB, #32
|
||||
|
||||
fmla v20.2d, v0.2d, v8.2d[1]
|
||||
fmla v25.2d, v1.2d, v9.2d[0]
|
||||
fmla v20.2d, v0.2d, v8.d[1]
|
||||
fmla v25.2d, v1.2d, v9.d[0]
|
||||
|
||||
ld1 {v4.2d, v5.2d}, [pA] // For next round
|
||||
add pA, pA, #32
|
||||
|
||||
fmla v24.2d, v0.2d, v9.2d[0]
|
||||
fmla v21.2d, v1.2d, v8.2d[1]
|
||||
fmla v24.2d, v0.2d, v9.d[0]
|
||||
fmla v21.2d, v1.2d, v8.d[1]
|
||||
|
||||
prfm PLDL1KEEP, [pA, #512]
|
||||
|
||||
fmla v28.2d, v0.2d, v9.2d[1]
|
||||
fmla v17.2d, v1.2d, v8.2d[0]
|
||||
fmla v28.2d, v0.2d, v9.d[1]
|
||||
fmla v17.2d, v1.2d, v8.d[0]
|
||||
.endm
|
||||
|
||||
.macro KERNEL4x4_M2
|
||||
fmla v16.2d, v4.2d, v12.2d[0]
|
||||
fmla v29.2d, v5.2d, v13.2d[1]
|
||||
fmla v16.2d, v4.2d, v12.d[0]
|
||||
fmla v29.2d, v5.2d, v13.d[1]
|
||||
|
||||
ld1 {v8.2d, v9.2d}, [pB] // For next round
|
||||
add pB, pB, #32
|
||||
|
||||
fmla v20.2d, v4.2d, v12.2d[1]
|
||||
fmla v25.2d, v5.2d, v13.2d[0]
|
||||
fmla v20.2d, v4.2d, v12.d[1]
|
||||
fmla v25.2d, v5.2d, v13.d[0]
|
||||
|
||||
ld1 {v0.2d, v1.2d}, [pA] // For next round
|
||||
add pA, pA, #32
|
||||
|
||||
fmla v24.2d, v4.2d, v13.2d[0]
|
||||
fmla v21.2d, v5.2d, v12.2d[1]
|
||||
fmla v24.2d, v4.2d, v13.d[0]
|
||||
fmla v21.2d, v5.2d, v12.d[1]
|
||||
|
||||
prfm PLDL1KEEP, [pB, #512]
|
||||
|
||||
fmla v28.2d, v4.2d, v13.2d[1]
|
||||
fmla v17.2d, v5.2d, v12.2d[0]
|
||||
fmla v28.2d, v4.2d, v13.d[1]
|
||||
fmla v17.2d, v5.2d, v12.d[0]
|
||||
.endm
|
||||
|
||||
.macro KERNEL4x4_E
|
||||
fmla v16.2d, v4.2d, v12.2d[0]
|
||||
fmla v29.2d, v5.2d, v13.2d[1]
|
||||
fmla v16.2d, v4.2d, v12.d[0]
|
||||
fmla v29.2d, v5.2d, v13.d[1]
|
||||
|
||||
fmla v20.2d, v4.2d, v12.2d[1]
|
||||
fmla v25.2d, v5.2d, v13.2d[0]
|
||||
fmla v20.2d, v4.2d, v12.d[1]
|
||||
fmla v25.2d, v5.2d, v13.d[0]
|
||||
|
||||
fmla v24.2d, v4.2d, v13.2d[0]
|
||||
fmla v21.2d, v5.2d, v12.2d[1]
|
||||
fmla v24.2d, v4.2d, v13.d[0]
|
||||
fmla v21.2d, v5.2d, v12.d[1]
|
||||
|
||||
fmla v28.2d, v4.2d, v13.2d[1]
|
||||
fmla v17.2d, v5.2d, v12.2d[0]
|
||||
fmla v28.2d, v4.2d, v13.d[1]
|
||||
fmla v17.2d, v5.2d, v12.d[0]
|
||||
.endm
|
||||
|
||||
.macro KERNEL4x4_SUB
|
||||
|
|
@ -602,17 +602,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v0.2d, v1.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
fmla v16.2d, v0.2d, v8.2d[0]
|
||||
fmla v29.2d, v1.2d, v9.2d[1]
|
||||
fmla v16.2d, v0.2d, v8.d[0]
|
||||
fmla v29.2d, v1.2d, v9.d[1]
|
||||
|
||||
fmla v20.2d, v0.2d, v8.2d[1]
|
||||
fmla v25.2d, v1.2d, v9.2d[0]
|
||||
fmla v20.2d, v0.2d, v8.d[1]
|
||||
fmla v25.2d, v1.2d, v9.d[0]
|
||||
|
||||
fmla v24.2d, v0.2d, v9.2d[0]
|
||||
fmla v21.2d, v1.2d, v8.2d[1]
|
||||
fmla v24.2d, v0.2d, v9.d[0]
|
||||
fmla v21.2d, v1.2d, v8.d[1]
|
||||
|
||||
fmla v28.2d, v0.2d, v9.2d[1]
|
||||
fmla v17.2d, v1.2d, v8.2d[0]
|
||||
fmla v28.2d, v0.2d, v9.d[1]
|
||||
fmla v17.2d, v1.2d, v8.d[0]
|
||||
.endm
|
||||
|
||||
.macro SAVE4x4
|
||||
|
|
@ -660,10 +660,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v0.2d}, [pA]
|
||||
add pA, pA, #16
|
||||
|
||||
fmla v16.2d, v0.2d, v8.2d[0]
|
||||
fmla v20.2d, v0.2d, v8.2d[1]
|
||||
fmla v24.2d, v0.2d, v9.2d[0]
|
||||
fmla v28.2d, v0.2d, v9.2d[1]
|
||||
fmla v16.2d, v0.2d, v8.d[0]
|
||||
fmla v20.2d, v0.2d, v8.d[1]
|
||||
fmla v24.2d, v0.2d, v9.d[0]
|
||||
fmla v28.2d, v0.2d, v9.d[1]
|
||||
.endm
|
||||
|
||||
.macro SAVE2x4
|
||||
|
|
@ -746,10 +746,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v0.2d, v1.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
fmla v16.2d, v0.2d, v8.2d[0]
|
||||
fmla v17.2d, v1.2d, v8.2d[0]
|
||||
fmla v20.2d, v0.2d, v8.2d[1]
|
||||
fmla v21.2d, v1.2d, v8.2d[1]
|
||||
fmla v16.2d, v0.2d, v8.d[0]
|
||||
fmla v17.2d, v1.2d, v8.d[0]
|
||||
fmla v20.2d, v0.2d, v8.d[1]
|
||||
fmla v21.2d, v1.2d, v8.d[1]
|
||||
.endm
|
||||
|
||||
.macro SAVE4x2
|
||||
|
|
@ -782,8 +782,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v0.2d}, [pA]
|
||||
add pA, pA, #16
|
||||
|
||||
fmla v16.2d, v0.2d, v8.2d[0]
|
||||
fmla v20.2d, v0.2d, v8.2d[1]
|
||||
fmla v16.2d, v0.2d, v8.d[0]
|
||||
fmla v20.2d, v0.2d, v8.d[1]
|
||||
.endm
|
||||
|
||||
.macro SAVE2x2
|
||||
|
|
@ -813,7 +813,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ldr d0 , [pA]
|
||||
add pA, pA, #8
|
||||
|
||||
fmla v16.2d, v8.2d, v0.2d[0]
|
||||
fmla v16.2d, v8.2d, v0.d[0]
|
||||
.endm
|
||||
|
||||
.macro SAVE1x2
|
||||
|
|
@ -842,8 +842,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v0.2d, v1.2d}, [pA]
|
||||
add pA , pA, #32
|
||||
|
||||
fmla v16.2d, v0.2d, v8.2d[0]
|
||||
fmla v17.2d, v1.2d, v8.2d[0]
|
||||
fmla v16.2d, v0.2d, v8.d[0]
|
||||
fmla v17.2d, v1.2d, v8.d[0]
|
||||
.endm
|
||||
|
||||
.macro SAVE4x1
|
||||
|
|
@ -871,7 +871,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v0.2d}, [pA]
|
||||
add pA , pA, #16
|
||||
|
||||
fmla v16.2d, v0.2d, v8.2d[0]
|
||||
fmla v16.2d, v0.2d, v8.d[0]
|
||||
.endm
|
||||
|
||||
.macro SAVE2x1
|
||||
|
|
|
|||
|
|
@ -52,12 +52,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#define alpha0 d10
|
||||
#define alphaV0 v10.d[0]
|
||||
#define alpha1 d11
|
||||
#define alphaV1 v11.d[0]
|
||||
#define alpha2 d14
|
||||
#define alphaV2 v14.d[0]
|
||||
#define alpha3 d15
|
||||
#define alphaV3 v15.d[0]
|
||||
|
||||
#define A_PRE_SIZE 2560
|
||||
#define B_PRE_SIZE 448
|
||||
#define C_PRE_SIZE 128
|
||||
|
||||
// 00 origM
|
||||
// 01 origN
|
||||
|
|
@ -74,8 +72,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
// 12 pCRow0
|
||||
// 13 pCRow1
|
||||
// 14 pCRow2
|
||||
// 15 pA
|
||||
// 16
|
||||
// 15 pCRow3
|
||||
// 16 pA
|
||||
// 17
|
||||
// 18 must save
|
||||
// 19 must save
|
||||
|
|
@ -100,14 +98,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
//v05 pA1_2, pA1_3
|
||||
//v06 pA1_4, pA1_5
|
||||
//v07 pA1_6, pA1_7
|
||||
//v08 must save pB0_0, pB0_1
|
||||
//v09 must save pB0_2, pB0_3
|
||||
//v10 must save ALPHA0
|
||||
//v11 must save ALPHA1
|
||||
//v12 must save pB1_0, pB1_1
|
||||
//v13 must save pB1_2, pB1_3
|
||||
//v14 must save ALPHA2
|
||||
//v15 must save ALPHA3
|
||||
//v08 must save pB0_0
|
||||
//v09 must save pB0_1
|
||||
//v10 must save pB0_2 --> ALPHA0
|
||||
//v11 must save pB0_3
|
||||
//v12 must save pB1_0
|
||||
//v13 must save pB1_1
|
||||
//v14 must save pB1_2
|
||||
//v15 must save pB1_3
|
||||
//v16 must save C00, C01
|
||||
//v17 must save C02, C03
|
||||
//v18 C04, C05
|
||||
|
|
@ -149,244 +147,257 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro KERNEL8x4_I
|
||||
ld1 {v0.2d, v1.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
ld1 {v2.2d, v3.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
ldp d8, d9, [pB]
|
||||
add pB, pB, #16
|
||||
ldp d10, d11, [pB]
|
||||
add pB, pB, #16
|
||||
ldp q0, q1, [pA], #32
|
||||
|
||||
fmul v16.2d, v0.2d, v8.2d[0]
|
||||
fmul v17.2d, v1.2d, v8.2d[0]
|
||||
ldp d8, d9, [pB], #16
|
||||
|
||||
fmul v18.2d, v2.2d, v8.2d[0]
|
||||
fmul v19.2d, v3.2d, v8.2d[0]
|
||||
fmul v16.2d, v0.2d, v8.d[0]
|
||||
fmul v20.2d, v0.2d, v9.d[0]
|
||||
|
||||
fmul v20.2d, v0.2d, v9.2d[0]
|
||||
fmul v21.2d, v1.2d, v9.2d[0]
|
||||
ldp d10, d11, [pB], #16
|
||||
|
||||
fmul v22.2d, v2.2d, v9.2d[0]
|
||||
fmul v23.2d, v3.2d, v9.2d[0]
|
||||
fmul v17.2d, v1.2d, v8.d[0]
|
||||
fmul v21.2d, v1.2d, v9.d[0]
|
||||
|
||||
fmul v24.2d, v0.2d, v10.2d[0]
|
||||
fmul v25.2d, v1.2d, v10.2d[0]
|
||||
ldp q2, q3, [pA], #32
|
||||
|
||||
fmul v26.2d, v2.2d, v10.2d[0]
|
||||
fmul v27.2d, v3.2d, v10.2d[0]
|
||||
fmul v24.2d, v0.2d, v10.d[0]
|
||||
fmul v28.2d, v0.2d, v11.d[0]
|
||||
|
||||
fmul v28.2d, v0.2d, v11.2d[0]
|
||||
fmul v29.2d, v1.2d, v11.2d[0]
|
||||
ldp q4, q5, [pA], #32
|
||||
|
||||
fmul v30.2d, v2.2d, v11.2d[0]
|
||||
fmul v31.2d, v3.2d, v11.2d[0]
|
||||
fmul v25.2d, v1.2d, v10.d[0]
|
||||
fmul v29.2d, v1.2d, v11.d[0]
|
||||
|
||||
ld1 {v4.2d, v5.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
ld1 {v6.2d, v7.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
ldp d12, d13, [pB]
|
||||
add pB, pB, #16
|
||||
ldp d14, d15, [pB]
|
||||
add pB, pB, #16
|
||||
ldp d12, d13, [pB], #16
|
||||
|
||||
fmul v18.2d, v2.2d, v8.d[0]
|
||||
fmul v22.2d, v2.2d, v9.d[0]
|
||||
|
||||
ldp d14, d15, [pB], #16
|
||||
|
||||
fmul v26.2d, v2.2d, v10.d[0]
|
||||
fmul v30.2d, v2.2d, v11.d[0]
|
||||
|
||||
ldp q6, q7, [pA], #32
|
||||
|
||||
fmul v19.2d, v3.2d, v8.d[0]
|
||||
fmul v27.2d, v3.2d, v10.d[0]
|
||||
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||
|
||||
fmul v31.2d, v3.2d, v11.d[0]
|
||||
fmul v23.2d, v3.2d, v9.d[0]
|
||||
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
|
||||
.endm
|
||||
|
||||
.macro KERNEL8x4_M1
|
||||
fmla v16.2d, v0.2d, v8.2d[0]
|
||||
fmla v21.2d, v1.2d, v9.2d[0]
|
||||
fmla v26.2d, v2.2d, v10.2d[0]
|
||||
fmla v31.2d, v3.2d, v11.2d[0]
|
||||
fmla v16.2d, v0.2d, v8.d[0]
|
||||
fmla v20.2d, v0.2d, v9.d[0]
|
||||
|
||||
ld1 {v4.2d}, [pA], #16
|
||||
ldp q4, q5, [pA], #32
|
||||
|
||||
fmla v20.2d, v0.2d, v9.2d[0]
|
||||
fmla v17.2d, v1.2d, v8.2d[0]
|
||||
fmla v24.2d, v0.2d, v10.d[0]
|
||||
fmla v28.2d, v0.2d, v11.d[0]
|
||||
|
||||
ld1 {v5.2d}, [pA], #16
|
||||
ldp d12, d13, [pB], #16
|
||||
|
||||
fmla v30.2d, v2.2d, v11.2d[0]
|
||||
fmla v27.2d, v3.2d, v10.2d[0]
|
||||
fmla v17.2d, v1.2d, v8.d[0]
|
||||
fmla v25.2d, v1.2d, v10.d[0]
|
||||
|
||||
ldp d12, d13, [pB]
|
||||
add pB, pB, #16
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
|
||||
|
||||
fmla v28.2d, v0.2d, v11.2d[0]
|
||||
fmla v25.2d, v1.2d, v10.2d[0]
|
||||
fmla v21.2d, v1.2d, v9.d[0]
|
||||
fmla v29.2d, v1.2d, v11.d[0]
|
||||
|
||||
ldp d14, d15, [pB]
|
||||
add pB, pB, #16
|
||||
ldp d14, d15, [pB], #16
|
||||
|
||||
fmla v18.2d, v2.2d, v8.2d[0]
|
||||
fmla v23.2d, v3.2d, v9.2d[0]
|
||||
fmla v18.2d, v2.2d, v8.d[0]
|
||||
fmla v22.2d, v2.2d, v9.d[0]
|
||||
|
||||
ld1 {v6.2d}, [pA], #16
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||
|
||||
fmla v24.2d, v0.2d, v10.2d[0]
|
||||
fmla v29.2d, v1.2d, v11.2d[0]
|
||||
fmla v26.2d, v2.2d, v10.d[0]
|
||||
fmla v30.2d, v2.2d, v11.d[0]
|
||||
fmla v19.2d, v3.2d, v8.d[0]
|
||||
fmla v23.2d, v3.2d, v9.d[0]
|
||||
|
||||
ld1 {v7.2d}, [pA], #16
|
||||
ldp q6, q7, [pA], #32
|
||||
|
||||
fmla v22.2d, v2.2d, v9.2d[0]
|
||||
fmla v19.2d, v3.2d, v8.2d[0]
|
||||
|
||||
prfm PLDL1KEEP, [pA, #224]
|
||||
prfm PLDL1KEEP, [pA, #224+64]
|
||||
fmla v27.2d, v3.2d, v10.d[0]
|
||||
fmla v31.2d, v3.2d, v11.d[0]
|
||||
.endm
|
||||
|
||||
.macro KERNEL8x4_M2
|
||||
fmla v16.2d, v4.2d, v12.2d[0]
|
||||
fmla v21.2d, v5.2d, v13.2d[0]
|
||||
fmla v26.2d, v6.2d, v14.2d[0]
|
||||
fmla v31.2d, v7.2d, v15.2d[0]
|
||||
fmla v16.2d, v4.2d, v12.d[0]
|
||||
fmla v20.2d, v4.2d, v13.d[0]
|
||||
fmla v24.2d, v4.2d, v14.d[0]
|
||||
fmla v28.2d, v4.2d, v15.d[0]
|
||||
|
||||
ld1 {v0.2d}, [pA], #16
|
||||
ldp q0, q1, [pA], #32
|
||||
|
||||
fmla v20.2d, v4.2d, v13.2d[0]
|
||||
fmla v17.2d, v5.2d, v12.2d[0]
|
||||
fmla v17.2d, v5.2d, v12.d[0]
|
||||
fmla v25.2d, v5.2d, v14.d[0]
|
||||
|
||||
ld1 {v1.2d}, [pA], #16
|
||||
ldp d8, d9, [pB], #16
|
||||
|
||||
fmla v30.2d, v6.2d, v15.2d[0]
|
||||
fmla v27.2d, v7.2d, v14.2d[0]
|
||||
fmla v21.2d, v5.2d, v13.d[0]
|
||||
fmla v29.2d, v5.2d, v15.d[0]
|
||||
|
||||
ldp d8, d9, [pB]
|
||||
add pB, pB, #16
|
||||
ldp d10, d11, [pB], #16
|
||||
|
||||
fmla v28.2d, v4.2d, v15.2d[0]
|
||||
fmla v25.2d, v5.2d, v14.2d[0]
|
||||
fmla v18.2d, v6.2d, v12.d[0]
|
||||
fmla v22.2d, v6.2d, v13.d[0]
|
||||
|
||||
ldp d10, d11, [pB]
|
||||
add pB, pB, #16
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
|
||||
fmla v22.2d, v6.2d, v13.2d[0]
|
||||
fmla v19.2d, v7.2d, v12.2d[0]
|
||||
fmla v26.2d, v6.2d, v14.d[0]
|
||||
fmla v30.2d, v6.2d, v15.d[0]
|
||||
|
||||
ld1 {v2.2d}, [pA], #16
|
||||
fmla v19.2d, v7.2d, v12.d[0]
|
||||
fmla v23.2d, v7.2d, v13.d[0]
|
||||
|
||||
fmla v24.2d, v4.2d, v14.2d[0]
|
||||
fmla v29.2d, v5.2d, v15.2d[0]
|
||||
ldp q2, q3, [pA], #32
|
||||
|
||||
ld1 {v3.2d}, [pA], #16
|
||||
|
||||
fmla v18.2d, v6.2d, v12.2d[0]
|
||||
fmla v23.2d, v7.2d, v13.2d[0]
|
||||
|
||||
prfm PLDL1KEEP, [pB, #640]
|
||||
fmla v27.2d, v7.2d, v14.d[0]
|
||||
fmla v31.2d, v7.2d, v15.d[0]
|
||||
.endm
|
||||
|
||||
.macro KERNEL8x4_E
|
||||
fmla v16.2d, v4.2d, v12.2d[0]
|
||||
fmla v17.2d, v5.2d, v12.2d[0]
|
||||
fmla v18.2d, v6.2d, v12.2d[0]
|
||||
fmla v19.2d, v7.2d, v12.2d[0]
|
||||
fmla v20.2d, v4.2d, v13.2d[0]
|
||||
fmla v21.2d, v5.2d, v13.2d[0]
|
||||
fmla v22.2d, v6.2d, v13.2d[0]
|
||||
fmla v23.2d, v7.2d, v13.2d[0]
|
||||
fmla v24.2d, v4.2d, v14.2d[0]
|
||||
fmla v25.2d, v5.2d, v14.2d[0]
|
||||
fmla v26.2d, v6.2d, v14.2d[0]
|
||||
fmla v27.2d, v7.2d, v14.2d[0]
|
||||
fmla v28.2d, v4.2d, v15.2d[0]
|
||||
fmla v29.2d, v5.2d, v15.2d[0]
|
||||
fmla v30.2d, v6.2d, v15.2d[0]
|
||||
fmla v31.2d, v7.2d, v15.2d[0]
|
||||
fmla v16.2d, v4.2d, v12.d[0]
|
||||
fmla v20.2d, v4.2d, v13.d[0]
|
||||
fmla v24.2d, v4.2d, v14.d[0]
|
||||
fmla v28.2d, v4.2d, v15.d[0]
|
||||
|
||||
fmla v17.2d, v5.2d, v12.d[0]
|
||||
fmla v25.2d, v5.2d, v14.d[0]
|
||||
fmla v21.2d, v5.2d, v13.d[0]
|
||||
fmla v29.2d, v5.2d, v15.d[0]
|
||||
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
|
||||
fmla v18.2d, v6.2d, v12.d[0]
|
||||
fmla v22.2d, v6.2d, v13.d[0]
|
||||
fmla v26.2d, v6.2d, v14.d[0]
|
||||
fmla v30.2d, v6.2d, v15.d[0]
|
||||
|
||||
fmla v19.2d, v7.2d, v12.d[0]
|
||||
fmla v23.2d, v7.2d, v13.d[0]
|
||||
fmla v27.2d, v7.2d, v14.d[0]
|
||||
fmla v31.2d, v7.2d, v15.d[0]
|
||||
.endm
|
||||
|
||||
.macro KERNEL8x4_SUB
|
||||
ld1 {v0.2d, v1.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
ld1 {v2.2d, v3.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
ldp d8, d9, [pB]
|
||||
add pB, pB, #16
|
||||
ldp d10, d11, [pB]
|
||||
add pB, pB, #16
|
||||
ldp q0, q1, [pA], #32
|
||||
|
||||
fmla v16.2d, v0.2d, v8.2d[0]
|
||||
fmla v17.2d, v1.2d, v8.2d[0]
|
||||
fmla v18.2d, v2.2d, v8.2d[0]
|
||||
fmla v19.2d, v3.2d, v8.2d[0]
|
||||
ldp d8, d9, [pB], #16
|
||||
|
||||
fmla v20.2d, v0.2d, v9.2d[0]
|
||||
fmla v21.2d, v1.2d, v9.2d[0]
|
||||
fmla v22.2d, v2.2d, v9.2d[0]
|
||||
fmla v23.2d, v3.2d, v9.2d[0]
|
||||
fmla v16.2d, v0.2d, v8.d[0]
|
||||
fmla v20.2d, v0.2d, v9.d[0]
|
||||
|
||||
fmla v24.2d, v0.2d, v10.2d[0]
|
||||
fmla v25.2d, v1.2d, v10.2d[0]
|
||||
fmla v26.2d, v2.2d, v10.2d[0]
|
||||
fmla v27.2d, v3.2d, v10.2d[0]
|
||||
ldp d10, d11, [pB], #16
|
||||
|
||||
fmla v28.2d, v0.2d, v11.2d[0]
|
||||
fmla v29.2d, v1.2d, v11.2d[0]
|
||||
fmla v30.2d, v2.2d, v11.2d[0]
|
||||
fmla v31.2d, v3.2d, v11.2d[0]
|
||||
fmla v17.2d, v1.2d, v8.d[0]
|
||||
fmla v21.2d, v1.2d, v9.d[0]
|
||||
|
||||
ldp q2, q3, [pA], #32
|
||||
|
||||
fmla v24.2d, v0.2d, v10.d[0]
|
||||
fmla v28.2d, v0.2d, v11.d[0]
|
||||
|
||||
fmla v25.2d, v1.2d, v10.d[0]
|
||||
fmla v29.2d, v1.2d, v11.d[0]
|
||||
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||
|
||||
fmla v18.2d, v2.2d, v8.d[0]
|
||||
fmla v22.2d, v2.2d, v9.d[0]
|
||||
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
|
||||
|
||||
fmla v26.2d, v2.2d, v10.d[0]
|
||||
fmla v30.2d, v2.2d, v11.d[0]
|
||||
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
|
||||
fmla v19.2d, v3.2d, v8.d[0]
|
||||
fmla v27.2d, v3.2d, v10.d[0]
|
||||
|
||||
fmla v31.2d, v3.2d, v11.d[0]
|
||||
fmla v23.2d, v3.2d, v9.d[0]
|
||||
.endm
|
||||
|
||||
.macro SAVE8x4
|
||||
fmov alpha0, alpha
|
||||
|
||||
ld1 {v0.2d, v1.2d}, [pCRow0]
|
||||
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
|
||||
|
||||
ldp q0, q1, [pCRow0]
|
||||
fmla v0.2d, v16.2d, alphaV0
|
||||
fmla v1.2d, v17.2d, alphaV0
|
||||
st1 {v0.2d, v1.2d}, [pCRow0]
|
||||
stp q0, q1, [pCRow0]
|
||||
|
||||
add pCRow0, pCRow0, #32
|
||||
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
|
||||
|
||||
ld1 {v2.2d, v3.2d}, [pCRow0]
|
||||
ldp q2, q3, [pCRow0]
|
||||
fmla v2.2d, v18.2d, alphaV0
|
||||
fmla v3.2d, v19.2d, alphaV0
|
||||
st1 {v2.2d, v3.2d}, [pCRow0]
|
||||
stp q2, q3, [pCRow0]
|
||||
|
||||
add pCRow0, pCRow0, #32
|
||||
|
||||
ld1 {v4.2d, v5.2d}, [pCRow1]
|
||||
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
|
||||
|
||||
ldp q4, q5, [pCRow1]
|
||||
fmla v4.2d, v20.2d, alphaV0
|
||||
fmla v5.2d, v21.2d, alphaV0
|
||||
st1 {v4.2d, v5.2d}, [pCRow1]
|
||||
stp q4, q5, [pCRow1]
|
||||
|
||||
add pCRow1, pCRow1, #32
|
||||
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
|
||||
|
||||
ld1 {v6.2d, v7.2d}, [pCRow1]
|
||||
ldp q6, q7, [pCRow1]
|
||||
fmla v6.2d, v22.2d, alphaV0
|
||||
fmla v7.2d, v23.2d, alphaV0
|
||||
st1 {v6.2d, v7.2d}, [pCRow1]
|
||||
stp q6, q7, [pCRow1]
|
||||
|
||||
add pCRow1, pCRow1, #32
|
||||
|
||||
ld1 {v0.2d, v1.2d}, [pCRow2]
|
||||
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
|
||||
|
||||
ldp q0, q1, [pCRow2]
|
||||
fmla v0.2d, v24.2d, alphaV0
|
||||
fmla v1.2d, v25.2d, alphaV0
|
||||
st1 {v0.2d, v1.2d}, [pCRow2]
|
||||
stp q0, q1, [pCRow2]
|
||||
|
||||
add pCRow2, pCRow2, #32
|
||||
ld1 {v2.2d, v3.2d}, [pCRow2]
|
||||
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
|
||||
|
||||
ldp q2, q3, [pCRow2]
|
||||
fmla v2.2d, v26.2d, alphaV0
|
||||
fmla v3.2d, v27.2d, alphaV0
|
||||
st1 {v2.2d, v3.2d}, [pCRow2]
|
||||
stp q2, q3, [pCRow2]
|
||||
|
||||
add pCRow2, pCRow2, #32
|
||||
|
||||
ld1 {v4.2d, v5.2d}, [pCRow3]
|
||||
prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
|
||||
|
||||
ldp q4, q5, [pCRow3]
|
||||
fmla v4.2d, v28.2d, alphaV0
|
||||
fmla v5.2d, v29.2d, alphaV0
|
||||
st1 {v4.2d, v5.2d}, [pCRow3]
|
||||
stp q4, q5, [pCRow3]
|
||||
|
||||
add pCRow3, pCRow3, #32
|
||||
prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
|
||||
|
||||
ld1 {v6.2d, v7.2d}, [pCRow3]
|
||||
ldp q6, q7, [pCRow3]
|
||||
fmla v6.2d, v30.2d, alphaV0
|
||||
fmla v7.2d, v31.2d, alphaV0
|
||||
st1 {v6.2d, v7.2d}, [pCRow3]
|
||||
stp q6, q7, [pCRow3]
|
||||
|
||||
add pCRow3, pCRow3, #32
|
||||
|
||||
prfm PLDL2KEEP, [pCRow0, #128]
|
||||
prfm PLDL2KEEP, [pCRow1, #128]
|
||||
prfm PLDL2KEEP, [pCRow2, #128]
|
||||
prfm PLDL2KEEP, [pCRow3, #128]
|
||||
.endm
|
||||
|
||||
/******************************************************************************/
|
||||
|
|
@ -408,44 +419,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v0.2d, v1.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
fmla v16.2d, v0.2d, v8.2d[0]
|
||||
fmla v29.2d, v1.2d, v9.2d[1]
|
||||
fmla v16.2d, v0.2d, v8.d[0]
|
||||
fmla v29.2d, v1.2d, v9.d[1]
|
||||
|
||||
fmla v20.2d, v0.2d, v8.2d[1]
|
||||
fmla v25.2d, v1.2d, v9.2d[0]
|
||||
fmla v20.2d, v0.2d, v8.d[1]
|
||||
fmla v25.2d, v1.2d, v9.d[0]
|
||||
|
||||
fmla v24.2d, v0.2d, v9.2d[0]
|
||||
fmla v21.2d, v1.2d, v8.2d[1]
|
||||
fmla v24.2d, v0.2d, v9.d[0]
|
||||
fmla v21.2d, v1.2d, v8.d[1]
|
||||
|
||||
fmla v28.2d, v0.2d, v9.2d[1]
|
||||
fmla v17.2d, v1.2d, v8.2d[0]
|
||||
fmla v28.2d, v0.2d, v9.d[1]
|
||||
fmla v17.2d, v1.2d, v8.d[0]
|
||||
.endm
|
||||
|
||||
.macro SAVE4x4
|
||||
fmov alpha0, alpha
|
||||
ld1 {v8.2d, v9.2d}, [pCRow0]
|
||||
fmla v8.2d, v16.2d, alphaV0
|
||||
fmla v9.2d, v17.2d, alphaV1
|
||||
fmla v9.2d, v17.2d, alphaV0
|
||||
st1 {v8.2d, v9.2d}, [pCRow0]
|
||||
|
||||
add pCRow1, pCRow0, LDC
|
||||
|
||||
ld1 {v12.2d, v13.2d}, [pCRow1]
|
||||
fmla v12.2d, v20.2d, alphaV2
|
||||
fmla v13.2d, v21.2d, alphaV3
|
||||
fmla v12.2d, v20.2d, alphaV0
|
||||
fmla v13.2d, v21.2d, alphaV0
|
||||
st1 {v12.2d, v13.2d}, [pCRow1]
|
||||
|
||||
add pCRow2, pCRow1, LDC
|
||||
|
||||
ld1 {v8.2d, v9.2d}, [pCRow2]
|
||||
fmla v8.2d, v24.2d, alphaV0
|
||||
fmla v9.2d, v25.2d, alphaV1
|
||||
fmla v9.2d, v25.2d, alphaV0
|
||||
st1 {v8.2d, v9.2d}, [pCRow2]
|
||||
|
||||
add pCRow1, pCRow2, LDC
|
||||
|
||||
ld1 {v12.2d, v13.2d}, [pCRow1]
|
||||
fmla v12.2d, v28.2d, alphaV2
|
||||
fmla v13.2d, v29.2d, alphaV3
|
||||
fmla v12.2d, v28.2d, alphaV0
|
||||
fmla v13.2d, v29.2d, alphaV0
|
||||
st1 {v12.2d, v13.2d}, [pCRow1]
|
||||
|
||||
add pCRow0, pCRow0, #32
|
||||
|
|
@ -467,13 +479,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v0.2d}, [pA]
|
||||
add pA, pA, #16
|
||||
|
||||
fmla v16.2d, v0.2d, v8.2d[0]
|
||||
fmla v20.2d, v0.2d, v8.2d[1]
|
||||
fmla v24.2d, v0.2d, v9.2d[0]
|
||||
fmla v28.2d, v0.2d, v9.2d[1]
|
||||
fmla v16.2d, v0.2d, v8.d[0]
|
||||
fmla v20.2d, v0.2d, v8.d[1]
|
||||
fmla v24.2d, v0.2d, v9.d[0]
|
||||
fmla v28.2d, v0.2d, v9.d[1]
|
||||
.endm
|
||||
|
||||
.macro SAVE2x4
|
||||
fmov alpha0, alpha
|
||||
ld1 {v8.2d}, [pCRow0]
|
||||
fmla v8.2d, v16.2d, alphaV0
|
||||
st1 {v8.2d}, [pCRow0]
|
||||
|
|
@ -481,19 +494,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
add pCRow1, pCRow0, LDC
|
||||
|
||||
ld1 {v12.2d}, [pCRow1]
|
||||
fmla v12.2d, v20.2d, alphaV1
|
||||
fmla v12.2d, v20.2d, alphaV0
|
||||
st1 {v12.2d}, [pCRow1]
|
||||
|
||||
add pCRow2, pCRow1, LDC
|
||||
|
||||
ld1 {v8.2d}, [pCRow2]
|
||||
fmla v8.2d, v24.2d, alphaV2
|
||||
fmla v8.2d, v24.2d, alphaV0
|
||||
st1 {v8.2d}, [pCRow2]
|
||||
|
||||
add pCRow1, pCRow2, LDC
|
||||
|
||||
ld1 {v12.2d}, [pCRow1]
|
||||
fmla v12.2d, v28.2d, alphaV3
|
||||
fmla v12.2d, v28.2d, alphaV0
|
||||
st1 {v12.2d}, [pCRow1]
|
||||
|
||||
add pCRow0, pCRow0, #16
|
||||
|
|
@ -518,6 +531,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro SAVE1x4
|
||||
fmov alpha0, alpha
|
||||
add pCRow1, pCRow0, LDC
|
||||
|
||||
ld1 {v8.d}[0], [pCRow0]
|
||||
|
|
@ -531,7 +545,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
ld1 {v12.d}[0], [pCRow2]
|
||||
ld1 {v12.d}[1], [pCRow1]
|
||||
fmla v12.2d, v20.2d, alphaV1
|
||||
fmla v12.2d, v20.2d, alphaV0
|
||||
st1 {v12.d}[0], [pCRow2]
|
||||
st1 {v12.d}[1], [pCRow1]
|
||||
|
||||
|
|
@ -559,32 +573,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v2.2d, v3.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
fmla v16.2d, v0.2d, v8.2d[0]
|
||||
fmla v17.2d, v1.2d, v8.2d[0]
|
||||
fmla v18.2d, v2.2d, v8.2d[0]
|
||||
fmla v19.2d, v3.2d, v8.2d[0]
|
||||
fmla v16.2d, v0.2d, v8.d[0]
|
||||
fmla v17.2d, v1.2d, v8.d[0]
|
||||
fmla v18.2d, v2.2d, v8.d[0]
|
||||
fmla v19.2d, v3.2d, v8.d[0]
|
||||
|
||||
fmla v20.2d, v0.2d, v8.2d[1]
|
||||
fmla v21.2d, v1.2d, v8.2d[1]
|
||||
fmla v22.2d, v2.2d, v8.2d[1]
|
||||
fmla v23.2d, v3.2d, v8.2d[1]
|
||||
fmla v20.2d, v0.2d, v8.d[1]
|
||||
fmla v21.2d, v1.2d, v8.d[1]
|
||||
fmla v22.2d, v2.2d, v8.d[1]
|
||||
fmla v23.2d, v3.2d, v8.d[1]
|
||||
.endm
|
||||
|
||||
.macro SAVE8x2
|
||||
fmov alpha0, alpha
|
||||
add pCRow1, pCRow0, LDC
|
||||
|
||||
ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0]
|
||||
fmla v0.2d, v16.2d, alphaV0
|
||||
fmla v1.2d, v17.2d, alphaV1
|
||||
fmla v2.2d, v18.2d, alphaV2
|
||||
fmla v3.2d, v19.2d, alphaV3
|
||||
fmla v1.2d, v17.2d, alphaV0
|
||||
fmla v2.2d, v18.2d, alphaV0
|
||||
fmla v3.2d, v19.2d, alphaV0
|
||||
st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0]
|
||||
|
||||
ld1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1]
|
||||
fmla v4.2d, v20.2d, alphaV0
|
||||
fmla v5.2d, v21.2d, alphaV1
|
||||
fmla v6.2d, v22.2d, alphaV2
|
||||
fmla v7.2d, v23.2d, alphaV3
|
||||
fmla v5.2d, v21.2d, alphaV0
|
||||
fmla v6.2d, v22.2d, alphaV0
|
||||
fmla v7.2d, v23.2d, alphaV0
|
||||
st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1]
|
||||
|
||||
add pCRow0, pCRow0, #64
|
||||
|
|
@ -605,23 +620,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v0.2d, v1.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
fmla v16.2d, v0.2d, v8.2d[0]
|
||||
fmla v17.2d, v1.2d, v8.2d[0]
|
||||
fmla v20.2d, v0.2d, v8.2d[1]
|
||||
fmla v21.2d, v1.2d, v8.2d[1]
|
||||
fmla v16.2d, v0.2d, v8.d[0]
|
||||
fmla v17.2d, v1.2d, v8.d[0]
|
||||
fmla v20.2d, v0.2d, v8.d[1]
|
||||
fmla v21.2d, v1.2d, v8.d[1]
|
||||
.endm
|
||||
|
||||
.macro SAVE4x2
|
||||
fmov alpha0, alpha
|
||||
ld1 {v8.2d, v9.2d}, [pCRow0]
|
||||
fmla v8.2d, v16.2d, alphaV0
|
||||
fmla v9.2d, v17.2d, alphaV1
|
||||
fmla v9.2d, v17.2d, alphaV0
|
||||
st1 {v8.2d, v9.2d}, [pCRow0]
|
||||
|
||||
add pCRow1, pCRow0, LDC
|
||||
|
||||
ld1 {v12.2d, v13.2d}, [pCRow1]
|
||||
fmla v12.2d, v20.2d, alphaV2
|
||||
fmla v13.2d, v21.2d, alphaV3
|
||||
fmla v12.2d, v20.2d, alphaV0
|
||||
fmla v13.2d, v21.2d, alphaV0
|
||||
st1 {v12.2d, v13.2d}, [pCRow1]
|
||||
|
||||
add pCRow0, pCRow0, #32
|
||||
|
|
@ -641,11 +657,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v0.2d}, [pA]
|
||||
add pA, pA, #16
|
||||
|
||||
fmla v16.2d, v0.2d, v8.2d[0]
|
||||
fmla v20.2d, v0.2d, v8.2d[1]
|
||||
fmla v16.2d, v0.2d, v8.d[0]
|
||||
fmla v20.2d, v0.2d, v8.d[1]
|
||||
.endm
|
||||
|
||||
.macro SAVE2x2
|
||||
fmov alpha0, alpha
|
||||
ld1 {v8.2d}, [pCRow0]
|
||||
fmla v8.2d, v16.2d, alphaV0
|
||||
st1 {v8.2d}, [pCRow0]
|
||||
|
|
@ -653,7 +670,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
add pCRow1 , pCRow0, LDC
|
||||
|
||||
ld1 {v12.2d}, [pCRow1]
|
||||
fmla v12.2d, v20.2d, alphaV1
|
||||
fmla v12.2d, v20.2d, alphaV0
|
||||
st1 {v12.2d}, [pCRow1]
|
||||
|
||||
add pCRow0, pCRow0, #16
|
||||
|
|
@ -672,10 +689,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ldr d0 , [pA]
|
||||
add pA, pA, #8
|
||||
|
||||
fmla v16.2d, v8.2d, v0.2d[0]
|
||||
fmla v16.2d, v8.2d, v0.d[0]
|
||||
.endm
|
||||
|
||||
.macro SAVE1x2
|
||||
fmov alpha0, alpha
|
||||
add pCRow1 , pCRow0, LDC
|
||||
|
||||
ld1 {v8.d}[0], [pCRow0]
|
||||
|
|
@ -706,18 +724,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v2.2d, v3.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
fmla v16.2d, v0.2d, v8.2d[0]
|
||||
fmla v17.2d, v1.2d, v8.2d[0]
|
||||
fmla v18.2d, v2.2d, v8.2d[0]
|
||||
fmla v19.2d, v3.2d, v8.2d[0]
|
||||
fmla v16.2d, v0.2d, v8.d[0]
|
||||
fmla v17.2d, v1.2d, v8.d[0]
|
||||
fmla v18.2d, v2.2d, v8.d[0]
|
||||
fmla v19.2d, v3.2d, v8.d[0]
|
||||
.endm
|
||||
|
||||
.macro SAVE8x1
|
||||
fmov alpha0, alpha
|
||||
ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0]
|
||||
fmla v0.2d, v16.2d, alphaV0
|
||||
fmla v1.2d, v17.2d, alphaV1
|
||||
fmla v2.2d, v18.2d, alphaV2
|
||||
fmla v3.2d, v19.2d, alphaV3
|
||||
fmla v1.2d, v17.2d, alphaV0
|
||||
fmla v2.2d, v18.2d, alphaV0
|
||||
fmla v3.2d, v19.2d, alphaV0
|
||||
st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0]
|
||||
|
||||
add pCRow0, pCRow0, #64
|
||||
|
|
@ -738,14 +757,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v0.2d, v1.2d}, [pA]
|
||||
add pA , pA, #32
|
||||
|
||||
fmla v16.2d, v0.2d, v8.2d[0]
|
||||
fmla v17.2d, v1.2d, v8.2d[0]
|
||||
fmla v16.2d, v0.2d, v8.d[0]
|
||||
fmla v17.2d, v1.2d, v8.d[0]
|
||||
.endm
|
||||
|
||||
.macro SAVE4x1
|
||||
fmov alpha0, alpha
|
||||
ld1 {v8.2d, v9.2d}, [pCRow0]
|
||||
fmla v8.2d, v16.2d, alphaV0
|
||||
fmla v9.2d, v17.2d, alphaV1
|
||||
fmla v9.2d, v17.2d, alphaV0
|
||||
st1 {v8.2d, v9.2d}, [pCRow0]
|
||||
|
||||
add pCRow0, pCRow0, #32
|
||||
|
|
@ -765,10 +785,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v0.2d}, [pA]
|
||||
add pA , pA, #16
|
||||
|
||||
fmla v16.2d, v0.2d, v8.2d[0]
|
||||
fmla v16.2d, v0.2d, v8.d[0]
|
||||
.endm
|
||||
|
||||
.macro SAVE2x1
|
||||
fmov alpha0, alpha
|
||||
ld1 {v8.2d}, [pCRow0]
|
||||
fmla v8.2d, v16.2d, alphaV0
|
||||
st1 {v8.2d}, [pCRow0]
|
||||
|
|
@ -793,6 +814,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro SAVE1x1
|
||||
fmov alpha0, alpha
|
||||
ldr d8, [pCRow0]
|
||||
fmadd d8, d16, alpha0, d8
|
||||
str d8, [pCRow0]
|
||||
|
|
@ -820,6 +842,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
stp x26, x27, [sp, #(9 * 16)]
|
||||
str x28, [sp, #(10 * 16)]
|
||||
|
||||
prfm PLDL1KEEP, [origPB]
|
||||
prfm PLDL1KEEP, [origPA]
|
||||
|
||||
fmov alpha, d0
|
||||
|
||||
lsl LDC, LDC, #3 // ldc = ldc * 8
|
||||
|
|
@ -838,6 +863,7 @@ dgemm_kernel_L4_BEGIN:
|
|||
add pCRow1, pCRow0, LDC
|
||||
add pCRow2, pCRow1, LDC
|
||||
add pCRow3, pCRow2, LDC
|
||||
|
||||
add pC, pCRow3, LDC
|
||||
|
||||
mov pA, origPA // pA = start of A array
|
||||
|
|
@ -849,6 +875,7 @@ dgemm_kernel_L4_M8_BEGIN:
|
|||
cmp counterI, #0
|
||||
ble dgemm_kernel_L4_M4_BEGIN
|
||||
|
||||
.align 5
|
||||
dgemm_kernel_L4_M8_20:
|
||||
|
||||
mov pB, origPB
|
||||
|
|
@ -868,8 +895,8 @@ dgemm_kernel_L4_M8_20:
|
|||
|
||||
subs counterL, counterL, #2 // subtract 2
|
||||
ble dgemm_kernel_L4_M8_22a
|
||||
.align 5
|
||||
|
||||
.align 5
|
||||
dgemm_kernel_L4_M8_22:
|
||||
|
||||
KERNEL8x4_M1
|
||||
|
|
@ -884,7 +911,7 @@ dgemm_kernel_L4_M8_22:
|
|||
subs counterL, counterL, #1
|
||||
bgt dgemm_kernel_L4_M8_22
|
||||
|
||||
|
||||
.align 5
|
||||
dgemm_kernel_L4_M8_22a:
|
||||
|
||||
KERNEL8x4_M1
|
||||
|
|
@ -898,6 +925,7 @@ dgemm_kernel_L4_M8_22a:
|
|||
|
||||
b dgemm_kernel_L4_M8_44
|
||||
|
||||
.align 5
|
||||
dgemm_kernel_L4_M8_32:
|
||||
|
||||
tst counterL, #1
|
||||
|
|
@ -923,6 +951,7 @@ dgemm_kernel_L4_M8_44:
|
|||
ands counterL , origK, #7
|
||||
ble dgemm_kernel_L4_M8_100
|
||||
|
||||
.align 5
|
||||
dgemm_kernel_L4_M8_46:
|
||||
|
||||
KERNEL8x4_SUB
|
||||
|
|
@ -931,6 +960,9 @@ dgemm_kernel_L4_M8_46:
|
|||
bne dgemm_kernel_L4_M8_46
|
||||
|
||||
dgemm_kernel_L4_M8_100:
|
||||
prfm PLDL1KEEP, [pA]
|
||||
prfm PLDL1KEEP, [pA, #64]
|
||||
prfm PLDL1KEEP, [origPB]
|
||||
|
||||
SAVE8x4
|
||||
|
||||
|
|
|
|||
|
|
@ -147,17 +147,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v0.2d, v1.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
fmul v16.2d, v0.2d, v8.2d[0]
|
||||
fmul v29.2d, v1.2d, v9.2d[1]
|
||||
fmul v16.2d, v0.2d, v8.d[0]
|
||||
fmul v29.2d, v1.2d, v9.d[1]
|
||||
|
||||
fmul v20.2d, v0.2d, v8.2d[1]
|
||||
fmul v25.2d, v1.2d, v9.2d[0]
|
||||
fmul v20.2d, v0.2d, v8.d[1]
|
||||
fmul v25.2d, v1.2d, v9.d[0]
|
||||
|
||||
fmul v24.2d, v0.2d, v9.2d[0]
|
||||
fmul v21.2d, v1.2d, v8.2d[1]
|
||||
fmul v24.2d, v0.2d, v9.d[0]
|
||||
fmul v21.2d, v1.2d, v8.d[1]
|
||||
|
||||
fmul v28.2d, v0.2d, v9.2d[1]
|
||||
fmul v17.2d, v1.2d, v8.2d[0]
|
||||
fmul v28.2d, v0.2d, v9.d[1]
|
||||
fmul v17.2d, v1.2d, v8.d[0]
|
||||
|
||||
ld1 {v12.2d, v13.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
|
|
@ -166,61 +166,61 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro KERNEL4x4_M1
|
||||
fmla v16.2d, v0.2d, v8.2d[0]
|
||||
fmla v29.2d, v1.2d, v9.2d[1]
|
||||
fmla v16.2d, v0.2d, v8.d[0]
|
||||
fmla v29.2d, v1.2d, v9.d[1]
|
||||
|
||||
ld1 {v12.2d, v13.2d}, [pB] // For next round
|
||||
add pB, pB, #32
|
||||
|
||||
fmla v20.2d, v0.2d, v8.2d[1]
|
||||
fmla v25.2d, v1.2d, v9.2d[0]
|
||||
fmla v20.2d, v0.2d, v8.d[1]
|
||||
fmla v25.2d, v1.2d, v9.d[0]
|
||||
|
||||
ld1 {v4.2d, v5.2d}, [pA] // For next round
|
||||
add pA, pA, #32
|
||||
|
||||
fmla v24.2d, v0.2d, v9.2d[0]
|
||||
fmla v21.2d, v1.2d, v8.2d[1]
|
||||
fmla v24.2d, v0.2d, v9.d[0]
|
||||
fmla v21.2d, v1.2d, v8.d[1]
|
||||
|
||||
prfm PLDL1KEEP, [pA, #512]
|
||||
|
||||
fmla v28.2d, v0.2d, v9.2d[1]
|
||||
fmla v17.2d, v1.2d, v8.2d[0]
|
||||
fmla v28.2d, v0.2d, v9.d[1]
|
||||
fmla v17.2d, v1.2d, v8.d[0]
|
||||
.endm
|
||||
|
||||
.macro KERNEL4x4_M2
|
||||
fmla v16.2d, v4.2d, v12.2d[0]
|
||||
fmla v29.2d, v5.2d, v13.2d[1]
|
||||
fmla v16.2d, v4.2d, v12.d[0]
|
||||
fmla v29.2d, v5.2d, v13.d[1]
|
||||
|
||||
ld1 {v8.2d, v9.2d}, [pB] // For next round
|
||||
add pB, pB, #32
|
||||
|
||||
fmla v20.2d, v4.2d, v12.2d[1]
|
||||
fmla v25.2d, v5.2d, v13.2d[0]
|
||||
fmla v20.2d, v4.2d, v12.d[1]
|
||||
fmla v25.2d, v5.2d, v13.d[0]
|
||||
|
||||
ld1 {v0.2d, v1.2d}, [pA] // For next round
|
||||
add pA, pA, #32
|
||||
|
||||
fmla v24.2d, v4.2d, v13.2d[0]
|
||||
fmla v21.2d, v5.2d, v12.2d[1]
|
||||
fmla v24.2d, v4.2d, v13.d[0]
|
||||
fmla v21.2d, v5.2d, v12.d[1]
|
||||
|
||||
prfm PLDL1KEEP, [pB, #512]
|
||||
|
||||
fmla v28.2d, v4.2d, v13.2d[1]
|
||||
fmla v17.2d, v5.2d, v12.2d[0]
|
||||
fmla v28.2d, v4.2d, v13.d[1]
|
||||
fmla v17.2d, v5.2d, v12.d[0]
|
||||
.endm
|
||||
|
||||
.macro KERNEL4x4_E
|
||||
fmla v16.2d, v4.2d, v12.2d[0]
|
||||
fmla v29.2d, v5.2d, v13.2d[1]
|
||||
fmla v16.2d, v4.2d, v12.d[0]
|
||||
fmla v29.2d, v5.2d, v13.d[1]
|
||||
|
||||
fmla v20.2d, v4.2d, v12.2d[1]
|
||||
fmla v25.2d, v5.2d, v13.2d[0]
|
||||
fmla v20.2d, v4.2d, v12.d[1]
|
||||
fmla v25.2d, v5.2d, v13.d[0]
|
||||
|
||||
fmla v24.2d, v4.2d, v13.2d[0]
|
||||
fmla v21.2d, v5.2d, v12.2d[1]
|
||||
fmla v24.2d, v4.2d, v13.d[0]
|
||||
fmla v21.2d, v5.2d, v12.d[1]
|
||||
|
||||
fmla v28.2d, v4.2d, v13.2d[1]
|
||||
fmla v17.2d, v5.2d, v12.2d[0]
|
||||
fmla v28.2d, v4.2d, v13.d[1]
|
||||
fmla v17.2d, v5.2d, v12.d[0]
|
||||
.endm
|
||||
|
||||
.macro KERNEL4x4_SUB
|
||||
|
|
@ -229,17 +229,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v0.2d, v1.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
fmla v16.2d, v0.2d, v8.2d[0]
|
||||
fmla v29.2d, v1.2d, v9.2d[1]
|
||||
fmla v16.2d, v0.2d, v8.d[0]
|
||||
fmla v29.2d, v1.2d, v9.d[1]
|
||||
|
||||
fmla v20.2d, v0.2d, v8.2d[1]
|
||||
fmla v25.2d, v1.2d, v9.2d[0]
|
||||
fmla v20.2d, v0.2d, v8.d[1]
|
||||
fmla v25.2d, v1.2d, v9.d[0]
|
||||
|
||||
fmla v24.2d, v0.2d, v9.2d[0]
|
||||
fmla v21.2d, v1.2d, v8.2d[1]
|
||||
fmla v24.2d, v0.2d, v9.d[0]
|
||||
fmla v21.2d, v1.2d, v8.d[1]
|
||||
|
||||
fmla v28.2d, v0.2d, v9.2d[1]
|
||||
fmla v17.2d, v1.2d, v8.2d[0]
|
||||
fmla v28.2d, v0.2d, v9.d[1]
|
||||
fmla v17.2d, v1.2d, v8.d[0]
|
||||
.endm
|
||||
|
||||
.macro SAVE4x4
|
||||
|
|
@ -283,10 +283,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v0.2d}, [pA]
|
||||
add pA, pA, #16
|
||||
|
||||
fmla v16.2d, v0.2d, v8.2d[0]
|
||||
fmla v20.2d, v0.2d, v8.2d[1]
|
||||
fmla v24.2d, v0.2d, v9.2d[0]
|
||||
fmla v28.2d, v0.2d, v9.2d[1]
|
||||
fmla v16.2d, v0.2d, v8.d[0]
|
||||
fmla v20.2d, v0.2d, v8.d[1]
|
||||
fmla v24.2d, v0.2d, v9.d[0]
|
||||
fmla v28.2d, v0.2d, v9.d[1]
|
||||
.endm
|
||||
|
||||
.macro SAVE2x4
|
||||
|
|
@ -361,10 +361,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v0.2d, v1.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
fmla v16.2d, v0.2d, v8.2d[0]
|
||||
fmla v17.2d, v1.2d, v8.2d[0]
|
||||
fmla v20.2d, v0.2d, v8.2d[1]
|
||||
fmla v21.2d, v1.2d, v8.2d[1]
|
||||
fmla v16.2d, v0.2d, v8.d[0]
|
||||
fmla v17.2d, v1.2d, v8.d[0]
|
||||
fmla v20.2d, v0.2d, v8.d[1]
|
||||
fmla v21.2d, v1.2d, v8.d[1]
|
||||
.endm
|
||||
|
||||
.macro SAVE4x2
|
||||
|
|
@ -395,8 +395,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v0.2d}, [pA]
|
||||
add pA, pA, #16
|
||||
|
||||
fmla v16.2d, v0.2d, v8.2d[0]
|
||||
fmla v20.2d, v0.2d, v8.2d[1]
|
||||
fmla v16.2d, v0.2d, v8.d[0]
|
||||
fmla v20.2d, v0.2d, v8.d[1]
|
||||
.endm
|
||||
|
||||
.macro SAVE2x2
|
||||
|
|
@ -424,7 +424,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ldr d0 , [pA]
|
||||
add pA, pA, #8
|
||||
|
||||
fmla v16.2d, v8.2d, v0.2d[0]
|
||||
fmla v16.2d, v8.2d, v0.d[0]
|
||||
.endm
|
||||
|
||||
.macro SAVE1x2
|
||||
|
|
@ -451,8 +451,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v0.2d, v1.2d}, [pA]
|
||||
add pA , pA, #32
|
||||
|
||||
fmla v16.2d, v0.2d, v8.2d[0]
|
||||
fmla v17.2d, v1.2d, v8.2d[0]
|
||||
fmla v16.2d, v0.2d, v8.d[0]
|
||||
fmla v17.2d, v1.2d, v8.d[0]
|
||||
.endm
|
||||
|
||||
.macro SAVE4x1
|
||||
|
|
@ -479,7 +479,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v0.2d}, [pA]
|
||||
add pA , pA, #16
|
||||
|
||||
fmla v16.2d, v0.2d, v8.2d[0]
|
||||
fmla v16.2d, v0.2d, v8.d[0]
|
||||
.endm
|
||||
|
||||
.macro SAVE2x1
|
||||
|
|
|
|||
|
|
@ -157,25 +157,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v10.2d, v11.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
|
||||
fmul v16.2d, v0.2d, v8.2d[0]
|
||||
fmul v17.2d, v1.2d, v8.2d[0]
|
||||
fmul v18.2d, v0.2d, v8.2d[1]
|
||||
fmul v19.2d, v1.2d, v8.2d[1]
|
||||
fmul v16.2d, v0.2d, v8.d[0]
|
||||
fmul v17.2d, v1.2d, v8.d[0]
|
||||
fmul v18.2d, v0.2d, v8.d[1]
|
||||
fmul v19.2d, v1.2d, v8.d[1]
|
||||
|
||||
fmul v20.2d, v0.2d, v9.2d[0]
|
||||
fmul v21.2d, v1.2d, v9.2d[0]
|
||||
fmul v22.2d, v0.2d, v9.2d[1]
|
||||
fmul v23.2d, v1.2d, v9.2d[1]
|
||||
fmul v20.2d, v0.2d, v9.d[0]
|
||||
fmul v21.2d, v1.2d, v9.d[0]
|
||||
fmul v22.2d, v0.2d, v9.d[1]
|
||||
fmul v23.2d, v1.2d, v9.d[1]
|
||||
|
||||
fmul v24.2d, v0.2d, v10.2d[0]
|
||||
fmul v25.2d, v1.2d, v10.2d[0]
|
||||
fmul v26.2d, v0.2d, v10.2d[1]
|
||||
fmul v27.2d, v1.2d, v10.2d[1]
|
||||
fmul v24.2d, v0.2d, v10.d[0]
|
||||
fmul v25.2d, v1.2d, v10.d[0]
|
||||
fmul v26.2d, v0.2d, v10.d[1]
|
||||
fmul v27.2d, v1.2d, v10.d[1]
|
||||
|
||||
fmul v28.2d, v0.2d, v11.2d[0]
|
||||
fmul v29.2d, v1.2d, v11.2d[0]
|
||||
fmul v30.2d, v0.2d, v11.2d[1]
|
||||
fmul v31.2d, v1.2d, v11.2d[1]
|
||||
fmul v28.2d, v0.2d, v11.d[0]
|
||||
fmul v29.2d, v1.2d, v11.d[0]
|
||||
fmul v30.2d, v0.2d, v11.d[1]
|
||||
fmul v31.2d, v1.2d, v11.d[1]
|
||||
|
||||
ld1 {v12.2d, v13.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
|
|
@ -186,25 +186,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro KERNEL4x8_M1
|
||||
fmla v16.2d, v0.2d, v8.2d[0]
|
||||
fmla v17.2d, v1.2d, v8.2d[0]
|
||||
fmla v18.2d, v0.2d, v8.2d[1]
|
||||
fmla v19.2d, v1.2d, v8.2d[1]
|
||||
fmla v16.2d, v0.2d, v8.d[0]
|
||||
fmla v17.2d, v1.2d, v8.d[0]
|
||||
fmla v18.2d, v0.2d, v8.d[1]
|
||||
fmla v19.2d, v1.2d, v8.d[1]
|
||||
|
||||
fmla v20.2d, v0.2d, v9.2d[0]
|
||||
fmla v21.2d, v1.2d, v9.2d[0]
|
||||
fmla v22.2d, v0.2d, v9.2d[1]
|
||||
fmla v23.2d, v1.2d, v9.2d[1]
|
||||
fmla v20.2d, v0.2d, v9.d[0]
|
||||
fmla v21.2d, v1.2d, v9.d[0]
|
||||
fmla v22.2d, v0.2d, v9.d[1]
|
||||
fmla v23.2d, v1.2d, v9.d[1]
|
||||
|
||||
fmla v24.2d, v0.2d, v10.2d[0]
|
||||
fmla v25.2d, v1.2d, v10.2d[0]
|
||||
fmla v26.2d, v0.2d, v10.2d[1]
|
||||
fmla v27.2d, v1.2d, v10.2d[1]
|
||||
fmla v24.2d, v0.2d, v10.d[0]
|
||||
fmla v25.2d, v1.2d, v10.d[0]
|
||||
fmla v26.2d, v0.2d, v10.d[1]
|
||||
fmla v27.2d, v1.2d, v10.d[1]
|
||||
|
||||
fmla v28.2d, v0.2d, v11.2d[0]
|
||||
fmla v29.2d, v1.2d, v11.2d[0]
|
||||
fmla v30.2d, v0.2d, v11.2d[1]
|
||||
fmla v31.2d, v1.2d, v11.2d[1]
|
||||
fmla v28.2d, v0.2d, v11.d[0]
|
||||
fmla v29.2d, v1.2d, v11.d[0]
|
||||
fmla v30.2d, v0.2d, v11.d[1]
|
||||
fmla v31.2d, v1.2d, v11.d[1]
|
||||
|
||||
ld1 {v12.2d, v13.2d}, [pB] // For next round
|
||||
add pB, pB, #32
|
||||
|
|
@ -217,25 +217,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro KERNEL4x8_M2
|
||||
fmla v16.2d, v4.2d, v12.2d[0]
|
||||
fmla v17.2d, v5.2d, v12.2d[0]
|
||||
fmla v18.2d, v4.2d, v12.2d[1]
|
||||
fmla v19.2d, v5.2d, v12.2d[1]
|
||||
fmla v16.2d, v4.2d, v12.d[0]
|
||||
fmla v17.2d, v5.2d, v12.d[0]
|
||||
fmla v18.2d, v4.2d, v12.d[1]
|
||||
fmla v19.2d, v5.2d, v12.d[1]
|
||||
|
||||
fmla v20.2d, v4.2d, v13.2d[0]
|
||||
fmla v21.2d, v5.2d, v13.2d[0]
|
||||
fmla v22.2d, v4.2d, v13.2d[1]
|
||||
fmla v23.2d, v5.2d, v13.2d[1]
|
||||
fmla v20.2d, v4.2d, v13.d[0]
|
||||
fmla v21.2d, v5.2d, v13.d[0]
|
||||
fmla v22.2d, v4.2d, v13.d[1]
|
||||
fmla v23.2d, v5.2d, v13.d[1]
|
||||
|
||||
fmla v24.2d, v4.2d, v14.2d[0]
|
||||
fmla v25.2d, v5.2d, v14.2d[0]
|
||||
fmla v26.2d, v4.2d, v14.2d[1]
|
||||
fmla v27.2d, v5.2d, v14.2d[1]
|
||||
fmla v24.2d, v4.2d, v14.d[0]
|
||||
fmla v25.2d, v5.2d, v14.d[0]
|
||||
fmla v26.2d, v4.2d, v14.d[1]
|
||||
fmla v27.2d, v5.2d, v14.d[1]
|
||||
|
||||
fmla v28.2d, v4.2d, v15.2d[0]
|
||||
fmla v29.2d, v5.2d, v15.2d[0]
|
||||
fmla v30.2d, v4.2d, v15.2d[1]
|
||||
fmla v31.2d, v5.2d, v15.2d[1]
|
||||
fmla v28.2d, v4.2d, v15.d[0]
|
||||
fmla v29.2d, v5.2d, v15.d[0]
|
||||
fmla v30.2d, v4.2d, v15.d[1]
|
||||
fmla v31.2d, v5.2d, v15.d[1]
|
||||
|
||||
ld1 {v8.2d, v9.2d}, [pB] // For next round
|
||||
add pB, pB, #32
|
||||
|
|
@ -248,25 +248,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro KERNEL4x8_E
|
||||
fmla v16.2d, v4.2d, v12.2d[0]
|
||||
fmla v17.2d, v5.2d, v12.2d[0]
|
||||
fmla v18.2d, v4.2d, v12.2d[1]
|
||||
fmla v19.2d, v5.2d, v12.2d[1]
|
||||
fmla v16.2d, v4.2d, v12.d[0]
|
||||
fmla v17.2d, v5.2d, v12.d[0]
|
||||
fmla v18.2d, v4.2d, v12.d[1]
|
||||
fmla v19.2d, v5.2d, v12.d[1]
|
||||
|
||||
fmla v20.2d, v4.2d, v13.2d[0]
|
||||
fmla v21.2d, v5.2d, v13.2d[0]
|
||||
fmla v22.2d, v4.2d, v13.2d[1]
|
||||
fmla v23.2d, v5.2d, v13.2d[1]
|
||||
fmla v20.2d, v4.2d, v13.d[0]
|
||||
fmla v21.2d, v5.2d, v13.d[0]
|
||||
fmla v22.2d, v4.2d, v13.d[1]
|
||||
fmla v23.2d, v5.2d, v13.d[1]
|
||||
|
||||
fmla v24.2d, v4.2d, v14.2d[0]
|
||||
fmla v25.2d, v5.2d, v14.2d[0]
|
||||
fmla v26.2d, v4.2d, v14.2d[1]
|
||||
fmla v27.2d, v5.2d, v14.2d[1]
|
||||
fmla v24.2d, v4.2d, v14.d[0]
|
||||
fmla v25.2d, v5.2d, v14.d[0]
|
||||
fmla v26.2d, v4.2d, v14.d[1]
|
||||
fmla v27.2d, v5.2d, v14.d[1]
|
||||
|
||||
fmla v28.2d, v4.2d, v15.2d[0]
|
||||
fmla v29.2d, v5.2d, v15.2d[0]
|
||||
fmla v30.2d, v4.2d, v15.2d[1]
|
||||
fmla v31.2d, v5.2d, v15.2d[1]
|
||||
fmla v28.2d, v4.2d, v15.d[0]
|
||||
fmla v29.2d, v5.2d, v15.d[0]
|
||||
fmla v30.2d, v4.2d, v15.d[1]
|
||||
fmla v31.2d, v5.2d, v15.d[1]
|
||||
.endm
|
||||
|
||||
.macro KERNEL4x8_SUB
|
||||
|
|
@ -277,25 +277,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v10.2d, v11.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
|
||||
fmla v16.2d, v0.2d, v8.2d[0]
|
||||
fmla v17.2d, v1.2d, v8.2d[0]
|
||||
fmla v18.2d, v0.2d, v8.2d[1]
|
||||
fmla v19.2d, v1.2d, v8.2d[1]
|
||||
fmla v16.2d, v0.2d, v8.d[0]
|
||||
fmla v17.2d, v1.2d, v8.d[0]
|
||||
fmla v18.2d, v0.2d, v8.d[1]
|
||||
fmla v19.2d, v1.2d, v8.d[1]
|
||||
|
||||
fmla v20.2d, v0.2d, v9.2d[0]
|
||||
fmla v21.2d, v1.2d, v9.2d[0]
|
||||
fmla v22.2d, v0.2d, v9.2d[1]
|
||||
fmla v23.2d, v1.2d, v9.2d[1]
|
||||
fmla v20.2d, v0.2d, v9.d[0]
|
||||
fmla v21.2d, v1.2d, v9.d[0]
|
||||
fmla v22.2d, v0.2d, v9.d[1]
|
||||
fmla v23.2d, v1.2d, v9.d[1]
|
||||
|
||||
fmla v24.2d, v0.2d, v10.2d[0]
|
||||
fmla v25.2d, v1.2d, v10.2d[0]
|
||||
fmla v26.2d, v0.2d, v10.2d[1]
|
||||
fmla v27.2d, v1.2d, v10.2d[1]
|
||||
fmla v24.2d, v0.2d, v10.d[0]
|
||||
fmla v25.2d, v1.2d, v10.d[0]
|
||||
fmla v26.2d, v0.2d, v10.d[1]
|
||||
fmla v27.2d, v1.2d, v10.d[1]
|
||||
|
||||
fmla v28.2d, v0.2d, v11.2d[0]
|
||||
fmla v29.2d, v1.2d, v11.2d[0]
|
||||
fmla v30.2d, v0.2d, v11.2d[1]
|
||||
fmla v31.2d, v1.2d, v11.2d[1]
|
||||
fmla v28.2d, v0.2d, v11.d[0]
|
||||
fmla v29.2d, v1.2d, v11.d[0]
|
||||
fmla v30.2d, v0.2d, v11.d[1]
|
||||
fmla v31.2d, v1.2d, v11.d[1]
|
||||
.endm
|
||||
|
||||
.macro SAVE4x8
|
||||
|
|
@ -369,17 +369,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v10.2d, v11.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
|
||||
fmla v16.2d, v0.2d, v8.2d[0]
|
||||
fmla v18.2d, v0.2d, v8.2d[1]
|
||||
fmla v16.2d, v0.2d, v8.d[0]
|
||||
fmla v18.2d, v0.2d, v8.d[1]
|
||||
|
||||
fmla v20.2d, v0.2d, v9.2d[0]
|
||||
fmla v22.2d, v0.2d, v9.2d[1]
|
||||
fmla v20.2d, v0.2d, v9.d[0]
|
||||
fmla v22.2d, v0.2d, v9.d[1]
|
||||
|
||||
fmla v24.2d, v0.2d, v10.2d[0]
|
||||
fmla v26.2d, v0.2d, v10.2d[1]
|
||||
fmla v24.2d, v0.2d, v10.d[0]
|
||||
fmla v26.2d, v0.2d, v10.d[1]
|
||||
|
||||
fmla v28.2d, v0.2d, v11.2d[0]
|
||||
fmla v30.2d, v0.2d, v11.2d[1]
|
||||
fmla v28.2d, v0.2d, v11.d[0]
|
||||
fmla v30.2d, v0.2d, v11.d[1]
|
||||
.endm
|
||||
|
||||
.macro SAVE2x8
|
||||
|
|
@ -499,17 +499,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v0.2d, v1.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
fmul v16.2d, v0.2d, v8.2d[0]
|
||||
fmul v29.2d, v1.2d, v9.2d[1]
|
||||
fmul v16.2d, v0.2d, v8.d[0]
|
||||
fmul v29.2d, v1.2d, v9.d[1]
|
||||
|
||||
fmul v20.2d, v0.2d, v8.2d[1]
|
||||
fmul v25.2d, v1.2d, v9.2d[0]
|
||||
fmul v20.2d, v0.2d, v8.d[1]
|
||||
fmul v25.2d, v1.2d, v9.d[0]
|
||||
|
||||
fmul v24.2d, v0.2d, v9.2d[0]
|
||||
fmul v21.2d, v1.2d, v8.2d[1]
|
||||
fmul v24.2d, v0.2d, v9.d[0]
|
||||
fmul v21.2d, v1.2d, v8.d[1]
|
||||
|
||||
fmul v28.2d, v0.2d, v9.2d[1]
|
||||
fmul v17.2d, v1.2d, v8.2d[0]
|
||||
fmul v28.2d, v0.2d, v9.d[1]
|
||||
fmul v17.2d, v1.2d, v8.d[0]
|
||||
|
||||
ld1 {v12.2d, v13.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
|
|
@ -518,61 +518,61 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro KERNEL4x4_M1
|
||||
fmla v16.2d, v0.2d, v8.2d[0]
|
||||
fmla v29.2d, v1.2d, v9.2d[1]
|
||||
fmla v16.2d, v0.2d, v8.d[0]
|
||||
fmla v29.2d, v1.2d, v9.d[1]
|
||||
|
||||
ld1 {v12.2d, v13.2d}, [pB] // For next round
|
||||
add pB, pB, #32
|
||||
|
||||
fmla v20.2d, v0.2d, v8.2d[1]
|
||||
fmla v25.2d, v1.2d, v9.2d[0]
|
||||
fmla v20.2d, v0.2d, v8.d[1]
|
||||
fmla v25.2d, v1.2d, v9.d[0]
|
||||
|
||||
ld1 {v4.2d, v5.2d}, [pA] // For next round
|
||||
add pA, pA, #32
|
||||
|
||||
fmla v24.2d, v0.2d, v9.2d[0]
|
||||
fmla v21.2d, v1.2d, v8.2d[1]
|
||||
fmla v24.2d, v0.2d, v9.d[0]
|
||||
fmla v21.2d, v1.2d, v8.d[1]
|
||||
|
||||
prfm PLDL1KEEP, [pA, #512]
|
||||
|
||||
fmla v28.2d, v0.2d, v9.2d[1]
|
||||
fmla v17.2d, v1.2d, v8.2d[0]
|
||||
fmla v28.2d, v0.2d, v9.d[1]
|
||||
fmla v17.2d, v1.2d, v8.d[0]
|
||||
.endm
|
||||
|
||||
.macro KERNEL4x4_M2
|
||||
fmla v16.2d, v4.2d, v12.2d[0]
|
||||
fmla v29.2d, v5.2d, v13.2d[1]
|
||||
fmla v16.2d, v4.2d, v12.d[0]
|
||||
fmla v29.2d, v5.2d, v13.d[1]
|
||||
|
||||
ld1 {v8.2d, v9.2d}, [pB] // For next round
|
||||
add pB, pB, #32
|
||||
|
||||
fmla v20.2d, v4.2d, v12.2d[1]
|
||||
fmla v25.2d, v5.2d, v13.2d[0]
|
||||
fmla v20.2d, v4.2d, v12.d[1]
|
||||
fmla v25.2d, v5.2d, v13.d[0]
|
||||
|
||||
ld1 {v0.2d, v1.2d}, [pA] // For next round
|
||||
add pA, pA, #32
|
||||
|
||||
fmla v24.2d, v4.2d, v13.2d[0]
|
||||
fmla v21.2d, v5.2d, v12.2d[1]
|
||||
fmla v24.2d, v4.2d, v13.d[0]
|
||||
fmla v21.2d, v5.2d, v12.d[1]
|
||||
|
||||
prfm PLDL1KEEP, [pB, #512]
|
||||
|
||||
fmla v28.2d, v4.2d, v13.2d[1]
|
||||
fmla v17.2d, v5.2d, v12.2d[0]
|
||||
fmla v28.2d, v4.2d, v13.d[1]
|
||||
fmla v17.2d, v5.2d, v12.d[0]
|
||||
.endm
|
||||
|
||||
.macro KERNEL4x4_E
|
||||
fmla v16.2d, v4.2d, v12.2d[0]
|
||||
fmla v29.2d, v5.2d, v13.2d[1]
|
||||
fmla v16.2d, v4.2d, v12.d[0]
|
||||
fmla v29.2d, v5.2d, v13.d[1]
|
||||
|
||||
fmla v20.2d, v4.2d, v12.2d[1]
|
||||
fmla v25.2d, v5.2d, v13.2d[0]
|
||||
fmla v20.2d, v4.2d, v12.d[1]
|
||||
fmla v25.2d, v5.2d, v13.d[0]
|
||||
|
||||
fmla v24.2d, v4.2d, v13.2d[0]
|
||||
fmla v21.2d, v5.2d, v12.2d[1]
|
||||
fmla v24.2d, v4.2d, v13.d[0]
|
||||
fmla v21.2d, v5.2d, v12.d[1]
|
||||
|
||||
fmla v28.2d, v4.2d, v13.2d[1]
|
||||
fmla v17.2d, v5.2d, v12.2d[0]
|
||||
fmla v28.2d, v4.2d, v13.d[1]
|
||||
fmla v17.2d, v5.2d, v12.d[0]
|
||||
.endm
|
||||
|
||||
.macro KERNEL4x4_SUB
|
||||
|
|
@ -581,17 +581,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v0.2d, v1.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
fmla v16.2d, v0.2d, v8.2d[0]
|
||||
fmla v29.2d, v1.2d, v9.2d[1]
|
||||
fmla v16.2d, v0.2d, v8.d[0]
|
||||
fmla v29.2d, v1.2d, v9.d[1]
|
||||
|
||||
fmla v20.2d, v0.2d, v8.2d[1]
|
||||
fmla v25.2d, v1.2d, v9.2d[0]
|
||||
fmla v20.2d, v0.2d, v8.d[1]
|
||||
fmla v25.2d, v1.2d, v9.d[0]
|
||||
|
||||
fmla v24.2d, v0.2d, v9.2d[0]
|
||||
fmla v21.2d, v1.2d, v8.2d[1]
|
||||
fmla v24.2d, v0.2d, v9.d[0]
|
||||
fmla v21.2d, v1.2d, v8.d[1]
|
||||
|
||||
fmla v28.2d, v0.2d, v9.2d[1]
|
||||
fmla v17.2d, v1.2d, v8.2d[0]
|
||||
fmla v28.2d, v0.2d, v9.d[1]
|
||||
fmla v17.2d, v1.2d, v8.d[0]
|
||||
.endm
|
||||
|
||||
.macro SAVE4x4
|
||||
|
|
@ -635,10 +635,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v0.2d}, [pA]
|
||||
add pA, pA, #16
|
||||
|
||||
fmla v16.2d, v0.2d, v8.2d[0]
|
||||
fmla v20.2d, v0.2d, v8.2d[1]
|
||||
fmla v24.2d, v0.2d, v9.2d[0]
|
||||
fmla v28.2d, v0.2d, v9.2d[1]
|
||||
fmla v16.2d, v0.2d, v8.d[0]
|
||||
fmla v20.2d, v0.2d, v8.d[1]
|
||||
fmla v24.2d, v0.2d, v9.d[0]
|
||||
fmla v28.2d, v0.2d, v9.d[1]
|
||||
.endm
|
||||
|
||||
.macro SAVE2x4
|
||||
|
|
@ -713,10 +713,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v0.2d, v1.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
fmla v16.2d, v0.2d, v8.2d[0]
|
||||
fmla v17.2d, v1.2d, v8.2d[0]
|
||||
fmla v20.2d, v0.2d, v8.2d[1]
|
||||
fmla v21.2d, v1.2d, v8.2d[1]
|
||||
fmla v16.2d, v0.2d, v8.d[0]
|
||||
fmla v17.2d, v1.2d, v8.d[0]
|
||||
fmla v20.2d, v0.2d, v8.d[1]
|
||||
fmla v21.2d, v1.2d, v8.d[1]
|
||||
.endm
|
||||
|
||||
.macro SAVE4x2
|
||||
|
|
@ -747,8 +747,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v0.2d}, [pA]
|
||||
add pA, pA, #16
|
||||
|
||||
fmla v16.2d, v0.2d, v8.2d[0]
|
||||
fmla v20.2d, v0.2d, v8.2d[1]
|
||||
fmla v16.2d, v0.2d, v8.d[0]
|
||||
fmla v20.2d, v0.2d, v8.d[1]
|
||||
.endm
|
||||
|
||||
.macro SAVE2x2
|
||||
|
|
@ -776,7 +776,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ldr d0 , [pA]
|
||||
add pA, pA, #8
|
||||
|
||||
fmla v16.2d, v8.2d, v0.2d[0]
|
||||
fmla v16.2d, v8.2d, v0.d[0]
|
||||
.endm
|
||||
|
||||
.macro SAVE1x2
|
||||
|
|
@ -803,8 +803,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v0.2d, v1.2d}, [pA]
|
||||
add pA , pA, #32
|
||||
|
||||
fmla v16.2d, v0.2d, v8.2d[0]
|
||||
fmla v17.2d, v1.2d, v8.2d[0]
|
||||
fmla v16.2d, v0.2d, v8.d[0]
|
||||
fmla v17.2d, v1.2d, v8.d[0]
|
||||
.endm
|
||||
|
||||
.macro SAVE4x1
|
||||
|
|
@ -831,7 +831,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v0.2d}, [pA]
|
||||
add pA , pA, #16
|
||||
|
||||
fmla v16.2d, v0.2d, v8.2d[0]
|
||||
fmla v16.2d, v0.2d, v8.d[0]
|
||||
.endm
|
||||
|
||||
.macro SAVE2x1
|
||||
|
|
|
|||
|
|
@ -157,25 +157,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v2.2d, v3.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
fmul v16.2d, v0.2d, v8.2d[0]
|
||||
fmul v17.2d, v1.2d, v8.2d[0]
|
||||
fmul v18.2d, v2.2d, v8.2d[0]
|
||||
fmul v19.2d, v3.2d, v8.2d[0]
|
||||
fmul v16.2d, v0.2d, v8.d[0]
|
||||
fmul v17.2d, v1.2d, v8.d[0]
|
||||
fmul v18.2d, v2.2d, v8.d[0]
|
||||
fmul v19.2d, v3.2d, v8.d[0]
|
||||
|
||||
fmul v20.2d, v0.2d, v8.2d[1]
|
||||
fmul v21.2d, v1.2d, v8.2d[1]
|
||||
fmul v22.2d, v2.2d, v8.2d[1]
|
||||
fmul v23.2d, v3.2d, v8.2d[1]
|
||||
fmul v20.2d, v0.2d, v8.d[1]
|
||||
fmul v21.2d, v1.2d, v8.d[1]
|
||||
fmul v22.2d, v2.2d, v8.d[1]
|
||||
fmul v23.2d, v3.2d, v8.d[1]
|
||||
|
||||
fmul v24.2d, v0.2d, v9.2d[0]
|
||||
fmul v25.2d, v1.2d, v9.2d[0]
|
||||
fmul v26.2d, v2.2d, v9.2d[0]
|
||||
fmul v27.2d, v3.2d, v9.2d[0]
|
||||
fmul v24.2d, v0.2d, v9.d[0]
|
||||
fmul v25.2d, v1.2d, v9.d[0]
|
||||
fmul v26.2d, v2.2d, v9.d[0]
|
||||
fmul v27.2d, v3.2d, v9.d[0]
|
||||
|
||||
fmul v28.2d, v0.2d, v9.2d[1]
|
||||
fmul v29.2d, v1.2d, v9.2d[1]
|
||||
fmul v30.2d, v2.2d, v9.2d[1]
|
||||
fmul v31.2d, v3.2d, v9.2d[1]
|
||||
fmul v28.2d, v0.2d, v9.d[1]
|
||||
fmul v29.2d, v1.2d, v9.d[1]
|
||||
fmul v30.2d, v2.2d, v9.d[1]
|
||||
fmul v31.2d, v3.2d, v9.d[1]
|
||||
|
||||
ld1 {v4.2d, v5.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
|
|
@ -186,25 +186,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro KERNEL8x4_M1
|
||||
fmla v16.2d, v0.2d, v8.2d[0]
|
||||
fmla v17.2d, v1.2d, v8.2d[0]
|
||||
fmla v18.2d, v2.2d, v8.2d[0]
|
||||
fmla v19.2d, v3.2d, v8.2d[0]
|
||||
fmla v16.2d, v0.2d, v8.d[0]
|
||||
fmla v17.2d, v1.2d, v8.d[0]
|
||||
fmla v18.2d, v2.2d, v8.d[0]
|
||||
fmla v19.2d, v3.2d, v8.d[0]
|
||||
|
||||
fmla v20.2d, v0.2d, v8.2d[1]
|
||||
fmla v21.2d, v1.2d, v8.2d[1]
|
||||
fmla v22.2d, v2.2d, v8.2d[1]
|
||||
fmla v23.2d, v3.2d, v8.2d[1]
|
||||
fmla v20.2d, v0.2d, v8.d[1]
|
||||
fmla v21.2d, v1.2d, v8.d[1]
|
||||
fmla v22.2d, v2.2d, v8.d[1]
|
||||
fmla v23.2d, v3.2d, v8.d[1]
|
||||
|
||||
fmla v24.2d, v0.2d, v9.2d[0]
|
||||
fmla v25.2d, v1.2d, v9.2d[0]
|
||||
fmla v26.2d, v2.2d, v9.2d[0]
|
||||
fmla v27.2d, v3.2d, v9.2d[0]
|
||||
fmla v24.2d, v0.2d, v9.d[0]
|
||||
fmla v25.2d, v1.2d, v9.d[0]
|
||||
fmla v26.2d, v2.2d, v9.d[0]
|
||||
fmla v27.2d, v3.2d, v9.d[0]
|
||||
|
||||
fmla v28.2d, v0.2d, v9.2d[1]
|
||||
fmla v29.2d, v1.2d, v9.2d[1]
|
||||
fmla v30.2d, v2.2d, v9.2d[1]
|
||||
fmla v31.2d, v3.2d, v9.2d[1]
|
||||
fmla v28.2d, v0.2d, v9.d[1]
|
||||
fmla v29.2d, v1.2d, v9.d[1]
|
||||
fmla v30.2d, v2.2d, v9.d[1]
|
||||
fmla v31.2d, v3.2d, v9.d[1]
|
||||
|
||||
ld1 {v4.2d, v5.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
|
|
@ -217,25 +217,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro KERNEL8x4_M2
|
||||
fmla v16.2d, v4.2d, v12.2d[0]
|
||||
fmla v17.2d, v5.2d, v12.2d[0]
|
||||
fmla v18.2d, v6.2d, v12.2d[0]
|
||||
fmla v19.2d, v7.2d, v12.2d[0]
|
||||
fmla v16.2d, v4.2d, v12.d[0]
|
||||
fmla v17.2d, v5.2d, v12.d[0]
|
||||
fmla v18.2d, v6.2d, v12.d[0]
|
||||
fmla v19.2d, v7.2d, v12.d[0]
|
||||
|
||||
fmla v20.2d, v4.2d, v12.2d[1]
|
||||
fmla v21.2d, v5.2d, v12.2d[1]
|
||||
fmla v22.2d, v6.2d, v12.2d[1]
|
||||
fmla v23.2d, v7.2d, v12.2d[1]
|
||||
fmla v20.2d, v4.2d, v12.d[1]
|
||||
fmla v21.2d, v5.2d, v12.d[1]
|
||||
fmla v22.2d, v6.2d, v12.d[1]
|
||||
fmla v23.2d, v7.2d, v12.d[1]
|
||||
|
||||
fmla v24.2d, v4.2d, v13.2d[0]
|
||||
fmla v25.2d, v5.2d, v13.2d[0]
|
||||
fmla v26.2d, v6.2d, v13.2d[0]
|
||||
fmla v27.2d, v7.2d, v13.2d[0]
|
||||
fmla v24.2d, v4.2d, v13.d[0]
|
||||
fmla v25.2d, v5.2d, v13.d[0]
|
||||
fmla v26.2d, v6.2d, v13.d[0]
|
||||
fmla v27.2d, v7.2d, v13.d[0]
|
||||
|
||||
fmla v28.2d, v4.2d, v13.2d[1]
|
||||
fmla v29.2d, v5.2d, v13.2d[1]
|
||||
fmla v30.2d, v6.2d, v13.2d[1]
|
||||
fmla v31.2d, v7.2d, v13.2d[1]
|
||||
fmla v28.2d, v4.2d, v13.d[1]
|
||||
fmla v29.2d, v5.2d, v13.d[1]
|
||||
fmla v30.2d, v6.2d, v13.d[1]
|
||||
fmla v31.2d, v7.2d, v13.d[1]
|
||||
|
||||
ld1 {v0.2d, v1.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
|
|
@ -248,25 +248,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro KERNEL8x4_E
|
||||
fmla v16.2d, v4.2d, v12.2d[0]
|
||||
fmla v17.2d, v5.2d, v12.2d[0]
|
||||
fmla v18.2d, v6.2d, v12.2d[0]
|
||||
fmla v19.2d, v7.2d, v12.2d[0]
|
||||
fmla v16.2d, v4.2d, v12.d[0]
|
||||
fmla v17.2d, v5.2d, v12.d[0]
|
||||
fmla v18.2d, v6.2d, v12.d[0]
|
||||
fmla v19.2d, v7.2d, v12.d[0]
|
||||
|
||||
fmla v20.2d, v4.2d, v12.2d[1]
|
||||
fmla v21.2d, v5.2d, v12.2d[1]
|
||||
fmla v22.2d, v6.2d, v12.2d[1]
|
||||
fmla v23.2d, v7.2d, v12.2d[1]
|
||||
fmla v20.2d, v4.2d, v12.d[1]
|
||||
fmla v21.2d, v5.2d, v12.d[1]
|
||||
fmla v22.2d, v6.2d, v12.d[1]
|
||||
fmla v23.2d, v7.2d, v12.d[1]
|
||||
|
||||
fmla v24.2d, v4.2d, v13.2d[0]
|
||||
fmla v25.2d, v5.2d, v13.2d[0]
|
||||
fmla v26.2d, v6.2d, v13.2d[0]
|
||||
fmla v27.2d, v7.2d, v13.2d[0]
|
||||
fmla v24.2d, v4.2d, v13.d[0]
|
||||
fmla v25.2d, v5.2d, v13.d[0]
|
||||
fmla v26.2d, v6.2d, v13.d[0]
|
||||
fmla v27.2d, v7.2d, v13.d[0]
|
||||
|
||||
fmla v28.2d, v4.2d, v13.2d[1]
|
||||
fmla v29.2d, v5.2d, v13.2d[1]
|
||||
fmla v30.2d, v6.2d, v13.2d[1]
|
||||
fmla v31.2d, v7.2d, v13.2d[1]
|
||||
fmla v28.2d, v4.2d, v13.d[1]
|
||||
fmla v29.2d, v5.2d, v13.d[1]
|
||||
fmla v30.2d, v6.2d, v13.d[1]
|
||||
fmla v31.2d, v7.2d, v13.d[1]
|
||||
.endm
|
||||
|
||||
.macro KERNEL8x4_SUB
|
||||
|
|
@ -277,25 +277,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v2.2d, v3.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
fmla v16.2d, v0.2d, v8.2d[0]
|
||||
fmla v17.2d, v1.2d, v8.2d[0]
|
||||
fmla v18.2d, v2.2d, v8.2d[0]
|
||||
fmla v19.2d, v3.2d, v8.2d[0]
|
||||
fmla v16.2d, v0.2d, v8.d[0]
|
||||
fmla v17.2d, v1.2d, v8.d[0]
|
||||
fmla v18.2d, v2.2d, v8.d[0]
|
||||
fmla v19.2d, v3.2d, v8.d[0]
|
||||
|
||||
fmla v20.2d, v0.2d, v8.2d[1]
|
||||
fmla v21.2d, v1.2d, v8.2d[1]
|
||||
fmla v22.2d, v2.2d, v8.2d[1]
|
||||
fmla v23.2d, v3.2d, v8.2d[1]
|
||||
fmla v20.2d, v0.2d, v8.d[1]
|
||||
fmla v21.2d, v1.2d, v8.d[1]
|
||||
fmla v22.2d, v2.2d, v8.d[1]
|
||||
fmla v23.2d, v3.2d, v8.d[1]
|
||||
|
||||
fmla v24.2d, v0.2d, v9.2d[0]
|
||||
fmla v25.2d, v1.2d, v9.2d[0]
|
||||
fmla v26.2d, v2.2d, v9.2d[0]
|
||||
fmla v27.2d, v3.2d, v9.2d[0]
|
||||
fmla v24.2d, v0.2d, v9.d[0]
|
||||
fmla v25.2d, v1.2d, v9.d[0]
|
||||
fmla v26.2d, v2.2d, v9.d[0]
|
||||
fmla v27.2d, v3.2d, v9.d[0]
|
||||
|
||||
fmla v28.2d, v0.2d, v9.2d[1]
|
||||
fmla v29.2d, v1.2d, v9.2d[1]
|
||||
fmla v30.2d, v2.2d, v9.2d[1]
|
||||
fmla v31.2d, v3.2d, v9.2d[1]
|
||||
fmla v28.2d, v0.2d, v9.d[1]
|
||||
fmla v29.2d, v1.2d, v9.d[1]
|
||||
fmla v30.2d, v2.2d, v9.d[1]
|
||||
fmla v31.2d, v3.2d, v9.d[1]
|
||||
.endm
|
||||
|
||||
.macro SAVE8x4
|
||||
|
|
@ -351,17 +351,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v0.2d, v1.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
fmla v16.2d, v0.2d, v8.2d[0]
|
||||
fmla v29.2d, v1.2d, v9.2d[1]
|
||||
fmla v16.2d, v0.2d, v8.d[0]
|
||||
fmla v29.2d, v1.2d, v9.d[1]
|
||||
|
||||
fmla v20.2d, v0.2d, v8.2d[1]
|
||||
fmla v25.2d, v1.2d, v9.2d[0]
|
||||
fmla v20.2d, v0.2d, v8.d[1]
|
||||
fmla v25.2d, v1.2d, v9.d[0]
|
||||
|
||||
fmla v24.2d, v0.2d, v9.2d[0]
|
||||
fmla v21.2d, v1.2d, v8.2d[1]
|
||||
fmla v24.2d, v0.2d, v9.d[0]
|
||||
fmla v21.2d, v1.2d, v8.d[1]
|
||||
|
||||
fmla v28.2d, v0.2d, v9.2d[1]
|
||||
fmla v17.2d, v1.2d, v8.2d[0]
|
||||
fmla v28.2d, v0.2d, v9.d[1]
|
||||
fmla v17.2d, v1.2d, v8.d[0]
|
||||
.endm
|
||||
|
||||
.macro SAVE4x4
|
||||
|
|
@ -406,10 +406,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v0.2d}, [pA]
|
||||
add pA, pA, #16
|
||||
|
||||
fmla v16.2d, v0.2d, v8.2d[0]
|
||||
fmla v20.2d, v0.2d, v8.2d[1]
|
||||
fmla v24.2d, v0.2d, v9.2d[0]
|
||||
fmla v28.2d, v0.2d, v9.2d[1]
|
||||
fmla v16.2d, v0.2d, v8.d[0]
|
||||
fmla v20.2d, v0.2d, v8.d[1]
|
||||
fmla v24.2d, v0.2d, v9.d[0]
|
||||
fmla v28.2d, v0.2d, v9.d[1]
|
||||
.endm
|
||||
|
||||
.macro SAVE2x4
|
||||
|
|
@ -490,15 +490,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v2.2d, v3.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
fmla v16.2d, v0.2d, v8.2d[0]
|
||||
fmla v17.2d, v1.2d, v8.2d[0]
|
||||
fmla v18.2d, v2.2d, v8.2d[0]
|
||||
fmla v19.2d, v3.2d, v8.2d[0]
|
||||
fmla v16.2d, v0.2d, v8.d[0]
|
||||
fmla v17.2d, v1.2d, v8.d[0]
|
||||
fmla v18.2d, v2.2d, v8.d[0]
|
||||
fmla v19.2d, v3.2d, v8.d[0]
|
||||
|
||||
fmla v20.2d, v0.2d, v8.2d[1]
|
||||
fmla v21.2d, v1.2d, v8.2d[1]
|
||||
fmla v22.2d, v2.2d, v8.2d[1]
|
||||
fmla v23.2d, v3.2d, v8.2d[1]
|
||||
fmla v20.2d, v0.2d, v8.d[1]
|
||||
fmla v21.2d, v1.2d, v8.d[1]
|
||||
fmla v22.2d, v2.2d, v8.d[1]
|
||||
fmla v23.2d, v3.2d, v8.d[1]
|
||||
.endm
|
||||
|
||||
.macro SAVE8x2
|
||||
|
|
@ -534,10 +534,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v0.2d, v1.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
fmla v16.2d, v0.2d, v8.2d[0]
|
||||
fmla v17.2d, v1.2d, v8.2d[0]
|
||||
fmla v20.2d, v0.2d, v8.2d[1]
|
||||
fmla v21.2d, v1.2d, v8.2d[1]
|
||||
fmla v16.2d, v0.2d, v8.d[0]
|
||||
fmla v17.2d, v1.2d, v8.d[0]
|
||||
fmla v20.2d, v0.2d, v8.d[1]
|
||||
fmla v21.2d, v1.2d, v8.d[1]
|
||||
.endm
|
||||
|
||||
.macro SAVE4x2
|
||||
|
|
@ -568,8 +568,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v0.2d}, [pA]
|
||||
add pA, pA, #16
|
||||
|
||||
fmla v16.2d, v0.2d, v8.2d[0]
|
||||
fmla v20.2d, v0.2d, v8.2d[1]
|
||||
fmla v16.2d, v0.2d, v8.d[0]
|
||||
fmla v20.2d, v0.2d, v8.d[1]
|
||||
.endm
|
||||
|
||||
.macro SAVE2x2
|
||||
|
|
@ -597,7 +597,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ldr d0 , [pA]
|
||||
add pA, pA, #8
|
||||
|
||||
fmla v16.2d, v8.2d, v0.2d[0]
|
||||
fmla v16.2d, v8.2d, v0.d[0]
|
||||
.endm
|
||||
|
||||
.macro SAVE1x2
|
||||
|
|
@ -629,10 +629,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v2.2d, v3.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
fmla v16.2d, v0.2d, v8.2d[0]
|
||||
fmla v17.2d, v1.2d, v8.2d[0]
|
||||
fmla v18.2d, v2.2d, v8.2d[0]
|
||||
fmla v19.2d, v3.2d, v8.2d[0]
|
||||
fmla v16.2d, v0.2d, v8.d[0]
|
||||
fmla v17.2d, v1.2d, v8.d[0]
|
||||
fmla v18.2d, v2.2d, v8.d[0]
|
||||
fmla v19.2d, v3.2d, v8.d[0]
|
||||
.endm
|
||||
|
||||
.macro SAVE8x1
|
||||
|
|
@ -660,8 +660,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v0.2d, v1.2d}, [pA]
|
||||
add pA , pA, #32
|
||||
|
||||
fmla v16.2d, v0.2d, v8.2d[0]
|
||||
fmla v17.2d, v1.2d, v8.2d[0]
|
||||
fmla v16.2d, v0.2d, v8.d[0]
|
||||
fmla v17.2d, v1.2d, v8.d[0]
|
||||
.endm
|
||||
|
||||
.macro SAVE4x1
|
||||
|
|
@ -686,7 +686,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v0.2d}, [pA]
|
||||
add pA , pA, #16
|
||||
|
||||
fmla v16.2d, v0.2d, v8.2d[0]
|
||||
fmla v16.2d, v0.2d, v8.d[0]
|
||||
.endm
|
||||
|
||||
.macro SAVE2x1
|
||||
|
|
|
|||
|
|
@ -158,25 +158,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v3.4s}, [pA]
|
||||
add pA, pA, #16
|
||||
|
||||
fmul v16.4s, v0.4s, v8.2s[0]
|
||||
fmul v17.4s, v1.4s, v8.2s[0]
|
||||
fmul v18.4s, v2.4s, v8.2s[0]
|
||||
fmul v19.4s, v3.4s, v8.2s[0]
|
||||
fmul v16.4s, v0.4s, v8.s[0]
|
||||
fmul v17.4s, v1.4s, v8.s[0]
|
||||
fmul v18.4s, v2.4s, v8.s[0]
|
||||
fmul v19.4s, v3.4s, v8.s[0]
|
||||
|
||||
fmul v20.4s, v0.4s, v8.2s[1]
|
||||
fmul v21.4s, v1.4s, v8.2s[1]
|
||||
fmul v22.4s, v2.4s, v8.2s[1]
|
||||
fmul v23.4s, v3.4s, v8.2s[1]
|
||||
fmul v20.4s, v0.4s, v8.s[1]
|
||||
fmul v21.4s, v1.4s, v8.s[1]
|
||||
fmul v22.4s, v2.4s, v8.s[1]
|
||||
fmul v23.4s, v3.4s, v8.s[1]
|
||||
|
||||
fmul v24.4s, v0.4s, v9.2s[0]
|
||||
fmul v25.4s, v1.4s, v9.2s[0]
|
||||
fmul v26.4s, v2.4s, v9.2s[0]
|
||||
fmul v27.4s, v3.4s, v9.2s[0]
|
||||
fmul v24.4s, v0.4s, v9.s[0]
|
||||
fmul v25.4s, v1.4s, v9.s[0]
|
||||
fmul v26.4s, v2.4s, v9.s[0]
|
||||
fmul v27.4s, v3.4s, v9.s[0]
|
||||
|
||||
fmul v28.4s, v0.4s, v9.2s[1]
|
||||
fmul v29.4s, v1.4s, v9.2s[1]
|
||||
fmul v30.4s, v2.4s, v9.2s[1]
|
||||
fmul v31.4s, v3.4s, v9.2s[1]
|
||||
fmul v28.4s, v0.4s, v9.s[1]
|
||||
fmul v29.4s, v1.4s, v9.s[1]
|
||||
fmul v30.4s, v2.4s, v9.s[1]
|
||||
fmul v31.4s, v3.4s, v9.s[1]
|
||||
|
||||
ld1 {v12.2s, v13.2s}, [pB]
|
||||
add pB, pB, #16
|
||||
|
|
@ -191,25 +191,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro KERNEL16x4_M1
|
||||
fmla v16.4s, v0.4s, v8.2s[0]
|
||||
fmla v17.4s, v1.4s, v8.2s[0]
|
||||
fmla v18.4s, v2.4s, v8.2s[0]
|
||||
fmla v19.4s, v3.4s, v8.2s[0]
|
||||
fmla v16.4s, v0.4s, v8.s[0]
|
||||
fmla v17.4s, v1.4s, v8.s[0]
|
||||
fmla v18.4s, v2.4s, v8.s[0]
|
||||
fmla v19.4s, v3.4s, v8.s[0]
|
||||
|
||||
fmla v20.4s, v0.4s, v8.2s[1]
|
||||
fmla v21.4s, v1.4s, v8.2s[1]
|
||||
fmla v22.4s, v2.4s, v8.2s[1]
|
||||
fmla v23.4s, v3.4s, v8.2s[1]
|
||||
fmla v20.4s, v0.4s, v8.s[1]
|
||||
fmla v21.4s, v1.4s, v8.s[1]
|
||||
fmla v22.4s, v2.4s, v8.s[1]
|
||||
fmla v23.4s, v3.4s, v8.s[1]
|
||||
|
||||
fmla v24.4s, v0.4s, v9.2s[0]
|
||||
fmla v25.4s, v1.4s, v9.2s[0]
|
||||
fmla v26.4s, v2.4s, v9.2s[0]
|
||||
fmla v27.4s, v3.4s, v9.2s[0]
|
||||
fmla v24.4s, v0.4s, v9.s[0]
|
||||
fmla v25.4s, v1.4s, v9.s[0]
|
||||
fmla v26.4s, v2.4s, v9.s[0]
|
||||
fmla v27.4s, v3.4s, v9.s[0]
|
||||
|
||||
fmla v28.4s, v0.4s, v9.2s[1]
|
||||
fmla v29.4s, v1.4s, v9.2s[1]
|
||||
fmla v30.4s, v2.4s, v9.2s[1]
|
||||
fmla v31.4s, v3.4s, v9.2s[1]
|
||||
fmla v28.4s, v0.4s, v9.s[1]
|
||||
fmla v29.4s, v1.4s, v9.s[1]
|
||||
fmla v30.4s, v2.4s, v9.s[1]
|
||||
fmla v31.4s, v3.4s, v9.s[1]
|
||||
|
||||
ld1 {v12.2s, v13.2s}, [pB]
|
||||
add pB, pB, #16
|
||||
|
|
@ -224,25 +224,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro KERNEL16x4_M2
|
||||
fmla v16.4s, v4.4s, v12.2s[0]
|
||||
fmla v17.4s, v5.4s, v12.2s[0]
|
||||
fmla v18.4s, v6.4s, v12.2s[0]
|
||||
fmla v19.4s, v7.4s, v12.2s[0]
|
||||
fmla v16.4s, v4.4s, v12.s[0]
|
||||
fmla v17.4s, v5.4s, v12.s[0]
|
||||
fmla v18.4s, v6.4s, v12.s[0]
|
||||
fmla v19.4s, v7.4s, v12.s[0]
|
||||
|
||||
fmla v20.4s, v4.4s, v12.2s[1]
|
||||
fmla v21.4s, v5.4s, v12.2s[1]
|
||||
fmla v22.4s, v6.4s, v12.2s[1]
|
||||
fmla v23.4s, v7.4s, v12.2s[1]
|
||||
fmla v20.4s, v4.4s, v12.s[1]
|
||||
fmla v21.4s, v5.4s, v12.s[1]
|
||||
fmla v22.4s, v6.4s, v12.s[1]
|
||||
fmla v23.4s, v7.4s, v12.s[1]
|
||||
|
||||
fmla v24.4s, v4.4s, v13.2s[0]
|
||||
fmla v25.4s, v5.4s, v13.2s[0]
|
||||
fmla v26.4s, v6.4s, v13.2s[0]
|
||||
fmla v27.4s, v7.4s, v13.2s[0]
|
||||
fmla v24.4s, v4.4s, v13.s[0]
|
||||
fmla v25.4s, v5.4s, v13.s[0]
|
||||
fmla v26.4s, v6.4s, v13.s[0]
|
||||
fmla v27.4s, v7.4s, v13.s[0]
|
||||
|
||||
fmla v28.4s, v4.4s, v13.2s[1]
|
||||
fmla v29.4s, v5.4s, v13.2s[1]
|
||||
fmla v30.4s, v6.4s, v13.2s[1]
|
||||
fmla v31.4s, v7.4s, v13.2s[1]
|
||||
fmla v28.4s, v4.4s, v13.s[1]
|
||||
fmla v29.4s, v5.4s, v13.s[1]
|
||||
fmla v30.4s, v6.4s, v13.s[1]
|
||||
fmla v31.4s, v7.4s, v13.s[1]
|
||||
|
||||
ld1 {v8.2s, v9.2s}, [pB]
|
||||
add pB, pB, #16
|
||||
|
|
@ -257,25 +257,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro KERNEL16x4_E
|
||||
fmla v16.4s, v4.4s, v12.2s[0]
|
||||
fmla v17.4s, v5.4s, v12.2s[0]
|
||||
fmla v18.4s, v6.4s, v12.2s[0]
|
||||
fmla v19.4s, v7.4s, v12.2s[0]
|
||||
fmla v16.4s, v4.4s, v12.s[0]
|
||||
fmla v17.4s, v5.4s, v12.s[0]
|
||||
fmla v18.4s, v6.4s, v12.s[0]
|
||||
fmla v19.4s, v7.4s, v12.s[0]
|
||||
|
||||
fmla v20.4s, v4.4s, v12.2s[1]
|
||||
fmla v21.4s, v5.4s, v12.2s[1]
|
||||
fmla v22.4s, v6.4s, v12.2s[1]
|
||||
fmla v23.4s, v7.4s, v12.2s[1]
|
||||
fmla v20.4s, v4.4s, v12.s[1]
|
||||
fmla v21.4s, v5.4s, v12.s[1]
|
||||
fmla v22.4s, v6.4s, v12.s[1]
|
||||
fmla v23.4s, v7.4s, v12.s[1]
|
||||
|
||||
fmla v24.4s, v4.4s, v13.2s[0]
|
||||
fmla v25.4s, v5.4s, v13.2s[0]
|
||||
fmla v26.4s, v6.4s, v13.2s[0]
|
||||
fmla v27.4s, v7.4s, v13.2s[0]
|
||||
fmla v24.4s, v4.4s, v13.s[0]
|
||||
fmla v25.4s, v5.4s, v13.s[0]
|
||||
fmla v26.4s, v6.4s, v13.s[0]
|
||||
fmla v27.4s, v7.4s, v13.s[0]
|
||||
|
||||
fmla v28.4s, v4.4s, v13.2s[1]
|
||||
fmla v29.4s, v5.4s, v13.2s[1]
|
||||
fmla v30.4s, v6.4s, v13.2s[1]
|
||||
fmla v31.4s, v7.4s, v13.2s[1]
|
||||
fmla v28.4s, v4.4s, v13.s[1]
|
||||
fmla v29.4s, v5.4s, v13.s[1]
|
||||
fmla v30.4s, v6.4s, v13.s[1]
|
||||
fmla v31.4s, v7.4s, v13.s[1]
|
||||
.endm
|
||||
|
||||
.macro KERNEL16x4_SUB
|
||||
|
|
@ -290,25 +290,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v3.4s}, [pA]
|
||||
add pA, pA, #16
|
||||
|
||||
fmla v16.4s, v0.4s, v8.2s[0]
|
||||
fmla v17.4s, v1.4s, v8.2s[0]
|
||||
fmla v18.4s, v2.4s, v8.2s[0]
|
||||
fmla v19.4s, v3.4s, v8.2s[0]
|
||||
fmla v16.4s, v0.4s, v8.s[0]
|
||||
fmla v17.4s, v1.4s, v8.s[0]
|
||||
fmla v18.4s, v2.4s, v8.s[0]
|
||||
fmla v19.4s, v3.4s, v8.s[0]
|
||||
|
||||
fmla v20.4s, v0.4s, v8.2s[1]
|
||||
fmla v21.4s, v1.4s, v8.2s[1]
|
||||
fmla v22.4s, v2.4s, v8.2s[1]
|
||||
fmla v23.4s, v3.4s, v8.2s[1]
|
||||
fmla v20.4s, v0.4s, v8.s[1]
|
||||
fmla v21.4s, v1.4s, v8.s[1]
|
||||
fmla v22.4s, v2.4s, v8.s[1]
|
||||
fmla v23.4s, v3.4s, v8.s[1]
|
||||
|
||||
fmla v24.4s, v0.4s, v9.2s[0]
|
||||
fmla v25.4s, v1.4s, v9.2s[0]
|
||||
fmla v26.4s, v2.4s, v9.2s[0]
|
||||
fmla v27.4s, v3.4s, v9.2s[0]
|
||||
fmla v24.4s, v0.4s, v9.s[0]
|
||||
fmla v25.4s, v1.4s, v9.s[0]
|
||||
fmla v26.4s, v2.4s, v9.s[0]
|
||||
fmla v27.4s, v3.4s, v9.s[0]
|
||||
|
||||
fmla v28.4s, v0.4s, v9.2s[1]
|
||||
fmla v29.4s, v1.4s, v9.2s[1]
|
||||
fmla v30.4s, v2.4s, v9.2s[1]
|
||||
fmla v31.4s, v3.4s, v9.2s[1]
|
||||
fmla v28.4s, v0.4s, v9.s[1]
|
||||
fmla v29.4s, v1.4s, v9.s[1]
|
||||
fmla v30.4s, v2.4s, v9.s[1]
|
||||
fmla v31.4s, v3.4s, v9.s[1]
|
||||
.endm
|
||||
|
||||
.macro SAVE16x4
|
||||
|
|
@ -370,14 +370,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v1.4s}, [pA]
|
||||
add pA, pA, #16
|
||||
|
||||
fmul v16.4s, v0.4s, v8.2s[0]
|
||||
fmul v17.4s, v1.4s, v8.2s[0]
|
||||
fmul v20.4s, v0.4s, v8.2s[1]
|
||||
fmul v21.4s, v1.4s, v8.2s[1]
|
||||
fmul v24.4s, v0.4s, v9.2s[0]
|
||||
fmul v25.4s, v1.4s, v9.2s[0]
|
||||
fmul v28.4s, v0.4s, v9.2s[1]
|
||||
fmul v29.4s, v1.4s, v9.2s[1]
|
||||
fmul v16.4s, v0.4s, v8.s[0]
|
||||
fmul v17.4s, v1.4s, v8.s[0]
|
||||
fmul v20.4s, v0.4s, v8.s[1]
|
||||
fmul v21.4s, v1.4s, v8.s[1]
|
||||
fmul v24.4s, v0.4s, v9.s[0]
|
||||
fmul v25.4s, v1.4s, v9.s[0]
|
||||
fmul v28.4s, v0.4s, v9.s[1]
|
||||
fmul v29.4s, v1.4s, v9.s[1]
|
||||
|
||||
ld1 {v12.2s, v13.2s}, [pB]
|
||||
add pB, pB, #16
|
||||
|
|
@ -388,14 +388,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro KERNEL8x4_M1
|
||||
fmla v16.4s, v0.4s, v8.2s[0]
|
||||
fmla v17.4s, v1.4s, v8.2s[0]
|
||||
fmla v20.4s, v0.4s, v8.2s[1]
|
||||
fmla v21.4s, v1.4s, v8.2s[1]
|
||||
fmla v24.4s, v0.4s, v9.2s[0]
|
||||
fmla v25.4s, v1.4s, v9.2s[0]
|
||||
fmla v28.4s, v0.4s, v9.2s[1]
|
||||
fmla v29.4s, v1.4s, v9.2s[1]
|
||||
fmla v16.4s, v0.4s, v8.s[0]
|
||||
fmla v17.4s, v1.4s, v8.s[0]
|
||||
fmla v20.4s, v0.4s, v8.s[1]
|
||||
fmla v21.4s, v1.4s, v8.s[1]
|
||||
fmla v24.4s, v0.4s, v9.s[0]
|
||||
fmla v25.4s, v1.4s, v9.s[0]
|
||||
fmla v28.4s, v0.4s, v9.s[1]
|
||||
fmla v29.4s, v1.4s, v9.s[1]
|
||||
|
||||
ld1 {v12.2s, v13.2s}, [pB]
|
||||
add pB, pB, #16
|
||||
|
|
@ -406,14 +406,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro KERNEL8x4_M2
|
||||
fmla v16.4s, v4.4s, v12.2s[0]
|
||||
fmla v17.4s, v5.4s, v12.2s[0]
|
||||
fmla v20.4s, v4.4s, v12.2s[1]
|
||||
fmla v21.4s, v5.4s, v12.2s[1]
|
||||
fmla v24.4s, v4.4s, v13.2s[0]
|
||||
fmla v25.4s, v5.4s, v13.2s[0]
|
||||
fmla v28.4s, v4.4s, v13.2s[1]
|
||||
fmla v29.4s, v5.4s, v13.2s[1]
|
||||
fmla v16.4s, v4.4s, v12.s[0]
|
||||
fmla v17.4s, v5.4s, v12.s[0]
|
||||
fmla v20.4s, v4.4s, v12.s[1]
|
||||
fmla v21.4s, v5.4s, v12.s[1]
|
||||
fmla v24.4s, v4.4s, v13.s[0]
|
||||
fmla v25.4s, v5.4s, v13.s[0]
|
||||
fmla v28.4s, v4.4s, v13.s[1]
|
||||
fmla v29.4s, v5.4s, v13.s[1]
|
||||
|
||||
ld1 {v8.2s, v9.2s}, [pB]
|
||||
add pB, pB, #16
|
||||
|
|
@ -424,14 +424,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro KERNEL8x4_E
|
||||
fmla v16.4s, v4.4s, v12.2s[0]
|
||||
fmla v17.4s, v5.4s, v12.2s[0]
|
||||
fmla v20.4s, v4.4s, v12.2s[1]
|
||||
fmla v21.4s, v5.4s, v12.2s[1]
|
||||
fmla v24.4s, v4.4s, v13.2s[0]
|
||||
fmla v25.4s, v5.4s, v13.2s[0]
|
||||
fmla v28.4s, v4.4s, v13.2s[1]
|
||||
fmla v29.4s, v5.4s, v13.2s[1]
|
||||
fmla v16.4s, v4.4s, v12.s[0]
|
||||
fmla v17.4s, v5.4s, v12.s[0]
|
||||
fmla v20.4s, v4.4s, v12.s[1]
|
||||
fmla v21.4s, v5.4s, v12.s[1]
|
||||
fmla v24.4s, v4.4s, v13.s[0]
|
||||
fmla v25.4s, v5.4s, v13.s[0]
|
||||
fmla v28.4s, v4.4s, v13.s[1]
|
||||
fmla v29.4s, v5.4s, v13.s[1]
|
||||
.endm
|
||||
|
||||
.macro KERNEL8x4_SUB
|
||||
|
|
@ -442,14 +442,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v1.4s}, [pA]
|
||||
add pA, pA, #16
|
||||
|
||||
fmla v16.4s, v0.4s, v8.2s[0]
|
||||
fmla v17.4s, v1.4s, v8.2s[0]
|
||||
fmla v20.4s, v0.4s, v8.2s[1]
|
||||
fmla v21.4s, v1.4s, v8.2s[1]
|
||||
fmla v24.4s, v0.4s, v9.2s[0]
|
||||
fmla v25.4s, v1.4s, v9.2s[0]
|
||||
fmla v28.4s, v0.4s, v9.2s[1]
|
||||
fmla v29.4s, v1.4s, v9.2s[1]
|
||||
fmla v16.4s, v0.4s, v8.s[0]
|
||||
fmla v17.4s, v1.4s, v8.s[0]
|
||||
fmla v20.4s, v0.4s, v8.s[1]
|
||||
fmla v21.4s, v1.4s, v8.s[1]
|
||||
fmla v24.4s, v0.4s, v9.s[0]
|
||||
fmla v25.4s, v1.4s, v9.s[0]
|
||||
fmla v28.4s, v0.4s, v9.s[1]
|
||||
fmla v29.4s, v1.4s, v9.s[1]
|
||||
.endm
|
||||
|
||||
.macro SAVE8x4
|
||||
|
|
@ -501,17 +501,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v0.2s, v1.2s}, [pA]
|
||||
add pA, pA, #16
|
||||
|
||||
fmul v16.2s, v0.2s, v8.2s[0]
|
||||
fmul v29.2s, v1.2s, v9.2s[1]
|
||||
fmul v16.2s, v0.2s, v8.s[0]
|
||||
fmul v29.2s, v1.2s, v9.s[1]
|
||||
|
||||
fmul v20.2s, v0.2s, v8.2s[1]
|
||||
fmul v25.2s, v1.2s, v9.2s[0]
|
||||
fmul v20.2s, v0.2s, v8.s[1]
|
||||
fmul v25.2s, v1.2s, v9.s[0]
|
||||
|
||||
fmul v24.2s, v0.2s, v9.2s[0]
|
||||
fmul v21.2s, v1.2s, v8.2s[1]
|
||||
fmul v24.2s, v0.2s, v9.s[0]
|
||||
fmul v21.2s, v1.2s, v8.s[1]
|
||||
|
||||
fmul v28.2s, v0.2s, v9.2s[1]
|
||||
fmul v17.2s, v1.2s, v8.2s[0]
|
||||
fmul v28.2s, v0.2s, v9.s[1]
|
||||
fmul v17.2s, v1.2s, v8.s[0]
|
||||
|
||||
ld1 {v12.2s, v13.2s}, [pB]
|
||||
add pB, pB, #16
|
||||
|
|
@ -520,61 +520,61 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro KERNEL4x4_M1
|
||||
fmla v16.2s, v0.2s, v8.2s[0]
|
||||
fmla v29.2s, v1.2s, v9.2s[1]
|
||||
fmla v16.2s, v0.2s, v8.s[0]
|
||||
fmla v29.2s, v1.2s, v9.s[1]
|
||||
|
||||
ld1 {v12.2s, v13.2s}, [pB] // For next round
|
||||
add pB, pB, #16
|
||||
|
||||
fmla v20.2s, v0.2s, v8.2s[1]
|
||||
fmla v25.2s, v1.2s, v9.2s[0]
|
||||
fmla v20.2s, v0.2s, v8.s[1]
|
||||
fmla v25.2s, v1.2s, v9.s[0]
|
||||
|
||||
ld1 {v4.2s, v5.2s}, [pA] // For next round
|
||||
add pA, pA, #16
|
||||
|
||||
fmla v24.2s, v0.2s, v9.2s[0]
|
||||
fmla v21.2s, v1.2s, v8.2s[1]
|
||||
fmla v24.2s, v0.2s, v9.s[0]
|
||||
fmla v21.2s, v1.2s, v8.s[1]
|
||||
|
||||
prfm PLDL1KEEP, [pB, #512]
|
||||
|
||||
fmla v28.2s, v0.2s, v9.2s[1]
|
||||
fmla v17.2s, v1.2s, v8.2s[0]
|
||||
fmla v28.2s, v0.2s, v9.s[1]
|
||||
fmla v17.2s, v1.2s, v8.s[0]
|
||||
.endm
|
||||
|
||||
.macro KERNEL4x4_M2
|
||||
fmla v16.2s, v4.2s, v12.2s[0]
|
||||
fmla v29.2s, v5.2s, v13.2s[1]
|
||||
fmla v16.2s, v4.2s, v12.s[0]
|
||||
fmla v29.2s, v5.2s, v13.s[1]
|
||||
|
||||
ld1 {v8.2s, v9.2s}, [pB] // For next round
|
||||
add pB, pB, #16
|
||||
|
||||
fmla v20.2s, v4.2s, v12.2s[1]
|
||||
fmla v25.2s, v5.2s, v13.2s[0]
|
||||
fmla v20.2s, v4.2s, v12.s[1]
|
||||
fmla v25.2s, v5.2s, v13.s[0]
|
||||
|
||||
ld1 {v0.2s, v1.2s}, [pA] // For next round
|
||||
add pA, pA, #16
|
||||
|
||||
fmla v24.2s, v4.2s, v13.2s[0]
|
||||
fmla v21.2s, v5.2s, v12.2s[1]
|
||||
fmla v24.2s, v4.2s, v13.s[0]
|
||||
fmla v21.2s, v5.2s, v12.s[1]
|
||||
|
||||
prfm PLDL1KEEP, [pA, #512]
|
||||
|
||||
fmla v28.2s, v4.2s, v13.2s[1]
|
||||
fmla v17.2s, v5.2s, v12.2s[0]
|
||||
fmla v28.2s, v4.2s, v13.s[1]
|
||||
fmla v17.2s, v5.2s, v12.s[0]
|
||||
.endm
|
||||
|
||||
.macro KERNEL4x4_E
|
||||
fmla v16.2s, v4.2s, v12.2s[0]
|
||||
fmla v29.2s, v5.2s, v13.2s[1]
|
||||
fmla v16.2s, v4.2s, v12.s[0]
|
||||
fmla v29.2s, v5.2s, v13.s[1]
|
||||
|
||||
fmla v20.2s, v4.2s, v12.2s[1]
|
||||
fmla v25.2s, v5.2s, v13.2s[0]
|
||||
fmla v20.2s, v4.2s, v12.s[1]
|
||||
fmla v25.2s, v5.2s, v13.s[0]
|
||||
|
||||
fmla v24.2s, v4.2s, v13.2s[0]
|
||||
fmla v21.2s, v5.2s, v12.2s[1]
|
||||
fmla v24.2s, v4.2s, v13.s[0]
|
||||
fmla v21.2s, v5.2s, v12.s[1]
|
||||
|
||||
fmla v28.2s, v4.2s, v13.2s[1]
|
||||
fmla v17.2s, v5.2s, v12.2s[0]
|
||||
fmla v28.2s, v4.2s, v13.s[1]
|
||||
fmla v17.2s, v5.2s, v12.s[0]
|
||||
.endm
|
||||
|
||||
.macro KERNEL4x4_SUB
|
||||
|
|
@ -583,17 +583,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v0.2s, v1.2s}, [pA]
|
||||
add pA, pA, #16
|
||||
|
||||
fmla v16.2s, v0.2s, v8.2s[0]
|
||||
fmla v29.2s, v1.2s, v9.2s[1]
|
||||
fmla v16.2s, v0.2s, v8.s[0]
|
||||
fmla v29.2s, v1.2s, v9.s[1]
|
||||
|
||||
fmla v20.2s, v0.2s, v8.2s[1]
|
||||
fmla v25.2s, v1.2s, v9.2s[0]
|
||||
fmla v20.2s, v0.2s, v8.s[1]
|
||||
fmla v25.2s, v1.2s, v9.s[0]
|
||||
|
||||
fmla v24.2s, v0.2s, v9.2s[0]
|
||||
fmla v21.2s, v1.2s, v8.2s[1]
|
||||
fmla v24.2s, v0.2s, v9.s[0]
|
||||
fmla v21.2s, v1.2s, v8.s[1]
|
||||
|
||||
fmla v28.2s, v0.2s, v9.2s[1]
|
||||
fmla v17.2s, v1.2s, v8.2s[0]
|
||||
fmla v28.2s, v0.2s, v9.s[1]
|
||||
fmla v17.2s, v1.2s, v8.s[0]
|
||||
.endm
|
||||
|
||||
.macro SAVE4x4
|
||||
|
|
@ -638,10 +638,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v0.2s}, [pA]
|
||||
add pA, pA, #8
|
||||
|
||||
fmla v16.2s, v0.2s, v8.2s[0]
|
||||
fmla v20.2s, v0.2s, v8.2s[1]
|
||||
fmla v24.2s, v0.2s, v9.2s[0]
|
||||
fmla v28.2s, v0.2s, v9.2s[1]
|
||||
fmla v16.2s, v0.2s, v8.s[0]
|
||||
fmla v20.2s, v0.2s, v8.s[1]
|
||||
fmla v24.2s, v0.2s, v9.s[0]
|
||||
fmla v28.2s, v0.2s, v9.s[1]
|
||||
.endm
|
||||
|
||||
.macro SAVE2x4
|
||||
|
|
@ -729,15 +729,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v3.4s}, [pA]
|
||||
add pA, pA, #16
|
||||
|
||||
fmla v16.4s, v0.4s, v8.2s[0]
|
||||
fmla v17.4s, v1.4s, v8.2s[0]
|
||||
fmla v18.4s, v2.4s, v8.2s[0]
|
||||
fmla v19.4s, v3.4s, v8.2s[0]
|
||||
fmla v16.4s, v0.4s, v8.s[0]
|
||||
fmla v17.4s, v1.4s, v8.s[0]
|
||||
fmla v18.4s, v2.4s, v8.s[0]
|
||||
fmla v19.4s, v3.4s, v8.s[0]
|
||||
|
||||
fmla v20.4s, v0.4s, v8.2s[1]
|
||||
fmla v21.4s, v1.4s, v8.2s[1]
|
||||
fmla v22.4s, v2.4s, v8.2s[1]
|
||||
fmla v23.4s, v3.4s, v8.2s[1]
|
||||
fmla v20.4s, v0.4s, v8.s[1]
|
||||
fmla v21.4s, v1.4s, v8.s[1]
|
||||
fmla v22.4s, v2.4s, v8.s[1]
|
||||
fmla v23.4s, v3.4s, v8.s[1]
|
||||
.endm
|
||||
|
||||
.macro SAVE16x2
|
||||
|
|
@ -777,11 +777,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v1.4s}, [pA]
|
||||
add pA, pA, #16
|
||||
|
||||
fmla v16.4s, v0.4s, v8.2s[0]
|
||||
fmla v17.4s, v1.4s, v8.2s[0]
|
||||
fmla v16.4s, v0.4s, v8.s[0]
|
||||
fmla v17.4s, v1.4s, v8.s[0]
|
||||
|
||||
fmla v20.4s, v0.4s, v8.2s[1]
|
||||
fmla v21.4s, v1.4s, v8.2s[1]
|
||||
fmla v20.4s, v0.4s, v8.s[1]
|
||||
fmla v21.4s, v1.4s, v8.s[1]
|
||||
.endm
|
||||
|
||||
.macro SAVE8x2
|
||||
|
|
@ -817,10 +817,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v0.2s, v1.2s}, [pA]
|
||||
add pA, pA, #16
|
||||
|
||||
fmla v16.2s, v0.2s, v8.2s[0]
|
||||
fmla v17.2s, v1.2s, v8.2s[0]
|
||||
fmla v20.2s, v0.2s, v8.2s[1]
|
||||
fmla v21.2s, v1.2s, v8.2s[1]
|
||||
fmla v16.2s, v0.2s, v8.s[0]
|
||||
fmla v17.2s, v1.2s, v8.s[0]
|
||||
fmla v20.2s, v0.2s, v8.s[1]
|
||||
fmla v21.2s, v1.2s, v8.s[1]
|
||||
.endm
|
||||
|
||||
.macro SAVE4x2
|
||||
|
|
@ -852,8 +852,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v0.2s}, [pA]
|
||||
add pA, pA, #8
|
||||
|
||||
fmla v16.2s, v0.2s, v8.2s[0]
|
||||
fmla v20.2s, v0.2s, v8.2s[1]
|
||||
fmla v16.2s, v0.2s, v8.s[0]
|
||||
fmla v20.2s, v0.2s, v8.s[1]
|
||||
.endm
|
||||
|
||||
.macro SAVE2x2
|
||||
|
|
@ -882,7 +882,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ldr s0 , [pA]
|
||||
add pA, pA, #4
|
||||
|
||||
fmla v16.2s, v8.2s, v0.2s[0]
|
||||
fmla v16.2s, v8.2s, v0.s[0]
|
||||
.endm
|
||||
|
||||
.macro SAVE1x2
|
||||
|
|
@ -918,10 +918,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v3.4s}, [pA]
|
||||
add pA, pA, #16
|
||||
|
||||
fmla v16.4s, v0.4s, v8.2s[0]
|
||||
fmla v17.4s, v1.4s, v8.2s[0]
|
||||
fmla v18.4s, v2.4s, v8.2s[0]
|
||||
fmla v19.4s, v3.4s, v8.2s[0]
|
||||
fmla v16.4s, v0.4s, v8.s[0]
|
||||
fmla v17.4s, v1.4s, v8.s[0]
|
||||
fmla v18.4s, v2.4s, v8.s[0]
|
||||
fmla v19.4s, v3.4s, v8.s[0]
|
||||
.endm
|
||||
|
||||
.macro SAVE16x1
|
||||
|
|
@ -951,8 +951,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v1.4s}, [pA]
|
||||
add pA, pA, #16
|
||||
|
||||
fmla v16.4s, v0.4s, v8.2s[0]
|
||||
fmla v17.4s, v1.4s, v8.2s[0]
|
||||
fmla v16.4s, v0.4s, v8.s[0]
|
||||
fmla v17.4s, v1.4s, v8.s[0]
|
||||
.endm
|
||||
|
||||
.macro SAVE8x1
|
||||
|
|
@ -978,8 +978,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v0.2s, v1.2s}, [pA]
|
||||
add pA , pA, #16
|
||||
|
||||
fmla v16.2s, v0.2s, v8.2s[0]
|
||||
fmla v17.2s, v1.2s, v8.2s[0]
|
||||
fmla v16.2s, v0.2s, v8.s[0]
|
||||
fmla v17.2s, v1.2s, v8.s[0]
|
||||
.endm
|
||||
|
||||
.macro SAVE4x1
|
||||
|
|
@ -1004,7 +1004,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v0.2s}, [pA]
|
||||
add pA , pA, #8
|
||||
|
||||
fmla v16.2s, v0.2s, v8.2s[0]
|
||||
fmla v16.2s, v0.2s, v8.s[0]
|
||||
.endm
|
||||
|
||||
.macro SAVE2x1
|
||||
|
|
|
|||
|
|
@ -192,164 +192,164 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v0.4s}, [pA_0]
|
||||
add pA_0, pA_0, #16
|
||||
|
||||
fmul v16.4s, v0.4s, v8.4s[0]
|
||||
fmul v20.4s, v0.4s, v8.4s[1]
|
||||
fmul v16.4s, v0.4s, v8.s[0]
|
||||
fmul v20.4s, v0.4s, v8.s[1]
|
||||
|
||||
ld1 {v2.4s}, [pA_1]
|
||||
add pA_1, pA_1, #16
|
||||
|
||||
fmul v24.4s, v0.4s, v8.4s[2]
|
||||
fmul v28.4s, v0.4s, v8.4s[3]
|
||||
fmul v24.4s, v0.4s, v8.s[2]
|
||||
fmul v28.4s, v0.4s, v8.s[3]
|
||||
|
||||
ld1 {v4.4s}, [pA_2]
|
||||
add pA_2, pA_2, #16
|
||||
|
||||
fmul v17.4s, v2.4s, v8.4s[0]
|
||||
fmul v21.4s, v2.4s, v8.4s[1]
|
||||
fmul v17.4s, v2.4s, v8.s[0]
|
||||
fmul v21.4s, v2.4s, v8.s[1]
|
||||
|
||||
ld1 {v6.4s}, [pA_3]
|
||||
add pA_3, pA_3, #16
|
||||
|
||||
fmul v25.4s, v2.4s, v8.4s[2]
|
||||
fmul v29.4s, v2.4s, v8.4s[3]
|
||||
fmul v25.4s, v2.4s, v8.s[2]
|
||||
fmul v29.4s, v2.4s, v8.s[3]
|
||||
|
||||
ld1 {v12.4s}, [pB] // for next round
|
||||
add pB, pB, #16
|
||||
|
||||
fmul v18.4s, v4.4s, v8.4s[0]
|
||||
fmul v19.4s, v6.4s, v8.4s[0]
|
||||
fmul v18.4s, v4.4s, v8.s[0]
|
||||
fmul v19.4s, v6.4s, v8.s[0]
|
||||
|
||||
ld1 {v1.4s}, [pA_0] // for next round
|
||||
add pA_0, pA_0, #16
|
||||
|
||||
fmul v22.4s, v4.4s, v8.4s[1]
|
||||
fmul v23.4s, v6.4s, v8.4s[1]
|
||||
fmul v22.4s, v4.4s, v8.s[1]
|
||||
fmul v23.4s, v6.4s, v8.s[1]
|
||||
|
||||
ld1 {v3.4s}, [pA_1] // for next round
|
||||
add pA_1, pA_1, #16
|
||||
|
||||
fmul v26.4s, v4.4s, v8.4s[2]
|
||||
fmul v27.4s, v6.4s, v8.4s[2]
|
||||
fmul v26.4s, v4.4s, v8.s[2]
|
||||
fmul v27.4s, v6.4s, v8.s[2]
|
||||
|
||||
ld1 {v5.4s}, [pA_2] // for next round
|
||||
add pA_2, pA_2, #16
|
||||
|
||||
fmul v30.4s, v4.4s, v8.4s[3]
|
||||
fmul v31.4s, v6.4s, v8.4s[3]
|
||||
fmul v30.4s, v4.4s, v8.s[3]
|
||||
fmul v31.4s, v6.4s, v8.s[3]
|
||||
|
||||
ld1 {v7.4s}, [pA_3] // for next round
|
||||
add pA_3, pA_3, #16
|
||||
.endm
|
||||
|
||||
.macro KERNEL16x4_M2
|
||||
fmla v16.4s, v1.4s, v12.4s[0]
|
||||
fmla v17.4s, v3.4s, v12.4s[0]
|
||||
fmla v16.4s, v1.4s, v12.s[0]
|
||||
fmla v17.4s, v3.4s, v12.s[0]
|
||||
|
||||
ld1 {v8.4s}, [pB] // for next round
|
||||
add pB, pB, #16
|
||||
|
||||
fmla v18.4s, v5.4s, v12.4s[0]
|
||||
fmla v19.4s, v7.4s, v12.4s[0]
|
||||
fmla v18.4s, v5.4s, v12.s[0]
|
||||
fmla v19.4s, v7.4s, v12.s[0]
|
||||
|
||||
ld1 {v0.4s}, [pA_0] // for next round
|
||||
add pA_0, pA_0, #16
|
||||
|
||||
fmla v20.4s, v1.4s, v12.4s[1]
|
||||
fmla v21.4s, v3.4s, v12.4s[1]
|
||||
fmla v20.4s, v1.4s, v12.s[1]
|
||||
fmla v21.4s, v3.4s, v12.s[1]
|
||||
|
||||
ld1 {v2.4s}, [pA_1] // for next round
|
||||
add pA_1, pA_1, #16
|
||||
|
||||
fmla v22.4s, v5.4s, v12.4s[1]
|
||||
fmla v23.4s, v7.4s, v12.4s[1]
|
||||
fmla v22.4s, v5.4s, v12.s[1]
|
||||
fmla v23.4s, v7.4s, v12.s[1]
|
||||
|
||||
ld1 {v4.4s}, [pA_2] // for next round
|
||||
add pA_2, pA_2, #16
|
||||
|
||||
fmla v24.4s, v1.4s, v12.4s[2]
|
||||
fmla v25.4s, v3.4s, v12.4s[2]
|
||||
fmla v24.4s, v1.4s, v12.s[2]
|
||||
fmla v25.4s, v3.4s, v12.s[2]
|
||||
|
||||
ld1 {v6.4s}, [pA_3] // for next round
|
||||
add pA_3, pA_3, #16
|
||||
|
||||
fmla v26.4s, v5.4s, v12.4s[2]
|
||||
fmla v27.4s, v7.4s, v12.4s[2]
|
||||
fmla v26.4s, v5.4s, v12.s[2]
|
||||
fmla v27.4s, v7.4s, v12.s[2]
|
||||
|
||||
prfm PLDL1KEEP, [pA_2, #512]
|
||||
|
||||
fmla v28.4s, v1.4s, v12.4s[3]
|
||||
fmla v29.4s, v3.4s, v12.4s[3]
|
||||
fmla v28.4s, v1.4s, v12.s[3]
|
||||
fmla v29.4s, v3.4s, v12.s[3]
|
||||
|
||||
prfm PLDL1KEEP, [pA_3, #512]
|
||||
|
||||
fmla v30.4s, v5.4s, v12.4s[3]
|
||||
fmla v31.4s, v7.4s, v12.4s[3]
|
||||
fmla v30.4s, v5.4s, v12.s[3]
|
||||
fmla v31.4s, v7.4s, v12.s[3]
|
||||
|
||||
prfm PLDL1KEEP, [pB, #512]
|
||||
.endm
|
||||
|
||||
.macro KERNEL16x4_M1
|
||||
fmla v16.4s, v0.4s, v8.4s[0]
|
||||
fmla v17.4s, v2.4s, v8.4s[0]
|
||||
fmla v16.4s, v0.4s, v8.s[0]
|
||||
fmla v17.4s, v2.4s, v8.s[0]
|
||||
|
||||
ld1 {v12.4s}, [pB] // for next round
|
||||
add pB, pB, #16
|
||||
|
||||
fmla v18.4s, v4.4s, v8.4s[0]
|
||||
fmla v19.4s, v6.4s, v8.4s[0]
|
||||
fmla v18.4s, v4.4s, v8.s[0]
|
||||
fmla v19.4s, v6.4s, v8.s[0]
|
||||
|
||||
ld1 {v1.4s}, [pA_0] // for next round
|
||||
add pA_0, pA_0, #16
|
||||
|
||||
fmla v20.4s, v0.4s, v8.4s[1]
|
||||
fmla v21.4s, v2.4s, v8.4s[1]
|
||||
fmla v20.4s, v0.4s, v8.s[1]
|
||||
fmla v21.4s, v2.4s, v8.s[1]
|
||||
|
||||
ld1 {v3.4s}, [pA_1] // for next round
|
||||
add pA_1, pA_1, #16
|
||||
|
||||
fmla v22.4s, v4.4s, v8.4s[1]
|
||||
fmla v23.4s, v6.4s, v8.4s[1]
|
||||
fmla v22.4s, v4.4s, v8.s[1]
|
||||
fmla v23.4s, v6.4s, v8.s[1]
|
||||
|
||||
ld1 {v5.4s}, [pA_2] // for next round
|
||||
add pA_2, pA_2, #16
|
||||
|
||||
fmla v24.4s, v0.4s, v8.4s[2]
|
||||
fmla v25.4s, v2.4s, v8.4s[2]
|
||||
fmla v24.4s, v0.4s, v8.s[2]
|
||||
fmla v25.4s, v2.4s, v8.s[2]
|
||||
|
||||
ld1 {v7.4s}, [pA_3] // for next round
|
||||
add pA_3, pA_3, #16
|
||||
|
||||
fmla v26.4s, v4.4s, v8.4s[2]
|
||||
fmla v27.4s, v6.4s, v8.4s[2]
|
||||
fmla v26.4s, v4.4s, v8.s[2]
|
||||
fmla v27.4s, v6.4s, v8.s[2]
|
||||
|
||||
prfm PLDL1KEEP, [pA_0, #512]
|
||||
|
||||
fmla v28.4s, v0.4s, v8.4s[3]
|
||||
fmla v29.4s, v2.4s, v8.4s[3]
|
||||
fmla v28.4s, v0.4s, v8.s[3]
|
||||
fmla v29.4s, v2.4s, v8.s[3]
|
||||
|
||||
prfm PLDL1KEEP, [pA_1, #512]
|
||||
|
||||
fmla v30.4s, v4.4s, v8.4s[3]
|
||||
fmla v31.4s, v6.4s, v8.4s[3]
|
||||
fmla v30.4s, v4.4s, v8.s[3]
|
||||
fmla v31.4s, v6.4s, v8.s[3]
|
||||
.endm
|
||||
|
||||
.macro KERNEL16x4_E
|
||||
fmla v16.4s, v1.4s, v12.4s[0]
|
||||
fmla v17.4s, v3.4s, v12.4s[0]
|
||||
fmla v18.4s, v5.4s, v12.4s[0]
|
||||
fmla v19.4s, v7.4s, v12.4s[0]
|
||||
fmla v20.4s, v1.4s, v12.4s[1]
|
||||
fmla v21.4s, v3.4s, v12.4s[1]
|
||||
fmla v22.4s, v5.4s, v12.4s[1]
|
||||
fmla v23.4s, v7.4s, v12.4s[1]
|
||||
fmla v24.4s, v1.4s, v12.4s[2]
|
||||
fmla v25.4s, v3.4s, v12.4s[2]
|
||||
fmla v26.4s, v5.4s, v12.4s[2]
|
||||
fmla v27.4s, v7.4s, v12.4s[2]
|
||||
fmla v28.4s, v1.4s, v12.4s[3]
|
||||
fmla v29.4s, v3.4s, v12.4s[3]
|
||||
fmla v30.4s, v5.4s, v12.4s[3]
|
||||
fmla v31.4s, v7.4s, v12.4s[3]
|
||||
fmla v16.4s, v1.4s, v12.s[0]
|
||||
fmla v17.4s, v3.4s, v12.s[0]
|
||||
fmla v18.4s, v5.4s, v12.s[0]
|
||||
fmla v19.4s, v7.4s, v12.s[0]
|
||||
fmla v20.4s, v1.4s, v12.s[1]
|
||||
fmla v21.4s, v3.4s, v12.s[1]
|
||||
fmla v22.4s, v5.4s, v12.s[1]
|
||||
fmla v23.4s, v7.4s, v12.s[1]
|
||||
fmla v24.4s, v1.4s, v12.s[2]
|
||||
fmla v25.4s, v3.4s, v12.s[2]
|
||||
fmla v26.4s, v5.4s, v12.s[2]
|
||||
fmla v27.4s, v7.4s, v12.s[2]
|
||||
fmla v28.4s, v1.4s, v12.s[3]
|
||||
fmla v29.4s, v3.4s, v12.s[3]
|
||||
fmla v30.4s, v5.4s, v12.s[3]
|
||||
fmla v31.4s, v7.4s, v12.s[3]
|
||||
.endm
|
||||
|
||||
.macro KERNEL16x4_SUB
|
||||
|
|
@ -359,34 +359,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v0.4s}, [pA_0]
|
||||
add pA_0, pA_0, #16
|
||||
|
||||
fmla v16.4s, v0.4s, v8.4s[0]
|
||||
fmla v20.4s, v0.4s, v8.4s[1]
|
||||
fmla v24.4s, v0.4s, v8.4s[2]
|
||||
fmla v28.4s, v0.4s, v8.4s[3]
|
||||
fmla v16.4s, v0.4s, v8.s[0]
|
||||
fmla v20.4s, v0.4s, v8.s[1]
|
||||
fmla v24.4s, v0.4s, v8.s[2]
|
||||
fmla v28.4s, v0.4s, v8.s[3]
|
||||
|
||||
ld1 {v2.4s}, [pA_1]
|
||||
add pA_1, pA_1, #16
|
||||
|
||||
fmla v17.4s, v2.4s, v8.4s[0]
|
||||
fmla v21.4s, v2.4s, v8.4s[1]
|
||||
fmla v25.4s, v2.4s, v8.4s[2]
|
||||
fmla v29.4s, v2.4s, v8.4s[3]
|
||||
fmla v17.4s, v2.4s, v8.s[0]
|
||||
fmla v21.4s, v2.4s, v8.s[1]
|
||||
fmla v25.4s, v2.4s, v8.s[2]
|
||||
fmla v29.4s, v2.4s, v8.s[3]
|
||||
|
||||
ld1 {v4.4s}, [pA_2]
|
||||
add pA_2, pA_2, #16
|
||||
|
||||
fmla v18.4s, v4.4s, v8.4s[0]
|
||||
fmla v22.4s, v4.4s, v8.4s[1]
|
||||
fmla v26.4s, v4.4s, v8.4s[2]
|
||||
fmla v30.4s, v4.4s, v8.4s[3]
|
||||
fmla v18.4s, v4.4s, v8.s[0]
|
||||
fmla v22.4s, v4.4s, v8.s[1]
|
||||
fmla v26.4s, v4.4s, v8.s[2]
|
||||
fmla v30.4s, v4.4s, v8.s[3]
|
||||
|
||||
ld1 {v6.4s}, [pA_3]
|
||||
add pA_3, pA_3, #16
|
||||
|
||||
fmla v19.4s, v6.4s, v8.4s[0]
|
||||
fmla v23.4s, v6.4s, v8.4s[1]
|
||||
fmla v27.4s, v6.4s, v8.4s[2]
|
||||
fmla v31.4s, v6.4s, v8.4s[3]
|
||||
fmla v19.4s, v6.4s, v8.s[0]
|
||||
fmla v23.4s, v6.4s, v8.s[1]
|
||||
fmla v27.4s, v6.4s, v8.s[2]
|
||||
fmla v31.4s, v6.4s, v8.s[3]
|
||||
.endm
|
||||
|
||||
.macro SAVE16x4
|
||||
|
|
@ -456,28 +456,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v0.2s, v1.2s}, [pA_0]
|
||||
add pA_0, pA_0, #16
|
||||
|
||||
fmla v16.2s, v0.2s, v8.2s[0]
|
||||
fmla v29.2s, v1.2s, v9.2s[1]
|
||||
fmla v20.2s, v0.2s, v8.2s[1]
|
||||
fmla v25.2s, v1.2s, v9.2s[0]
|
||||
fmla v16.2s, v0.2s, v8.s[0]
|
||||
fmla v29.2s, v1.2s, v9.s[1]
|
||||
fmla v20.2s, v0.2s, v8.s[1]
|
||||
fmla v25.2s, v1.2s, v9.s[0]
|
||||
|
||||
ld1 {v2.2s, v3.2s}, [pA_1]
|
||||
add pA_1, pA_1, #16
|
||||
|
||||
fmla v24.2s, v0.2s, v9.2s[0]
|
||||
fmla v21.2s, v1.2s, v8.2s[1]
|
||||
fmla v28.2s, v0.2s, v9.2s[1]
|
||||
fmla v17.2s, v1.2s, v8.2s[0]
|
||||
fmla v24.2s, v0.2s, v9.s[0]
|
||||
fmla v21.2s, v1.2s, v8.s[1]
|
||||
fmla v28.2s, v0.2s, v9.s[1]
|
||||
fmla v17.2s, v1.2s, v8.s[0]
|
||||
|
||||
fmla v18.2s, v2.2s, v8.2s[0]
|
||||
fmla v31.2s, v3.2s, v9.2s[1]
|
||||
fmla v22.2s, v2.2s, v8.2s[1]
|
||||
fmla v27.2s, v3.2s, v9.2s[0]
|
||||
fmla v18.2s, v2.2s, v8.s[0]
|
||||
fmla v31.2s, v3.2s, v9.s[1]
|
||||
fmla v22.2s, v2.2s, v8.s[1]
|
||||
fmla v27.2s, v3.2s, v9.s[0]
|
||||
|
||||
fmla v26.2s, v2.2s, v9.2s[0]
|
||||
fmla v23.2s, v3.2s, v8.2s[1]
|
||||
fmla v30.2s, v2.2s, v9.2s[1]
|
||||
fmla v19.2s, v3.2s, v8.2s[0]
|
||||
fmla v26.2s, v2.2s, v9.s[0]
|
||||
fmla v23.2s, v3.2s, v8.s[1]
|
||||
fmla v30.2s, v2.2s, v9.s[1]
|
||||
fmla v19.2s, v3.2s, v8.s[0]
|
||||
.endm
|
||||
|
||||
.macro SAVE8x4
|
||||
|
|
@ -556,17 +556,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v0.2s, v1.2s}, [pA_0]
|
||||
add pA_0, pA_0, #16
|
||||
|
||||
fmla v16.2s, v0.2s, v8.2s[0]
|
||||
fmla v29.2s, v1.2s, v9.2s[1]
|
||||
fmla v16.2s, v0.2s, v8.s[0]
|
||||
fmla v29.2s, v1.2s, v9.s[1]
|
||||
|
||||
fmla v20.2s, v0.2s, v8.2s[1]
|
||||
fmla v25.2s, v1.2s, v9.2s[0]
|
||||
fmla v20.2s, v0.2s, v8.s[1]
|
||||
fmla v25.2s, v1.2s, v9.s[0]
|
||||
|
||||
fmla v24.2s, v0.2s, v9.2s[0]
|
||||
fmla v21.2s, v1.2s, v8.2s[1]
|
||||
fmla v24.2s, v0.2s, v9.s[0]
|
||||
fmla v21.2s, v1.2s, v8.s[1]
|
||||
|
||||
fmla v28.2s, v0.2s, v9.2s[1]
|
||||
fmla v17.2s, v1.2s, v8.2s[0]
|
||||
fmla v28.2s, v0.2s, v9.s[1]
|
||||
fmla v17.2s, v1.2s, v8.s[0]
|
||||
.endm
|
||||
|
||||
.macro SAVE4x4
|
||||
|
|
@ -614,10 +614,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v0.2s}, [pA_0]
|
||||
add pA_0, pA_0, #8
|
||||
|
||||
fmla v16.2s, v0.2s, v8.2s[0]
|
||||
fmla v20.2s, v0.2s, v8.2s[1]
|
||||
fmla v24.2s, v0.2s, v9.2s[0]
|
||||
fmla v28.2s, v0.2s, v9.2s[1]
|
||||
fmla v16.2s, v0.2s, v8.s[0]
|
||||
fmla v20.2s, v0.2s, v8.s[1]
|
||||
fmla v24.2s, v0.2s, v9.s[0]
|
||||
fmla v28.2s, v0.2s, v9.s[1]
|
||||
.endm
|
||||
|
||||
.macro SAVE2x4
|
||||
|
|
@ -700,10 +700,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v0.2s, v1.2s}, [pA_0]
|
||||
add pA_0, pA_0, #16
|
||||
|
||||
fmla v16.2s, v0.2s, v8.2s[0]
|
||||
fmla v17.2s, v1.2s, v8.2s[0]
|
||||
fmla v20.2s, v0.2s, v8.2s[1]
|
||||
fmla v21.2s, v1.2s, v8.2s[1]
|
||||
fmla v16.2s, v0.2s, v8.s[0]
|
||||
fmla v17.2s, v1.2s, v8.s[0]
|
||||
fmla v20.2s, v0.2s, v8.s[1]
|
||||
fmla v21.2s, v1.2s, v8.s[1]
|
||||
.endm
|
||||
|
||||
.macro SAVE4x2
|
||||
|
|
@ -736,8 +736,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v0.2s}, [pA_0]
|
||||
add pA_0, pA_0, #8
|
||||
|
||||
fmla v16.2s, v0.2s, v8.2s[0]
|
||||
fmla v20.2s, v0.2s, v8.2s[1]
|
||||
fmla v16.2s, v0.2s, v8.s[0]
|
||||
fmla v20.2s, v0.2s, v8.s[1]
|
||||
.endm
|
||||
|
||||
.macro SAVE2x2
|
||||
|
|
@ -767,7 +767,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ldr s0 , [pA_0]
|
||||
add pA_0, pA_0, #4
|
||||
|
||||
fmla v16.2s, v8.2s, v0.2s[0]
|
||||
fmla v16.2s, v8.2s, v0.s[0]
|
||||
.endm
|
||||
|
||||
.macro SAVE1x2
|
||||
|
|
@ -796,8 +796,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v0.2s, v1.2s}, [pA_0]
|
||||
add pA_0 , pA_0, #16
|
||||
|
||||
fmla v16.2s, v0.2s, v8.2s[0]
|
||||
fmla v17.2s, v1.2s, v8.2s[0]
|
||||
fmla v16.2s, v0.2s, v8.s[0]
|
||||
fmla v17.2s, v1.2s, v8.s[0]
|
||||
.endm
|
||||
|
||||
.macro SAVE4x1
|
||||
|
|
@ -825,7 +825,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v0.2s}, [pA_0]
|
||||
add pA_0 , pA_0, #8
|
||||
|
||||
fmla v16.2s, v0.2s, v8.2s[0]
|
||||
fmla v16.2s, v0.2s, v8.s[0]
|
||||
.endm
|
||||
|
||||
.macro SAVE2x1
|
||||
|
|
|
|||
|
|
@ -157,22 +157,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v1.4s}, [pA]
|
||||
add pA, pA, #16
|
||||
|
||||
fmul v16.4s, v0.4s, v4.4s[0]
|
||||
fmul v17.4s, v1.4s, v4.4s[0]
|
||||
fmul v18.4s, v0.4s, v4.4s[1]
|
||||
fmul v19.4s, v1.4s, v4.4s[1]
|
||||
fmul v20.4s, v0.4s, v4.4s[2]
|
||||
fmul v21.4s, v1.4s, v4.4s[2]
|
||||
fmul v22.4s, v0.4s, v4.4s[3]
|
||||
fmul v23.4s, v1.4s, v4.4s[3]
|
||||
fmul v24.4s, v0.4s, v5.4s[0]
|
||||
fmul v25.4s, v1.4s, v5.4s[0]
|
||||
fmul v26.4s, v0.4s, v5.4s[1]
|
||||
fmul v27.4s, v1.4s, v5.4s[1]
|
||||
fmul v28.4s, v0.4s, v5.4s[2]
|
||||
fmul v29.4s, v1.4s, v5.4s[2]
|
||||
fmul v30.4s, v0.4s, v5.4s[3]
|
||||
fmul v31.4s, v1.4s, v5.4s[3]
|
||||
fmul v16.4s, v0.4s, v4.s[0]
|
||||
fmul v17.4s, v1.4s, v4.s[0]
|
||||
fmul v18.4s, v0.4s, v4.s[1]
|
||||
fmul v19.4s, v1.4s, v4.s[1]
|
||||
fmul v20.4s, v0.4s, v4.s[2]
|
||||
fmul v21.4s, v1.4s, v4.s[2]
|
||||
fmul v22.4s, v0.4s, v4.s[3]
|
||||
fmul v23.4s, v1.4s, v4.s[3]
|
||||
fmul v24.4s, v0.4s, v5.s[0]
|
||||
fmul v25.4s, v1.4s, v5.s[0]
|
||||
fmul v26.4s, v0.4s, v5.s[1]
|
||||
fmul v27.4s, v1.4s, v5.s[1]
|
||||
fmul v28.4s, v0.4s, v5.s[2]
|
||||
fmul v29.4s, v1.4s, v5.s[2]
|
||||
fmul v30.4s, v0.4s, v5.s[3]
|
||||
fmul v31.4s, v1.4s, v5.s[3]
|
||||
|
||||
ld1 {v6.4s}, [pB]
|
||||
add pB, pB, #16
|
||||
|
|
@ -185,22 +185,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro KERNEL8x8_M1
|
||||
fmla v16.4s, v0.4s, v4.4s[0]
|
||||
fmla v17.4s, v1.4s, v4.4s[0]
|
||||
fmla v18.4s, v0.4s, v4.4s[1]
|
||||
fmla v19.4s, v1.4s, v4.4s[1]
|
||||
fmla v20.4s, v0.4s, v4.4s[2]
|
||||
fmla v21.4s, v1.4s, v4.4s[2]
|
||||
fmla v22.4s, v0.4s, v4.4s[3]
|
||||
fmla v23.4s, v1.4s, v4.4s[3]
|
||||
fmla v24.4s, v0.4s, v5.4s[0]
|
||||
fmla v25.4s, v1.4s, v5.4s[0]
|
||||
fmla v26.4s, v0.4s, v5.4s[1]
|
||||
fmla v27.4s, v1.4s, v5.4s[1]
|
||||
fmla v28.4s, v0.4s, v5.4s[2]
|
||||
fmla v29.4s, v1.4s, v5.4s[2]
|
||||
fmla v30.4s, v0.4s, v5.4s[3]
|
||||
fmla v31.4s, v1.4s, v5.4s[3]
|
||||
fmla v16.4s, v0.4s, v4.s[0]
|
||||
fmla v17.4s, v1.4s, v4.s[0]
|
||||
fmla v18.4s, v0.4s, v4.s[1]
|
||||
fmla v19.4s, v1.4s, v4.s[1]
|
||||
fmla v20.4s, v0.4s, v4.s[2]
|
||||
fmla v21.4s, v1.4s, v4.s[2]
|
||||
fmla v22.4s, v0.4s, v4.s[3]
|
||||
fmla v23.4s, v1.4s, v4.s[3]
|
||||
fmla v24.4s, v0.4s, v5.s[0]
|
||||
fmla v25.4s, v1.4s, v5.s[0]
|
||||
fmla v26.4s, v0.4s, v5.s[1]
|
||||
fmla v27.4s, v1.4s, v5.s[1]
|
||||
fmla v28.4s, v0.4s, v5.s[2]
|
||||
fmla v29.4s, v1.4s, v5.s[2]
|
||||
fmla v30.4s, v0.4s, v5.s[3]
|
||||
fmla v31.4s, v1.4s, v5.s[3]
|
||||
|
||||
ld1 {v6.4s}, [pB]
|
||||
add pB, pB, #16
|
||||
|
|
@ -213,22 +213,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro KERNEL8x8_M2
|
||||
fmla v16.4s, v2.4s, v6.4s[0]
|
||||
fmla v17.4s, v3.4s, v6.4s[0]
|
||||
fmla v18.4s, v2.4s, v6.4s[1]
|
||||
fmla v19.4s, v3.4s, v6.4s[1]
|
||||
fmla v20.4s, v2.4s, v6.4s[2]
|
||||
fmla v21.4s, v3.4s, v6.4s[2]
|
||||
fmla v22.4s, v2.4s, v6.4s[3]
|
||||
fmla v23.4s, v3.4s, v6.4s[3]
|
||||
fmla v24.4s, v2.4s, v7.4s[0]
|
||||
fmla v25.4s, v3.4s, v7.4s[0]
|
||||
fmla v26.4s, v2.4s, v7.4s[1]
|
||||
fmla v27.4s, v3.4s, v7.4s[1]
|
||||
fmla v28.4s, v2.4s, v7.4s[2]
|
||||
fmla v29.4s, v3.4s, v7.4s[2]
|
||||
fmla v30.4s, v2.4s, v7.4s[3]
|
||||
fmla v31.4s, v3.4s, v7.4s[3]
|
||||
fmla v16.4s, v2.4s, v6.s[0]
|
||||
fmla v17.4s, v3.4s, v6.s[0]
|
||||
fmla v18.4s, v2.4s, v6.s[1]
|
||||
fmla v19.4s, v3.4s, v6.s[1]
|
||||
fmla v20.4s, v2.4s, v6.s[2]
|
||||
fmla v21.4s, v3.4s, v6.s[2]
|
||||
fmla v22.4s, v2.4s, v6.s[3]
|
||||
fmla v23.4s, v3.4s, v6.s[3]
|
||||
fmla v24.4s, v2.4s, v7.s[0]
|
||||
fmla v25.4s, v3.4s, v7.s[0]
|
||||
fmla v26.4s, v2.4s, v7.s[1]
|
||||
fmla v27.4s, v3.4s, v7.s[1]
|
||||
fmla v28.4s, v2.4s, v7.s[2]
|
||||
fmla v29.4s, v3.4s, v7.s[2]
|
||||
fmla v30.4s, v2.4s, v7.s[3]
|
||||
fmla v31.4s, v3.4s, v7.s[3]
|
||||
|
||||
ld1 {v4.4s}, [pB]
|
||||
add pB, pB, #16
|
||||
|
|
@ -241,22 +241,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro KERNEL8x8_E
|
||||
fmla v16.4s, v2.4s, v6.4s[0]
|
||||
fmla v17.4s, v3.4s, v6.4s[0]
|
||||
fmla v18.4s, v2.4s, v6.4s[1]
|
||||
fmla v19.4s, v3.4s, v6.4s[1]
|
||||
fmla v20.4s, v2.4s, v6.4s[2]
|
||||
fmla v21.4s, v3.4s, v6.4s[2]
|
||||
fmla v22.4s, v2.4s, v6.4s[3]
|
||||
fmla v23.4s, v3.4s, v6.4s[3]
|
||||
fmla v24.4s, v2.4s, v7.4s[0]
|
||||
fmla v25.4s, v3.4s, v7.4s[0]
|
||||
fmla v26.4s, v2.4s, v7.4s[1]
|
||||
fmla v27.4s, v3.4s, v7.4s[1]
|
||||
fmla v28.4s, v2.4s, v7.4s[2]
|
||||
fmla v29.4s, v3.4s, v7.4s[2]
|
||||
fmla v30.4s, v2.4s, v7.4s[3]
|
||||
fmla v31.4s, v3.4s, v7.4s[3]
|
||||
fmla v16.4s, v2.4s, v6.s[0]
|
||||
fmla v17.4s, v3.4s, v6.s[0]
|
||||
fmla v18.4s, v2.4s, v6.s[1]
|
||||
fmla v19.4s, v3.4s, v6.s[1]
|
||||
fmla v20.4s, v2.4s, v6.s[2]
|
||||
fmla v21.4s, v3.4s, v6.s[2]
|
||||
fmla v22.4s, v2.4s, v6.s[3]
|
||||
fmla v23.4s, v3.4s, v6.s[3]
|
||||
fmla v24.4s, v2.4s, v7.s[0]
|
||||
fmla v25.4s, v3.4s, v7.s[0]
|
||||
fmla v26.4s, v2.4s, v7.s[1]
|
||||
fmla v27.4s, v3.4s, v7.s[1]
|
||||
fmla v28.4s, v2.4s, v7.s[2]
|
||||
fmla v29.4s, v3.4s, v7.s[2]
|
||||
fmla v30.4s, v2.4s, v7.s[3]
|
||||
fmla v31.4s, v3.4s, v7.s[3]
|
||||
.endm
|
||||
|
||||
.macro KERNEL8x8_SUB
|
||||
|
|
@ -269,22 +269,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v1.4s}, [pA]
|
||||
add pA, pA, #16
|
||||
|
||||
fmla v16.4s, v0.4s, v4.4s[0]
|
||||
fmla v17.4s, v1.4s, v4.4s[0]
|
||||
fmla v18.4s, v0.4s, v4.4s[1]
|
||||
fmla v19.4s, v1.4s, v4.4s[1]
|
||||
fmla v20.4s, v0.4s, v4.4s[2]
|
||||
fmla v21.4s, v1.4s, v4.4s[2]
|
||||
fmla v22.4s, v0.4s, v4.4s[3]
|
||||
fmla v23.4s, v1.4s, v4.4s[3]
|
||||
fmla v24.4s, v0.4s, v5.4s[0]
|
||||
fmla v25.4s, v1.4s, v5.4s[0]
|
||||
fmla v26.4s, v0.4s, v5.4s[1]
|
||||
fmla v27.4s, v1.4s, v5.4s[1]
|
||||
fmla v28.4s, v0.4s, v5.4s[2]
|
||||
fmla v29.4s, v1.4s, v5.4s[2]
|
||||
fmla v30.4s, v0.4s, v5.4s[3]
|
||||
fmla v31.4s, v1.4s, v5.4s[3]
|
||||
fmla v16.4s, v0.4s, v4.s[0]
|
||||
fmla v17.4s, v1.4s, v4.s[0]
|
||||
fmla v18.4s, v0.4s, v4.s[1]
|
||||
fmla v19.4s, v1.4s, v4.s[1]
|
||||
fmla v20.4s, v0.4s, v4.s[2]
|
||||
fmla v21.4s, v1.4s, v4.s[2]
|
||||
fmla v22.4s, v0.4s, v4.s[3]
|
||||
fmla v23.4s, v1.4s, v4.s[3]
|
||||
fmla v24.4s, v0.4s, v5.s[0]
|
||||
fmla v25.4s, v1.4s, v5.s[0]
|
||||
fmla v26.4s, v0.4s, v5.s[1]
|
||||
fmla v27.4s, v1.4s, v5.s[1]
|
||||
fmla v28.4s, v0.4s, v5.s[2]
|
||||
fmla v29.4s, v1.4s, v5.s[2]
|
||||
fmla v30.4s, v0.4s, v5.s[3]
|
||||
fmla v31.4s, v1.4s, v5.s[3]
|
||||
.endm
|
||||
|
||||
.macro SAVE8x8
|
||||
|
|
@ -367,14 +367,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v0.4s}, [pA]
|
||||
add pA, pA, #16
|
||||
|
||||
fmul v16.4s, v0.4s, v4.4s[0]
|
||||
fmul v18.4s, v0.4s, v4.4s[1]
|
||||
fmul v20.4s, v0.4s, v4.4s[2]
|
||||
fmul v22.4s, v0.4s, v4.4s[3]
|
||||
fmul v24.4s, v0.4s, v5.4s[0]
|
||||
fmul v26.4s, v0.4s, v5.4s[1]
|
||||
fmul v28.4s, v0.4s, v5.4s[2]
|
||||
fmul v30.4s, v0.4s, v5.4s[3]
|
||||
fmul v16.4s, v0.4s, v4.s[0]
|
||||
fmul v18.4s, v0.4s, v4.s[1]
|
||||
fmul v20.4s, v0.4s, v4.s[2]
|
||||
fmul v22.4s, v0.4s, v4.s[3]
|
||||
fmul v24.4s, v0.4s, v5.s[0]
|
||||
fmul v26.4s, v0.4s, v5.s[1]
|
||||
fmul v28.4s, v0.4s, v5.s[2]
|
||||
fmul v30.4s, v0.4s, v5.s[3]
|
||||
|
||||
ld1 {v6.4s}, [pB]
|
||||
add pB, pB, #16
|
||||
|
|
@ -385,14 +385,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro KERNEL4x8_M1
|
||||
fmla v16.4s, v0.4s, v4.4s[0]
|
||||
fmla v18.4s, v0.4s, v4.4s[1]
|
||||
fmla v20.4s, v0.4s, v4.4s[2]
|
||||
fmla v22.4s, v0.4s, v4.4s[3]
|
||||
fmla v24.4s, v0.4s, v5.4s[0]
|
||||
fmla v26.4s, v0.4s, v5.4s[1]
|
||||
fmla v28.4s, v0.4s, v5.4s[2]
|
||||
fmla v30.4s, v0.4s, v5.4s[3]
|
||||
fmla v16.4s, v0.4s, v4.s[0]
|
||||
fmla v18.4s, v0.4s, v4.s[1]
|
||||
fmla v20.4s, v0.4s, v4.s[2]
|
||||
fmla v22.4s, v0.4s, v4.s[3]
|
||||
fmla v24.4s, v0.4s, v5.s[0]
|
||||
fmla v26.4s, v0.4s, v5.s[1]
|
||||
fmla v28.4s, v0.4s, v5.s[2]
|
||||
fmla v30.4s, v0.4s, v5.s[3]
|
||||
|
||||
ld1 {v6.4s}, [pB]
|
||||
add pB, pB, #16
|
||||
|
|
@ -403,14 +403,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro KERNEL4x8_M2
|
||||
fmla v16.4s, v2.4s, v6.4s[0]
|
||||
fmla v18.4s, v2.4s, v6.4s[1]
|
||||
fmla v20.4s, v2.4s, v6.4s[2]
|
||||
fmla v22.4s, v2.4s, v6.4s[3]
|
||||
fmla v24.4s, v2.4s, v7.4s[0]
|
||||
fmla v26.4s, v2.4s, v7.4s[1]
|
||||
fmla v28.4s, v2.4s, v7.4s[2]
|
||||
fmla v30.4s, v2.4s, v7.4s[3]
|
||||
fmla v16.4s, v2.4s, v6.s[0]
|
||||
fmla v18.4s, v2.4s, v6.s[1]
|
||||
fmla v20.4s, v2.4s, v6.s[2]
|
||||
fmla v22.4s, v2.4s, v6.s[3]
|
||||
fmla v24.4s, v2.4s, v7.s[0]
|
||||
fmla v26.4s, v2.4s, v7.s[1]
|
||||
fmla v28.4s, v2.4s, v7.s[2]
|
||||
fmla v30.4s, v2.4s, v7.s[3]
|
||||
|
||||
ld1 {v4.4s}, [pB]
|
||||
add pB, pB, #16
|
||||
|
|
@ -421,14 +421,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro KERNEL4x8_E
|
||||
fmla v16.4s, v2.4s, v6.4s[0]
|
||||
fmla v18.4s, v2.4s, v6.4s[1]
|
||||
fmla v20.4s, v2.4s, v6.4s[2]
|
||||
fmla v22.4s, v2.4s, v6.4s[3]
|
||||
fmla v24.4s, v2.4s, v7.4s[0]
|
||||
fmla v26.4s, v2.4s, v7.4s[1]
|
||||
fmla v28.4s, v2.4s, v7.4s[2]
|
||||
fmla v30.4s, v2.4s, v7.4s[3]
|
||||
fmla v16.4s, v2.4s, v6.s[0]
|
||||
fmla v18.4s, v2.4s, v6.s[1]
|
||||
fmla v20.4s, v2.4s, v6.s[2]
|
||||
fmla v22.4s, v2.4s, v6.s[3]
|
||||
fmla v24.4s, v2.4s, v7.s[0]
|
||||
fmla v26.4s, v2.4s, v7.s[1]
|
||||
fmla v28.4s, v2.4s, v7.s[2]
|
||||
fmla v30.4s, v2.4s, v7.s[3]
|
||||
.endm
|
||||
|
||||
.macro KERNEL4x8_SUB
|
||||
|
|
@ -439,14 +439,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v0.4s}, [pA]
|
||||
add pA, pA, #16
|
||||
|
||||
fmla v16.4s, v0.4s, v4.4s[0]
|
||||
fmla v18.4s, v0.4s, v4.4s[1]
|
||||
fmla v20.4s, v0.4s, v4.4s[2]
|
||||
fmla v22.4s, v0.4s, v4.4s[3]
|
||||
fmla v24.4s, v0.4s, v5.4s[0]
|
||||
fmla v26.4s, v0.4s, v5.4s[1]
|
||||
fmla v28.4s, v0.4s, v5.4s[2]
|
||||
fmla v30.4s, v0.4s, v5.4s[3]
|
||||
fmla v16.4s, v0.4s, v4.s[0]
|
||||
fmla v18.4s, v0.4s, v4.s[1]
|
||||
fmla v20.4s, v0.4s, v4.s[2]
|
||||
fmla v22.4s, v0.4s, v4.s[3]
|
||||
fmla v24.4s, v0.4s, v5.s[0]
|
||||
fmla v26.4s, v0.4s, v5.s[1]
|
||||
fmla v28.4s, v0.4s, v5.s[2]
|
||||
fmla v30.4s, v0.4s, v5.s[3]
|
||||
.endm
|
||||
|
||||
.macro SAVE4x8
|
||||
|
|
@ -520,14 +520,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v0.2s}, [pA]
|
||||
add pA, pA, #8
|
||||
|
||||
fmla v16.2s, v0.2s, v4.4s[0]
|
||||
fmla v18.2s, v0.2s, v4.4s[1]
|
||||
fmla v20.2s, v0.2s, v4.4s[2]
|
||||
fmla v22.2s, v0.2s, v4.4s[3]
|
||||
fmla v24.2s, v0.2s, v5.4s[0]
|
||||
fmla v26.2s, v0.2s, v5.4s[1]
|
||||
fmla v28.2s, v0.2s, v5.4s[2]
|
||||
fmla v30.2s, v0.2s, v5.4s[3]
|
||||
fmla v16.2s, v0.2s, v4.s[0]
|
||||
fmla v18.2s, v0.2s, v4.s[1]
|
||||
fmla v20.2s, v0.2s, v4.s[2]
|
||||
fmla v22.2s, v0.2s, v4.s[3]
|
||||
fmla v24.2s, v0.2s, v5.s[0]
|
||||
fmla v26.2s, v0.2s, v5.s[1]
|
||||
fmla v28.2s, v0.2s, v5.s[2]
|
||||
fmla v30.2s, v0.2s, v5.s[3]
|
||||
.endm
|
||||
|
||||
.macro SAVE2x8
|
||||
|
|
@ -601,14 +601,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ldr s0, [pA]
|
||||
add pA, pA, #4
|
||||
|
||||
fmla s16, s0, v4.4s[0]
|
||||
fmla s18, s0, v4.4s[1]
|
||||
fmla s20, s0, v4.4s[2]
|
||||
fmla s22, s0, v4.4s[3]
|
||||
fmla s24, s0, v5.4s[0]
|
||||
fmla s26, s0, v5.4s[1]
|
||||
fmla s28, s0, v5.4s[2]
|
||||
fmla s30, s0, v5.4s[3]
|
||||
fmla s16, s0, v4.s[0]
|
||||
fmla s18, s0, v4.s[1]
|
||||
fmla s20, s0, v4.s[2]
|
||||
fmla s22, s0, v4.s[3]
|
||||
fmla s24, s0, v5.s[0]
|
||||
fmla s26, s0, v5.s[1]
|
||||
fmla s28, s0, v5.s[2]
|
||||
fmla s30, s0, v5.s[3]
|
||||
.endm
|
||||
|
||||
.macro SAVE1x8
|
||||
|
|
@ -682,14 +682,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v1.4s}, [pA]
|
||||
add pA, pA, #16
|
||||
|
||||
fmul v16.4s, v0.4s, v8.2s[0]
|
||||
fmul v17.4s, v1.4s, v8.2s[0]
|
||||
fmul v20.4s, v0.4s, v8.2s[1]
|
||||
fmul v21.4s, v1.4s, v8.2s[1]
|
||||
fmul v24.4s, v0.4s, v9.2s[0]
|
||||
fmul v25.4s, v1.4s, v9.2s[0]
|
||||
fmul v28.4s, v0.4s, v9.2s[1]
|
||||
fmul v29.4s, v1.4s, v9.2s[1]
|
||||
fmul v16.4s, v0.4s, v8.s[0]
|
||||
fmul v17.4s, v1.4s, v8.s[0]
|
||||
fmul v20.4s, v0.4s, v8.s[1]
|
||||
fmul v21.4s, v1.4s, v8.s[1]
|
||||
fmul v24.4s, v0.4s, v9.s[0]
|
||||
fmul v25.4s, v1.4s, v9.s[0]
|
||||
fmul v28.4s, v0.4s, v9.s[1]
|
||||
fmul v29.4s, v1.4s, v9.s[1]
|
||||
|
||||
ld1 {v12.2s, v13.2s}, [pB]
|
||||
add pB, pB, #16
|
||||
|
|
@ -700,14 +700,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro KERNEL8x4_M1
|
||||
fmla v16.4s, v0.4s, v8.2s[0]
|
||||
fmla v17.4s, v1.4s, v8.2s[0]
|
||||
fmla v20.4s, v0.4s, v8.2s[1]
|
||||
fmla v21.4s, v1.4s, v8.2s[1]
|
||||
fmla v24.4s, v0.4s, v9.2s[0]
|
||||
fmla v25.4s, v1.4s, v9.2s[0]
|
||||
fmla v28.4s, v0.4s, v9.2s[1]
|
||||
fmla v29.4s, v1.4s, v9.2s[1]
|
||||
fmla v16.4s, v0.4s, v8.s[0]
|
||||
fmla v17.4s, v1.4s, v8.s[0]
|
||||
fmla v20.4s, v0.4s, v8.s[1]
|
||||
fmla v21.4s, v1.4s, v8.s[1]
|
||||
fmla v24.4s, v0.4s, v9.s[0]
|
||||
fmla v25.4s, v1.4s, v9.s[0]
|
||||
fmla v28.4s, v0.4s, v9.s[1]
|
||||
fmla v29.4s, v1.4s, v9.s[1]
|
||||
|
||||
ld1 {v12.2s, v13.2s}, [pB]
|
||||
add pB, pB, #16
|
||||
|
|
@ -718,14 +718,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro KERNEL8x4_M2
|
||||
fmla v16.4s, v4.4s, v12.2s[0]
|
||||
fmla v17.4s, v5.4s, v12.2s[0]
|
||||
fmla v20.4s, v4.4s, v12.2s[1]
|
||||
fmla v21.4s, v5.4s, v12.2s[1]
|
||||
fmla v24.4s, v4.4s, v13.2s[0]
|
||||
fmla v25.4s, v5.4s, v13.2s[0]
|
||||
fmla v28.4s, v4.4s, v13.2s[1]
|
||||
fmla v29.4s, v5.4s, v13.2s[1]
|
||||
fmla v16.4s, v4.4s, v12.s[0]
|
||||
fmla v17.4s, v5.4s, v12.s[0]
|
||||
fmla v20.4s, v4.4s, v12.s[1]
|
||||
fmla v21.4s, v5.4s, v12.s[1]
|
||||
fmla v24.4s, v4.4s, v13.s[0]
|
||||
fmla v25.4s, v5.4s, v13.s[0]
|
||||
fmla v28.4s, v4.4s, v13.s[1]
|
||||
fmla v29.4s, v5.4s, v13.s[1]
|
||||
|
||||
ld1 {v8.2s, v9.2s}, [pB]
|
||||
add pB, pB, #16
|
||||
|
|
@ -736,14 +736,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro KERNEL8x4_E
|
||||
fmla v16.4s, v4.4s, v12.2s[0]
|
||||
fmla v17.4s, v5.4s, v12.2s[0]
|
||||
fmla v20.4s, v4.4s, v12.2s[1]
|
||||
fmla v21.4s, v5.4s, v12.2s[1]
|
||||
fmla v24.4s, v4.4s, v13.2s[0]
|
||||
fmla v25.4s, v5.4s, v13.2s[0]
|
||||
fmla v28.4s, v4.4s, v13.2s[1]
|
||||
fmla v29.4s, v5.4s, v13.2s[1]
|
||||
fmla v16.4s, v4.4s, v12.s[0]
|
||||
fmla v17.4s, v5.4s, v12.s[0]
|
||||
fmla v20.4s, v4.4s, v12.s[1]
|
||||
fmla v21.4s, v5.4s, v12.s[1]
|
||||
fmla v24.4s, v4.4s, v13.s[0]
|
||||
fmla v25.4s, v5.4s, v13.s[0]
|
||||
fmla v28.4s, v4.4s, v13.s[1]
|
||||
fmla v29.4s, v5.4s, v13.s[1]
|
||||
.endm
|
||||
|
||||
.macro KERNEL8x4_SUB
|
||||
|
|
@ -754,14 +754,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v1.4s}, [pA]
|
||||
add pA, pA, #16
|
||||
|
||||
fmla v16.4s, v0.4s, v8.2s[0]
|
||||
fmla v17.4s, v1.4s, v8.2s[0]
|
||||
fmla v20.4s, v0.4s, v8.2s[1]
|
||||
fmla v21.4s, v1.4s, v8.2s[1]
|
||||
fmla v24.4s, v0.4s, v9.2s[0]
|
||||
fmla v25.4s, v1.4s, v9.2s[0]
|
||||
fmla v28.4s, v0.4s, v9.2s[1]
|
||||
fmla v29.4s, v1.4s, v9.2s[1]
|
||||
fmla v16.4s, v0.4s, v8.s[0]
|
||||
fmla v17.4s, v1.4s, v8.s[0]
|
||||
fmla v20.4s, v0.4s, v8.s[1]
|
||||
fmla v21.4s, v1.4s, v8.s[1]
|
||||
fmla v24.4s, v0.4s, v9.s[0]
|
||||
fmla v25.4s, v1.4s, v9.s[0]
|
||||
fmla v28.4s, v0.4s, v9.s[1]
|
||||
fmla v29.4s, v1.4s, v9.s[1]
|
||||
.endm
|
||||
|
||||
.macro SAVE8x4
|
||||
|
|
@ -814,17 +814,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v0.2s, v1.2s}, [pA]
|
||||
add pA, pA, #16
|
||||
|
||||
fmul v16.2s, v0.2s, v8.2s[0]
|
||||
fmul v29.2s, v1.2s, v9.2s[1]
|
||||
fmul v16.2s, v0.2s, v8.s[0]
|
||||
fmul v29.2s, v1.2s, v9.s[1]
|
||||
|
||||
fmul v20.2s, v0.2s, v8.2s[1]
|
||||
fmul v25.2s, v1.2s, v9.2s[0]
|
||||
fmul v20.2s, v0.2s, v8.s[1]
|
||||
fmul v25.2s, v1.2s, v9.s[0]
|
||||
|
||||
fmul v24.2s, v0.2s, v9.2s[0]
|
||||
fmul v21.2s, v1.2s, v8.2s[1]
|
||||
fmul v24.2s, v0.2s, v9.s[0]
|
||||
fmul v21.2s, v1.2s, v8.s[1]
|
||||
|
||||
fmul v28.2s, v0.2s, v9.2s[1]
|
||||
fmul v17.2s, v1.2s, v8.2s[0]
|
||||
fmul v28.2s, v0.2s, v9.s[1]
|
||||
fmul v17.2s, v1.2s, v8.s[0]
|
||||
|
||||
ld1 {v12.2s, v13.2s}, [pB]
|
||||
add pB, pB, #16
|
||||
|
|
@ -833,61 +833,61 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro KERNEL4x4_M1
|
||||
fmla v16.2s, v0.2s, v8.2s[0]
|
||||
fmla v29.2s, v1.2s, v9.2s[1]
|
||||
fmla v16.2s, v0.2s, v8.s[0]
|
||||
fmla v29.2s, v1.2s, v9.s[1]
|
||||
|
||||
ld1 {v12.2s, v13.2s}, [pB] // For next round
|
||||
add pB, pB, #16
|
||||
|
||||
fmla v20.2s, v0.2s, v8.2s[1]
|
||||
fmla v25.2s, v1.2s, v9.2s[0]
|
||||
fmla v20.2s, v0.2s, v8.s[1]
|
||||
fmla v25.2s, v1.2s, v9.s[0]
|
||||
|
||||
ld1 {v4.2s, v5.2s}, [pA] // For next round
|
||||
add pA, pA, #16
|
||||
|
||||
fmla v24.2s, v0.2s, v9.2s[0]
|
||||
fmla v21.2s, v1.2s, v8.2s[1]
|
||||
fmla v24.2s, v0.2s, v9.s[0]
|
||||
fmla v21.2s, v1.2s, v8.s[1]
|
||||
|
||||
prfm PLDL1KEEP, [pB, #512]
|
||||
|
||||
fmla v28.2s, v0.2s, v9.2s[1]
|
||||
fmla v17.2s, v1.2s, v8.2s[0]
|
||||
fmla v28.2s, v0.2s, v9.s[1]
|
||||
fmla v17.2s, v1.2s, v8.s[0]
|
||||
.endm
|
||||
|
||||
.macro KERNEL4x4_M2
|
||||
fmla v16.2s, v4.2s, v12.2s[0]
|
||||
fmla v29.2s, v5.2s, v13.2s[1]
|
||||
fmla v16.2s, v4.2s, v12.s[0]
|
||||
fmla v29.2s, v5.2s, v13.s[1]
|
||||
|
||||
ld1 {v8.2s, v9.2s}, [pB] // For next round
|
||||
add pB, pB, #16
|
||||
|
||||
fmla v20.2s, v4.2s, v12.2s[1]
|
||||
fmla v25.2s, v5.2s, v13.2s[0]
|
||||
fmla v20.2s, v4.2s, v12.s[1]
|
||||
fmla v25.2s, v5.2s, v13.s[0]
|
||||
|
||||
ld1 {v0.2s, v1.2s}, [pA] // For next round
|
||||
add pA, pA, #16
|
||||
|
||||
fmla v24.2s, v4.2s, v13.2s[0]
|
||||
fmla v21.2s, v5.2s, v12.2s[1]
|
||||
fmla v24.2s, v4.2s, v13.s[0]
|
||||
fmla v21.2s, v5.2s, v12.s[1]
|
||||
|
||||
prfm PLDL1KEEP, [pA, #512]
|
||||
|
||||
fmla v28.2s, v4.2s, v13.2s[1]
|
||||
fmla v17.2s, v5.2s, v12.2s[0]
|
||||
fmla v28.2s, v4.2s, v13.s[1]
|
||||
fmla v17.2s, v5.2s, v12.s[0]
|
||||
.endm
|
||||
|
||||
.macro KERNEL4x4_E
|
||||
fmla v16.2s, v4.2s, v12.2s[0]
|
||||
fmla v29.2s, v5.2s, v13.2s[1]
|
||||
fmla v16.2s, v4.2s, v12.s[0]
|
||||
fmla v29.2s, v5.2s, v13.s[1]
|
||||
|
||||
fmla v20.2s, v4.2s, v12.2s[1]
|
||||
fmla v25.2s, v5.2s, v13.2s[0]
|
||||
fmla v20.2s, v4.2s, v12.s[1]
|
||||
fmla v25.2s, v5.2s, v13.s[0]
|
||||
|
||||
fmla v24.2s, v4.2s, v13.2s[0]
|
||||
fmla v21.2s, v5.2s, v12.2s[1]
|
||||
fmla v24.2s, v4.2s, v13.s[0]
|
||||
fmla v21.2s, v5.2s, v12.s[1]
|
||||
|
||||
fmla v28.2s, v4.2s, v13.2s[1]
|
||||
fmla v17.2s, v5.2s, v12.2s[0]
|
||||
fmla v28.2s, v4.2s, v13.s[1]
|
||||
fmla v17.2s, v5.2s, v12.s[0]
|
||||
.endm
|
||||
|
||||
.macro KERNEL4x4_SUB
|
||||
|
|
@ -896,17 +896,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v0.2s, v1.2s}, [pA]
|
||||
add pA, pA, #16
|
||||
|
||||
fmla v16.2s, v0.2s, v8.2s[0]
|
||||
fmla v29.2s, v1.2s, v9.2s[1]
|
||||
fmla v16.2s, v0.2s, v8.s[0]
|
||||
fmla v29.2s, v1.2s, v9.s[1]
|
||||
|
||||
fmla v20.2s, v0.2s, v8.2s[1]
|
||||
fmla v25.2s, v1.2s, v9.2s[0]
|
||||
fmla v20.2s, v0.2s, v8.s[1]
|
||||
fmla v25.2s, v1.2s, v9.s[0]
|
||||
|
||||
fmla v24.2s, v0.2s, v9.2s[0]
|
||||
fmla v21.2s, v1.2s, v8.2s[1]
|
||||
fmla v24.2s, v0.2s, v9.s[0]
|
||||
fmla v21.2s, v1.2s, v8.s[1]
|
||||
|
||||
fmla v28.2s, v0.2s, v9.2s[1]
|
||||
fmla v17.2s, v1.2s, v8.2s[0]
|
||||
fmla v28.2s, v0.2s, v9.s[1]
|
||||
fmla v17.2s, v1.2s, v8.s[0]
|
||||
.endm
|
||||
|
||||
.macro SAVE4x4
|
||||
|
|
@ -951,10 +951,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v0.2s}, [pA]
|
||||
add pA, pA, #8
|
||||
|
||||
fmla v16.2s, v0.2s, v8.2s[0]
|
||||
fmla v20.2s, v0.2s, v8.2s[1]
|
||||
fmla v24.2s, v0.2s, v9.2s[0]
|
||||
fmla v28.2s, v0.2s, v9.2s[1]
|
||||
fmla v16.2s, v0.2s, v8.s[0]
|
||||
fmla v20.2s, v0.2s, v8.s[1]
|
||||
fmla v24.2s, v0.2s, v9.s[0]
|
||||
fmla v28.2s, v0.2s, v9.s[1]
|
||||
.endm
|
||||
|
||||
.macro SAVE2x4
|
||||
|
|
@ -1034,11 +1034,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v1.4s}, [pA]
|
||||
add pA, pA, #16
|
||||
|
||||
fmla v16.4s, v0.4s, v8.2s[0]
|
||||
fmla v17.4s, v1.4s, v8.2s[0]
|
||||
fmla v16.4s, v0.4s, v8.s[0]
|
||||
fmla v17.4s, v1.4s, v8.s[0]
|
||||
|
||||
fmla v20.4s, v0.4s, v8.2s[1]
|
||||
fmla v21.4s, v1.4s, v8.2s[1]
|
||||
fmla v20.4s, v0.4s, v8.s[1]
|
||||
fmla v21.4s, v1.4s, v8.s[1]
|
||||
.endm
|
||||
|
||||
.macro SAVE8x2
|
||||
|
|
@ -1074,10 +1074,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v0.2s, v1.2s}, [pA]
|
||||
add pA, pA, #16
|
||||
|
||||
fmla v16.2s, v0.2s, v8.2s[0]
|
||||
fmla v17.2s, v1.2s, v8.2s[0]
|
||||
fmla v20.2s, v0.2s, v8.2s[1]
|
||||
fmla v21.2s, v1.2s, v8.2s[1]
|
||||
fmla v16.2s, v0.2s, v8.s[0]
|
||||
fmla v17.2s, v1.2s, v8.s[0]
|
||||
fmla v20.2s, v0.2s, v8.s[1]
|
||||
fmla v21.2s, v1.2s, v8.s[1]
|
||||
.endm
|
||||
|
||||
.macro SAVE4x2
|
||||
|
|
@ -1109,8 +1109,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v0.2s}, [pA]
|
||||
add pA, pA, #8
|
||||
|
||||
fmla v16.2s, v0.2s, v8.2s[0]
|
||||
fmla v20.2s, v0.2s, v8.2s[1]
|
||||
fmla v16.2s, v0.2s, v8.s[0]
|
||||
fmla v20.2s, v0.2s, v8.s[1]
|
||||
.endm
|
||||
|
||||
.macro SAVE2x2
|
||||
|
|
@ -1139,7 +1139,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ldr s0 , [pA]
|
||||
add pA, pA, #4
|
||||
|
||||
fmla v16.2s, v8.2s, v0.2s[0]
|
||||
fmla v16.2s, v8.2s, v0.s[0]
|
||||
.endm
|
||||
|
||||
.macro SAVE1x2
|
||||
|
|
@ -1169,8 +1169,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v1.4s}, [pA]
|
||||
add pA, pA, #16
|
||||
|
||||
fmla v16.4s, v0.4s, v8.2s[0]
|
||||
fmla v17.4s, v1.4s, v8.2s[0]
|
||||
fmla v16.4s, v0.4s, v8.s[0]
|
||||
fmla v17.4s, v1.4s, v8.s[0]
|
||||
.endm
|
||||
|
||||
.macro SAVE8x1
|
||||
|
|
@ -1196,8 +1196,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v0.2s, v1.2s}, [pA]
|
||||
add pA , pA, #16
|
||||
|
||||
fmla v16.2s, v0.2s, v8.2s[0]
|
||||
fmla v17.2s, v1.2s, v8.2s[0]
|
||||
fmla v16.2s, v0.2s, v8.s[0]
|
||||
fmla v17.2s, v1.2s, v8.s[0]
|
||||
.endm
|
||||
|
||||
.macro SAVE4x1
|
||||
|
|
@ -1222,7 +1222,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v0.2s}, [pA]
|
||||
add pA , pA, #8
|
||||
|
||||
fmla v16.2s, v0.2s, v8.2s[0]
|
||||
fmla v16.2s, v0.2s, v8.s[0]
|
||||
.endm
|
||||
|
||||
.macro SAVE2x1
|
||||
|
|
|
|||
|
|
@ -161,25 +161,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v3.4s}, [pA]
|
||||
add pA, pA, #16
|
||||
|
||||
fmul v16.4s, v0.4s, v8.2s[0]
|
||||
fmul v17.4s, v1.4s, v8.2s[0]
|
||||
fmul v18.4s, v2.4s, v8.2s[0]
|
||||
fmul v19.4s, v3.4s, v8.2s[0]
|
||||
fmul v16.4s, v0.4s, v8.s[0]
|
||||
fmul v17.4s, v1.4s, v8.s[0]
|
||||
fmul v18.4s, v2.4s, v8.s[0]
|
||||
fmul v19.4s, v3.4s, v8.s[0]
|
||||
|
||||
fmul v20.4s, v0.4s, v8.2s[1]
|
||||
fmul v21.4s, v1.4s, v8.2s[1]
|
||||
fmul v22.4s, v2.4s, v8.2s[1]
|
||||
fmul v23.4s, v3.4s, v8.2s[1]
|
||||
fmul v20.4s, v0.4s, v8.s[1]
|
||||
fmul v21.4s, v1.4s, v8.s[1]
|
||||
fmul v22.4s, v2.4s, v8.s[1]
|
||||
fmul v23.4s, v3.4s, v8.s[1]
|
||||
|
||||
fmul v24.4s, v0.4s, v9.2s[0]
|
||||
fmul v25.4s, v1.4s, v9.2s[0]
|
||||
fmul v26.4s, v2.4s, v9.2s[0]
|
||||
fmul v27.4s, v3.4s, v9.2s[0]
|
||||
fmul v24.4s, v0.4s, v9.s[0]
|
||||
fmul v25.4s, v1.4s, v9.s[0]
|
||||
fmul v26.4s, v2.4s, v9.s[0]
|
||||
fmul v27.4s, v3.4s, v9.s[0]
|
||||
|
||||
fmul v28.4s, v0.4s, v9.2s[1]
|
||||
fmul v29.4s, v1.4s, v9.2s[1]
|
||||
fmul v30.4s, v2.4s, v9.2s[1]
|
||||
fmul v31.4s, v3.4s, v9.2s[1]
|
||||
fmul v28.4s, v0.4s, v9.s[1]
|
||||
fmul v29.4s, v1.4s, v9.s[1]
|
||||
fmul v30.4s, v2.4s, v9.s[1]
|
||||
fmul v31.4s, v3.4s, v9.s[1]
|
||||
|
||||
ld1 {v12.2s, v13.2s}, [pB]
|
||||
add pB, pB, #16
|
||||
|
|
@ -194,25 +194,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro KERNEL16x4_M1
|
||||
fmla v16.4s, v0.4s, v8.2s[0]
|
||||
fmla v17.4s, v1.4s, v8.2s[0]
|
||||
fmla v18.4s, v2.4s, v8.2s[0]
|
||||
fmla v19.4s, v3.4s, v8.2s[0]
|
||||
fmla v16.4s, v0.4s, v8.s[0]
|
||||
fmla v17.4s, v1.4s, v8.s[0]
|
||||
fmla v18.4s, v2.4s, v8.s[0]
|
||||
fmla v19.4s, v3.4s, v8.s[0]
|
||||
|
||||
fmla v20.4s, v0.4s, v8.2s[1]
|
||||
fmla v21.4s, v1.4s, v8.2s[1]
|
||||
fmla v22.4s, v2.4s, v8.2s[1]
|
||||
fmla v23.4s, v3.4s, v8.2s[1]
|
||||
fmla v20.4s, v0.4s, v8.s[1]
|
||||
fmla v21.4s, v1.4s, v8.s[1]
|
||||
fmla v22.4s, v2.4s, v8.s[1]
|
||||
fmla v23.4s, v3.4s, v8.s[1]
|
||||
|
||||
fmla v24.4s, v0.4s, v9.2s[0]
|
||||
fmla v25.4s, v1.4s, v9.2s[0]
|
||||
fmla v26.4s, v2.4s, v9.2s[0]
|
||||
fmla v27.4s, v3.4s, v9.2s[0]
|
||||
fmla v24.4s, v0.4s, v9.s[0]
|
||||
fmla v25.4s, v1.4s, v9.s[0]
|
||||
fmla v26.4s, v2.4s, v9.s[0]
|
||||
fmla v27.4s, v3.4s, v9.s[0]
|
||||
|
||||
fmla v28.4s, v0.4s, v9.2s[1]
|
||||
fmla v29.4s, v1.4s, v9.2s[1]
|
||||
fmla v30.4s, v2.4s, v9.2s[1]
|
||||
fmla v31.4s, v3.4s, v9.2s[1]
|
||||
fmla v28.4s, v0.4s, v9.s[1]
|
||||
fmla v29.4s, v1.4s, v9.s[1]
|
||||
fmla v30.4s, v2.4s, v9.s[1]
|
||||
fmla v31.4s, v3.4s, v9.s[1]
|
||||
|
||||
ld1 {v12.2s, v13.2s}, [pB]
|
||||
add pB, pB, #16
|
||||
|
|
@ -227,25 +227,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro KERNEL16x4_M2
|
||||
fmla v16.4s, v4.4s, v12.2s[0]
|
||||
fmla v17.4s, v5.4s, v12.2s[0]
|
||||
fmla v18.4s, v6.4s, v12.2s[0]
|
||||
fmla v19.4s, v7.4s, v12.2s[0]
|
||||
fmla v16.4s, v4.4s, v12.s[0]
|
||||
fmla v17.4s, v5.4s, v12.s[0]
|
||||
fmla v18.4s, v6.4s, v12.s[0]
|
||||
fmla v19.4s, v7.4s, v12.s[0]
|
||||
|
||||
fmla v20.4s, v4.4s, v12.2s[1]
|
||||
fmla v21.4s, v5.4s, v12.2s[1]
|
||||
fmla v22.4s, v6.4s, v12.2s[1]
|
||||
fmla v23.4s, v7.4s, v12.2s[1]
|
||||
fmla v20.4s, v4.4s, v12.s[1]
|
||||
fmla v21.4s, v5.4s, v12.s[1]
|
||||
fmla v22.4s, v6.4s, v12.s[1]
|
||||
fmla v23.4s, v7.4s, v12.s[1]
|
||||
|
||||
fmla v24.4s, v4.4s, v13.2s[0]
|
||||
fmla v25.4s, v5.4s, v13.2s[0]
|
||||
fmla v26.4s, v6.4s, v13.2s[0]
|
||||
fmla v27.4s, v7.4s, v13.2s[0]
|
||||
fmla v24.4s, v4.4s, v13.s[0]
|
||||
fmla v25.4s, v5.4s, v13.s[0]
|
||||
fmla v26.4s, v6.4s, v13.s[0]
|
||||
fmla v27.4s, v7.4s, v13.s[0]
|
||||
|
||||
fmla v28.4s, v4.4s, v13.2s[1]
|
||||
fmla v29.4s, v5.4s, v13.2s[1]
|
||||
fmla v30.4s, v6.4s, v13.2s[1]
|
||||
fmla v31.4s, v7.4s, v13.2s[1]
|
||||
fmla v28.4s, v4.4s, v13.s[1]
|
||||
fmla v29.4s, v5.4s, v13.s[1]
|
||||
fmla v30.4s, v6.4s, v13.s[1]
|
||||
fmla v31.4s, v7.4s, v13.s[1]
|
||||
|
||||
ld1 {v8.2s, v9.2s}, [pB]
|
||||
add pB, pB, #16
|
||||
|
|
@ -260,25 +260,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro KERNEL16x4_E
|
||||
fmla v16.4s, v4.4s, v12.2s[0]
|
||||
fmla v17.4s, v5.4s, v12.2s[0]
|
||||
fmla v18.4s, v6.4s, v12.2s[0]
|
||||
fmla v19.4s, v7.4s, v12.2s[0]
|
||||
fmla v16.4s, v4.4s, v12.s[0]
|
||||
fmla v17.4s, v5.4s, v12.s[0]
|
||||
fmla v18.4s, v6.4s, v12.s[0]
|
||||
fmla v19.4s, v7.4s, v12.s[0]
|
||||
|
||||
fmla v20.4s, v4.4s, v12.2s[1]
|
||||
fmla v21.4s, v5.4s, v12.2s[1]
|
||||
fmla v22.4s, v6.4s, v12.2s[1]
|
||||
fmla v23.4s, v7.4s, v12.2s[1]
|
||||
fmla v20.4s, v4.4s, v12.s[1]
|
||||
fmla v21.4s, v5.4s, v12.s[1]
|
||||
fmla v22.4s, v6.4s, v12.s[1]
|
||||
fmla v23.4s, v7.4s, v12.s[1]
|
||||
|
||||
fmla v24.4s, v4.4s, v13.2s[0]
|
||||
fmla v25.4s, v5.4s, v13.2s[0]
|
||||
fmla v26.4s, v6.4s, v13.2s[0]
|
||||
fmla v27.4s, v7.4s, v13.2s[0]
|
||||
fmla v24.4s, v4.4s, v13.s[0]
|
||||
fmla v25.4s, v5.4s, v13.s[0]
|
||||
fmla v26.4s, v6.4s, v13.s[0]
|
||||
fmla v27.4s, v7.4s, v13.s[0]
|
||||
|
||||
fmla v28.4s, v4.4s, v13.2s[1]
|
||||
fmla v29.4s, v5.4s, v13.2s[1]
|
||||
fmla v30.4s, v6.4s, v13.2s[1]
|
||||
fmla v31.4s, v7.4s, v13.2s[1]
|
||||
fmla v28.4s, v4.4s, v13.s[1]
|
||||
fmla v29.4s, v5.4s, v13.s[1]
|
||||
fmla v30.4s, v6.4s, v13.s[1]
|
||||
fmla v31.4s, v7.4s, v13.s[1]
|
||||
.endm
|
||||
|
||||
.macro KERNEL16x4_SUB
|
||||
|
|
@ -293,25 +293,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v3.4s}, [pA]
|
||||
add pA, pA, #16
|
||||
|
||||
fmla v16.4s, v0.4s, v8.2s[0]
|
||||
fmla v17.4s, v1.4s, v8.2s[0]
|
||||
fmla v18.4s, v2.4s, v8.2s[0]
|
||||
fmla v19.4s, v3.4s, v8.2s[0]
|
||||
fmla v16.4s, v0.4s, v8.s[0]
|
||||
fmla v17.4s, v1.4s, v8.s[0]
|
||||
fmla v18.4s, v2.4s, v8.s[0]
|
||||
fmla v19.4s, v3.4s, v8.s[0]
|
||||
|
||||
fmla v20.4s, v0.4s, v8.2s[1]
|
||||
fmla v21.4s, v1.4s, v8.2s[1]
|
||||
fmla v22.4s, v2.4s, v8.2s[1]
|
||||
fmla v23.4s, v3.4s, v8.2s[1]
|
||||
fmla v20.4s, v0.4s, v8.s[1]
|
||||
fmla v21.4s, v1.4s, v8.s[1]
|
||||
fmla v22.4s, v2.4s, v8.s[1]
|
||||
fmla v23.4s, v3.4s, v8.s[1]
|
||||
|
||||
fmla v24.4s, v0.4s, v9.2s[0]
|
||||
fmla v25.4s, v1.4s, v9.2s[0]
|
||||
fmla v26.4s, v2.4s, v9.2s[0]
|
||||
fmla v27.4s, v3.4s, v9.2s[0]
|
||||
fmla v24.4s, v0.4s, v9.s[0]
|
||||
fmla v25.4s, v1.4s, v9.s[0]
|
||||
fmla v26.4s, v2.4s, v9.s[0]
|
||||
fmla v27.4s, v3.4s, v9.s[0]
|
||||
|
||||
fmla v28.4s, v0.4s, v9.2s[1]
|
||||
fmla v29.4s, v1.4s, v9.2s[1]
|
||||
fmla v30.4s, v2.4s, v9.2s[1]
|
||||
fmla v31.4s, v3.4s, v9.2s[1]
|
||||
fmla v28.4s, v0.4s, v9.s[1]
|
||||
fmla v29.4s, v1.4s, v9.s[1]
|
||||
fmla v30.4s, v2.4s, v9.s[1]
|
||||
fmla v31.4s, v3.4s, v9.s[1]
|
||||
.endm
|
||||
|
||||
.macro SAVE16x4
|
||||
|
|
@ -369,14 +369,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v1.4s}, [pA]
|
||||
add pA, pA, #16
|
||||
|
||||
fmul v16.4s, v0.4s, v8.2s[0]
|
||||
fmul v17.4s, v1.4s, v8.2s[0]
|
||||
fmul v20.4s, v0.4s, v8.2s[1]
|
||||
fmul v21.4s, v1.4s, v8.2s[1]
|
||||
fmul v24.4s, v0.4s, v9.2s[0]
|
||||
fmul v25.4s, v1.4s, v9.2s[0]
|
||||
fmul v28.4s, v0.4s, v9.2s[1]
|
||||
fmul v29.4s, v1.4s, v9.2s[1]
|
||||
fmul v16.4s, v0.4s, v8.s[0]
|
||||
fmul v17.4s, v1.4s, v8.s[0]
|
||||
fmul v20.4s, v0.4s, v8.s[1]
|
||||
fmul v21.4s, v1.4s, v8.s[1]
|
||||
fmul v24.4s, v0.4s, v9.s[0]
|
||||
fmul v25.4s, v1.4s, v9.s[0]
|
||||
fmul v28.4s, v0.4s, v9.s[1]
|
||||
fmul v29.4s, v1.4s, v9.s[1]
|
||||
|
||||
ld1 {v12.2s, v13.2s}, [pB]
|
||||
add pB, pB, #16
|
||||
|
|
@ -387,14 +387,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro KERNEL8x4_M1
|
||||
fmla v16.4s, v0.4s, v8.2s[0]
|
||||
fmla v17.4s, v1.4s, v8.2s[0]
|
||||
fmla v20.4s, v0.4s, v8.2s[1]
|
||||
fmla v21.4s, v1.4s, v8.2s[1]
|
||||
fmla v24.4s, v0.4s, v9.2s[0]
|
||||
fmla v25.4s, v1.4s, v9.2s[0]
|
||||
fmla v28.4s, v0.4s, v9.2s[1]
|
||||
fmla v29.4s, v1.4s, v9.2s[1]
|
||||
fmla v16.4s, v0.4s, v8.s[0]
|
||||
fmla v17.4s, v1.4s, v8.s[0]
|
||||
fmla v20.4s, v0.4s, v8.s[1]
|
||||
fmla v21.4s, v1.4s, v8.s[1]
|
||||
fmla v24.4s, v0.4s, v9.s[0]
|
||||
fmla v25.4s, v1.4s, v9.s[0]
|
||||
fmla v28.4s, v0.4s, v9.s[1]
|
||||
fmla v29.4s, v1.4s, v9.s[1]
|
||||
|
||||
ld1 {v12.2s, v13.2s}, [pB]
|
||||
add pB, pB, #16
|
||||
|
|
@ -405,14 +405,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro KERNEL8x4_M2
|
||||
fmla v16.4s, v4.4s, v12.2s[0]
|
||||
fmla v17.4s, v5.4s, v12.2s[0]
|
||||
fmla v20.4s, v4.4s, v12.2s[1]
|
||||
fmla v21.4s, v5.4s, v12.2s[1]
|
||||
fmla v24.4s, v4.4s, v13.2s[0]
|
||||
fmla v25.4s, v5.4s, v13.2s[0]
|
||||
fmla v28.4s, v4.4s, v13.2s[1]
|
||||
fmla v29.4s, v5.4s, v13.2s[1]
|
||||
fmla v16.4s, v4.4s, v12.s[0]
|
||||
fmla v17.4s, v5.4s, v12.s[0]
|
||||
fmla v20.4s, v4.4s, v12.s[1]
|
||||
fmla v21.4s, v5.4s, v12.s[1]
|
||||
fmla v24.4s, v4.4s, v13.s[0]
|
||||
fmla v25.4s, v5.4s, v13.s[0]
|
||||
fmla v28.4s, v4.4s, v13.s[1]
|
||||
fmla v29.4s, v5.4s, v13.s[1]
|
||||
|
||||
ld1 {v8.2s, v9.2s}, [pB]
|
||||
add pB, pB, #16
|
||||
|
|
@ -423,14 +423,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro KERNEL8x4_E
|
||||
fmla v16.4s, v4.4s, v12.2s[0]
|
||||
fmla v17.4s, v5.4s, v12.2s[0]
|
||||
fmla v20.4s, v4.4s, v12.2s[1]
|
||||
fmla v21.4s, v5.4s, v12.2s[1]
|
||||
fmla v24.4s, v4.4s, v13.2s[0]
|
||||
fmla v25.4s, v5.4s, v13.2s[0]
|
||||
fmla v28.4s, v4.4s, v13.2s[1]
|
||||
fmla v29.4s, v5.4s, v13.2s[1]
|
||||
fmla v16.4s, v4.4s, v12.s[0]
|
||||
fmla v17.4s, v5.4s, v12.s[0]
|
||||
fmla v20.4s, v4.4s, v12.s[1]
|
||||
fmla v21.4s, v5.4s, v12.s[1]
|
||||
fmla v24.4s, v4.4s, v13.s[0]
|
||||
fmla v25.4s, v5.4s, v13.s[0]
|
||||
fmla v28.4s, v4.4s, v13.s[1]
|
||||
fmla v29.4s, v5.4s, v13.s[1]
|
||||
.endm
|
||||
|
||||
.macro KERNEL8x4_SUB
|
||||
|
|
@ -441,14 +441,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v1.4s}, [pA]
|
||||
add pA, pA, #16
|
||||
|
||||
fmla v16.4s, v0.4s, v8.2s[0]
|
||||
fmla v17.4s, v1.4s, v8.2s[0]
|
||||
fmla v20.4s, v0.4s, v8.2s[1]
|
||||
fmla v21.4s, v1.4s, v8.2s[1]
|
||||
fmla v24.4s, v0.4s, v9.2s[0]
|
||||
fmla v25.4s, v1.4s, v9.2s[0]
|
||||
fmla v28.4s, v0.4s, v9.2s[1]
|
||||
fmla v29.4s, v1.4s, v9.2s[1]
|
||||
fmla v16.4s, v0.4s, v8.s[0]
|
||||
fmla v17.4s, v1.4s, v8.s[0]
|
||||
fmla v20.4s, v0.4s, v8.s[1]
|
||||
fmla v21.4s, v1.4s, v8.s[1]
|
||||
fmla v24.4s, v0.4s, v9.s[0]
|
||||
fmla v25.4s, v1.4s, v9.s[0]
|
||||
fmla v28.4s, v0.4s, v9.s[1]
|
||||
fmla v29.4s, v1.4s, v9.s[1]
|
||||
.endm
|
||||
|
||||
.macro SAVE8x4
|
||||
|
|
@ -496,17 +496,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v0.2s, v1.2s}, [pA]
|
||||
add pA, pA, #16
|
||||
|
||||
fmul v16.2s, v0.2s, v8.2s[0]
|
||||
fmul v29.2s, v1.2s, v9.2s[1]
|
||||
fmul v16.2s, v0.2s, v8.s[0]
|
||||
fmul v29.2s, v1.2s, v9.s[1]
|
||||
|
||||
fmul v20.2s, v0.2s, v8.2s[1]
|
||||
fmul v25.2s, v1.2s, v9.2s[0]
|
||||
fmul v20.2s, v0.2s, v8.s[1]
|
||||
fmul v25.2s, v1.2s, v9.s[0]
|
||||
|
||||
fmul v24.2s, v0.2s, v9.2s[0]
|
||||
fmul v21.2s, v1.2s, v8.2s[1]
|
||||
fmul v24.2s, v0.2s, v9.s[0]
|
||||
fmul v21.2s, v1.2s, v8.s[1]
|
||||
|
||||
fmul v28.2s, v0.2s, v9.2s[1]
|
||||
fmul v17.2s, v1.2s, v8.2s[0]
|
||||
fmul v28.2s, v0.2s, v9.s[1]
|
||||
fmul v17.2s, v1.2s, v8.s[0]
|
||||
|
||||
ld1 {v12.2s, v13.2s}, [pB]
|
||||
add pB, pB, #16
|
||||
|
|
@ -515,61 +515,61 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro KERNEL4x4_M1
|
||||
fmla v16.2s, v0.2s, v8.2s[0]
|
||||
fmla v29.2s, v1.2s, v9.2s[1]
|
||||
fmla v16.2s, v0.2s, v8.s[0]
|
||||
fmla v29.2s, v1.2s, v9.s[1]
|
||||
|
||||
ld1 {v12.2s, v13.2s}, [pB] // For next round
|
||||
add pB, pB, #16
|
||||
|
||||
fmla v20.2s, v0.2s, v8.2s[1]
|
||||
fmla v25.2s, v1.2s, v9.2s[0]
|
||||
fmla v20.2s, v0.2s, v8.s[1]
|
||||
fmla v25.2s, v1.2s, v9.s[0]
|
||||
|
||||
ld1 {v4.2s, v5.2s}, [pA] // For next round
|
||||
add pA, pA, #16
|
||||
|
||||
fmla v24.2s, v0.2s, v9.2s[0]
|
||||
fmla v21.2s, v1.2s, v8.2s[1]
|
||||
fmla v24.2s, v0.2s, v9.s[0]
|
||||
fmla v21.2s, v1.2s, v8.s[1]
|
||||
|
||||
prfm PLDL1KEEP, [pB, #512]
|
||||
|
||||
fmla v28.2s, v0.2s, v9.2s[1]
|
||||
fmla v17.2s, v1.2s, v8.2s[0]
|
||||
fmla v28.2s, v0.2s, v9.s[1]
|
||||
fmla v17.2s, v1.2s, v8.s[0]
|
||||
.endm
|
||||
|
||||
.macro KERNEL4x4_M2
|
||||
fmla v16.2s, v4.2s, v12.2s[0]
|
||||
fmla v29.2s, v5.2s, v13.2s[1]
|
||||
fmla v16.2s, v4.2s, v12.s[0]
|
||||
fmla v29.2s, v5.2s, v13.s[1]
|
||||
|
||||
ld1 {v8.2s, v9.2s}, [pB] // For next round
|
||||
add pB, pB, #16
|
||||
|
||||
fmla v20.2s, v4.2s, v12.2s[1]
|
||||
fmla v25.2s, v5.2s, v13.2s[0]
|
||||
fmla v20.2s, v4.2s, v12.s[1]
|
||||
fmla v25.2s, v5.2s, v13.s[0]
|
||||
|
||||
ld1 {v0.2s, v1.2s}, [pA] // For next round
|
||||
add pA, pA, #16
|
||||
|
||||
fmla v24.2s, v4.2s, v13.2s[0]
|
||||
fmla v21.2s, v5.2s, v12.2s[1]
|
||||
fmla v24.2s, v4.2s, v13.s[0]
|
||||
fmla v21.2s, v5.2s, v12.s[1]
|
||||
|
||||
prfm PLDL1KEEP, [pA, #512]
|
||||
|
||||
fmla v28.2s, v4.2s, v13.2s[1]
|
||||
fmla v17.2s, v5.2s, v12.2s[0]
|
||||
fmla v28.2s, v4.2s, v13.s[1]
|
||||
fmla v17.2s, v5.2s, v12.s[0]
|
||||
.endm
|
||||
|
||||
.macro KERNEL4x4_E
|
||||
fmla v16.2s, v4.2s, v12.2s[0]
|
||||
fmla v29.2s, v5.2s, v13.2s[1]
|
||||
fmla v16.2s, v4.2s, v12.s[0]
|
||||
fmla v29.2s, v5.2s, v13.s[1]
|
||||
|
||||
fmla v20.2s, v4.2s, v12.2s[1]
|
||||
fmla v25.2s, v5.2s, v13.2s[0]
|
||||
fmla v20.2s, v4.2s, v12.s[1]
|
||||
fmla v25.2s, v5.2s, v13.s[0]
|
||||
|
||||
fmla v24.2s, v4.2s, v13.2s[0]
|
||||
fmla v21.2s, v5.2s, v12.2s[1]
|
||||
fmla v24.2s, v4.2s, v13.s[0]
|
||||
fmla v21.2s, v5.2s, v12.s[1]
|
||||
|
||||
fmla v28.2s, v4.2s, v13.2s[1]
|
||||
fmla v17.2s, v5.2s, v12.2s[0]
|
||||
fmla v28.2s, v4.2s, v13.s[1]
|
||||
fmla v17.2s, v5.2s, v12.s[0]
|
||||
.endm
|
||||
|
||||
.macro KERNEL4x4_SUB
|
||||
|
|
@ -578,17 +578,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v0.2s, v1.2s}, [pA]
|
||||
add pA, pA, #16
|
||||
|
||||
fmla v16.2s, v0.2s, v8.2s[0]
|
||||
fmla v29.2s, v1.2s, v9.2s[1]
|
||||
fmla v16.2s, v0.2s, v8.s[0]
|
||||
fmla v29.2s, v1.2s, v9.s[1]
|
||||
|
||||
fmla v20.2s, v0.2s, v8.2s[1]
|
||||
fmla v25.2s, v1.2s, v9.2s[0]
|
||||
fmla v20.2s, v0.2s, v8.s[1]
|
||||
fmla v25.2s, v1.2s, v9.s[0]
|
||||
|
||||
fmla v24.2s, v0.2s, v9.2s[0]
|
||||
fmla v21.2s, v1.2s, v8.2s[1]
|
||||
fmla v24.2s, v0.2s, v9.s[0]
|
||||
fmla v21.2s, v1.2s, v8.s[1]
|
||||
|
||||
fmla v28.2s, v0.2s, v9.2s[1]
|
||||
fmla v17.2s, v1.2s, v8.2s[0]
|
||||
fmla v28.2s, v0.2s, v9.s[1]
|
||||
fmla v17.2s, v1.2s, v8.s[0]
|
||||
.endm
|
||||
|
||||
.macro SAVE4x4
|
||||
|
|
@ -633,10 +633,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v0.2s}, [pA]
|
||||
add pA, pA, #8
|
||||
|
||||
fmla v16.2s, v0.2s, v8.2s[0]
|
||||
fmla v20.2s, v0.2s, v8.2s[1]
|
||||
fmla v24.2s, v0.2s, v9.2s[0]
|
||||
fmla v28.2s, v0.2s, v9.2s[1]
|
||||
fmla v16.2s, v0.2s, v8.s[0]
|
||||
fmla v20.2s, v0.2s, v8.s[1]
|
||||
fmla v24.2s, v0.2s, v9.s[0]
|
||||
fmla v28.2s, v0.2s, v9.s[1]
|
||||
.endm
|
||||
|
||||
.macro SAVE2x4
|
||||
|
|
@ -718,15 +718,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v3.4s}, [pA]
|
||||
add pA, pA, #16
|
||||
|
||||
fmla v16.4s, v0.4s, v8.2s[0]
|
||||
fmla v17.4s, v1.4s, v8.2s[0]
|
||||
fmla v18.4s, v2.4s, v8.2s[0]
|
||||
fmla v19.4s, v3.4s, v8.2s[0]
|
||||
fmla v16.4s, v0.4s, v8.s[0]
|
||||
fmla v17.4s, v1.4s, v8.s[0]
|
||||
fmla v18.4s, v2.4s, v8.s[0]
|
||||
fmla v19.4s, v3.4s, v8.s[0]
|
||||
|
||||
fmla v20.4s, v0.4s, v8.2s[1]
|
||||
fmla v21.4s, v1.4s, v8.2s[1]
|
||||
fmla v22.4s, v2.4s, v8.2s[1]
|
||||
fmla v23.4s, v3.4s, v8.2s[1]
|
||||
fmla v20.4s, v0.4s, v8.s[1]
|
||||
fmla v21.4s, v1.4s, v8.s[1]
|
||||
fmla v22.4s, v2.4s, v8.s[1]
|
||||
fmla v23.4s, v3.4s, v8.s[1]
|
||||
.endm
|
||||
|
||||
.macro SAVE16x2
|
||||
|
|
@ -764,11 +764,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v1.4s}, [pA]
|
||||
add pA, pA, #16
|
||||
|
||||
fmla v16.4s, v0.4s, v8.2s[0]
|
||||
fmla v17.4s, v1.4s, v8.2s[0]
|
||||
fmla v16.4s, v0.4s, v8.s[0]
|
||||
fmla v17.4s, v1.4s, v8.s[0]
|
||||
|
||||
fmla v20.4s, v0.4s, v8.2s[1]
|
||||
fmla v21.4s, v1.4s, v8.2s[1]
|
||||
fmla v20.4s, v0.4s, v8.s[1]
|
||||
fmla v21.4s, v1.4s, v8.s[1]
|
||||
.endm
|
||||
|
||||
.macro SAVE8x2
|
||||
|
|
@ -802,10 +802,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v0.2s, v1.2s}, [pA]
|
||||
add pA, pA, #16
|
||||
|
||||
fmla v16.2s, v0.2s, v8.2s[0]
|
||||
fmla v17.2s, v1.2s, v8.2s[0]
|
||||
fmla v20.2s, v0.2s, v8.2s[1]
|
||||
fmla v21.2s, v1.2s, v8.2s[1]
|
||||
fmla v16.2s, v0.2s, v8.s[0]
|
||||
fmla v17.2s, v1.2s, v8.s[0]
|
||||
fmla v20.2s, v0.2s, v8.s[1]
|
||||
fmla v21.2s, v1.2s, v8.s[1]
|
||||
.endm
|
||||
|
||||
.macro SAVE4x2
|
||||
|
|
@ -837,8 +837,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v0.2s}, [pA]
|
||||
add pA, pA, #8
|
||||
|
||||
fmla v16.2s, v0.2s, v8.2s[0]
|
||||
fmla v20.2s, v0.2s, v8.2s[1]
|
||||
fmla v16.2s, v0.2s, v8.s[0]
|
||||
fmla v20.2s, v0.2s, v8.s[1]
|
||||
.endm
|
||||
|
||||
.macro SAVE2x2
|
||||
|
|
@ -866,7 +866,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ldr s0 , [pA]
|
||||
add pA, pA, #4
|
||||
|
||||
fmla v16.2s, v8.2s, v0.2s[0]
|
||||
fmla v16.2s, v8.2s, v0.s[0]
|
||||
.endm
|
||||
|
||||
.macro SAVE1x2
|
||||
|
|
@ -901,10 +901,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v3.4s}, [pA]
|
||||
add pA, pA, #16
|
||||
|
||||
fmla v16.4s, v0.4s, v8.2s[0]
|
||||
fmla v17.4s, v1.4s, v8.2s[0]
|
||||
fmla v18.4s, v2.4s, v8.2s[0]
|
||||
fmla v19.4s, v3.4s, v8.2s[0]
|
||||
fmla v16.4s, v0.4s, v8.s[0]
|
||||
fmla v17.4s, v1.4s, v8.s[0]
|
||||
fmla v18.4s, v2.4s, v8.s[0]
|
||||
fmla v19.4s, v3.4s, v8.s[0]
|
||||
.endm
|
||||
|
||||
.macro SAVE16x1
|
||||
|
|
@ -934,8 +934,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v1.4s}, [pA]
|
||||
add pA, pA, #16
|
||||
|
||||
fmla v16.4s, v0.4s, v8.2s[0]
|
||||
fmla v17.4s, v1.4s, v8.2s[0]
|
||||
fmla v16.4s, v0.4s, v8.s[0]
|
||||
fmla v17.4s, v1.4s, v8.s[0]
|
||||
.endm
|
||||
|
||||
.macro SAVE8x1
|
||||
|
|
@ -961,8 +961,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v0.2s, v1.2s}, [pA]
|
||||
add pA , pA, #16
|
||||
|
||||
fmla v16.2s, v0.2s, v8.2s[0]
|
||||
fmla v17.2s, v1.2s, v8.2s[0]
|
||||
fmla v16.2s, v0.2s, v8.s[0]
|
||||
fmla v17.2s, v1.2s, v8.s[0]
|
||||
.endm
|
||||
|
||||
.macro SAVE4x1
|
||||
|
|
@ -987,7 +987,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v0.2s}, [pA]
|
||||
add pA , pA, #8
|
||||
|
||||
fmla v16.2s, v0.2s, v8.2s[0]
|
||||
fmla v16.2s, v0.2s, v8.s[0]
|
||||
.endm
|
||||
|
||||
.macro SAVE2x1
|
||||
|
|
|
|||
|
|
@ -147,17 +147,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v0.2s, v1.2s}, [pA]
|
||||
add pA, pA, #16
|
||||
|
||||
fmul v16.2s, v0.2s, v8.2s[0]
|
||||
fmul v29.2s, v1.2s, v9.2s[1]
|
||||
fmul v16.2s, v0.2s, v8.s[0]
|
||||
fmul v29.2s, v1.2s, v9.s[1]
|
||||
|
||||
fmul v20.2s, v0.2s, v8.2s[1]
|
||||
fmul v25.2s, v1.2s, v9.2s[0]
|
||||
fmul v20.2s, v0.2s, v8.s[1]
|
||||
fmul v25.2s, v1.2s, v9.s[0]
|
||||
|
||||
fmul v24.2s, v0.2s, v9.2s[0]
|
||||
fmul v21.2s, v1.2s, v8.2s[1]
|
||||
fmul v24.2s, v0.2s, v9.s[0]
|
||||
fmul v21.2s, v1.2s, v8.s[1]
|
||||
|
||||
fmul v28.2s, v0.2s, v9.2s[1]
|
||||
fmul v17.2s, v1.2s, v8.2s[0]
|
||||
fmul v28.2s, v0.2s, v9.s[1]
|
||||
fmul v17.2s, v1.2s, v8.s[0]
|
||||
|
||||
ld1 {v12.2s, v13.2s}, [pB]
|
||||
add pB, pB, #16
|
||||
|
|
@ -166,61 +166,61 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro KERNEL4x4_M1
|
||||
fmla v16.2s, v0.2s, v8.2s[0]
|
||||
fmla v29.2s, v1.2s, v9.2s[1]
|
||||
fmla v16.2s, v0.2s, v8.s[0]
|
||||
fmla v29.2s, v1.2s, v9.s[1]
|
||||
|
||||
ld1 {v12.2s, v13.2s}, [pB] // For next round
|
||||
add pB, pB, #16
|
||||
|
||||
fmla v20.2s, v0.2s, v8.2s[1]
|
||||
fmla v25.2s, v1.2s, v9.2s[0]
|
||||
fmla v20.2s, v0.2s, v8.s[1]
|
||||
fmla v25.2s, v1.2s, v9.s[0]
|
||||
|
||||
ld1 {v4.2s, v5.2s}, [pA] // For next round
|
||||
add pA, pA, #16
|
||||
|
||||
fmla v24.2s, v0.2s, v9.2s[0]
|
||||
fmla v21.2s, v1.2s, v8.2s[1]
|
||||
fmla v24.2s, v0.2s, v9.s[0]
|
||||
fmla v21.2s, v1.2s, v8.s[1]
|
||||
|
||||
prfm PLDL1KEEP, [pB, #512]
|
||||
|
||||
fmla v28.2s, v0.2s, v9.2s[1]
|
||||
fmla v17.2s, v1.2s, v8.2s[0]
|
||||
fmla v28.2s, v0.2s, v9.s[1]
|
||||
fmla v17.2s, v1.2s, v8.s[0]
|
||||
.endm
|
||||
|
||||
.macro KERNEL4x4_M2
|
||||
fmla v16.2s, v4.2s, v12.2s[0]
|
||||
fmla v29.2s, v5.2s, v13.2s[1]
|
||||
fmla v16.2s, v4.2s, v12.s[0]
|
||||
fmla v29.2s, v5.2s, v13.s[1]
|
||||
|
||||
ld1 {v8.2s, v9.2s}, [pB] // For next round
|
||||
add pB, pB, #16
|
||||
|
||||
fmla v20.2s, v4.2s, v12.2s[1]
|
||||
fmla v25.2s, v5.2s, v13.2s[0]
|
||||
fmla v20.2s, v4.2s, v12.s[1]
|
||||
fmla v25.2s, v5.2s, v13.s[0]
|
||||
|
||||
ld1 {v0.2s, v1.2s}, [pA] // For next round
|
||||
add pA, pA, #16
|
||||
|
||||
fmla v24.2s, v4.2s, v13.2s[0]
|
||||
fmla v21.2s, v5.2s, v12.2s[1]
|
||||
fmla v24.2s, v4.2s, v13.s[0]
|
||||
fmla v21.2s, v5.2s, v12.s[1]
|
||||
|
||||
prfm PLDL1KEEP, [pA, #512]
|
||||
|
||||
fmla v28.2s, v4.2s, v13.2s[1]
|
||||
fmla v17.2s, v5.2s, v12.2s[0]
|
||||
fmla v28.2s, v4.2s, v13.s[1]
|
||||
fmla v17.2s, v5.2s, v12.s[0]
|
||||
.endm
|
||||
|
||||
.macro KERNEL4x4_E
|
||||
fmla v16.2s, v4.2s, v12.2s[0]
|
||||
fmla v29.2s, v5.2s, v13.2s[1]
|
||||
fmla v16.2s, v4.2s, v12.s[0]
|
||||
fmla v29.2s, v5.2s, v13.s[1]
|
||||
|
||||
fmla v20.2s, v4.2s, v12.2s[1]
|
||||
fmla v25.2s, v5.2s, v13.2s[0]
|
||||
fmla v20.2s, v4.2s, v12.s[1]
|
||||
fmla v25.2s, v5.2s, v13.s[0]
|
||||
|
||||
fmla v24.2s, v4.2s, v13.2s[0]
|
||||
fmla v21.2s, v5.2s, v12.2s[1]
|
||||
fmla v24.2s, v4.2s, v13.s[0]
|
||||
fmla v21.2s, v5.2s, v12.s[1]
|
||||
|
||||
fmla v28.2s, v4.2s, v13.2s[1]
|
||||
fmla v17.2s, v5.2s, v12.2s[0]
|
||||
fmla v28.2s, v4.2s, v13.s[1]
|
||||
fmla v17.2s, v5.2s, v12.s[0]
|
||||
.endm
|
||||
|
||||
.macro KERNEL4x4_SUB
|
||||
|
|
@ -229,17 +229,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v0.2s, v1.2s}, [pA]
|
||||
add pA, pA, #16
|
||||
|
||||
fmla v16.2s, v0.2s, v8.2s[0]
|
||||
fmla v29.2s, v1.2s, v9.2s[1]
|
||||
fmla v16.2s, v0.2s, v8.s[0]
|
||||
fmla v29.2s, v1.2s, v9.s[1]
|
||||
|
||||
fmla v20.2s, v0.2s, v8.2s[1]
|
||||
fmla v25.2s, v1.2s, v9.2s[0]
|
||||
fmla v20.2s, v0.2s, v8.s[1]
|
||||
fmla v25.2s, v1.2s, v9.s[0]
|
||||
|
||||
fmla v24.2s, v0.2s, v9.2s[0]
|
||||
fmla v21.2s, v1.2s, v8.2s[1]
|
||||
fmla v24.2s, v0.2s, v9.s[0]
|
||||
fmla v21.2s, v1.2s, v8.s[1]
|
||||
|
||||
fmla v28.2s, v0.2s, v9.2s[1]
|
||||
fmla v17.2s, v1.2s, v8.2s[0]
|
||||
fmla v28.2s, v0.2s, v9.s[1]
|
||||
fmla v17.2s, v1.2s, v8.s[0]
|
||||
.endm
|
||||
|
||||
.macro SAVE4x4
|
||||
|
|
@ -280,10 +280,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v0.2s}, [pA]
|
||||
add pA, pA, #8
|
||||
|
||||
fmla v16.2s, v0.2s, v8.2s[0]
|
||||
fmla v20.2s, v0.2s, v8.2s[1]
|
||||
fmla v24.2s, v0.2s, v9.2s[0]
|
||||
fmla v28.2s, v0.2s, v9.2s[1]
|
||||
fmla v16.2s, v0.2s, v8.s[0]
|
||||
fmla v20.2s, v0.2s, v8.s[1]
|
||||
fmla v24.2s, v0.2s, v9.s[0]
|
||||
fmla v28.2s, v0.2s, v9.s[1]
|
||||
.endm
|
||||
|
||||
.macro SAVE2x4
|
||||
|
|
@ -353,10 +353,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v0.2s, v1.2s}, [pA]
|
||||
add pA, pA, #16
|
||||
|
||||
fmla v16.2s, v0.2s, v8.2s[0]
|
||||
fmla v17.2s, v1.2s, v8.2s[0]
|
||||
fmla v20.2s, v0.2s, v8.2s[1]
|
||||
fmla v21.2s, v1.2s, v8.2s[1]
|
||||
fmla v16.2s, v0.2s, v8.s[0]
|
||||
fmla v17.2s, v1.2s, v8.s[0]
|
||||
fmla v20.2s, v0.2s, v8.s[1]
|
||||
fmla v21.2s, v1.2s, v8.s[1]
|
||||
.endm
|
||||
|
||||
.macro SAVE4x2
|
||||
|
|
@ -386,8 +386,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v0.2s}, [pA]
|
||||
add pA, pA, #8
|
||||
|
||||
fmla v16.2s, v0.2s, v8.2s[0]
|
||||
fmla v20.2s, v0.2s, v8.2s[1]
|
||||
fmla v16.2s, v0.2s, v8.s[0]
|
||||
fmla v20.2s, v0.2s, v8.s[1]
|
||||
.endm
|
||||
|
||||
.macro SAVE2x2
|
||||
|
|
@ -414,7 +414,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ldr s0 , [pA]
|
||||
add pA, pA, #4
|
||||
|
||||
fmla v16.2s, v8.2s, v0.2s[0]
|
||||
fmla v16.2s, v8.2s, v0.s[0]
|
||||
.endm
|
||||
|
||||
.macro SAVE1x2
|
||||
|
|
@ -440,8 +440,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v0.2s, v1.2s}, [pA]
|
||||
add pA , pA, #16
|
||||
|
||||
fmla v16.2s, v0.2s, v8.2s[0]
|
||||
fmla v17.2s, v1.2s, v8.2s[0]
|
||||
fmla v16.2s, v0.2s, v8.s[0]
|
||||
fmla v17.2s, v1.2s, v8.s[0]
|
||||
.endm
|
||||
|
||||
.macro SAVE4x1
|
||||
|
|
@ -468,7 +468,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v0.2s}, [pA]
|
||||
add pA , pA, #8
|
||||
|
||||
fmla v16.2s, v0.2s, v8.2s[0]
|
||||
fmla v16.2s, v0.2s, v8.s[0]
|
||||
.endm
|
||||
|
||||
.macro SAVE2x1
|
||||
|
|
|
|||
|
|
@ -159,22 +159,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v1.4s}, [pA]
|
||||
add pA, pA, #16
|
||||
|
||||
fmul v16.4s, v0.4s, v4.4s[0]
|
||||
fmul v17.4s, v1.4s, v4.4s[0]
|
||||
fmul v18.4s, v0.4s, v4.4s[1]
|
||||
fmul v19.4s, v1.4s, v4.4s[1]
|
||||
fmul v20.4s, v0.4s, v4.4s[2]
|
||||
fmul v21.4s, v1.4s, v4.4s[2]
|
||||
fmul v22.4s, v0.4s, v4.4s[3]
|
||||
fmul v23.4s, v1.4s, v4.4s[3]
|
||||
fmul v24.4s, v0.4s, v5.4s[0]
|
||||
fmul v25.4s, v1.4s, v5.4s[0]
|
||||
fmul v26.4s, v0.4s, v5.4s[1]
|
||||
fmul v27.4s, v1.4s, v5.4s[1]
|
||||
fmul v28.4s, v0.4s, v5.4s[2]
|
||||
fmul v29.4s, v1.4s, v5.4s[2]
|
||||
fmul v30.4s, v0.4s, v5.4s[3]
|
||||
fmul v31.4s, v1.4s, v5.4s[3]
|
||||
fmul v16.4s, v0.4s, v4.s[0]
|
||||
fmul v17.4s, v1.4s, v4.s[0]
|
||||
fmul v18.4s, v0.4s, v4.s[1]
|
||||
fmul v19.4s, v1.4s, v4.s[1]
|
||||
fmul v20.4s, v0.4s, v4.s[2]
|
||||
fmul v21.4s, v1.4s, v4.s[2]
|
||||
fmul v22.4s, v0.4s, v4.s[3]
|
||||
fmul v23.4s, v1.4s, v4.s[3]
|
||||
fmul v24.4s, v0.4s, v5.s[0]
|
||||
fmul v25.4s, v1.4s, v5.s[0]
|
||||
fmul v26.4s, v0.4s, v5.s[1]
|
||||
fmul v27.4s, v1.4s, v5.s[1]
|
||||
fmul v28.4s, v0.4s, v5.s[2]
|
||||
fmul v29.4s, v1.4s, v5.s[2]
|
||||
fmul v30.4s, v0.4s, v5.s[3]
|
||||
fmul v31.4s, v1.4s, v5.s[3]
|
||||
|
||||
ld1 {v6.4s}, [pB]
|
||||
add pB, pB, #16
|
||||
|
|
@ -187,22 +187,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro KERNEL8x8_M1
|
||||
fmla v16.4s, v0.4s, v4.4s[0]
|
||||
fmla v17.4s, v1.4s, v4.4s[0]
|
||||
fmla v18.4s, v0.4s, v4.4s[1]
|
||||
fmla v19.4s, v1.4s, v4.4s[1]
|
||||
fmla v20.4s, v0.4s, v4.4s[2]
|
||||
fmla v21.4s, v1.4s, v4.4s[2]
|
||||
fmla v22.4s, v0.4s, v4.4s[3]
|
||||
fmla v23.4s, v1.4s, v4.4s[3]
|
||||
fmla v24.4s, v0.4s, v5.4s[0]
|
||||
fmla v25.4s, v1.4s, v5.4s[0]
|
||||
fmla v26.4s, v0.4s, v5.4s[1]
|
||||
fmla v27.4s, v1.4s, v5.4s[1]
|
||||
fmla v28.4s, v0.4s, v5.4s[2]
|
||||
fmla v29.4s, v1.4s, v5.4s[2]
|
||||
fmla v30.4s, v0.4s, v5.4s[3]
|
||||
fmla v31.4s, v1.4s, v5.4s[3]
|
||||
fmla v16.4s, v0.4s, v4.s[0]
|
||||
fmla v17.4s, v1.4s, v4.s[0]
|
||||
fmla v18.4s, v0.4s, v4.s[1]
|
||||
fmla v19.4s, v1.4s, v4.s[1]
|
||||
fmla v20.4s, v0.4s, v4.s[2]
|
||||
fmla v21.4s, v1.4s, v4.s[2]
|
||||
fmla v22.4s, v0.4s, v4.s[3]
|
||||
fmla v23.4s, v1.4s, v4.s[3]
|
||||
fmla v24.4s, v0.4s, v5.s[0]
|
||||
fmla v25.4s, v1.4s, v5.s[0]
|
||||
fmla v26.4s, v0.4s, v5.s[1]
|
||||
fmla v27.4s, v1.4s, v5.s[1]
|
||||
fmla v28.4s, v0.4s, v5.s[2]
|
||||
fmla v29.4s, v1.4s, v5.s[2]
|
||||
fmla v30.4s, v0.4s, v5.s[3]
|
||||
fmla v31.4s, v1.4s, v5.s[3]
|
||||
|
||||
ld1 {v6.4s}, [pB]
|
||||
add pB, pB, #16
|
||||
|
|
@ -215,22 +215,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro KERNEL8x8_M2
|
||||
fmla v16.4s, v2.4s, v6.4s[0]
|
||||
fmla v17.4s, v3.4s, v6.4s[0]
|
||||
fmla v18.4s, v2.4s, v6.4s[1]
|
||||
fmla v19.4s, v3.4s, v6.4s[1]
|
||||
fmla v20.4s, v2.4s, v6.4s[2]
|
||||
fmla v21.4s, v3.4s, v6.4s[2]
|
||||
fmla v22.4s, v2.4s, v6.4s[3]
|
||||
fmla v23.4s, v3.4s, v6.4s[3]
|
||||
fmla v24.4s, v2.4s, v7.4s[0]
|
||||
fmla v25.4s, v3.4s, v7.4s[0]
|
||||
fmla v26.4s, v2.4s, v7.4s[1]
|
||||
fmla v27.4s, v3.4s, v7.4s[1]
|
||||
fmla v28.4s, v2.4s, v7.4s[2]
|
||||
fmla v29.4s, v3.4s, v7.4s[2]
|
||||
fmla v30.4s, v2.4s, v7.4s[3]
|
||||
fmla v31.4s, v3.4s, v7.4s[3]
|
||||
fmla v16.4s, v2.4s, v6.s[0]
|
||||
fmla v17.4s, v3.4s, v6.s[0]
|
||||
fmla v18.4s, v2.4s, v6.s[1]
|
||||
fmla v19.4s, v3.4s, v6.s[1]
|
||||
fmla v20.4s, v2.4s, v6.s[2]
|
||||
fmla v21.4s, v3.4s, v6.s[2]
|
||||
fmla v22.4s, v2.4s, v6.s[3]
|
||||
fmla v23.4s, v3.4s, v6.s[3]
|
||||
fmla v24.4s, v2.4s, v7.s[0]
|
||||
fmla v25.4s, v3.4s, v7.s[0]
|
||||
fmla v26.4s, v2.4s, v7.s[1]
|
||||
fmla v27.4s, v3.4s, v7.s[1]
|
||||
fmla v28.4s, v2.4s, v7.s[2]
|
||||
fmla v29.4s, v3.4s, v7.s[2]
|
||||
fmla v30.4s, v2.4s, v7.s[3]
|
||||
fmla v31.4s, v3.4s, v7.s[3]
|
||||
|
||||
ld1 {v4.4s}, [pB]
|
||||
add pB, pB, #16
|
||||
|
|
@ -243,22 +243,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro KERNEL8x8_E
|
||||
fmla v16.4s, v2.4s, v6.4s[0]
|
||||
fmla v17.4s, v3.4s, v6.4s[0]
|
||||
fmla v18.4s, v2.4s, v6.4s[1]
|
||||
fmla v19.4s, v3.4s, v6.4s[1]
|
||||
fmla v20.4s, v2.4s, v6.4s[2]
|
||||
fmla v21.4s, v3.4s, v6.4s[2]
|
||||
fmla v22.4s, v2.4s, v6.4s[3]
|
||||
fmla v23.4s, v3.4s, v6.4s[3]
|
||||
fmla v24.4s, v2.4s, v7.4s[0]
|
||||
fmla v25.4s, v3.4s, v7.4s[0]
|
||||
fmla v26.4s, v2.4s, v7.4s[1]
|
||||
fmla v27.4s, v3.4s, v7.4s[1]
|
||||
fmla v28.4s, v2.4s, v7.4s[2]
|
||||
fmla v29.4s, v3.4s, v7.4s[2]
|
||||
fmla v30.4s, v2.4s, v7.4s[3]
|
||||
fmla v31.4s, v3.4s, v7.4s[3]
|
||||
fmla v16.4s, v2.4s, v6.s[0]
|
||||
fmla v17.4s, v3.4s, v6.s[0]
|
||||
fmla v18.4s, v2.4s, v6.s[1]
|
||||
fmla v19.4s, v3.4s, v6.s[1]
|
||||
fmla v20.4s, v2.4s, v6.s[2]
|
||||
fmla v21.4s, v3.4s, v6.s[2]
|
||||
fmla v22.4s, v2.4s, v6.s[3]
|
||||
fmla v23.4s, v3.4s, v6.s[3]
|
||||
fmla v24.4s, v2.4s, v7.s[0]
|
||||
fmla v25.4s, v3.4s, v7.s[0]
|
||||
fmla v26.4s, v2.4s, v7.s[1]
|
||||
fmla v27.4s, v3.4s, v7.s[1]
|
||||
fmla v28.4s, v2.4s, v7.s[2]
|
||||
fmla v29.4s, v3.4s, v7.s[2]
|
||||
fmla v30.4s, v2.4s, v7.s[3]
|
||||
fmla v31.4s, v3.4s, v7.s[3]
|
||||
.endm
|
||||
|
||||
.macro KERNEL8x8_SUB
|
||||
|
|
@ -271,22 +271,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v1.4s}, [pA]
|
||||
add pA, pA, #16
|
||||
|
||||
fmla v16.4s, v0.4s, v4.4s[0]
|
||||
fmla v17.4s, v1.4s, v4.4s[0]
|
||||
fmla v18.4s, v0.4s, v4.4s[1]
|
||||
fmla v19.4s, v1.4s, v4.4s[1]
|
||||
fmla v20.4s, v0.4s, v4.4s[2]
|
||||
fmla v21.4s, v1.4s, v4.4s[2]
|
||||
fmla v22.4s, v0.4s, v4.4s[3]
|
||||
fmla v23.4s, v1.4s, v4.4s[3]
|
||||
fmla v24.4s, v0.4s, v5.4s[0]
|
||||
fmla v25.4s, v1.4s, v5.4s[0]
|
||||
fmla v26.4s, v0.4s, v5.4s[1]
|
||||
fmla v27.4s, v1.4s, v5.4s[1]
|
||||
fmla v28.4s, v0.4s, v5.4s[2]
|
||||
fmla v29.4s, v1.4s, v5.4s[2]
|
||||
fmla v30.4s, v0.4s, v5.4s[3]
|
||||
fmla v31.4s, v1.4s, v5.4s[3]
|
||||
fmla v16.4s, v0.4s, v4.s[0]
|
||||
fmla v17.4s, v1.4s, v4.s[0]
|
||||
fmla v18.4s, v0.4s, v4.s[1]
|
||||
fmla v19.4s, v1.4s, v4.s[1]
|
||||
fmla v20.4s, v0.4s, v4.s[2]
|
||||
fmla v21.4s, v1.4s, v4.s[2]
|
||||
fmla v22.4s, v0.4s, v4.s[3]
|
||||
fmla v23.4s, v1.4s, v4.s[3]
|
||||
fmla v24.4s, v0.4s, v5.s[0]
|
||||
fmla v25.4s, v1.4s, v5.s[0]
|
||||
fmla v26.4s, v0.4s, v5.s[1]
|
||||
fmla v27.4s, v1.4s, v5.s[1]
|
||||
fmla v28.4s, v0.4s, v5.s[2]
|
||||
fmla v29.4s, v1.4s, v5.s[2]
|
||||
fmla v30.4s, v0.4s, v5.s[3]
|
||||
fmla v31.4s, v1.4s, v5.s[3]
|
||||
.endm
|
||||
|
||||
.macro SAVE8x8
|
||||
|
|
@ -361,14 +361,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v0.4s}, [pA]
|
||||
add pA, pA, #16
|
||||
|
||||
fmul v16.4s, v0.4s, v4.4s[0]
|
||||
fmul v18.4s, v0.4s, v4.4s[1]
|
||||
fmul v20.4s, v0.4s, v4.4s[2]
|
||||
fmul v22.4s, v0.4s, v4.4s[3]
|
||||
fmul v24.4s, v0.4s, v5.4s[0]
|
||||
fmul v26.4s, v0.4s, v5.4s[1]
|
||||
fmul v28.4s, v0.4s, v5.4s[2]
|
||||
fmul v30.4s, v0.4s, v5.4s[3]
|
||||
fmul v16.4s, v0.4s, v4.s[0]
|
||||
fmul v18.4s, v0.4s, v4.s[1]
|
||||
fmul v20.4s, v0.4s, v4.s[2]
|
||||
fmul v22.4s, v0.4s, v4.s[3]
|
||||
fmul v24.4s, v0.4s, v5.s[0]
|
||||
fmul v26.4s, v0.4s, v5.s[1]
|
||||
fmul v28.4s, v0.4s, v5.s[2]
|
||||
fmul v30.4s, v0.4s, v5.s[3]
|
||||
|
||||
ld1 {v6.4s}, [pB]
|
||||
add pB, pB, #16
|
||||
|
|
@ -379,14 +379,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro KERNEL4x8_M1
|
||||
fmla v16.4s, v0.4s, v4.4s[0]
|
||||
fmla v18.4s, v0.4s, v4.4s[1]
|
||||
fmla v20.4s, v0.4s, v4.4s[2]
|
||||
fmla v22.4s, v0.4s, v4.4s[3]
|
||||
fmla v24.4s, v0.4s, v5.4s[0]
|
||||
fmla v26.4s, v0.4s, v5.4s[1]
|
||||
fmla v28.4s, v0.4s, v5.4s[2]
|
||||
fmla v30.4s, v0.4s, v5.4s[3]
|
||||
fmla v16.4s, v0.4s, v4.s[0]
|
||||
fmla v18.4s, v0.4s, v4.s[1]
|
||||
fmla v20.4s, v0.4s, v4.s[2]
|
||||
fmla v22.4s, v0.4s, v4.s[3]
|
||||
fmla v24.4s, v0.4s, v5.s[0]
|
||||
fmla v26.4s, v0.4s, v5.s[1]
|
||||
fmla v28.4s, v0.4s, v5.s[2]
|
||||
fmla v30.4s, v0.4s, v5.s[3]
|
||||
|
||||
ld1 {v6.4s}, [pB]
|
||||
add pB, pB, #16
|
||||
|
|
@ -397,14 +397,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro KERNEL4x8_M2
|
||||
fmla v16.4s, v2.4s, v6.4s[0]
|
||||
fmla v18.4s, v2.4s, v6.4s[1]
|
||||
fmla v20.4s, v2.4s, v6.4s[2]
|
||||
fmla v22.4s, v2.4s, v6.4s[3]
|
||||
fmla v24.4s, v2.4s, v7.4s[0]
|
||||
fmla v26.4s, v2.4s, v7.4s[1]
|
||||
fmla v28.4s, v2.4s, v7.4s[2]
|
||||
fmla v30.4s, v2.4s, v7.4s[3]
|
||||
fmla v16.4s, v2.4s, v6.s[0]
|
||||
fmla v18.4s, v2.4s, v6.s[1]
|
||||
fmla v20.4s, v2.4s, v6.s[2]
|
||||
fmla v22.4s, v2.4s, v6.s[3]
|
||||
fmla v24.4s, v2.4s, v7.s[0]
|
||||
fmla v26.4s, v2.4s, v7.s[1]
|
||||
fmla v28.4s, v2.4s, v7.s[2]
|
||||
fmla v30.4s, v2.4s, v7.s[3]
|
||||
|
||||
ld1 {v4.4s}, [pB]
|
||||
add pB, pB, #16
|
||||
|
|
@ -415,14 +415,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro KERNEL4x8_E
|
||||
fmla v16.4s, v2.4s, v6.4s[0]
|
||||
fmla v18.4s, v2.4s, v6.4s[1]
|
||||
fmla v20.4s, v2.4s, v6.4s[2]
|
||||
fmla v22.4s, v2.4s, v6.4s[3]
|
||||
fmla v24.4s, v2.4s, v7.4s[0]
|
||||
fmla v26.4s, v2.4s, v7.4s[1]
|
||||
fmla v28.4s, v2.4s, v7.4s[2]
|
||||
fmla v30.4s, v2.4s, v7.4s[3]
|
||||
fmla v16.4s, v2.4s, v6.s[0]
|
||||
fmla v18.4s, v2.4s, v6.s[1]
|
||||
fmla v20.4s, v2.4s, v6.s[2]
|
||||
fmla v22.4s, v2.4s, v6.s[3]
|
||||
fmla v24.4s, v2.4s, v7.s[0]
|
||||
fmla v26.4s, v2.4s, v7.s[1]
|
||||
fmla v28.4s, v2.4s, v7.s[2]
|
||||
fmla v30.4s, v2.4s, v7.s[3]
|
||||
.endm
|
||||
|
||||
.macro KERNEL4x8_SUB
|
||||
|
|
@ -433,14 +433,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v0.4s}, [pA]
|
||||
add pA, pA, #16
|
||||
|
||||
fmla v16.4s, v0.4s, v4.4s[0]
|
||||
fmla v18.4s, v0.4s, v4.4s[1]
|
||||
fmla v20.4s, v0.4s, v4.4s[2]
|
||||
fmla v22.4s, v0.4s, v4.4s[3]
|
||||
fmla v24.4s, v0.4s, v5.4s[0]
|
||||
fmla v26.4s, v0.4s, v5.4s[1]
|
||||
fmla v28.4s, v0.4s, v5.4s[2]
|
||||
fmla v30.4s, v0.4s, v5.4s[3]
|
||||
fmla v16.4s, v0.4s, v4.s[0]
|
||||
fmla v18.4s, v0.4s, v4.s[1]
|
||||
fmla v20.4s, v0.4s, v4.s[2]
|
||||
fmla v22.4s, v0.4s, v4.s[3]
|
||||
fmla v24.4s, v0.4s, v5.s[0]
|
||||
fmla v26.4s, v0.4s, v5.s[1]
|
||||
fmla v28.4s, v0.4s, v5.s[2]
|
||||
fmla v30.4s, v0.4s, v5.s[3]
|
||||
.endm
|
||||
|
||||
.macro SAVE4x8
|
||||
|
|
@ -514,14 +514,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v0.2s}, [pA]
|
||||
add pA, pA, #8
|
||||
|
||||
fmla v16.2s, v0.2s, v4.4s[0]
|
||||
fmla v18.2s, v0.2s, v4.4s[1]
|
||||
fmla v20.2s, v0.2s, v4.4s[2]
|
||||
fmla v22.2s, v0.2s, v4.4s[3]
|
||||
fmla v24.2s, v0.2s, v5.4s[0]
|
||||
fmla v26.2s, v0.2s, v5.4s[1]
|
||||
fmla v28.2s, v0.2s, v5.4s[2]
|
||||
fmla v30.2s, v0.2s, v5.4s[3]
|
||||
fmla v16.2s, v0.2s, v4.s[0]
|
||||
fmla v18.2s, v0.2s, v4.s[1]
|
||||
fmla v20.2s, v0.2s, v4.s[2]
|
||||
fmla v22.2s, v0.2s, v4.s[3]
|
||||
fmla v24.2s, v0.2s, v5.s[0]
|
||||
fmla v26.2s, v0.2s, v5.s[1]
|
||||
fmla v28.2s, v0.2s, v5.s[2]
|
||||
fmla v30.2s, v0.2s, v5.s[3]
|
||||
.endm
|
||||
|
||||
.macro SAVE2x8
|
||||
|
|
@ -595,14 +595,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ldr s0, [pA]
|
||||
add pA, pA, #4
|
||||
|
||||
fmla s16, s0, v4.4s[0]
|
||||
fmla s18, s0, v4.4s[1]
|
||||
fmla s20, s0, v4.4s[2]
|
||||
fmla s22, s0, v4.4s[3]
|
||||
fmla s24, s0, v5.4s[0]
|
||||
fmla s26, s0, v5.4s[1]
|
||||
fmla s28, s0, v5.4s[2]
|
||||
fmla s30, s0, v5.4s[3]
|
||||
fmla s16, s0, v4.s[0]
|
||||
fmla s18, s0, v4.s[1]
|
||||
fmla s20, s0, v4.s[2]
|
||||
fmla s22, s0, v4.s[3]
|
||||
fmla s24, s0, v5.s[0]
|
||||
fmla s26, s0, v5.s[1]
|
||||
fmla s28, s0, v5.s[2]
|
||||
fmla s30, s0, v5.s[3]
|
||||
.endm
|
||||
|
||||
.macro SAVE1x8
|
||||
|
|
@ -676,14 +676,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v1.4s}, [pA]
|
||||
add pA, pA, #16
|
||||
|
||||
fmul v16.4s, v0.4s, v8.2s[0]
|
||||
fmul v17.4s, v1.4s, v8.2s[0]
|
||||
fmul v20.4s, v0.4s, v8.2s[1]
|
||||
fmul v21.4s, v1.4s, v8.2s[1]
|
||||
fmul v24.4s, v0.4s, v9.2s[0]
|
||||
fmul v25.4s, v1.4s, v9.2s[0]
|
||||
fmul v28.4s, v0.4s, v9.2s[1]
|
||||
fmul v29.4s, v1.4s, v9.2s[1]
|
||||
fmul v16.4s, v0.4s, v8.s[0]
|
||||
fmul v17.4s, v1.4s, v8.s[0]
|
||||
fmul v20.4s, v0.4s, v8.s[1]
|
||||
fmul v21.4s, v1.4s, v8.s[1]
|
||||
fmul v24.4s, v0.4s, v9.s[0]
|
||||
fmul v25.4s, v1.4s, v9.s[0]
|
||||
fmul v28.4s, v0.4s, v9.s[1]
|
||||
fmul v29.4s, v1.4s, v9.s[1]
|
||||
|
||||
ld1 {v12.2s, v13.2s}, [pB]
|
||||
add pB, pB, #16
|
||||
|
|
@ -694,14 +694,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro KERNEL8x4_M1
|
||||
fmla v16.4s, v0.4s, v8.2s[0]
|
||||
fmla v17.4s, v1.4s, v8.2s[0]
|
||||
fmla v20.4s, v0.4s, v8.2s[1]
|
||||
fmla v21.4s, v1.4s, v8.2s[1]
|
||||
fmla v24.4s, v0.4s, v9.2s[0]
|
||||
fmla v25.4s, v1.4s, v9.2s[0]
|
||||
fmla v28.4s, v0.4s, v9.2s[1]
|
||||
fmla v29.4s, v1.4s, v9.2s[1]
|
||||
fmla v16.4s, v0.4s, v8.s[0]
|
||||
fmla v17.4s, v1.4s, v8.s[0]
|
||||
fmla v20.4s, v0.4s, v8.s[1]
|
||||
fmla v21.4s, v1.4s, v8.s[1]
|
||||
fmla v24.4s, v0.4s, v9.s[0]
|
||||
fmla v25.4s, v1.4s, v9.s[0]
|
||||
fmla v28.4s, v0.4s, v9.s[1]
|
||||
fmla v29.4s, v1.4s, v9.s[1]
|
||||
|
||||
ld1 {v12.2s, v13.2s}, [pB]
|
||||
add pB, pB, #16
|
||||
|
|
@ -712,14 +712,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro KERNEL8x4_M2
|
||||
fmla v16.4s, v4.4s, v12.2s[0]
|
||||
fmla v17.4s, v5.4s, v12.2s[0]
|
||||
fmla v20.4s, v4.4s, v12.2s[1]
|
||||
fmla v21.4s, v5.4s, v12.2s[1]
|
||||
fmla v24.4s, v4.4s, v13.2s[0]
|
||||
fmla v25.4s, v5.4s, v13.2s[0]
|
||||
fmla v28.4s, v4.4s, v13.2s[1]
|
||||
fmla v29.4s, v5.4s, v13.2s[1]
|
||||
fmla v16.4s, v4.4s, v12.s[0]
|
||||
fmla v17.4s, v5.4s, v12.s[0]
|
||||
fmla v20.4s, v4.4s, v12.s[1]
|
||||
fmla v21.4s, v5.4s, v12.s[1]
|
||||
fmla v24.4s, v4.4s, v13.s[0]
|
||||
fmla v25.4s, v5.4s, v13.s[0]
|
||||
fmla v28.4s, v4.4s, v13.s[1]
|
||||
fmla v29.4s, v5.4s, v13.s[1]
|
||||
|
||||
ld1 {v8.2s, v9.2s}, [pB]
|
||||
add pB, pB, #16
|
||||
|
|
@ -730,14 +730,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro KERNEL8x4_E
|
||||
fmla v16.4s, v4.4s, v12.2s[0]
|
||||
fmla v17.4s, v5.4s, v12.2s[0]
|
||||
fmla v20.4s, v4.4s, v12.2s[1]
|
||||
fmla v21.4s, v5.4s, v12.2s[1]
|
||||
fmla v24.4s, v4.4s, v13.2s[0]
|
||||
fmla v25.4s, v5.4s, v13.2s[0]
|
||||
fmla v28.4s, v4.4s, v13.2s[1]
|
||||
fmla v29.4s, v5.4s, v13.2s[1]
|
||||
fmla v16.4s, v4.4s, v12.s[0]
|
||||
fmla v17.4s, v5.4s, v12.s[0]
|
||||
fmla v20.4s, v4.4s, v12.s[1]
|
||||
fmla v21.4s, v5.4s, v12.s[1]
|
||||
fmla v24.4s, v4.4s, v13.s[0]
|
||||
fmla v25.4s, v5.4s, v13.s[0]
|
||||
fmla v28.4s, v4.4s, v13.s[1]
|
||||
fmla v29.4s, v5.4s, v13.s[1]
|
||||
.endm
|
||||
|
||||
.macro KERNEL8x4_SUB
|
||||
|
|
@ -748,14 +748,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v1.4s}, [pA]
|
||||
add pA, pA, #16
|
||||
|
||||
fmla v16.4s, v0.4s, v8.2s[0]
|
||||
fmla v17.4s, v1.4s, v8.2s[0]
|
||||
fmla v20.4s, v0.4s, v8.2s[1]
|
||||
fmla v21.4s, v1.4s, v8.2s[1]
|
||||
fmla v24.4s, v0.4s, v9.2s[0]
|
||||
fmla v25.4s, v1.4s, v9.2s[0]
|
||||
fmla v28.4s, v0.4s, v9.2s[1]
|
||||
fmla v29.4s, v1.4s, v9.2s[1]
|
||||
fmla v16.4s, v0.4s, v8.s[0]
|
||||
fmla v17.4s, v1.4s, v8.s[0]
|
||||
fmla v20.4s, v0.4s, v8.s[1]
|
||||
fmla v21.4s, v1.4s, v8.s[1]
|
||||
fmla v24.4s, v0.4s, v9.s[0]
|
||||
fmla v25.4s, v1.4s, v9.s[0]
|
||||
fmla v28.4s, v0.4s, v9.s[1]
|
||||
fmla v29.4s, v1.4s, v9.s[1]
|
||||
.endm
|
||||
|
||||
.macro SAVE8x4
|
||||
|
|
@ -808,17 +808,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v0.2s, v1.2s}, [pA]
|
||||
add pA, pA, #16
|
||||
|
||||
fmul v16.2s, v0.2s, v8.2s[0]
|
||||
fmul v29.2s, v1.2s, v9.2s[1]
|
||||
fmul v16.2s, v0.2s, v8.s[0]
|
||||
fmul v29.2s, v1.2s, v9.s[1]
|
||||
|
||||
fmul v20.2s, v0.2s, v8.2s[1]
|
||||
fmul v25.2s, v1.2s, v9.2s[0]
|
||||
fmul v20.2s, v0.2s, v8.s[1]
|
||||
fmul v25.2s, v1.2s, v9.s[0]
|
||||
|
||||
fmul v24.2s, v0.2s, v9.2s[0]
|
||||
fmul v21.2s, v1.2s, v8.2s[1]
|
||||
fmul v24.2s, v0.2s, v9.s[0]
|
||||
fmul v21.2s, v1.2s, v8.s[1]
|
||||
|
||||
fmul v28.2s, v0.2s, v9.2s[1]
|
||||
fmul v17.2s, v1.2s, v8.2s[0]
|
||||
fmul v28.2s, v0.2s, v9.s[1]
|
||||
fmul v17.2s, v1.2s, v8.s[0]
|
||||
|
||||
ld1 {v12.2s, v13.2s}, [pB]
|
||||
add pB, pB, #16
|
||||
|
|
@ -827,61 +827,61 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro KERNEL4x4_M1
|
||||
fmla v16.2s, v0.2s, v8.2s[0]
|
||||
fmla v29.2s, v1.2s, v9.2s[1]
|
||||
fmla v16.2s, v0.2s, v8.s[0]
|
||||
fmla v29.2s, v1.2s, v9.s[1]
|
||||
|
||||
ld1 {v12.2s, v13.2s}, [pB] // For next round
|
||||
add pB, pB, #16
|
||||
|
||||
fmla v20.2s, v0.2s, v8.2s[1]
|
||||
fmla v25.2s, v1.2s, v9.2s[0]
|
||||
fmla v20.2s, v0.2s, v8.s[1]
|
||||
fmla v25.2s, v1.2s, v9.s[0]
|
||||
|
||||
ld1 {v4.2s, v5.2s}, [pA] // For next round
|
||||
add pA, pA, #16
|
||||
|
||||
fmla v24.2s, v0.2s, v9.2s[0]
|
||||
fmla v21.2s, v1.2s, v8.2s[1]
|
||||
fmla v24.2s, v0.2s, v9.s[0]
|
||||
fmla v21.2s, v1.2s, v8.s[1]
|
||||
|
||||
prfm PLDL1KEEP, [pB, #512]
|
||||
|
||||
fmla v28.2s, v0.2s, v9.2s[1]
|
||||
fmla v17.2s, v1.2s, v8.2s[0]
|
||||
fmla v28.2s, v0.2s, v9.s[1]
|
||||
fmla v17.2s, v1.2s, v8.s[0]
|
||||
.endm
|
||||
|
||||
.macro KERNEL4x4_M2
|
||||
fmla v16.2s, v4.2s, v12.2s[0]
|
||||
fmla v29.2s, v5.2s, v13.2s[1]
|
||||
fmla v16.2s, v4.2s, v12.s[0]
|
||||
fmla v29.2s, v5.2s, v13.s[1]
|
||||
|
||||
ld1 {v8.2s, v9.2s}, [pB] // For next round
|
||||
add pB, pB, #16
|
||||
|
||||
fmla v20.2s, v4.2s, v12.2s[1]
|
||||
fmla v25.2s, v5.2s, v13.2s[0]
|
||||
fmla v20.2s, v4.2s, v12.s[1]
|
||||
fmla v25.2s, v5.2s, v13.s[0]
|
||||
|
||||
ld1 {v0.2s, v1.2s}, [pA] // For next round
|
||||
add pA, pA, #16
|
||||
|
||||
fmla v24.2s, v4.2s, v13.2s[0]
|
||||
fmla v21.2s, v5.2s, v12.2s[1]
|
||||
fmla v24.2s, v4.2s, v13.s[0]
|
||||
fmla v21.2s, v5.2s, v12.s[1]
|
||||
|
||||
prfm PLDL1KEEP, [pA, #512]
|
||||
|
||||
fmla v28.2s, v4.2s, v13.2s[1]
|
||||
fmla v17.2s, v5.2s, v12.2s[0]
|
||||
fmla v28.2s, v4.2s, v13.s[1]
|
||||
fmla v17.2s, v5.2s, v12.s[0]
|
||||
.endm
|
||||
|
||||
.macro KERNEL4x4_E
|
||||
fmla v16.2s, v4.2s, v12.2s[0]
|
||||
fmla v29.2s, v5.2s, v13.2s[1]
|
||||
fmla v16.2s, v4.2s, v12.s[0]
|
||||
fmla v29.2s, v5.2s, v13.s[1]
|
||||
|
||||
fmla v20.2s, v4.2s, v12.2s[1]
|
||||
fmla v25.2s, v5.2s, v13.2s[0]
|
||||
fmla v20.2s, v4.2s, v12.s[1]
|
||||
fmla v25.2s, v5.2s, v13.s[0]
|
||||
|
||||
fmla v24.2s, v4.2s, v13.2s[0]
|
||||
fmla v21.2s, v5.2s, v12.2s[1]
|
||||
fmla v24.2s, v4.2s, v13.s[0]
|
||||
fmla v21.2s, v5.2s, v12.s[1]
|
||||
|
||||
fmla v28.2s, v4.2s, v13.2s[1]
|
||||
fmla v17.2s, v5.2s, v12.2s[0]
|
||||
fmla v28.2s, v4.2s, v13.s[1]
|
||||
fmla v17.2s, v5.2s, v12.s[0]
|
||||
.endm
|
||||
|
||||
.macro KERNEL4x4_SUB
|
||||
|
|
@ -890,17 +890,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v0.2s, v1.2s}, [pA]
|
||||
add pA, pA, #16
|
||||
|
||||
fmla v16.2s, v0.2s, v8.2s[0]
|
||||
fmla v29.2s, v1.2s, v9.2s[1]
|
||||
fmla v16.2s, v0.2s, v8.s[0]
|
||||
fmla v29.2s, v1.2s, v9.s[1]
|
||||
|
||||
fmla v20.2s, v0.2s, v8.2s[1]
|
||||
fmla v25.2s, v1.2s, v9.2s[0]
|
||||
fmla v20.2s, v0.2s, v8.s[1]
|
||||
fmla v25.2s, v1.2s, v9.s[0]
|
||||
|
||||
fmla v24.2s, v0.2s, v9.2s[0]
|
||||
fmla v21.2s, v1.2s, v8.2s[1]
|
||||
fmla v24.2s, v0.2s, v9.s[0]
|
||||
fmla v21.2s, v1.2s, v8.s[1]
|
||||
|
||||
fmla v28.2s, v0.2s, v9.2s[1]
|
||||
fmla v17.2s, v1.2s, v8.2s[0]
|
||||
fmla v28.2s, v0.2s, v9.s[1]
|
||||
fmla v17.2s, v1.2s, v8.s[0]
|
||||
.endm
|
||||
|
||||
.macro SAVE4x4
|
||||
|
|
@ -945,10 +945,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v0.2s}, [pA]
|
||||
add pA, pA, #8
|
||||
|
||||
fmla v16.2s, v0.2s, v8.2s[0]
|
||||
fmla v20.2s, v0.2s, v8.2s[1]
|
||||
fmla v24.2s, v0.2s, v9.2s[0]
|
||||
fmla v28.2s, v0.2s, v9.2s[1]
|
||||
fmla v16.2s, v0.2s, v8.s[0]
|
||||
fmla v20.2s, v0.2s, v8.s[1]
|
||||
fmla v24.2s, v0.2s, v9.s[0]
|
||||
fmla v28.2s, v0.2s, v9.s[1]
|
||||
.endm
|
||||
|
||||
.macro SAVE2x4
|
||||
|
|
@ -1028,11 +1028,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v1.4s}, [pA]
|
||||
add pA, pA, #16
|
||||
|
||||
fmla v16.4s, v0.4s, v8.2s[0]
|
||||
fmla v17.4s, v1.4s, v8.2s[0]
|
||||
fmla v16.4s, v0.4s, v8.s[0]
|
||||
fmla v17.4s, v1.4s, v8.s[0]
|
||||
|
||||
fmla v20.4s, v0.4s, v8.2s[1]
|
||||
fmla v21.4s, v1.4s, v8.2s[1]
|
||||
fmla v20.4s, v0.4s, v8.s[1]
|
||||
fmla v21.4s, v1.4s, v8.s[1]
|
||||
.endm
|
||||
|
||||
.macro SAVE8x2
|
||||
|
|
@ -1068,10 +1068,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v0.2s, v1.2s}, [pA]
|
||||
add pA, pA, #16
|
||||
|
||||
fmla v16.2s, v0.2s, v8.2s[0]
|
||||
fmla v17.2s, v1.2s, v8.2s[0]
|
||||
fmla v20.2s, v0.2s, v8.2s[1]
|
||||
fmla v21.2s, v1.2s, v8.2s[1]
|
||||
fmla v16.2s, v0.2s, v8.s[0]
|
||||
fmla v17.2s, v1.2s, v8.s[0]
|
||||
fmla v20.2s, v0.2s, v8.s[1]
|
||||
fmla v21.2s, v1.2s, v8.s[1]
|
||||
.endm
|
||||
|
||||
.macro SAVE4x2
|
||||
|
|
@ -1103,8 +1103,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v0.2s}, [pA]
|
||||
add pA, pA, #8
|
||||
|
||||
fmla v16.2s, v0.2s, v8.2s[0]
|
||||
fmla v20.2s, v0.2s, v8.2s[1]
|
||||
fmla v16.2s, v0.2s, v8.s[0]
|
||||
fmla v20.2s, v0.2s, v8.s[1]
|
||||
.endm
|
||||
|
||||
.macro SAVE2x2
|
||||
|
|
@ -1133,7 +1133,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ldr s0 , [pA]
|
||||
add pA, pA, #4
|
||||
|
||||
fmla v16.2s, v8.2s, v0.2s[0]
|
||||
fmla v16.2s, v8.2s, v0.s[0]
|
||||
.endm
|
||||
|
||||
.macro SAVE1x2
|
||||
|
|
@ -1163,8 +1163,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v1.4s}, [pA]
|
||||
add pA, pA, #16
|
||||
|
||||
fmla v16.4s, v0.4s, v8.2s[0]
|
||||
fmla v17.4s, v1.4s, v8.2s[0]
|
||||
fmla v16.4s, v0.4s, v8.s[0]
|
||||
fmla v17.4s, v1.4s, v8.s[0]
|
||||
.endm
|
||||
|
||||
.macro SAVE8x1
|
||||
|
|
@ -1190,8 +1190,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v0.2s, v1.2s}, [pA]
|
||||
add pA , pA, #16
|
||||
|
||||
fmla v16.2s, v0.2s, v8.2s[0]
|
||||
fmla v17.2s, v1.2s, v8.2s[0]
|
||||
fmla v16.2s, v0.2s, v8.s[0]
|
||||
fmla v17.2s, v1.2s, v8.s[0]
|
||||
.endm
|
||||
|
||||
.macro SAVE4x1
|
||||
|
|
@ -1216,7 +1216,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1 {v0.2s}, [pA]
|
||||
add pA , pA, #8
|
||||
|
||||
fmla v16.2s, v0.2s, v8.2s[0]
|
||||
fmla v16.2s, v0.2s, v8.s[0]
|
||||
.endm
|
||||
|
||||
.macro SAVE2x1
|
||||
|
|
|
|||
|
|
@ -182,93 +182,93 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld2 {v2.2d, v3.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
fmul v16.2d, v0.2d, v8.2d[0]
|
||||
OP_ii v16.2d, v1.2d, v9.2d[0]
|
||||
fmul v16.2d, v0.2d, v8.d[0]
|
||||
OP_ii v16.2d, v1.2d, v9.d[0]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
eor v17.16b, v17.16b, v17.16b
|
||||
fmls v17.2d, v0.2d, v9.2d[0]
|
||||
fmls v17.2d, v0.2d, v9.d[0]
|
||||
#else
|
||||
fmul v17.2d, v0.2d, v9.2d[0]
|
||||
fmul v17.2d, v0.2d, v9.d[0]
|
||||
#endif
|
||||
OP_ir v17.2d, v1.2d, v8.2d[0]
|
||||
OP_ir v17.2d, v1.2d, v8.d[0]
|
||||
|
||||
fmul v18.2d, v2.2d, v8.2d[0]
|
||||
OP_ii v18.2d, v3.2d, v9.2d[0]
|
||||
fmul v18.2d, v2.2d, v8.d[0]
|
||||
OP_ii v18.2d, v3.2d, v9.d[0]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
eor v19.16b, v19.16b, v19.16b
|
||||
fmls v19.2d, v2.2d, v9.2d[0]
|
||||
fmls v19.2d, v2.2d, v9.d[0]
|
||||
#else
|
||||
fmul v19.2d, v2.2d, v9.2d[0]
|
||||
fmul v19.2d, v2.2d, v9.d[0]
|
||||
#endif
|
||||
OP_ir v19.2d, v3.2d, v8.2d[0]
|
||||
OP_ir v19.2d, v3.2d, v8.d[0]
|
||||
|
||||
fmul v20.2d, v0.2d, v8.2d[1]
|
||||
OP_ii v20.2d, v1.2d, v9.2d[1]
|
||||
fmul v20.2d, v0.2d, v8.d[1]
|
||||
OP_ii v20.2d, v1.2d, v9.d[1]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
eor v21.16b, v21.16b, v21.16b
|
||||
fmls v21.2d, v0.2d, v9.2d[1]
|
||||
fmls v21.2d, v0.2d, v9.d[1]
|
||||
#else
|
||||
fmul v21.2d, v0.2d, v9.2d[1]
|
||||
fmul v21.2d, v0.2d, v9.d[1]
|
||||
#endif
|
||||
OP_ir v21.2d, v1.2d, v8.2d[1]
|
||||
OP_ir v21.2d, v1.2d, v8.d[1]
|
||||
|
||||
fmul v22.2d, v2.2d, v8.2d[1]
|
||||
OP_ii v22.2d, v3.2d, v9.2d[1]
|
||||
fmul v22.2d, v2.2d, v8.d[1]
|
||||
OP_ii v22.2d, v3.2d, v9.d[1]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
eor v23.16b, v23.16b, v23.16b
|
||||
fmls v23.2d, v2.2d, v9.2d[1]
|
||||
fmls v23.2d, v2.2d, v9.d[1]
|
||||
#else
|
||||
fmul v23.2d, v2.2d, v9.2d[1]
|
||||
fmul v23.2d, v2.2d, v9.d[1]
|
||||
#endif
|
||||
OP_ir v23.2d, v3.2d, v8.2d[1]
|
||||
OP_ir v23.2d, v3.2d, v8.d[1]
|
||||
|
||||
fmul v24.2d, v0.2d, v10.2d[0]
|
||||
OP_ii v24.2d, v1.2d, v11.2d[0]
|
||||
fmul v24.2d, v0.2d, v10.d[0]
|
||||
OP_ii v24.2d, v1.2d, v11.d[0]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
eor v25.16b, v25.16b, v25.16b
|
||||
fmls v25.2d, v0.2d, v11.2d[0]
|
||||
fmls v25.2d, v0.2d, v11.d[0]
|
||||
#else
|
||||
fmul v25.2d, v0.2d, v11.2d[0]
|
||||
fmul v25.2d, v0.2d, v11.d[0]
|
||||
#endif
|
||||
OP_ir v25.2d, v1.2d, v10.2d[0]
|
||||
OP_ir v25.2d, v1.2d, v10.d[0]
|
||||
|
||||
fmul v26.2d, v2.2d, v10.2d[0]
|
||||
OP_ii v26.2d, v3.2d, v11.2d[0]
|
||||
fmul v26.2d, v2.2d, v10.d[0]
|
||||
OP_ii v26.2d, v3.2d, v11.d[0]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
eor v27.16b, v27.16b, v27.16b
|
||||
fmls v27.2d, v2.2d, v11.2d[0]
|
||||
fmls v27.2d, v2.2d, v11.d[0]
|
||||
#else
|
||||
fmul v27.2d, v2.2d, v11.2d[0]
|
||||
fmul v27.2d, v2.2d, v11.d[0]
|
||||
#endif
|
||||
OP_ir v27.2d, v3.2d, v10.2d[0]
|
||||
OP_ir v27.2d, v3.2d, v10.d[0]
|
||||
|
||||
fmul v28.2d, v0.2d, v10.2d[1]
|
||||
OP_ii v28.2d, v1.2d, v11.2d[1]
|
||||
fmul v28.2d, v0.2d, v10.d[1]
|
||||
OP_ii v28.2d, v1.2d, v11.d[1]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
eor v29.16b, v29.16b, v29.16b
|
||||
fmls v29.2d, v0.2d, v11.2d[1]
|
||||
fmls v29.2d, v0.2d, v11.d[1]
|
||||
#else
|
||||
fmul v29.2d, v0.2d, v11.2d[1]
|
||||
fmul v29.2d, v0.2d, v11.d[1]
|
||||
#endif
|
||||
OP_ir v29.2d, v1.2d, v10.2d[1]
|
||||
OP_ir v29.2d, v1.2d, v10.d[1]
|
||||
|
||||
fmul v30.2d, v2.2d, v10.2d[1]
|
||||
OP_ii v30.2d, v3.2d, v11.2d[1]
|
||||
fmul v30.2d, v2.2d, v10.d[1]
|
||||
OP_ii v30.2d, v3.2d, v11.d[1]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
eor v31.16b, v31.16b, v31.16b
|
||||
fmls v31.2d, v2.2d, v11.2d[1]
|
||||
fmls v31.2d, v2.2d, v11.d[1]
|
||||
#else
|
||||
fmul v31.2d, v2.2d, v11.2d[1]
|
||||
fmul v31.2d, v2.2d, v11.d[1]
|
||||
#endif
|
||||
OP_ir v31.2d, v3.2d, v10.2d[1]
|
||||
OP_ir v31.2d, v3.2d, v10.d[1]
|
||||
|
||||
ld2 {v12.2d, v13.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
|
|
@ -281,161 +281,161 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro KERNEL4x4_M1
|
||||
OP_rr v16.2d, v0.2d, v8.2d[0]
|
||||
OP_ii v16.2d, v1.2d, v9.2d[0]
|
||||
OP_ri v17.2d, v0.2d, v9.2d[0]
|
||||
OP_ir v17.2d, v1.2d, v8.2d[0]
|
||||
OP_rr v16.2d, v0.2d, v8.d[0]
|
||||
OP_ii v16.2d, v1.2d, v9.d[0]
|
||||
OP_ri v17.2d, v0.2d, v9.d[0]
|
||||
OP_ir v17.2d, v1.2d, v8.d[0]
|
||||
|
||||
ld2 {v12.2d, v13.2d}, [pB] // For next round
|
||||
add pB, pB, #32
|
||||
|
||||
OP_rr v18.2d, v2.2d, v8.2d[0]
|
||||
OP_ii v18.2d, v3.2d, v9.2d[0]
|
||||
OP_ri v19.2d, v2.2d, v9.2d[0]
|
||||
OP_ir v19.2d, v3.2d, v8.2d[0]
|
||||
OP_rr v18.2d, v2.2d, v8.d[0]
|
||||
OP_ii v18.2d, v3.2d, v9.d[0]
|
||||
OP_ri v19.2d, v2.2d, v9.d[0]
|
||||
OP_ir v19.2d, v3.2d, v8.d[0]
|
||||
|
||||
ld2 {v14.2d, v15.2d}, [pB] // For next round
|
||||
add pB, pB, #32
|
||||
|
||||
OP_rr v20.2d, v0.2d, v8.2d[1]
|
||||
OP_ii v20.2d, v1.2d, v9.2d[1]
|
||||
OP_ri v21.2d, v0.2d, v9.2d[1]
|
||||
OP_ir v21.2d, v1.2d, v8.2d[1]
|
||||
OP_rr v20.2d, v0.2d, v8.d[1]
|
||||
OP_ii v20.2d, v1.2d, v9.d[1]
|
||||
OP_ri v21.2d, v0.2d, v9.d[1]
|
||||
OP_ir v21.2d, v1.2d, v8.d[1]
|
||||
|
||||
ld2 {v4.2d, v5.2d} , [pA] // For next round
|
||||
add pA, pA, #32
|
||||
|
||||
OP_rr v22.2d, v2.2d, v8.2d[1]
|
||||
OP_ii v22.2d, v3.2d, v9.2d[1]
|
||||
OP_ri v23.2d, v2.2d, v9.2d[1]
|
||||
OP_ir v23.2d, v3.2d, v8.2d[1]
|
||||
OP_rr v22.2d, v2.2d, v8.d[1]
|
||||
OP_ii v22.2d, v3.2d, v9.d[1]
|
||||
OP_ri v23.2d, v2.2d, v9.d[1]
|
||||
OP_ir v23.2d, v3.2d, v8.d[1]
|
||||
|
||||
ld2 {v6.2d, v7.2d} , [pA] // For next round
|
||||
add pA, pA, #32
|
||||
|
||||
OP_rr v24.2d, v0.2d, v10.2d[0]
|
||||
OP_ii v24.2d, v1.2d, v11.2d[0]
|
||||
OP_ri v25.2d, v0.2d, v11.2d[0]
|
||||
OP_ir v25.2d, v1.2d, v10.2d[0]
|
||||
OP_rr v24.2d, v0.2d, v10.d[0]
|
||||
OP_ii v24.2d, v1.2d, v11.d[0]
|
||||
OP_ri v25.2d, v0.2d, v11.d[0]
|
||||
OP_ir v25.2d, v1.2d, v10.d[0]
|
||||
|
||||
prfm PLDL1KEEP, [pA, #512]
|
||||
|
||||
OP_rr v26.2d, v2.2d, v10.2d[0]
|
||||
OP_ii v26.2d, v3.2d, v11.2d[0]
|
||||
OP_ri v27.2d, v2.2d, v11.2d[0]
|
||||
OP_ir v27.2d, v3.2d, v10.2d[0]
|
||||
OP_rr v26.2d, v2.2d, v10.d[0]
|
||||
OP_ii v26.2d, v3.2d, v11.d[0]
|
||||
OP_ri v27.2d, v2.2d, v11.d[0]
|
||||
OP_ir v27.2d, v3.2d, v10.d[0]
|
||||
|
||||
prfm PLDL1KEEP, [pB, #512]
|
||||
|
||||
OP_rr v28.2d, v0.2d, v10.2d[1]
|
||||
OP_ii v28.2d, v1.2d, v11.2d[1]
|
||||
OP_ri v29.2d, v0.2d, v11.2d[1]
|
||||
OP_ir v29.2d, v1.2d, v10.2d[1]
|
||||
OP_rr v28.2d, v0.2d, v10.d[1]
|
||||
OP_ii v28.2d, v1.2d, v11.d[1]
|
||||
OP_ri v29.2d, v0.2d, v11.d[1]
|
||||
OP_ir v29.2d, v1.2d, v10.d[1]
|
||||
|
||||
OP_rr v30.2d, v2.2d, v10.2d[1]
|
||||
OP_ii v30.2d, v3.2d, v11.2d[1]
|
||||
OP_ri v31.2d, v2.2d, v11.2d[1]
|
||||
OP_ir v31.2d, v3.2d, v10.2d[1]
|
||||
OP_rr v30.2d, v2.2d, v10.d[1]
|
||||
OP_ii v30.2d, v3.2d, v11.d[1]
|
||||
OP_ri v31.2d, v2.2d, v11.d[1]
|
||||
OP_ir v31.2d, v3.2d, v10.d[1]
|
||||
.endm
|
||||
|
||||
.macro KERNEL4x4_M2
|
||||
OP_rr v16.2d, v4.2d, v12.2d[0]
|
||||
OP_ii v16.2d, v5.2d, v13.2d[0]
|
||||
OP_ri v17.2d, v4.2d, v13.2d[0]
|
||||
OP_ir v17.2d, v5.2d, v12.2d[0]
|
||||
OP_rr v16.2d, v4.2d, v12.d[0]
|
||||
OP_ii v16.2d, v5.2d, v13.d[0]
|
||||
OP_ri v17.2d, v4.2d, v13.d[0]
|
||||
OP_ir v17.2d, v5.2d, v12.d[0]
|
||||
|
||||
ld2 {v8.2d, v9.2d}, [pB] // For next round
|
||||
add pB, pB, #32
|
||||
|
||||
OP_rr v18.2d, v6.2d, v12.2d[0]
|
||||
OP_ii v18.2d, v7.2d, v13.2d[0]
|
||||
OP_ri v19.2d, v6.2d, v13.2d[0]
|
||||
OP_ir v19.2d, v7.2d, v12.2d[0]
|
||||
OP_rr v18.2d, v6.2d, v12.d[0]
|
||||
OP_ii v18.2d, v7.2d, v13.d[0]
|
||||
OP_ri v19.2d, v6.2d, v13.d[0]
|
||||
OP_ir v19.2d, v7.2d, v12.d[0]
|
||||
|
||||
ld2 {v10.2d, v11.2d}, [pB] // For next round
|
||||
add pB, pB, #32
|
||||
|
||||
OP_rr v20.2d, v4.2d, v12.2d[1]
|
||||
OP_ii v20.2d, v5.2d, v13.2d[1]
|
||||
OP_ri v21.2d, v4.2d, v13.2d[1]
|
||||
OP_ir v21.2d, v5.2d, v12.2d[1]
|
||||
OP_rr v20.2d, v4.2d, v12.d[1]
|
||||
OP_ii v20.2d, v5.2d, v13.d[1]
|
||||
OP_ri v21.2d, v4.2d, v13.d[1]
|
||||
OP_ir v21.2d, v5.2d, v12.d[1]
|
||||
|
||||
ld2 {v0.2d, v1.2d}, [pA] // For next round
|
||||
add pA, pA, #32
|
||||
|
||||
OP_rr v22.2d, v6.2d, v12.2d[1]
|
||||
OP_ii v22.2d, v7.2d, v13.2d[1]
|
||||
OP_ri v23.2d, v6.2d, v13.2d[1]
|
||||
OP_ir v23.2d, v7.2d, v12.2d[1]
|
||||
OP_rr v22.2d, v6.2d, v12.d[1]
|
||||
OP_ii v22.2d, v7.2d, v13.d[1]
|
||||
OP_ri v23.2d, v6.2d, v13.d[1]
|
||||
OP_ir v23.2d, v7.2d, v12.d[1]
|
||||
|
||||
ld2 {v2.2d, v3.2d}, [pA] // For next round
|
||||
add pA, pA, #32
|
||||
|
||||
OP_rr v24.2d, v4.2d, v14.2d[0]
|
||||
OP_ii v24.2d, v5.2d, v15.2d[0]
|
||||
OP_ri v25.2d, v4.2d, v15.2d[0]
|
||||
OP_ir v25.2d, v5.2d, v14.2d[0]
|
||||
OP_rr v24.2d, v4.2d, v14.d[0]
|
||||
OP_ii v24.2d, v5.2d, v15.d[0]
|
||||
OP_ri v25.2d, v4.2d, v15.d[0]
|
||||
OP_ir v25.2d, v5.2d, v14.d[0]
|
||||
|
||||
prfm PLDL1KEEP, [pA, #512]
|
||||
|
||||
OP_rr v26.2d, v6.2d, v14.2d[0]
|
||||
OP_ii v26.2d, v7.2d, v15.2d[0]
|
||||
OP_ri v27.2d, v6.2d, v15.2d[0]
|
||||
OP_ir v27.2d, v7.2d, v14.2d[0]
|
||||
OP_rr v26.2d, v6.2d, v14.d[0]
|
||||
OP_ii v26.2d, v7.2d, v15.d[0]
|
||||
OP_ri v27.2d, v6.2d, v15.d[0]
|
||||
OP_ir v27.2d, v7.2d, v14.d[0]
|
||||
|
||||
prfm PLDL1KEEP, [pB, #512]
|
||||
|
||||
OP_rr v28.2d, v4.2d, v14.2d[1]
|
||||
OP_ii v28.2d, v5.2d, v15.2d[1]
|
||||
OP_ri v29.2d, v4.2d, v15.2d[1]
|
||||
OP_ir v29.2d, v5.2d, v14.2d[1]
|
||||
OP_rr v28.2d, v4.2d, v14.d[1]
|
||||
OP_ii v28.2d, v5.2d, v15.d[1]
|
||||
OP_ri v29.2d, v4.2d, v15.d[1]
|
||||
OP_ir v29.2d, v5.2d, v14.d[1]
|
||||
|
||||
OP_rr v30.2d, v6.2d, v14.2d[1]
|
||||
OP_ii v30.2d, v7.2d, v15.2d[1]
|
||||
OP_ri v31.2d, v6.2d, v15.2d[1]
|
||||
OP_ir v31.2d, v7.2d, v14.2d[1]
|
||||
OP_rr v30.2d, v6.2d, v14.d[1]
|
||||
OP_ii v30.2d, v7.2d, v15.d[1]
|
||||
OP_ri v31.2d, v6.2d, v15.d[1]
|
||||
OP_ir v31.2d, v7.2d, v14.d[1]
|
||||
.endm
|
||||
|
||||
.macro KERNEL4x4_E
|
||||
OP_rr v16.2d, v4.2d, v12.2d[0]
|
||||
OP_ii v16.2d, v5.2d, v13.2d[0]
|
||||
OP_ri v17.2d, v4.2d, v13.2d[0]
|
||||
OP_ir v17.2d, v5.2d, v12.2d[0]
|
||||
OP_rr v16.2d, v4.2d, v12.d[0]
|
||||
OP_ii v16.2d, v5.2d, v13.d[0]
|
||||
OP_ri v17.2d, v4.2d, v13.d[0]
|
||||
OP_ir v17.2d, v5.2d, v12.d[0]
|
||||
|
||||
OP_rr v18.2d, v6.2d, v12.2d[0]
|
||||
OP_ii v18.2d, v7.2d, v13.2d[0]
|
||||
OP_ri v19.2d, v6.2d, v13.2d[0]
|
||||
OP_ir v19.2d, v7.2d, v12.2d[0]
|
||||
OP_rr v18.2d, v6.2d, v12.d[0]
|
||||
OP_ii v18.2d, v7.2d, v13.d[0]
|
||||
OP_ri v19.2d, v6.2d, v13.d[0]
|
||||
OP_ir v19.2d, v7.2d, v12.d[0]
|
||||
|
||||
OP_rr v20.2d, v4.2d, v12.2d[1]
|
||||
OP_ii v20.2d, v5.2d, v13.2d[1]
|
||||
OP_ri v21.2d, v4.2d, v13.2d[1]
|
||||
OP_ir v21.2d, v5.2d, v12.2d[1]
|
||||
OP_rr v20.2d, v4.2d, v12.d[1]
|
||||
OP_ii v20.2d, v5.2d, v13.d[1]
|
||||
OP_ri v21.2d, v4.2d, v13.d[1]
|
||||
OP_ir v21.2d, v5.2d, v12.d[1]
|
||||
|
||||
OP_rr v22.2d, v6.2d, v12.2d[1]
|
||||
OP_ii v22.2d, v7.2d, v13.2d[1]
|
||||
OP_ri v23.2d, v6.2d, v13.2d[1]
|
||||
OP_ir v23.2d, v7.2d, v12.2d[1]
|
||||
OP_rr v22.2d, v6.2d, v12.d[1]
|
||||
OP_ii v22.2d, v7.2d, v13.d[1]
|
||||
OP_ri v23.2d, v6.2d, v13.d[1]
|
||||
OP_ir v23.2d, v7.2d, v12.d[1]
|
||||
|
||||
OP_rr v24.2d, v4.2d, v14.2d[0]
|
||||
OP_ii v24.2d, v5.2d, v15.2d[0]
|
||||
OP_ri v25.2d, v4.2d, v15.2d[0]
|
||||
OP_ir v25.2d, v5.2d, v14.2d[0]
|
||||
OP_rr v24.2d, v4.2d, v14.d[0]
|
||||
OP_ii v24.2d, v5.2d, v15.d[0]
|
||||
OP_ri v25.2d, v4.2d, v15.d[0]
|
||||
OP_ir v25.2d, v5.2d, v14.d[0]
|
||||
|
||||
OP_rr v26.2d, v6.2d, v14.2d[0]
|
||||
OP_ii v26.2d, v7.2d, v15.2d[0]
|
||||
OP_ri v27.2d, v6.2d, v15.2d[0]
|
||||
OP_ir v27.2d, v7.2d, v14.2d[0]
|
||||
OP_rr v26.2d, v6.2d, v14.d[0]
|
||||
OP_ii v26.2d, v7.2d, v15.d[0]
|
||||
OP_ri v27.2d, v6.2d, v15.d[0]
|
||||
OP_ir v27.2d, v7.2d, v14.d[0]
|
||||
|
||||
OP_rr v28.2d, v4.2d, v14.2d[1]
|
||||
OP_ii v28.2d, v5.2d, v15.2d[1]
|
||||
OP_ri v29.2d, v4.2d, v15.2d[1]
|
||||
OP_ir v29.2d, v5.2d, v14.2d[1]
|
||||
OP_rr v28.2d, v4.2d, v14.d[1]
|
||||
OP_ii v28.2d, v5.2d, v15.d[1]
|
||||
OP_ri v29.2d, v4.2d, v15.d[1]
|
||||
OP_ir v29.2d, v5.2d, v14.d[1]
|
||||
|
||||
OP_rr v30.2d, v6.2d, v14.2d[1]
|
||||
OP_ii v30.2d, v7.2d, v15.2d[1]
|
||||
OP_ri v31.2d, v6.2d, v15.2d[1]
|
||||
OP_ir v31.2d, v7.2d, v14.2d[1]
|
||||
OP_rr v30.2d, v6.2d, v14.d[1]
|
||||
OP_ii v30.2d, v7.2d, v15.d[1]
|
||||
OP_ri v31.2d, v6.2d, v15.d[1]
|
||||
OP_ir v31.2d, v7.2d, v14.d[1]
|
||||
.endm
|
||||
|
||||
.macro KERNEL4x4_SUB
|
||||
|
|
@ -448,45 +448,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld2 {v2.2d, v3.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
OP_rr v16.2d, v0.2d, v8.2d[0]
|
||||
OP_ii v16.2d, v1.2d, v9.2d[0]
|
||||
OP_ri v17.2d, v0.2d, v9.2d[0]
|
||||
OP_ir v17.2d, v1.2d, v8.2d[0]
|
||||
OP_rr v16.2d, v0.2d, v8.d[0]
|
||||
OP_ii v16.2d, v1.2d, v9.d[0]
|
||||
OP_ri v17.2d, v0.2d, v9.d[0]
|
||||
OP_ir v17.2d, v1.2d, v8.d[0]
|
||||
|
||||
OP_rr v18.2d, v2.2d, v8.2d[0]
|
||||
OP_ii v18.2d, v3.2d, v9.2d[0]
|
||||
OP_ri v19.2d, v2.2d, v9.2d[0]
|
||||
OP_ir v19.2d, v3.2d, v8.2d[0]
|
||||
OP_rr v18.2d, v2.2d, v8.d[0]
|
||||
OP_ii v18.2d, v3.2d, v9.d[0]
|
||||
OP_ri v19.2d, v2.2d, v9.d[0]
|
||||
OP_ir v19.2d, v3.2d, v8.d[0]
|
||||
|
||||
OP_rr v20.2d, v0.2d, v8.2d[1]
|
||||
OP_ii v20.2d, v1.2d, v9.2d[1]
|
||||
OP_ri v21.2d, v0.2d, v9.2d[1]
|
||||
OP_ir v21.2d, v1.2d, v8.2d[1]
|
||||
OP_rr v20.2d, v0.2d, v8.d[1]
|
||||
OP_ii v20.2d, v1.2d, v9.d[1]
|
||||
OP_ri v21.2d, v0.2d, v9.d[1]
|
||||
OP_ir v21.2d, v1.2d, v8.d[1]
|
||||
|
||||
OP_rr v22.2d, v2.2d, v8.2d[1]
|
||||
OP_ii v22.2d, v3.2d, v9.2d[1]
|
||||
OP_ri v23.2d, v2.2d, v9.2d[1]
|
||||
OP_ir v23.2d, v3.2d, v8.2d[1]
|
||||
OP_rr v22.2d, v2.2d, v8.d[1]
|
||||
OP_ii v22.2d, v3.2d, v9.d[1]
|
||||
OP_ri v23.2d, v2.2d, v9.d[1]
|
||||
OP_ir v23.2d, v3.2d, v8.d[1]
|
||||
|
||||
OP_rr v24.2d, v0.2d, v10.2d[0]
|
||||
OP_ii v24.2d, v1.2d, v11.2d[0]
|
||||
OP_ri v25.2d, v0.2d, v11.2d[0]
|
||||
OP_ir v25.2d, v1.2d, v10.2d[0]
|
||||
OP_rr v24.2d, v0.2d, v10.d[0]
|
||||
OP_ii v24.2d, v1.2d, v11.d[0]
|
||||
OP_ri v25.2d, v0.2d, v11.d[0]
|
||||
OP_ir v25.2d, v1.2d, v10.d[0]
|
||||
|
||||
OP_rr v26.2d, v2.2d, v10.2d[0]
|
||||
OP_ii v26.2d, v3.2d, v11.2d[0]
|
||||
OP_ri v27.2d, v2.2d, v11.2d[0]
|
||||
OP_ir v27.2d, v3.2d, v10.2d[0]
|
||||
OP_rr v26.2d, v2.2d, v10.d[0]
|
||||
OP_ii v26.2d, v3.2d, v11.d[0]
|
||||
OP_ri v27.2d, v2.2d, v11.d[0]
|
||||
OP_ir v27.2d, v3.2d, v10.d[0]
|
||||
|
||||
OP_rr v28.2d, v0.2d, v10.2d[1]
|
||||
OP_ii v28.2d, v1.2d, v11.2d[1]
|
||||
OP_ri v29.2d, v0.2d, v11.2d[1]
|
||||
OP_ir v29.2d, v1.2d, v10.2d[1]
|
||||
OP_rr v28.2d, v0.2d, v10.d[1]
|
||||
OP_ii v28.2d, v1.2d, v11.d[1]
|
||||
OP_ri v29.2d, v0.2d, v11.d[1]
|
||||
OP_ir v29.2d, v1.2d, v10.d[1]
|
||||
|
||||
OP_rr v30.2d, v2.2d, v10.2d[1]
|
||||
OP_ii v30.2d, v3.2d, v11.2d[1]
|
||||
OP_ri v31.2d, v2.2d, v11.2d[1]
|
||||
OP_ir v31.2d, v3.2d, v10.2d[1]
|
||||
OP_rr v30.2d, v2.2d, v10.d[1]
|
||||
OP_ii v30.2d, v3.2d, v11.d[1]
|
||||
OP_ri v31.2d, v2.2d, v11.d[1]
|
||||
OP_ir v31.2d, v3.2d, v10.d[1]
|
||||
.endm
|
||||
|
||||
.macro SAVE4x4
|
||||
|
|
@ -582,25 +582,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld2 {v0.2d, v1.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
OP_rr v16.2d, v0.2d, v8.2d[0]
|
||||
OP_ii v16.2d, v1.2d, v9.2d[0]
|
||||
OP_ri v17.2d, v0.2d, v9.2d[0]
|
||||
OP_ir v17.2d, v1.2d, v8.2d[0]
|
||||
OP_rr v16.2d, v0.2d, v8.d[0]
|
||||
OP_ii v16.2d, v1.2d, v9.d[0]
|
||||
OP_ri v17.2d, v0.2d, v9.d[0]
|
||||
OP_ir v17.2d, v1.2d, v8.d[0]
|
||||
|
||||
OP_rr v20.2d, v0.2d, v8.2d[1]
|
||||
OP_ii v20.2d, v1.2d, v9.2d[1]
|
||||
OP_ri v21.2d, v0.2d, v9.2d[1]
|
||||
OP_ir v21.2d, v1.2d, v8.2d[1]
|
||||
OP_rr v20.2d, v0.2d, v8.d[1]
|
||||
OP_ii v20.2d, v1.2d, v9.d[1]
|
||||
OP_ri v21.2d, v0.2d, v9.d[1]
|
||||
OP_ir v21.2d, v1.2d, v8.d[1]
|
||||
|
||||
OP_rr v24.2d, v0.2d, v10.2d[0]
|
||||
OP_ii v24.2d, v1.2d, v11.2d[0]
|
||||
OP_ri v25.2d, v0.2d, v11.2d[0]
|
||||
OP_ir v25.2d, v1.2d, v10.2d[0]
|
||||
OP_rr v24.2d, v0.2d, v10.d[0]
|
||||
OP_ii v24.2d, v1.2d, v11.d[0]
|
||||
OP_ri v25.2d, v0.2d, v11.d[0]
|
||||
OP_ir v25.2d, v1.2d, v10.d[0]
|
||||
|
||||
OP_rr v28.2d, v0.2d, v10.2d[1]
|
||||
OP_ii v28.2d, v1.2d, v11.2d[1]
|
||||
OP_ri v29.2d, v0.2d, v11.2d[1]
|
||||
OP_ir v29.2d, v1.2d, v10.2d[1]
|
||||
OP_rr v28.2d, v0.2d, v10.d[1]
|
||||
OP_ii v28.2d, v1.2d, v11.d[1]
|
||||
OP_ri v29.2d, v0.2d, v11.d[1]
|
||||
OP_ir v29.2d, v1.2d, v10.d[1]
|
||||
.endm
|
||||
|
||||
.macro SAVE2x4
|
||||
|
|
@ -669,25 +669,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld2 {v0.d, v1.d}[0], [pA]
|
||||
add pA, pA, #16
|
||||
|
||||
OP_rr d16, d0, v8.2d[0]
|
||||
OP_ii d16, d1, v9.2d[0]
|
||||
OP_ri d17, d0, v9.2d[0]
|
||||
OP_ir d17, d1, v8.2d[0]
|
||||
OP_rr d16, d0, v8.d[0]
|
||||
OP_ii d16, d1, v9.d[0]
|
||||
OP_ri d17, d0, v9.d[0]
|
||||
OP_ir d17, d1, v8.d[0]
|
||||
|
||||
OP_rr d20, d0, v8.2d[1]
|
||||
OP_ii d20, d1, v9.2d[1]
|
||||
OP_ri d21, d0, v9.2d[1]
|
||||
OP_ir d21, d1, v8.2d[1]
|
||||
OP_rr d20, d0, v8.d[1]
|
||||
OP_ii d20, d1, v9.d[1]
|
||||
OP_ri d21, d0, v9.d[1]
|
||||
OP_ir d21, d1, v8.d[1]
|
||||
|
||||
OP_rr d24, d0, v10.2d[0]
|
||||
OP_ii d24, d1, v11.2d[0]
|
||||
OP_ri d25, d0, v11.2d[0]
|
||||
OP_ir d25, d1, v10.2d[0]
|
||||
OP_rr d24, d0, v10.d[0]
|
||||
OP_ii d24, d1, v11.d[0]
|
||||
OP_ri d25, d0, v11.d[0]
|
||||
OP_ir d25, d1, v10.d[0]
|
||||
|
||||
OP_rr d28, d0, v10.2d[1]
|
||||
OP_ii d28, d1, v11.2d[1]
|
||||
OP_ri d29, d0, v11.2d[1]
|
||||
OP_ir d29, d1, v10.2d[1]
|
||||
OP_rr d28, d0, v10.d[1]
|
||||
OP_ii d28, d1, v11.d[1]
|
||||
OP_ri d29, d0, v11.d[1]
|
||||
OP_ir d29, d1, v10.d[1]
|
||||
.endm
|
||||
|
||||
.macro SAVE1x4
|
||||
|
|
@ -756,25 +756,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld2 {v2.2d, v3.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
OP_rr v16.2d, v0.2d, v8.2d[0]
|
||||
OP_ii v16.2d, v1.2d, v9.2d[0]
|
||||
OP_ri v17.2d, v0.2d, v9.2d[0]
|
||||
OP_ir v17.2d, v1.2d, v8.2d[0]
|
||||
OP_rr v16.2d, v0.2d, v8.d[0]
|
||||
OP_ii v16.2d, v1.2d, v9.d[0]
|
||||
OP_ri v17.2d, v0.2d, v9.d[0]
|
||||
OP_ir v17.2d, v1.2d, v8.d[0]
|
||||
|
||||
OP_rr v18.2d, v2.2d, v8.2d[0]
|
||||
OP_ii v18.2d, v3.2d, v9.2d[0]
|
||||
OP_ri v19.2d, v2.2d, v9.2d[0]
|
||||
OP_ir v19.2d, v3.2d, v8.2d[0]
|
||||
OP_rr v18.2d, v2.2d, v8.d[0]
|
||||
OP_ii v18.2d, v3.2d, v9.d[0]
|
||||
OP_ri v19.2d, v2.2d, v9.d[0]
|
||||
OP_ir v19.2d, v3.2d, v8.d[0]
|
||||
|
||||
OP_rr v20.2d, v0.2d, v8.2d[1]
|
||||
OP_ii v20.2d, v1.2d, v9.2d[1]
|
||||
OP_ri v21.2d, v0.2d, v9.2d[1]
|
||||
OP_ir v21.2d, v1.2d, v8.2d[1]
|
||||
OP_rr v20.2d, v0.2d, v8.d[1]
|
||||
OP_ii v20.2d, v1.2d, v9.d[1]
|
||||
OP_ri v21.2d, v0.2d, v9.d[1]
|
||||
OP_ir v21.2d, v1.2d, v8.d[1]
|
||||
|
||||
OP_rr v22.2d, v2.2d, v8.2d[1]
|
||||
OP_ii v22.2d, v3.2d, v9.2d[1]
|
||||
OP_ri v23.2d, v2.2d, v9.2d[1]
|
||||
OP_ir v23.2d, v3.2d, v8.2d[1]
|
||||
OP_rr v22.2d, v2.2d, v8.d[1]
|
||||
OP_ii v22.2d, v3.2d, v9.d[1]
|
||||
OP_ri v23.2d, v2.2d, v9.d[1]
|
||||
OP_ir v23.2d, v3.2d, v8.d[1]
|
||||
.endm
|
||||
|
||||
.macro SAVE4x2
|
||||
|
|
@ -833,15 +833,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld2 {v0.2d, v1.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
OP_rr v16.2d, v0.2d, v8.2d[0]
|
||||
OP_ii v16.2d, v1.2d, v9.2d[0]
|
||||
OP_ri v17.2d, v0.2d, v9.2d[0]
|
||||
OP_ir v17.2d, v1.2d, v8.2d[0]
|
||||
OP_rr v16.2d, v0.2d, v8.d[0]
|
||||
OP_ii v16.2d, v1.2d, v9.d[0]
|
||||
OP_ri v17.2d, v0.2d, v9.d[0]
|
||||
OP_ir v17.2d, v1.2d, v8.d[0]
|
||||
|
||||
OP_rr v20.2d, v0.2d, v8.2d[1]
|
||||
OP_ii v20.2d, v1.2d, v9.2d[1]
|
||||
OP_ri v21.2d, v0.2d, v9.2d[1]
|
||||
OP_ir v21.2d, v1.2d, v8.2d[1]
|
||||
OP_rr v20.2d, v0.2d, v8.d[1]
|
||||
OP_ii v20.2d, v1.2d, v9.d[1]
|
||||
OP_ri v21.2d, v0.2d, v9.d[1]
|
||||
OP_ir v21.2d, v1.2d, v8.d[1]
|
||||
.endm
|
||||
|
||||
.macro SAVE2x2
|
||||
|
|
@ -886,15 +886,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld2 {v0.d, v1.d}[0], [pA]
|
||||
add pA, pA, #16
|
||||
|
||||
OP_rr d16, d0, v8.2d[0]
|
||||
OP_ii d16, d1, v9.2d[0]
|
||||
OP_ri d17, d0, v9.2d[0]
|
||||
OP_ir d17, d1, v8.2d[0]
|
||||
OP_rr d16, d0, v8.d[0]
|
||||
OP_ii d16, d1, v9.d[0]
|
||||
OP_ri d17, d0, v9.d[0]
|
||||
OP_ir d17, d1, v8.d[0]
|
||||
|
||||
OP_rr d20, d0, v8.2d[1]
|
||||
OP_ii d20, d1, v9.2d[1]
|
||||
OP_ri d21, d0, v9.2d[1]
|
||||
OP_ir d21, d1, v8.2d[1]
|
||||
OP_rr d20, d0, v8.d[1]
|
||||
OP_ii d20, d1, v9.d[1]
|
||||
OP_ri d21, d0, v9.d[1]
|
||||
OP_ir d21, d1, v8.d[1]
|
||||
.endm
|
||||
|
||||
.macro SAVE1x2
|
||||
|
|
|
|||
|
|
@ -185,93 +185,93 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld2 {v2.2d, v3.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
fmul v16.2d, v0.2d, v8.2d[0]
|
||||
OP_ii v16.2d, v1.2d, v9.2d[0]
|
||||
fmul v16.2d, v0.2d, v8.d[0]
|
||||
OP_ii v16.2d, v1.2d, v9.d[0]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
eor v17.16b, v17.16b, v17.16b
|
||||
fmls v17.2d, v0.2d, v9.2d[0]
|
||||
fmls v17.2d, v0.2d, v9.d[0]
|
||||
#else
|
||||
fmul v17.2d, v0.2d, v9.2d[0]
|
||||
fmul v17.2d, v0.2d, v9.d[0]
|
||||
#endif
|
||||
OP_ir v17.2d, v1.2d, v8.2d[0]
|
||||
OP_ir v17.2d, v1.2d, v8.d[0]
|
||||
|
||||
fmul v18.2d, v2.2d, v8.2d[0]
|
||||
OP_ii v18.2d, v3.2d, v9.2d[0]
|
||||
fmul v18.2d, v2.2d, v8.d[0]
|
||||
OP_ii v18.2d, v3.2d, v9.d[0]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
eor v19.16b, v19.16b, v19.16b
|
||||
fmls v19.2d, v2.2d, v9.2d[0]
|
||||
fmls v19.2d, v2.2d, v9.d[0]
|
||||
#else
|
||||
fmul v19.2d, v2.2d, v9.2d[0]
|
||||
fmul v19.2d, v2.2d, v9.d[0]
|
||||
#endif
|
||||
OP_ir v19.2d, v3.2d, v8.2d[0]
|
||||
OP_ir v19.2d, v3.2d, v8.d[0]
|
||||
|
||||
fmul v20.2d, v0.2d, v8.2d[1]
|
||||
OP_ii v20.2d, v1.2d, v9.2d[1]
|
||||
fmul v20.2d, v0.2d, v8.d[1]
|
||||
OP_ii v20.2d, v1.2d, v9.d[1]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
eor v21.16b, v21.16b, v21.16b
|
||||
fmls v21.2d, v0.2d, v9.2d[1]
|
||||
fmls v21.2d, v0.2d, v9.d[1]
|
||||
#else
|
||||
fmul v21.2d, v0.2d, v9.2d[1]
|
||||
fmul v21.2d, v0.2d, v9.d[1]
|
||||
#endif
|
||||
OP_ir v21.2d, v1.2d, v8.2d[1]
|
||||
OP_ir v21.2d, v1.2d, v8.d[1]
|
||||
|
||||
fmul v22.2d, v2.2d, v8.2d[1]
|
||||
OP_ii v22.2d, v3.2d, v9.2d[1]
|
||||
fmul v22.2d, v2.2d, v8.d[1]
|
||||
OP_ii v22.2d, v3.2d, v9.d[1]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
eor v23.16b, v23.16b, v23.16b
|
||||
fmls v23.2d, v2.2d, v9.2d[1]
|
||||
fmls v23.2d, v2.2d, v9.d[1]
|
||||
#else
|
||||
fmul v23.2d, v2.2d, v9.2d[1]
|
||||
fmul v23.2d, v2.2d, v9.d[1]
|
||||
#endif
|
||||
OP_ir v23.2d, v3.2d, v8.2d[1]
|
||||
OP_ir v23.2d, v3.2d, v8.d[1]
|
||||
|
||||
fmul v24.2d, v0.2d, v10.2d[0]
|
||||
OP_ii v24.2d, v1.2d, v11.2d[0]
|
||||
fmul v24.2d, v0.2d, v10.d[0]
|
||||
OP_ii v24.2d, v1.2d, v11.d[0]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
eor v25.16b, v25.16b, v25.16b
|
||||
fmls v25.2d, v0.2d, v11.2d[0]
|
||||
fmls v25.2d, v0.2d, v11.d[0]
|
||||
#else
|
||||
fmul v25.2d, v0.2d, v11.2d[0]
|
||||
fmul v25.2d, v0.2d, v11.d[0]
|
||||
#endif
|
||||
OP_ir v25.2d, v1.2d, v10.2d[0]
|
||||
OP_ir v25.2d, v1.2d, v10.d[0]
|
||||
|
||||
fmul v26.2d, v2.2d, v10.2d[0]
|
||||
OP_ii v26.2d, v3.2d, v11.2d[0]
|
||||
fmul v26.2d, v2.2d, v10.d[0]
|
||||
OP_ii v26.2d, v3.2d, v11.d[0]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
eor v27.16b, v27.16b, v27.16b
|
||||
fmls v27.2d, v2.2d, v11.2d[0]
|
||||
fmls v27.2d, v2.2d, v11.d[0]
|
||||
#else
|
||||
fmul v27.2d, v2.2d, v11.2d[0]
|
||||
fmul v27.2d, v2.2d, v11.d[0]
|
||||
#endif
|
||||
OP_ir v27.2d, v3.2d, v10.2d[0]
|
||||
OP_ir v27.2d, v3.2d, v10.d[0]
|
||||
|
||||
fmul v28.2d, v0.2d, v10.2d[1]
|
||||
OP_ii v28.2d, v1.2d, v11.2d[1]
|
||||
fmul v28.2d, v0.2d, v10.d[1]
|
||||
OP_ii v28.2d, v1.2d, v11.d[1]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
eor v29.16b, v29.16b, v29.16b
|
||||
fmls v29.2d, v0.2d, v11.2d[1]
|
||||
fmls v29.2d, v0.2d, v11.d[1]
|
||||
#else
|
||||
fmul v29.2d, v0.2d, v11.2d[1]
|
||||
fmul v29.2d, v0.2d, v11.d[1]
|
||||
#endif
|
||||
OP_ir v29.2d, v1.2d, v10.2d[1]
|
||||
OP_ir v29.2d, v1.2d, v10.d[1]
|
||||
|
||||
fmul v30.2d, v2.2d, v10.2d[1]
|
||||
OP_ii v30.2d, v3.2d, v11.2d[1]
|
||||
fmul v30.2d, v2.2d, v10.d[1]
|
||||
OP_ii v30.2d, v3.2d, v11.d[1]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
eor v31.16b, v31.16b, v31.16b
|
||||
fmls v31.2d, v2.2d, v11.2d[1]
|
||||
fmls v31.2d, v2.2d, v11.d[1]
|
||||
#else
|
||||
fmul v31.2d, v2.2d, v11.2d[1]
|
||||
fmul v31.2d, v2.2d, v11.d[1]
|
||||
#endif
|
||||
OP_ir v31.2d, v3.2d, v10.2d[1]
|
||||
OP_ir v31.2d, v3.2d, v10.d[1]
|
||||
|
||||
ld2 {v12.2d, v13.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
|
|
@ -284,161 +284,161 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro KERNEL4x4_M1
|
||||
OP_rr v16.2d, v0.2d, v8.2d[0]
|
||||
OP_ii v16.2d, v1.2d, v9.2d[0]
|
||||
OP_ri v17.2d, v0.2d, v9.2d[0]
|
||||
OP_ir v17.2d, v1.2d, v8.2d[0]
|
||||
OP_rr v16.2d, v0.2d, v8.d[0]
|
||||
OP_ii v16.2d, v1.2d, v9.d[0]
|
||||
OP_ri v17.2d, v0.2d, v9.d[0]
|
||||
OP_ir v17.2d, v1.2d, v8.d[0]
|
||||
|
||||
ld2 {v12.2d, v13.2d}, [pB] // For next round
|
||||
add pB, pB, #32
|
||||
|
||||
OP_rr v18.2d, v2.2d, v8.2d[0]
|
||||
OP_ii v18.2d, v3.2d, v9.2d[0]
|
||||
OP_ri v19.2d, v2.2d, v9.2d[0]
|
||||
OP_ir v19.2d, v3.2d, v8.2d[0]
|
||||
OP_rr v18.2d, v2.2d, v8.d[0]
|
||||
OP_ii v18.2d, v3.2d, v9.d[0]
|
||||
OP_ri v19.2d, v2.2d, v9.d[0]
|
||||
OP_ir v19.2d, v3.2d, v8.d[0]
|
||||
|
||||
ld2 {v14.2d, v15.2d}, [pB] // For next round
|
||||
add pB, pB, #32
|
||||
|
||||
OP_rr v20.2d, v0.2d, v8.2d[1]
|
||||
OP_ii v20.2d, v1.2d, v9.2d[1]
|
||||
OP_ri v21.2d, v0.2d, v9.2d[1]
|
||||
OP_ir v21.2d, v1.2d, v8.2d[1]
|
||||
OP_rr v20.2d, v0.2d, v8.d[1]
|
||||
OP_ii v20.2d, v1.2d, v9.d[1]
|
||||
OP_ri v21.2d, v0.2d, v9.d[1]
|
||||
OP_ir v21.2d, v1.2d, v8.d[1]
|
||||
|
||||
ld2 {v4.2d, v5.2d} , [pA] // For next round
|
||||
add pA, pA, #32
|
||||
|
||||
OP_rr v22.2d, v2.2d, v8.2d[1]
|
||||
OP_ii v22.2d, v3.2d, v9.2d[1]
|
||||
OP_ri v23.2d, v2.2d, v9.2d[1]
|
||||
OP_ir v23.2d, v3.2d, v8.2d[1]
|
||||
OP_rr v22.2d, v2.2d, v8.d[1]
|
||||
OP_ii v22.2d, v3.2d, v9.d[1]
|
||||
OP_ri v23.2d, v2.2d, v9.d[1]
|
||||
OP_ir v23.2d, v3.2d, v8.d[1]
|
||||
|
||||
ld2 {v6.2d, v7.2d} , [pA] // For next round
|
||||
add pA, pA, #32
|
||||
|
||||
OP_rr v24.2d, v0.2d, v10.2d[0]
|
||||
OP_ii v24.2d, v1.2d, v11.2d[0]
|
||||
OP_ri v25.2d, v0.2d, v11.2d[0]
|
||||
OP_ir v25.2d, v1.2d, v10.2d[0]
|
||||
OP_rr v24.2d, v0.2d, v10.d[0]
|
||||
OP_ii v24.2d, v1.2d, v11.d[0]
|
||||
OP_ri v25.2d, v0.2d, v11.d[0]
|
||||
OP_ir v25.2d, v1.2d, v10.d[0]
|
||||
|
||||
prfm PLDL1KEEP, [pA, #512]
|
||||
|
||||
OP_rr v26.2d, v2.2d, v10.2d[0]
|
||||
OP_ii v26.2d, v3.2d, v11.2d[0]
|
||||
OP_ri v27.2d, v2.2d, v11.2d[0]
|
||||
OP_ir v27.2d, v3.2d, v10.2d[0]
|
||||
OP_rr v26.2d, v2.2d, v10.d[0]
|
||||
OP_ii v26.2d, v3.2d, v11.d[0]
|
||||
OP_ri v27.2d, v2.2d, v11.d[0]
|
||||
OP_ir v27.2d, v3.2d, v10.d[0]
|
||||
|
||||
prfm PLDL1KEEP, [pB, #512]
|
||||
|
||||
OP_rr v28.2d, v0.2d, v10.2d[1]
|
||||
OP_ii v28.2d, v1.2d, v11.2d[1]
|
||||
OP_ri v29.2d, v0.2d, v11.2d[1]
|
||||
OP_ir v29.2d, v1.2d, v10.2d[1]
|
||||
OP_rr v28.2d, v0.2d, v10.d[1]
|
||||
OP_ii v28.2d, v1.2d, v11.d[1]
|
||||
OP_ri v29.2d, v0.2d, v11.d[1]
|
||||
OP_ir v29.2d, v1.2d, v10.d[1]
|
||||
|
||||
OP_rr v30.2d, v2.2d, v10.2d[1]
|
||||
OP_ii v30.2d, v3.2d, v11.2d[1]
|
||||
OP_ri v31.2d, v2.2d, v11.2d[1]
|
||||
OP_ir v31.2d, v3.2d, v10.2d[1]
|
||||
OP_rr v30.2d, v2.2d, v10.d[1]
|
||||
OP_ii v30.2d, v3.2d, v11.d[1]
|
||||
OP_ri v31.2d, v2.2d, v11.d[1]
|
||||
OP_ir v31.2d, v3.2d, v10.d[1]
|
||||
.endm
|
||||
|
||||
.macro KERNEL4x4_M2
|
||||
OP_rr v16.2d, v4.2d, v12.2d[0]
|
||||
OP_ii v16.2d, v5.2d, v13.2d[0]
|
||||
OP_ri v17.2d, v4.2d, v13.2d[0]
|
||||
OP_ir v17.2d, v5.2d, v12.2d[0]
|
||||
OP_rr v16.2d, v4.2d, v12.d[0]
|
||||
OP_ii v16.2d, v5.2d, v13.d[0]
|
||||
OP_ri v17.2d, v4.2d, v13.d[0]
|
||||
OP_ir v17.2d, v5.2d, v12.d[0]
|
||||
|
||||
ld2 {v8.2d, v9.2d}, [pB] // For next round
|
||||
add pB, pB, #32
|
||||
|
||||
OP_rr v18.2d, v6.2d, v12.2d[0]
|
||||
OP_ii v18.2d, v7.2d, v13.2d[0]
|
||||
OP_ri v19.2d, v6.2d, v13.2d[0]
|
||||
OP_ir v19.2d, v7.2d, v12.2d[0]
|
||||
OP_rr v18.2d, v6.2d, v12.d[0]
|
||||
OP_ii v18.2d, v7.2d, v13.d[0]
|
||||
OP_ri v19.2d, v6.2d, v13.d[0]
|
||||
OP_ir v19.2d, v7.2d, v12.d[0]
|
||||
|
||||
ld2 {v10.2d, v11.2d}, [pB] // For next round
|
||||
add pB, pB, #32
|
||||
|
||||
OP_rr v20.2d, v4.2d, v12.2d[1]
|
||||
OP_ii v20.2d, v5.2d, v13.2d[1]
|
||||
OP_ri v21.2d, v4.2d, v13.2d[1]
|
||||
OP_ir v21.2d, v5.2d, v12.2d[1]
|
||||
OP_rr v20.2d, v4.2d, v12.d[1]
|
||||
OP_ii v20.2d, v5.2d, v13.d[1]
|
||||
OP_ri v21.2d, v4.2d, v13.d[1]
|
||||
OP_ir v21.2d, v5.2d, v12.d[1]
|
||||
|
||||
ld2 {v0.2d, v1.2d}, [pA] // For next round
|
||||
add pA, pA, #32
|
||||
|
||||
OP_rr v22.2d, v6.2d, v12.2d[1]
|
||||
OP_ii v22.2d, v7.2d, v13.2d[1]
|
||||
OP_ri v23.2d, v6.2d, v13.2d[1]
|
||||
OP_ir v23.2d, v7.2d, v12.2d[1]
|
||||
OP_rr v22.2d, v6.2d, v12.d[1]
|
||||
OP_ii v22.2d, v7.2d, v13.d[1]
|
||||
OP_ri v23.2d, v6.2d, v13.d[1]
|
||||
OP_ir v23.2d, v7.2d, v12.d[1]
|
||||
|
||||
ld2 {v2.2d, v3.2d}, [pA] // For next round
|
||||
add pA, pA, #32
|
||||
|
||||
OP_rr v24.2d, v4.2d, v14.2d[0]
|
||||
OP_ii v24.2d, v5.2d, v15.2d[0]
|
||||
OP_ri v25.2d, v4.2d, v15.2d[0]
|
||||
OP_ir v25.2d, v5.2d, v14.2d[0]
|
||||
OP_rr v24.2d, v4.2d, v14.d[0]
|
||||
OP_ii v24.2d, v5.2d, v15.d[0]
|
||||
OP_ri v25.2d, v4.2d, v15.d[0]
|
||||
OP_ir v25.2d, v5.2d, v14.d[0]
|
||||
|
||||
prfm PLDL1KEEP, [pA, #512]
|
||||
|
||||
OP_rr v26.2d, v6.2d, v14.2d[0]
|
||||
OP_ii v26.2d, v7.2d, v15.2d[0]
|
||||
OP_ri v27.2d, v6.2d, v15.2d[0]
|
||||
OP_ir v27.2d, v7.2d, v14.2d[0]
|
||||
OP_rr v26.2d, v6.2d, v14.d[0]
|
||||
OP_ii v26.2d, v7.2d, v15.d[0]
|
||||
OP_ri v27.2d, v6.2d, v15.d[0]
|
||||
OP_ir v27.2d, v7.2d, v14.d[0]
|
||||
|
||||
prfm PLDL1KEEP, [pB, #512]
|
||||
|
||||
OP_rr v28.2d, v4.2d, v14.2d[1]
|
||||
OP_ii v28.2d, v5.2d, v15.2d[1]
|
||||
OP_ri v29.2d, v4.2d, v15.2d[1]
|
||||
OP_ir v29.2d, v5.2d, v14.2d[1]
|
||||
OP_rr v28.2d, v4.2d, v14.d[1]
|
||||
OP_ii v28.2d, v5.2d, v15.d[1]
|
||||
OP_ri v29.2d, v4.2d, v15.d[1]
|
||||
OP_ir v29.2d, v5.2d, v14.d[1]
|
||||
|
||||
OP_rr v30.2d, v6.2d, v14.2d[1]
|
||||
OP_ii v30.2d, v7.2d, v15.2d[1]
|
||||
OP_ri v31.2d, v6.2d, v15.2d[1]
|
||||
OP_ir v31.2d, v7.2d, v14.2d[1]
|
||||
OP_rr v30.2d, v6.2d, v14.d[1]
|
||||
OP_ii v30.2d, v7.2d, v15.d[1]
|
||||
OP_ri v31.2d, v6.2d, v15.d[1]
|
||||
OP_ir v31.2d, v7.2d, v14.d[1]
|
||||
.endm
|
||||
|
||||
.macro KERNEL4x4_E
|
||||
OP_rr v16.2d, v4.2d, v12.2d[0]
|
||||
OP_ii v16.2d, v5.2d, v13.2d[0]
|
||||
OP_ri v17.2d, v4.2d, v13.2d[0]
|
||||
OP_ir v17.2d, v5.2d, v12.2d[0]
|
||||
OP_rr v16.2d, v4.2d, v12.d[0]
|
||||
OP_ii v16.2d, v5.2d, v13.d[0]
|
||||
OP_ri v17.2d, v4.2d, v13.d[0]
|
||||
OP_ir v17.2d, v5.2d, v12.d[0]
|
||||
|
||||
OP_rr v18.2d, v6.2d, v12.2d[0]
|
||||
OP_ii v18.2d, v7.2d, v13.2d[0]
|
||||
OP_ri v19.2d, v6.2d, v13.2d[0]
|
||||
OP_ir v19.2d, v7.2d, v12.2d[0]
|
||||
OP_rr v18.2d, v6.2d, v12.d[0]
|
||||
OP_ii v18.2d, v7.2d, v13.d[0]
|
||||
OP_ri v19.2d, v6.2d, v13.d[0]
|
||||
OP_ir v19.2d, v7.2d, v12.d[0]
|
||||
|
||||
OP_rr v20.2d, v4.2d, v12.2d[1]
|
||||
OP_ii v20.2d, v5.2d, v13.2d[1]
|
||||
OP_ri v21.2d, v4.2d, v13.2d[1]
|
||||
OP_ir v21.2d, v5.2d, v12.2d[1]
|
||||
OP_rr v20.2d, v4.2d, v12.d[1]
|
||||
OP_ii v20.2d, v5.2d, v13.d[1]
|
||||
OP_ri v21.2d, v4.2d, v13.d[1]
|
||||
OP_ir v21.2d, v5.2d, v12.d[1]
|
||||
|
||||
OP_rr v22.2d, v6.2d, v12.2d[1]
|
||||
OP_ii v22.2d, v7.2d, v13.2d[1]
|
||||
OP_ri v23.2d, v6.2d, v13.2d[1]
|
||||
OP_ir v23.2d, v7.2d, v12.2d[1]
|
||||
OP_rr v22.2d, v6.2d, v12.d[1]
|
||||
OP_ii v22.2d, v7.2d, v13.d[1]
|
||||
OP_ri v23.2d, v6.2d, v13.d[1]
|
||||
OP_ir v23.2d, v7.2d, v12.d[1]
|
||||
|
||||
OP_rr v24.2d, v4.2d, v14.2d[0]
|
||||
OP_ii v24.2d, v5.2d, v15.2d[0]
|
||||
OP_ri v25.2d, v4.2d, v15.2d[0]
|
||||
OP_ir v25.2d, v5.2d, v14.2d[0]
|
||||
OP_rr v24.2d, v4.2d, v14.d[0]
|
||||
OP_ii v24.2d, v5.2d, v15.d[0]
|
||||
OP_ri v25.2d, v4.2d, v15.d[0]
|
||||
OP_ir v25.2d, v5.2d, v14.d[0]
|
||||
|
||||
OP_rr v26.2d, v6.2d, v14.2d[0]
|
||||
OP_ii v26.2d, v7.2d, v15.2d[0]
|
||||
OP_ri v27.2d, v6.2d, v15.2d[0]
|
||||
OP_ir v27.2d, v7.2d, v14.2d[0]
|
||||
OP_rr v26.2d, v6.2d, v14.d[0]
|
||||
OP_ii v26.2d, v7.2d, v15.d[0]
|
||||
OP_ri v27.2d, v6.2d, v15.d[0]
|
||||
OP_ir v27.2d, v7.2d, v14.d[0]
|
||||
|
||||
OP_rr v28.2d, v4.2d, v14.2d[1]
|
||||
OP_ii v28.2d, v5.2d, v15.2d[1]
|
||||
OP_ri v29.2d, v4.2d, v15.2d[1]
|
||||
OP_ir v29.2d, v5.2d, v14.2d[1]
|
||||
OP_rr v28.2d, v4.2d, v14.d[1]
|
||||
OP_ii v28.2d, v5.2d, v15.d[1]
|
||||
OP_ri v29.2d, v4.2d, v15.d[1]
|
||||
OP_ir v29.2d, v5.2d, v14.d[1]
|
||||
|
||||
OP_rr v30.2d, v6.2d, v14.2d[1]
|
||||
OP_ii v30.2d, v7.2d, v15.2d[1]
|
||||
OP_ri v31.2d, v6.2d, v15.2d[1]
|
||||
OP_ir v31.2d, v7.2d, v14.2d[1]
|
||||
OP_rr v30.2d, v6.2d, v14.d[1]
|
||||
OP_ii v30.2d, v7.2d, v15.d[1]
|
||||
OP_ri v31.2d, v6.2d, v15.d[1]
|
||||
OP_ir v31.2d, v7.2d, v14.d[1]
|
||||
.endm
|
||||
|
||||
.macro KERNEL4x4_SUB
|
||||
|
|
@ -451,45 +451,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld2 {v2.2d, v3.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
OP_rr v16.2d, v0.2d, v8.2d[0]
|
||||
OP_ii v16.2d, v1.2d, v9.2d[0]
|
||||
OP_ri v17.2d, v0.2d, v9.2d[0]
|
||||
OP_ir v17.2d, v1.2d, v8.2d[0]
|
||||
OP_rr v16.2d, v0.2d, v8.d[0]
|
||||
OP_ii v16.2d, v1.2d, v9.d[0]
|
||||
OP_ri v17.2d, v0.2d, v9.d[0]
|
||||
OP_ir v17.2d, v1.2d, v8.d[0]
|
||||
|
||||
OP_rr v18.2d, v2.2d, v8.2d[0]
|
||||
OP_ii v18.2d, v3.2d, v9.2d[0]
|
||||
OP_ri v19.2d, v2.2d, v9.2d[0]
|
||||
OP_ir v19.2d, v3.2d, v8.2d[0]
|
||||
OP_rr v18.2d, v2.2d, v8.d[0]
|
||||
OP_ii v18.2d, v3.2d, v9.d[0]
|
||||
OP_ri v19.2d, v2.2d, v9.d[0]
|
||||
OP_ir v19.2d, v3.2d, v8.d[0]
|
||||
|
||||
OP_rr v20.2d, v0.2d, v8.2d[1]
|
||||
OP_ii v20.2d, v1.2d, v9.2d[1]
|
||||
OP_ri v21.2d, v0.2d, v9.2d[1]
|
||||
OP_ir v21.2d, v1.2d, v8.2d[1]
|
||||
OP_rr v20.2d, v0.2d, v8.d[1]
|
||||
OP_ii v20.2d, v1.2d, v9.d[1]
|
||||
OP_ri v21.2d, v0.2d, v9.d[1]
|
||||
OP_ir v21.2d, v1.2d, v8.d[1]
|
||||
|
||||
OP_rr v22.2d, v2.2d, v8.2d[1]
|
||||
OP_ii v22.2d, v3.2d, v9.2d[1]
|
||||
OP_ri v23.2d, v2.2d, v9.2d[1]
|
||||
OP_ir v23.2d, v3.2d, v8.2d[1]
|
||||
OP_rr v22.2d, v2.2d, v8.d[1]
|
||||
OP_ii v22.2d, v3.2d, v9.d[1]
|
||||
OP_ri v23.2d, v2.2d, v9.d[1]
|
||||
OP_ir v23.2d, v3.2d, v8.d[1]
|
||||
|
||||
OP_rr v24.2d, v0.2d, v10.2d[0]
|
||||
OP_ii v24.2d, v1.2d, v11.2d[0]
|
||||
OP_ri v25.2d, v0.2d, v11.2d[0]
|
||||
OP_ir v25.2d, v1.2d, v10.2d[0]
|
||||
OP_rr v24.2d, v0.2d, v10.d[0]
|
||||
OP_ii v24.2d, v1.2d, v11.d[0]
|
||||
OP_ri v25.2d, v0.2d, v11.d[0]
|
||||
OP_ir v25.2d, v1.2d, v10.d[0]
|
||||
|
||||
OP_rr v26.2d, v2.2d, v10.2d[0]
|
||||
OP_ii v26.2d, v3.2d, v11.2d[0]
|
||||
OP_ri v27.2d, v2.2d, v11.2d[0]
|
||||
OP_ir v27.2d, v3.2d, v10.2d[0]
|
||||
OP_rr v26.2d, v2.2d, v10.d[0]
|
||||
OP_ii v26.2d, v3.2d, v11.d[0]
|
||||
OP_ri v27.2d, v2.2d, v11.d[0]
|
||||
OP_ir v27.2d, v3.2d, v10.d[0]
|
||||
|
||||
OP_rr v28.2d, v0.2d, v10.2d[1]
|
||||
OP_ii v28.2d, v1.2d, v11.2d[1]
|
||||
OP_ri v29.2d, v0.2d, v11.2d[1]
|
||||
OP_ir v29.2d, v1.2d, v10.2d[1]
|
||||
OP_rr v28.2d, v0.2d, v10.d[1]
|
||||
OP_ii v28.2d, v1.2d, v11.d[1]
|
||||
OP_ri v29.2d, v0.2d, v11.d[1]
|
||||
OP_ir v29.2d, v1.2d, v10.d[1]
|
||||
|
||||
OP_rr v30.2d, v2.2d, v10.2d[1]
|
||||
OP_ii v30.2d, v3.2d, v11.2d[1]
|
||||
OP_ri v31.2d, v2.2d, v11.2d[1]
|
||||
OP_ir v31.2d, v3.2d, v10.2d[1]
|
||||
OP_rr v30.2d, v2.2d, v10.d[1]
|
||||
OP_ii v30.2d, v3.2d, v11.d[1]
|
||||
OP_ri v31.2d, v2.2d, v11.d[1]
|
||||
OP_ir v31.2d, v3.2d, v10.d[1]
|
||||
.endm
|
||||
|
||||
.macro SAVE4x4
|
||||
|
|
@ -577,25 +577,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld2 {v0.2d, v1.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
OP_rr v16.2d, v0.2d, v8.2d[0]
|
||||
OP_ii v16.2d, v1.2d, v9.2d[0]
|
||||
OP_ri v17.2d, v0.2d, v9.2d[0]
|
||||
OP_ir v17.2d, v1.2d, v8.2d[0]
|
||||
OP_rr v16.2d, v0.2d, v8.d[0]
|
||||
OP_ii v16.2d, v1.2d, v9.d[0]
|
||||
OP_ri v17.2d, v0.2d, v9.d[0]
|
||||
OP_ir v17.2d, v1.2d, v8.d[0]
|
||||
|
||||
OP_rr v20.2d, v0.2d, v8.2d[1]
|
||||
OP_ii v20.2d, v1.2d, v9.2d[1]
|
||||
OP_ri v21.2d, v0.2d, v9.2d[1]
|
||||
OP_ir v21.2d, v1.2d, v8.2d[1]
|
||||
OP_rr v20.2d, v0.2d, v8.d[1]
|
||||
OP_ii v20.2d, v1.2d, v9.d[1]
|
||||
OP_ri v21.2d, v0.2d, v9.d[1]
|
||||
OP_ir v21.2d, v1.2d, v8.d[1]
|
||||
|
||||
OP_rr v24.2d, v0.2d, v10.2d[0]
|
||||
OP_ii v24.2d, v1.2d, v11.2d[0]
|
||||
OP_ri v25.2d, v0.2d, v11.2d[0]
|
||||
OP_ir v25.2d, v1.2d, v10.2d[0]
|
||||
OP_rr v24.2d, v0.2d, v10.d[0]
|
||||
OP_ii v24.2d, v1.2d, v11.d[0]
|
||||
OP_ri v25.2d, v0.2d, v11.d[0]
|
||||
OP_ir v25.2d, v1.2d, v10.d[0]
|
||||
|
||||
OP_rr v28.2d, v0.2d, v10.2d[1]
|
||||
OP_ii v28.2d, v1.2d, v11.2d[1]
|
||||
OP_ri v29.2d, v0.2d, v11.2d[1]
|
||||
OP_ir v29.2d, v1.2d, v10.2d[1]
|
||||
OP_rr v28.2d, v0.2d, v10.d[1]
|
||||
OP_ii v28.2d, v1.2d, v11.d[1]
|
||||
OP_ri v29.2d, v0.2d, v11.d[1]
|
||||
OP_ir v29.2d, v1.2d, v10.d[1]
|
||||
.endm
|
||||
|
||||
.macro SAVE2x4
|
||||
|
|
@ -660,25 +660,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld2 {v0.d, v1.d}[0], [pA]
|
||||
add pA, pA, #16
|
||||
|
||||
OP_rr d16, d0, v8.2d[0]
|
||||
OP_ii d16, d1, v9.2d[0]
|
||||
OP_ri d17, d0, v9.2d[0]
|
||||
OP_ir d17, d1, v8.2d[0]
|
||||
OP_rr d16, d0, v8.d[0]
|
||||
OP_ii d16, d1, v9.d[0]
|
||||
OP_ri d17, d0, v9.d[0]
|
||||
OP_ir d17, d1, v8.d[0]
|
||||
|
||||
OP_rr d20, d0, v8.2d[1]
|
||||
OP_ii d20, d1, v9.2d[1]
|
||||
OP_ri d21, d0, v9.2d[1]
|
||||
OP_ir d21, d1, v8.2d[1]
|
||||
OP_rr d20, d0, v8.d[1]
|
||||
OP_ii d20, d1, v9.d[1]
|
||||
OP_ri d21, d0, v9.d[1]
|
||||
OP_ir d21, d1, v8.d[1]
|
||||
|
||||
OP_rr d24, d0, v10.2d[0]
|
||||
OP_ii d24, d1, v11.2d[0]
|
||||
OP_ri d25, d0, v11.2d[0]
|
||||
OP_ir d25, d1, v10.2d[0]
|
||||
OP_rr d24, d0, v10.d[0]
|
||||
OP_ii d24, d1, v11.d[0]
|
||||
OP_ri d25, d0, v11.d[0]
|
||||
OP_ir d25, d1, v10.d[0]
|
||||
|
||||
OP_rr d28, d0, v10.2d[1]
|
||||
OP_ii d28, d1, v11.2d[1]
|
||||
OP_ri d29, d0, v11.2d[1]
|
||||
OP_ir d29, d1, v10.2d[1]
|
||||
OP_rr d28, d0, v10.d[1]
|
||||
OP_ii d28, d1, v11.d[1]
|
||||
OP_ri d29, d0, v11.d[1]
|
||||
OP_ir d29, d1, v10.d[1]
|
||||
.endm
|
||||
|
||||
.macro SAVE1x4
|
||||
|
|
@ -743,25 +743,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld2 {v2.2d, v3.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
OP_rr v16.2d, v0.2d, v8.2d[0]
|
||||
OP_ii v16.2d, v1.2d, v9.2d[0]
|
||||
OP_ri v17.2d, v0.2d, v9.2d[0]
|
||||
OP_ir v17.2d, v1.2d, v8.2d[0]
|
||||
OP_rr v16.2d, v0.2d, v8.d[0]
|
||||
OP_ii v16.2d, v1.2d, v9.d[0]
|
||||
OP_ri v17.2d, v0.2d, v9.d[0]
|
||||
OP_ir v17.2d, v1.2d, v8.d[0]
|
||||
|
||||
OP_rr v18.2d, v2.2d, v8.2d[0]
|
||||
OP_ii v18.2d, v3.2d, v9.2d[0]
|
||||
OP_ri v19.2d, v2.2d, v9.2d[0]
|
||||
OP_ir v19.2d, v3.2d, v8.2d[0]
|
||||
OP_rr v18.2d, v2.2d, v8.d[0]
|
||||
OP_ii v18.2d, v3.2d, v9.d[0]
|
||||
OP_ri v19.2d, v2.2d, v9.d[0]
|
||||
OP_ir v19.2d, v3.2d, v8.d[0]
|
||||
|
||||
OP_rr v20.2d, v0.2d, v8.2d[1]
|
||||
OP_ii v20.2d, v1.2d, v9.2d[1]
|
||||
OP_ri v21.2d, v0.2d, v9.2d[1]
|
||||
OP_ir v21.2d, v1.2d, v8.2d[1]
|
||||
OP_rr v20.2d, v0.2d, v8.d[1]
|
||||
OP_ii v20.2d, v1.2d, v9.d[1]
|
||||
OP_ri v21.2d, v0.2d, v9.d[1]
|
||||
OP_ir v21.2d, v1.2d, v8.d[1]
|
||||
|
||||
OP_rr v22.2d, v2.2d, v8.2d[1]
|
||||
OP_ii v22.2d, v3.2d, v9.2d[1]
|
||||
OP_ri v23.2d, v2.2d, v9.2d[1]
|
||||
OP_ir v23.2d, v3.2d, v8.2d[1]
|
||||
OP_rr v22.2d, v2.2d, v8.d[1]
|
||||
OP_ii v22.2d, v3.2d, v9.d[1]
|
||||
OP_ri v23.2d, v2.2d, v9.d[1]
|
||||
OP_ir v23.2d, v3.2d, v8.d[1]
|
||||
.endm
|
||||
|
||||
.macro SAVE4x2
|
||||
|
|
@ -816,15 +816,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld2 {v0.2d, v1.2d}, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
OP_rr v16.2d, v0.2d, v8.2d[0]
|
||||
OP_ii v16.2d, v1.2d, v9.2d[0]
|
||||
OP_ri v17.2d, v0.2d, v9.2d[0]
|
||||
OP_ir v17.2d, v1.2d, v8.2d[0]
|
||||
OP_rr v16.2d, v0.2d, v8.d[0]
|
||||
OP_ii v16.2d, v1.2d, v9.d[0]
|
||||
OP_ri v17.2d, v0.2d, v9.d[0]
|
||||
OP_ir v17.2d, v1.2d, v8.d[0]
|
||||
|
||||
OP_rr v20.2d, v0.2d, v8.2d[1]
|
||||
OP_ii v20.2d, v1.2d, v9.2d[1]
|
||||
OP_ri v21.2d, v0.2d, v9.2d[1]
|
||||
OP_ir v21.2d, v1.2d, v8.2d[1]
|
||||
OP_rr v20.2d, v0.2d, v8.d[1]
|
||||
OP_ii v20.2d, v1.2d, v9.d[1]
|
||||
OP_ri v21.2d, v0.2d, v9.d[1]
|
||||
OP_ir v21.2d, v1.2d, v8.d[1]
|
||||
.endm
|
||||
|
||||
.macro SAVE2x2
|
||||
|
|
@ -867,15 +867,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld2 {v0.d, v1.d}[0], [pA]
|
||||
add pA, pA, #16
|
||||
|
||||
OP_rr d16, d0, v8.2d[0]
|
||||
OP_ii d16, d1, v9.2d[0]
|
||||
OP_ri d17, d0, v9.2d[0]
|
||||
OP_ir d17, d1, v8.2d[0]
|
||||
OP_rr d16, d0, v8.d[0]
|
||||
OP_ii d16, d1, v9.d[0]
|
||||
OP_ri d17, d0, v9.d[0]
|
||||
OP_ir d17, d1, v8.d[0]
|
||||
|
||||
OP_rr d20, d0, v8.2d[1]
|
||||
OP_ii d20, d1, v9.2d[1]
|
||||
OP_ri d21, d0, v9.2d[1]
|
||||
OP_ir d21, d1, v8.2d[1]
|
||||
OP_rr d20, d0, v8.d[1]
|
||||
OP_ii d20, d1, v9.d[1]
|
||||
OP_ri d21, d0, v9.d[1]
|
||||
OP_ir d21, d1, v8.d[1]
|
||||
.endm
|
||||
|
||||
.macro SAVE1x2
|
||||
|
|
|
|||
|
|
@ -3,14 +3,18 @@
|
|||
#CGEMM_BETA = ../generic/zgemm_beta.c
|
||||
#ZGEMM_BETA = ../generic/zgemm_beta.c
|
||||
|
||||
STRMMKERNEL = gemm_kernel_power6.S
|
||||
STRMMKERNEL = strmm_kernel_16x8_power8.S
|
||||
DTRMMKERNEL = dtrmm_kernel_16x4_power8.S
|
||||
CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
|
||||
CTRMMKERNEL = ctrmm_kernel_8x4_power8.S
|
||||
ZTRMMKERNEL = ztrmm_kernel_8x2_power8.S
|
||||
|
||||
SGEMMKERNEL = gemm_kernel_power6.S
|
||||
SGEMMONCOPY = ../generic/gemm_ncopy_4.c
|
||||
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
|
||||
SGEMMKERNEL = sgemm_kernel_16x8_power8.S
|
||||
SGEMMINCOPY = ../generic/gemm_ncopy_16.c
|
||||
SGEMMITCOPY = ../generic/gemm_tcopy_16.c
|
||||
SGEMMONCOPY = ../generic/gemm_ncopy_8.c
|
||||
SGEMMOTCOPY = ../generic/gemm_tcopy_8.c
|
||||
SGEMMINCOPYOBJ = sgemm_incopy.o
|
||||
SGEMMITCOPYOBJ = sgemm_itcopy.o
|
||||
SGEMMONCOPYOBJ = sgemm_oncopy.o
|
||||
SGEMMOTCOPYOBJ = sgemm_otcopy.o
|
||||
|
||||
|
|
@ -24,11 +28,15 @@ DGEMMITCOPYOBJ = dgemm_itcopy.o
|
|||
DGEMMONCOPYOBJ = dgemm_oncopy.o
|
||||
DGEMMOTCOPYOBJ = dgemm_otcopy.o
|
||||
|
||||
CGEMMKERNEL = ../generic/zgemmkernel_2x2.c
|
||||
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
||||
CGEMMKERNEL = cgemm_kernel_8x4_power8.S
|
||||
CGEMMINCOPY = ../generic/zgemm_ncopy_8.c
|
||||
CGEMMITCOPY = ../generic/zgemm_tcopy_8.c
|
||||
CGEMMONCOPY = ../generic/zgemm_ncopy_4.c
|
||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c
|
||||
CGEMMONCOPYOBJ = cgemm_oncopy.o
|
||||
CGEMMOTCOPYOBJ = cgemm_otcopy.o
|
||||
CGEMMINCOPYOBJ = cgemm_incopy.o
|
||||
CGEMMITCOPYOBJ = cgemm_itcopy.o
|
||||
|
||||
ZGEMMKERNEL = zgemm_kernel_8x2_power8.S
|
||||
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||
|
|
@ -97,56 +105,56 @@ ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
|||
#ISMINKERNEL = ../arm/imin.c
|
||||
#IDMINKERNEL = ../arm/imin.c
|
||||
#
|
||||
#SASUMKERNEL = ../arm/asum.c
|
||||
#DASUMKERNEL = ../arm/asum.c
|
||||
#CASUMKERNEL = ../arm/zasum.c
|
||||
#ZASUMKERNEL = ../arm/zasum.c
|
||||
SASUMKERNEL = sasum.c
|
||||
DASUMKERNEL = dasum.c
|
||||
CASUMKERNEL = casum.c
|
||||
ZASUMKERNEL = zasum.c
|
||||
#
|
||||
#SAXPYKERNEL = ../arm/axpy.c
|
||||
#DAXPYKERNEL = ../arm/axpy.c
|
||||
DAXPYKERNEL = daxpy.c
|
||||
#CAXPYKERNEL = ../arm/zaxpy.c
|
||||
#ZAXPYKERNEL = ../arm/zaxpy.c
|
||||
ZAXPYKERNEL = zaxpy.c
|
||||
#
|
||||
#SCOPYKERNEL = ../arm/copy.c
|
||||
#DCOPYKERNEL = ../arm/copy.c
|
||||
#CCOPYKERNEL = ../arm/zcopy.c
|
||||
#ZCOPYKERNEL = ../arm/zcopy.c
|
||||
SCOPYKERNEL = scopy.c
|
||||
DCOPYKERNEL = dcopy.c
|
||||
CCOPYKERNEL = ccopy.c
|
||||
ZCOPYKERNEL = zcopy.c
|
||||
#
|
||||
#SDOTKERNEL = ../arm/dot.c
|
||||
#DDOTKERNEL = ../arm/dot.c
|
||||
SDOTKERNEL = sdot.c
|
||||
DDOTKERNEL = ddot.c
|
||||
#CDOTKERNEL = ../arm/zdot.c
|
||||
#ZDOTKERNEL = ../arm/zdot.c
|
||||
ZDOTKERNEL = zdot.c
|
||||
#
|
||||
#SNRM2KERNEL = ../arm/nrm2.c
|
||||
#DNRM2KERNEL = ../arm/nrm2.c
|
||||
#CNRM2KERNEL = ../arm/znrm2.c
|
||||
#ZNRM2KERNEL = ../arm/znrm2.c
|
||||
#
|
||||
#SROTKERNEL = ../arm/rot.c
|
||||
#DROTKERNEL = ../arm/rot.c
|
||||
SROTKERNEL = srot.c
|
||||
DROTKERNEL = drot.c
|
||||
#CROTKERNEL = ../arm/zrot.c
|
||||
#ZROTKERNEL = ../arm/zrot.c
|
||||
#
|
||||
#SSCALKERNEL = ../arm/scal.c
|
||||
#DSCALKERNEL = ../arm/scal.c
|
||||
SSCALKERNEL = sscal.c
|
||||
DSCALKERNEL = dscal.c
|
||||
#CSCALKERNEL = ../arm/zscal.c
|
||||
#ZSCALKERNEL = ../arm/zscal.c
|
||||
ZSCALKERNEL = zscal.c
|
||||
#
|
||||
#SSWAPKERNEL = ../arm/swap.c
|
||||
#DSWAPKERNEL = ../arm/swap.c
|
||||
#CSWAPKERNEL = ../arm/zswap.c
|
||||
#ZSWAPKERNEL = ../arm/zswap.c
|
||||
SSWAPKERNEL = sswap.c
|
||||
DSWAPKERNEL = dswap.c
|
||||
CSWAPKERNEL = cswap.c
|
||||
ZSWAPKERNEL = zswap.c
|
||||
#
|
||||
|
||||
#SGEMVNKERNEL = ../arm/gemv_n.c
|
||||
#DGEMVNKERNEL = ../arm/gemv_n.c
|
||||
DGEMVNKERNEL = dgemv_n.c
|
||||
#CGEMVNKERNEL = ../arm/zgemv_n.c
|
||||
#ZGEMVNKERNEL = ../arm/zgemv_n.c
|
||||
#
|
||||
#SGEMVTKERNEL = ../arm/gemv_t.c
|
||||
#DGEMVTKERNEL = ../arm/gemv_t.c
|
||||
#CGEMVTKERNEL = ../arm/zgemv_t.c
|
||||
#ZGEMVTKERNEL = ../arm/zgemv_t.c
|
||||
#ZGEMVTKERNEL = zgemv_t_4.c
|
||||
|
||||
|
||||
#SSYMV_U_KERNEL = ../generic/symv_k.c
|
||||
|
|
|
|||
|
|
@ -0,0 +1,151 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
/**************************************************************************************
|
||||
* 2016/03/28 Werner Saar (wernsaar@googlemail.com)
|
||||
* BLASTEST : OK
|
||||
* CTEST : OK
|
||||
* TEST : OK
|
||||
* LAPACK-TEST : OK
|
||||
**************************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include <math.h>
|
||||
|
||||
#if defined(DOUBLE)
|
||||
|
||||
#define ABS fabs
|
||||
|
||||
#else
|
||||
|
||||
#define ABS fabsf
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(POWER8)
|
||||
#include "casum_microk_power8.c"
|
||||
#endif
|
||||
|
||||
|
||||
#ifndef HAVE_KERNEL_16
|
||||
|
||||
static void casum_kernel_16(BLASLONG n, FLOAT *x1, FLOAT *svec)
|
||||
{
|
||||
|
||||
BLASLONG i=0;
|
||||
FLOAT *x = x1;
|
||||
FLOAT temp0, temp1, temp2, temp3;
|
||||
FLOAT temp4, temp5, temp6, temp7;
|
||||
FLOAT sum0 = 0.0;
|
||||
FLOAT sum1 = 0.0;
|
||||
FLOAT sum2 = 0.0;
|
||||
FLOAT sum3 = 0.0;
|
||||
|
||||
while ( i< n )
|
||||
{
|
||||
|
||||
temp0 = ABS(x[0]);
|
||||
temp1 = ABS(x[1]);
|
||||
temp2 = ABS(x[2]);
|
||||
temp3 = ABS(x[3]);
|
||||
temp4 = ABS(x[4]);
|
||||
temp5 = ABS(x[5]);
|
||||
temp6 = ABS(x[6]);
|
||||
temp7 = ABS(x[7]);
|
||||
|
||||
sum0 += temp0;
|
||||
sum1 += temp1;
|
||||
sum2 += temp2;
|
||||
sum3 += temp3;
|
||||
|
||||
sum0 += temp4;
|
||||
sum1 += temp5;
|
||||
sum2 += temp6;
|
||||
sum3 += temp7;
|
||||
|
||||
x+=8;
|
||||
i+=4;
|
||||
|
||||
}
|
||||
|
||||
svec[0] = sum0+sum1+sum2+sum3;
|
||||
svec[1] = 0.0;
|
||||
svec[2] = 0.0;
|
||||
svec[3] = 0.0;
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
BLASLONG i=0;
|
||||
BLASLONG ip=0;
|
||||
FLOAT sumf = 0.0;
|
||||
FLOAT svec[4] __attribute__ ((aligned (16)));;
|
||||
BLASLONG n1;
|
||||
BLASLONG inc_x2;
|
||||
|
||||
if (n <= 0 || inc_x <= 0) return(sumf);
|
||||
|
||||
if ( inc_x == 1 )
|
||||
{
|
||||
|
||||
n1 = n & -16;
|
||||
if ( n1 > 0 )
|
||||
{
|
||||
|
||||
casum_kernel_16(n1, x, svec);
|
||||
sumf = svec[0] + svec[1]+svec[2]+svec[3];
|
||||
i=n1;
|
||||
ip = 2 * n1;
|
||||
}
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
sumf += ABS(x[ip]) + ABS(x[ip+1]);
|
||||
ip += 2;
|
||||
i++;
|
||||
}
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
inc_x2 = 2 * inc_x;
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
sumf += ABS(x[ip]) + ABS(x[ip+1]);
|
||||
ip += inc_x2;
|
||||
i++;
|
||||
}
|
||||
|
||||
}
|
||||
return(sumf);
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,177 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
/**************************************************************************************
|
||||
* 2016/03/28 Werner Saar (wernsaar@googlemail.com)
|
||||
* BLASTEST : OK
|
||||
* CTEST : OK
|
||||
* TEST : OK
|
||||
* LAPACK-TEST : OK
|
||||
**************************************************************************************/
|
||||
|
||||
#define HAVE_KERNEL_16 1
|
||||
static void casum_kernel_16( BLASLONG n, FLOAT *x, FLOAT *svec) __attribute__ ((noinline));
|
||||
|
||||
static void casum_kernel_16( BLASLONG n, FLOAT *x, FLOAT *svec)
|
||||
{
|
||||
|
||||
|
||||
BLASLONG i = n;
|
||||
BLASLONG o16 = 16;
|
||||
BLASLONG o32 = 32;
|
||||
BLASLONG o48 = 48;
|
||||
BLASLONG o64 = 64;
|
||||
BLASLONG o80 = 80;
|
||||
BLASLONG o96 = 96;
|
||||
BLASLONG o112 = 112;
|
||||
FLOAT *x1=x;
|
||||
BLASLONG pre = 384;
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
|
||||
"dcbt %2 , %4 \n\t"
|
||||
|
||||
"xxlxor 32,32,32 \n\t"
|
||||
"xxlxor 33,33,33 \n\t"
|
||||
"xxlxor 34,34,34 \n\t"
|
||||
"xxlxor 35,35,35 \n\t"
|
||||
"xxlxor 36,36,36 \n\t"
|
||||
"xxlxor 37,37,37 \n\t"
|
||||
"xxlxor 38,38,38 \n\t"
|
||||
"xxlxor 39,39,39 \n\t"
|
||||
|
||||
"lxvw4x 40, 0, %2 \n\t"
|
||||
"lxvw4x 41, %5, %2 \n\t"
|
||||
"lxvw4x 42, %6, %2 \n\t"
|
||||
"lxvw4x 43, %7, %2 \n\t"
|
||||
"lxvw4x 44, %8, %2 \n\t"
|
||||
"lxvw4x 45, %9, %2 \n\t"
|
||||
"lxvw4x 46, %10, %2 \n\t"
|
||||
"lxvw4x 47, %11, %2 \n\t"
|
||||
|
||||
"addi %2, %2, 128 \n\t"
|
||||
|
||||
"addic. %0 , %0 , -16 \n\t"
|
||||
"ble 2f \n\t"
|
||||
|
||||
".align 5 \n\t"
|
||||
"1: \n\t"
|
||||
|
||||
"dcbt %2 , %4 \n\t"
|
||||
|
||||
"xvabssp 48, 40 \n\t"
|
||||
"xvabssp 49, 41 \n\t"
|
||||
"xvabssp 50, 42 \n\t"
|
||||
"xvabssp 51, 43 \n\t"
|
||||
|
||||
"lxvw4x 40, 0, %2 \n\t"
|
||||
"lxvw4x 41, %5, %2 \n\t"
|
||||
|
||||
"xvabssp 52, 44 \n\t"
|
||||
"xvabssp 53, 45 \n\t"
|
||||
|
||||
"lxvw4x 42, %6, %2 \n\t"
|
||||
"lxvw4x 43, %7, %2 \n\t"
|
||||
|
||||
"xvabssp 54, 46 \n\t"
|
||||
"xvabssp 55, 47 \n\t"
|
||||
|
||||
"lxvw4x 44, %8, %2 \n\t"
|
||||
"lxvw4x 45, %9, %2 \n\t"
|
||||
|
||||
"xvaddsp 32, 32, 48 \n\t"
|
||||
"xvaddsp 33, 33, 49 \n\t"
|
||||
|
||||
"lxvw4x 46, %10, %2 \n\t"
|
||||
"lxvw4x 47, %11, %2 \n\t"
|
||||
|
||||
"xvaddsp 34, 34, 50 \n\t"
|
||||
"xvaddsp 35, 35, 51 \n\t"
|
||||
"addi %2, %2, 128 \n\t"
|
||||
"xvaddsp 36, 36, 52 \n\t"
|
||||
"xvaddsp 37, 37, 53 \n\t"
|
||||
"addic. %0 , %0 , -16 \n\t"
|
||||
"xvaddsp 38, 38, 54 \n\t"
|
||||
"xvaddsp 39, 39, 55 \n\t"
|
||||
|
||||
"bgt 1b \n\t"
|
||||
|
||||
"2: \n\t"
|
||||
|
||||
|
||||
"xvabssp 48, 40 \n\t"
|
||||
"xvabssp 49, 41 \n\t"
|
||||
"xvabssp 50, 42 \n\t"
|
||||
"xvabssp 51, 43 \n\t"
|
||||
"xvabssp 52, 44 \n\t"
|
||||
"xvabssp 53, 45 \n\t"
|
||||
"xvabssp 54, 46 \n\t"
|
||||
"xvabssp 55, 47 \n\t"
|
||||
|
||||
"xvaddsp 32, 32, 48 \n\t"
|
||||
"xvaddsp 33, 33, 49 \n\t"
|
||||
"xvaddsp 34, 34, 50 \n\t"
|
||||
"xvaddsp 35, 35, 51 \n\t"
|
||||
"xvaddsp 36, 36, 52 \n\t"
|
||||
"xvaddsp 37, 37, 53 \n\t"
|
||||
"xvaddsp 38, 38, 54 \n\t"
|
||||
"xvaddsp 39, 39, 55 \n\t"
|
||||
|
||||
"xvaddsp 32, 32, 33 \n\t"
|
||||
"xvaddsp 34, 34, 35 \n\t"
|
||||
"xvaddsp 36, 36, 37 \n\t"
|
||||
"xvaddsp 38, 38, 39 \n\t"
|
||||
|
||||
"xvaddsp 32, 32, 34 \n\t"
|
||||
"xvaddsp 36, 36, 38 \n\t"
|
||||
|
||||
"xvaddsp 32, 32, 36 \n\t"
|
||||
|
||||
|
||||
"stxvw4x 32, 0, %3 \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"r" (x1), // 2
|
||||
"r" (svec), // 3
|
||||
"r" (pre), // 4
|
||||
"r" (o16), // 5
|
||||
"r" (o32), // 6
|
||||
"r" (o48), // 7
|
||||
"r" (o64), // 8
|
||||
"r" (o80), // 9
|
||||
"r" (o96), // 10
|
||||
"r" (o112) // 11
|
||||
: "cr0", "%0", "%2", "memory"
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,140 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
/**************************************************************************************
|
||||
* 2016/03/25 Werner Saar (wernsaar@googlemail.com)
|
||||
* BLASTEST : OK
|
||||
* CTEST : OK
|
||||
* TEST : OK
|
||||
* LAPACK-TEST : OK
|
||||
**************************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if defined(POWER8)
|
||||
#include "ccopy_microk_power8.c"
|
||||
#endif
|
||||
|
||||
#ifndef HAVE_KERNEL_32
|
||||
|
||||
static void ccopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y)
|
||||
{
|
||||
|
||||
BLASLONG i=0;
|
||||
FLOAT f0, f1, f2, f3, f4, f5, f6, f7;
|
||||
FLOAT *x1=x;
|
||||
FLOAT *y1=y;
|
||||
|
||||
while ( i<n )
|
||||
{
|
||||
|
||||
f0 = x1[0];
|
||||
f1 = x1[1];
|
||||
f2 = x1[2];
|
||||
f3 = x1[3];
|
||||
f4 = x1[4];
|
||||
f5 = x1[5];
|
||||
f6 = x1[6];
|
||||
f7 = x1[7];
|
||||
|
||||
y1[0] = f0;
|
||||
y1[1] = f1;
|
||||
y1[2] = f2;
|
||||
y1[3] = f3;
|
||||
y1[4] = f4;
|
||||
y1[5] = f5;
|
||||
y1[6] = f6;
|
||||
y1[7] = f7;
|
||||
|
||||
x1 += 8;
|
||||
y1 += 8;
|
||||
|
||||
i+=4;
|
||||
}
|
||||
return;
|
||||
|
||||
}
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
||||
{
|
||||
BLASLONG i=0;
|
||||
BLASLONG ix=0,iy=0;
|
||||
|
||||
if ( n <= 0 ) return(0);
|
||||
|
||||
if ( (inc_x == 1) && (inc_y == 1 ))
|
||||
{
|
||||
|
||||
BLASLONG n1 = n & -32;
|
||||
if ( n1 > 0 )
|
||||
{
|
||||
ccopy_kernel_32(n1, x, y);
|
||||
i=n1;
|
||||
ix=n1*2;
|
||||
iy=n1*2;
|
||||
}
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
y[iy] = x[iy] ;
|
||||
y[iy+1] = x[ix+1] ;
|
||||
ix+=2;
|
||||
iy+=2;
|
||||
i++ ;
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
BLASLONG inc_x2 = 2 * inc_x;
|
||||
BLASLONG inc_y2 = 2 * inc_y;
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
y[iy] = x[ix] ;
|
||||
y[iy+1] = x[ix+1] ;
|
||||
ix += inc_x2 ;
|
||||
iy += inc_y2 ;
|
||||
i++ ;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
return(0);
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,174 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
/**************************************************************************************
|
||||
* 2016/03/25 Werner Saar (wernsaar@googlemail.com)
|
||||
* BLASTEST : OK
|
||||
* CTEST : OK
|
||||
* TEST : OK
|
||||
* LAPACK-TEST : OK
|
||||
**************************************************************************************/
|
||||
|
||||
#define HAVE_KERNEL_32 1
|
||||
|
||||
static void ccopy_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
|
||||
|
||||
static void ccopy_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y)
|
||||
{
|
||||
|
||||
|
||||
BLASLONG i = n;
|
||||
BLASLONG o16 = 16;
|
||||
BLASLONG o32 = 32;
|
||||
BLASLONG o48 = 48;
|
||||
BLASLONG o64 = 64;
|
||||
BLASLONG o80 = 80;
|
||||
BLASLONG o96 = 96;
|
||||
BLASLONG o112 = 112;
|
||||
FLOAT *x1=x;
|
||||
FLOAT *y1=y;
|
||||
BLASLONG pre = 384;
|
||||
BLASLONG alpha=0;
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
|
||||
"lxvw4x 40, 0, %2 \n\t"
|
||||
"lxvw4x 41, %5, %2 \n\t"
|
||||
"lxvw4x 42, %6, %2 \n\t"
|
||||
"lxvw4x 43, %7, %2 \n\t"
|
||||
"lxvw4x 44, %8, %2 \n\t"
|
||||
"lxvw4x 45, %9, %2 \n\t"
|
||||
"lxvw4x 46, %10, %2 \n\t"
|
||||
"lxvw4x 47, %11, %2 \n\t"
|
||||
|
||||
"addi %2, %2, 128 \n\t"
|
||||
|
||||
"lxvw4x 50, 0, %2 \n\t"
|
||||
"lxvw4x 51, %5, %2 \n\t"
|
||||
"lxvw4x 52, %6, %2 \n\t"
|
||||
"lxvw4x 53, %7, %2 \n\t"
|
||||
"lxvw4x 54, %8, %2 \n\t"
|
||||
"lxvw4x 55, %9, %2 \n\t"
|
||||
"lxvw4x 56, %10, %2 \n\t"
|
||||
"lxvw4x 57, %11, %2 \n\t"
|
||||
|
||||
"addi %2, %2, 128 \n\t"
|
||||
|
||||
"addic. %0 , %0 , -32 \n\t"
|
||||
"ble 2f \n\t"
|
||||
|
||||
".align 5 \n\t"
|
||||
"1: \n\t"
|
||||
|
||||
"stxvw4x 40, 0, %1 \n\t"
|
||||
"stxvw4x 41, %5, %1 \n\t"
|
||||
"lxvw4x 40, 0, %2 \n\t"
|
||||
"lxvw4x 41, %5, %2 \n\t"
|
||||
"stxvw4x 42, %6, %1 \n\t"
|
||||
"stxvw4x 43, %7, %1 \n\t"
|
||||
"lxvw4x 42, %6, %2 \n\t"
|
||||
"lxvw4x 43, %7, %2 \n\t"
|
||||
"stxvw4x 44, %8, %1 \n\t"
|
||||
"stxvw4x 45, %9, %1 \n\t"
|
||||
"lxvw4x 44, %8, %2 \n\t"
|
||||
"lxvw4x 45, %9, %2 \n\t"
|
||||
"stxvw4x 46, %10, %1 \n\t"
|
||||
"stxvw4x 47, %11, %1 \n\t"
|
||||
"lxvw4x 46, %10, %2 \n\t"
|
||||
"lxvw4x 47, %11, %2 \n\t"
|
||||
|
||||
|
||||
"addi %1, %1, 128 \n\t"
|
||||
"addi %2, %2, 128 \n\t"
|
||||
|
||||
"stxvw4x 50, 0, %1 \n\t"
|
||||
"stxvw4x 51, %5, %1 \n\t"
|
||||
"lxvw4x 50, 0, %2 \n\t"
|
||||
"lxvw4x 51, %5, %2 \n\t"
|
||||
"stxvw4x 52, %6, %1 \n\t"
|
||||
"stxvw4x 53, %7, %1 \n\t"
|
||||
"lxvw4x 52, %6, %2 \n\t"
|
||||
"lxvw4x 53, %7, %2 \n\t"
|
||||
"stxvw4x 54, %8, %1 \n\t"
|
||||
"stxvw4x 55, %9, %1 \n\t"
|
||||
"lxvw4x 54, %8, %2 \n\t"
|
||||
"lxvw4x 55, %9, %2 \n\t"
|
||||
"stxvw4x 56, %10, %1 \n\t"
|
||||
"stxvw4x 57, %11, %1 \n\t"
|
||||
"lxvw4x 56, %10, %2 \n\t"
|
||||
"lxvw4x 57, %11, %2 \n\t"
|
||||
|
||||
"addi %1, %1, 128 \n\t"
|
||||
"addi %2, %2, 128 \n\t"
|
||||
|
||||
"addic. %0 , %0 , -32 \n\t"
|
||||
"bgt 1b \n\t"
|
||||
|
||||
"2: \n\t"
|
||||
|
||||
"stxvw4x 40, 0, %1 \n\t"
|
||||
"stxvw4x 41, %5, %1 \n\t"
|
||||
"stxvw4x 42, %6, %1 \n\t"
|
||||
"stxvw4x 43, %7, %1 \n\t"
|
||||
"stxvw4x 44, %8, %1 \n\t"
|
||||
"stxvw4x 45, %9, %1 \n\t"
|
||||
"stxvw4x 46, %10, %1 \n\t"
|
||||
"stxvw4x 47, %11, %1 \n\t"
|
||||
|
||||
"addi %1, %1, 128 \n\t"
|
||||
|
||||
"stxvw4x 50, 0, %1 \n\t"
|
||||
"stxvw4x 51, %5, %1 \n\t"
|
||||
"stxvw4x 52, %6, %1 \n\t"
|
||||
"stxvw4x 53, %7, %1 \n\t"
|
||||
"stxvw4x 54, %8, %1 \n\t"
|
||||
"stxvw4x 55, %9, %1 \n\t"
|
||||
"stxvw4x 56, %10, %1 \n\t"
|
||||
"stxvw4x 57, %11, %1 \n\t"
|
||||
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (y1), // 1
|
||||
"r" (x1), // 2
|
||||
"r" (alpha), // 3
|
||||
"r" (pre), // 4
|
||||
"r" (o16), // 5
|
||||
"r" (o32), // 6
|
||||
"r" (o48), // 7
|
||||
"r" (o64), // 8
|
||||
"r" (o80), // 9
|
||||
"r" (o96), // 10
|
||||
"r" (o112) // 11
|
||||
: "cr0", "%0", "%2" , "%1", "memory"
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,407 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
/**************************************************************************************
|
||||
* 2016/04/04 Werner Saar (wernsaar@googlemail.com)
|
||||
* BLASTEST : OK
|
||||
* CTEST : OK
|
||||
* TEST : OK
|
||||
* LAPACK-TEST : OK
|
||||
**************************************************************************************/
|
||||
|
||||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
#include "def_vsx.h"
|
||||
|
||||
#ifndef __64BIT__
|
||||
#define LOAD lwz
|
||||
#else
|
||||
#define LOAD ld
|
||||
#endif
|
||||
|
||||
#ifdef __64BIT__
|
||||
#define STACKSIZE 32000
|
||||
#define ALPHA_R_SP 296(SP)
|
||||
#define ALPHA_I_SP 304(SP)
|
||||
#define FZERO 312(SP)
|
||||
#else
|
||||
#define STACKSIZE 256
|
||||
#define ALPHA_R_SP 224(SP)
|
||||
#define ALPHA_I_SP 232(SP)
|
||||
#define FZERO 240(SP)
|
||||
#endif
|
||||
|
||||
#define M r3
|
||||
#define N r4
|
||||
#define K r5
|
||||
|
||||
#ifdef linux
|
||||
#ifndef __64BIT__
|
||||
#define A r6
|
||||
#define B r7
|
||||
#define C r8
|
||||
#define LDC r9
|
||||
#define OFFSET r10
|
||||
#else
|
||||
#define A r8
|
||||
#define B r9
|
||||
#define C r10
|
||||
#define LDC r6
|
||||
#define OFFSET r7
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(_AIX) || defined(__APPLE__)
|
||||
#if !defined(__64BIT__) && defined(DOUBLE)
|
||||
#define A r10
|
||||
#define B r6
|
||||
#define C r7
|
||||
#define LDC r8
|
||||
#define OFFSET r9
|
||||
#else
|
||||
#define A r8
|
||||
#define B r9
|
||||
#define C r10
|
||||
#define LDC r6
|
||||
#define OFFSET r7
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#define o0 0
|
||||
|
||||
#define alpha_dr vs28
|
||||
#define alpha_di vs29
|
||||
#define alpha_sr vs30
|
||||
#define alpha_si vs31
|
||||
|
||||
#define FRAMEPOINTER r12
|
||||
|
||||
#define BBUFFER r14
|
||||
#define L r15
|
||||
#define o12 r16
|
||||
#define o4 r17
|
||||
#define T2 r19
|
||||
#define BBO r20
|
||||
#define o8 r21
|
||||
#define I r22
|
||||
#define J r23
|
||||
#define AO r24
|
||||
#define BO r25
|
||||
#define CO r26
|
||||
#define o16 r27
|
||||
#define o32 r28
|
||||
#define o48 r29
|
||||
|
||||
#define PRE r30
|
||||
#define T1 r31
|
||||
|
||||
#ifndef NEEDPARAM
|
||||
|
||||
PROLOGUE
|
||||
PROFCODE
|
||||
|
||||
mr FRAMEPOINTER, SP
|
||||
addi SP, SP, -STACKSIZE
|
||||
addi SP, SP, -STACKSIZE
|
||||
addi SP, SP, -STACKSIZE
|
||||
addi SP, SP, -STACKSIZE
|
||||
li r0, 0
|
||||
|
||||
stfd f14, 0(SP)
|
||||
stfd f15, 8(SP)
|
||||
stfd f16, 16(SP)
|
||||
stfd f17, 24(SP)
|
||||
|
||||
stfd f18, 32(SP)
|
||||
stfd f19, 40(SP)
|
||||
stfd f20, 48(SP)
|
||||
stfd f21, 56(SP)
|
||||
|
||||
stfd f22, 64(SP)
|
||||
stfd f23, 72(SP)
|
||||
stfd f24, 80(SP)
|
||||
stfd f25, 88(SP)
|
||||
|
||||
stfd f26, 96(SP)
|
||||
stfd f27, 104(SP)
|
||||
stfd f28, 112(SP)
|
||||
stfd f29, 120(SP)
|
||||
|
||||
stfd f30, 128(SP)
|
||||
stfd f31, 136(SP)
|
||||
|
||||
#ifdef __64BIT__
|
||||
std r31, 144(SP)
|
||||
std r30, 152(SP)
|
||||
std r29, 160(SP)
|
||||
std r28, 168(SP)
|
||||
std r27, 176(SP)
|
||||
std r26, 184(SP)
|
||||
std r25, 192(SP)
|
||||
std r24, 200(SP)
|
||||
std r23, 208(SP)
|
||||
std r22, 216(SP)
|
||||
std r21, 224(SP)
|
||||
std r20, 232(SP)
|
||||
std r19, 240(SP)
|
||||
std r18, 248(SP)
|
||||
std r17, 256(SP)
|
||||
std r16, 264(SP)
|
||||
std r15, 272(SP)
|
||||
std r14, 280(SP)
|
||||
#else
|
||||
stw r31, 144(SP)
|
||||
stw r30, 148(SP)
|
||||
stw r29, 152(SP)
|
||||
stw r28, 156(SP)
|
||||
stw r27, 160(SP)
|
||||
stw r26, 164(SP)
|
||||
stw r25, 168(SP)
|
||||
stw r24, 172(SP)
|
||||
stw r23, 176(SP)
|
||||
stw r22, 180(SP)
|
||||
stw r21, 184(SP)
|
||||
stw r20, 188(SP)
|
||||
stw r19, 192(SP)
|
||||
stw r18, 196(SP)
|
||||
stw r17, 200(SP)
|
||||
stw r16, 204(SP)
|
||||
stw r15, 208(SP)
|
||||
stw r14, 212(SP)
|
||||
#endif
|
||||
|
||||
stfs f1, ALPHA_R_SP
|
||||
stfs f2, ALPHA_I_SP
|
||||
// stw r0, FZERO
|
||||
|
||||
#ifdef linux
|
||||
#ifdef __64BIT__
|
||||
ld LDC, FRAMESLOT(0) + 0(FRAMEPOINTER)
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(_AIX) || defined(__APPLE__)
|
||||
#ifdef __64BIT__
|
||||
ld LDC, FRAMESLOT(0) + 0(FRAMEPOINTER)
|
||||
#else
|
||||
#ifdef DOUBLE
|
||||
lwz B, FRAMESLOT(0) + 0(FRAMEPOINTER)
|
||||
lwz C, FRAMESLOT(1) + 0(FRAMEPOINTER)
|
||||
lwz LDC, FRAMESLOT(2) + 0(FRAMEPOINTER)
|
||||
#else
|
||||
lwz LDC, FRAMESLOT(0) + 0(FRAMEPOINTER)
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef TRMMKERNEL
|
||||
#if defined(linux) && defined(__64BIT__)
|
||||
ld OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER)
|
||||
#endif
|
||||
|
||||
#if defined(_AIX) || defined(__APPLE__)
|
||||
#ifdef __64BIT__
|
||||
ld OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER)
|
||||
#else
|
||||
#ifdef DOUBLE
|
||||
lwz OFFSET, FRAMESLOT(3) + 0(FRAMEPOINTER)
|
||||
#else
|
||||
lwz OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER)
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
#if defined(TRMMKERNEL) && !defined(LEFT)
|
||||
neg KK, OFFSET
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#include "cgemm_macros_8x4_power8.S"
|
||||
|
||||
cmpwi cr0, M, 0
|
||||
ble L999_H1
|
||||
cmpwi cr0, N, 0
|
||||
ble L999_H1
|
||||
cmpwi cr0, K, 0
|
||||
ble L999_H1
|
||||
|
||||
slwi LDC, LDC, ZBASE_SHIFT
|
||||
li PRE, 384
|
||||
li o4 , 4
|
||||
li o8 , 8
|
||||
li o12 , 12
|
||||
li o16 , 16
|
||||
li o32 , 32
|
||||
li o48 , 48
|
||||
|
||||
addi BBUFFER, SP, 512+4096
|
||||
li T1, -4096
|
||||
and BBUFFER, BBUFFER, T1
|
||||
|
||||
|
||||
#ifdef __64BIT__
|
||||
addi T1 , SP, 296
|
||||
#else
|
||||
addi T1 , SP, 224
|
||||
#endif
|
||||
|
||||
stxsspx vs1, 0, T1
|
||||
lxsspx alpha_dr, 0, T1
|
||||
stxsspx vs2, o8 , T1
|
||||
lxsspx alpha_di, o8, T1
|
||||
addi T1, SP, 360
|
||||
li T2, 0
|
||||
|
||||
stw T2, 0(T1)
|
||||
stw T2, 4(T1)
|
||||
stw T2, 8(T1)
|
||||
stxsspx alpha_dr, o12, T1
|
||||
lxvw4x alpha_sr, o0 , T1
|
||||
addi T1, T1, 16
|
||||
|
||||
stw T2, 0(T1)
|
||||
stw T2, 4(T1)
|
||||
stw T2, 8(T1)
|
||||
stxsspx alpha_di, o12, T1
|
||||
lxvw4x alpha_si, o0 , T1
|
||||
|
||||
.align 5
|
||||
|
||||
#include "cgemm_logic_8x4_power8.S"
|
||||
|
||||
L999:
|
||||
addi r3, 0, 0
|
||||
|
||||
lfd f14, 0(SP)
|
||||
lfd f15, 8(SP)
|
||||
lfd f16, 16(SP)
|
||||
lfd f17, 24(SP)
|
||||
|
||||
lfd f18, 32(SP)
|
||||
lfd f19, 40(SP)
|
||||
lfd f20, 48(SP)
|
||||
lfd f21, 56(SP)
|
||||
|
||||
lfd f22, 64(SP)
|
||||
lfd f23, 72(SP)
|
||||
lfd f24, 80(SP)
|
||||
lfd f25, 88(SP)
|
||||
|
||||
lfd f26, 96(SP)
|
||||
lfd f27, 104(SP)
|
||||
lfd f28, 112(SP)
|
||||
lfd f29, 120(SP)
|
||||
|
||||
lfd f30, 128(SP)
|
||||
lfd f31, 136(SP)
|
||||
|
||||
#ifdef __64BIT__
|
||||
ld r31, 144(SP)
|
||||
ld r30, 152(SP)
|
||||
ld r29, 160(SP)
|
||||
ld r28, 168(SP)
|
||||
ld r27, 176(SP)
|
||||
ld r26, 184(SP)
|
||||
ld r25, 192(SP)
|
||||
ld r24, 200(SP)
|
||||
ld r23, 208(SP)
|
||||
ld r22, 216(SP)
|
||||
ld r21, 224(SP)
|
||||
ld r20, 232(SP)
|
||||
ld r19, 240(SP)
|
||||
ld r18, 248(SP)
|
||||
ld r17, 256(SP)
|
||||
ld r16, 264(SP)
|
||||
ld r15, 272(SP)
|
||||
ld r14, 280(SP)
|
||||
#else
|
||||
lwz r31, 144(SP)
|
||||
lwz r30, 148(SP)
|
||||
lwz r29, 152(SP)
|
||||
lwz r28, 156(SP)
|
||||
lwz r27, 160(SP)
|
||||
lwz r26, 164(SP)
|
||||
lwz r25, 168(SP)
|
||||
lwz r24, 172(SP)
|
||||
lwz r23, 176(SP)
|
||||
lwz r22, 180(SP)
|
||||
lwz r21, 184(SP)
|
||||
lwz r20, 188(SP)
|
||||
lwz r19, 192(SP)
|
||||
lwz r18, 196(SP)
|
||||
lwz r17, 200(SP)
|
||||
lwz r16, 204(SP)
|
||||
lwz r15, 208(SP)
|
||||
lwz r14, 212(SP)
|
||||
#endif
|
||||
|
||||
addi SP, SP, STACKSIZE
|
||||
addi SP, SP, STACKSIZE
|
||||
addi SP, SP, STACKSIZE
|
||||
addi SP, SP, STACKSIZE
|
||||
|
||||
blr
|
||||
|
||||
EPILOGUE
|
||||
#endif
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
|
@ -0,0 +1,175 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
/**************************************************************************************
|
||||
* 2016/03/27 Werner Saar (wernsaar@googlemail.com)
|
||||
* BLASTEST : OK
|
||||
* CTEST : OK
|
||||
* TEST : OK
|
||||
* LAPACK-TEST : OK
|
||||
**************************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
|
||||
#if defined(POWER8)
|
||||
#include "cswap_microk_power8.c"
|
||||
#endif
|
||||
|
||||
|
||||
#ifndef HAVE_KERNEL_32
|
||||
|
||||
static void cswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y)
|
||||
{
|
||||
|
||||
BLASLONG i=0;
|
||||
FLOAT f0, f1, f2, f3, f4, f5, f6, f7;
|
||||
FLOAT g0, g1, g2, g3, g4, g5, g6, g7;
|
||||
FLOAT *x1=x;
|
||||
FLOAT *y1=y;
|
||||
|
||||
while ( i<n )
|
||||
{
|
||||
|
||||
f0 = x1[0];
|
||||
f1 = x1[1];
|
||||
f2 = x1[2];
|
||||
f3 = x1[3];
|
||||
f4 = x1[4];
|
||||
f5 = x1[5];
|
||||
f6 = x1[6];
|
||||
f7 = x1[7];
|
||||
|
||||
g0 = y1[0];
|
||||
g1 = y1[1];
|
||||
g2 = y1[2];
|
||||
g3 = y1[3];
|
||||
g4 = y1[4];
|
||||
g5 = y1[5];
|
||||
g6 = y1[6];
|
||||
g7 = y1[7];
|
||||
|
||||
y1[0] = f0;
|
||||
y1[1] = f1;
|
||||
y1[2] = f2;
|
||||
y1[3] = f3;
|
||||
y1[4] = f4;
|
||||
y1[5] = f5;
|
||||
y1[6] = f6;
|
||||
y1[7] = f7;
|
||||
|
||||
x1[0] = g0;
|
||||
x1[1] = g1;
|
||||
x1[2] = g2;
|
||||
x1[3] = g3;
|
||||
x1[4] = g4;
|
||||
x1[5] = g5;
|
||||
x1[6] = g6;
|
||||
x1[7] = g7;
|
||||
|
||||
x1 += 8;
|
||||
y1 += 8;
|
||||
|
||||
i+=4;
|
||||
}
|
||||
return;
|
||||
|
||||
}
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
|
||||
{
|
||||
BLASLONG i=0;
|
||||
BLASLONG ix=0,iy=0;
|
||||
FLOAT temp[2];
|
||||
BLASLONG inc_x2, inc_y2;
|
||||
|
||||
if ( n <= 0 ) return(0);
|
||||
|
||||
if ( (inc_x == 1) && (inc_y == 1 ))
|
||||
{
|
||||
|
||||
BLASLONG n1 = n & -32;
|
||||
if ( n1 > 0 )
|
||||
{
|
||||
cswap_kernel_32(n1, x, y);
|
||||
i=n1;
|
||||
ix = 2* n1;
|
||||
iy = 2* n1;
|
||||
}
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
|
||||
temp[0] = x[ix] ;
|
||||
temp[1] = x[ix+1] ;
|
||||
x[ix] = y[iy] ;
|
||||
x[ix+1] = y[iy+1] ;
|
||||
y[iy] = temp[0] ;
|
||||
y[iy+1] = temp[1] ;
|
||||
|
||||
ix += 2 ;
|
||||
iy += 2 ;
|
||||
i++ ;
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
inc_x2 = 2 * inc_x;
|
||||
inc_y2 = 2 * inc_y;
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
|
||||
temp[0] = x[ix] ;
|
||||
temp[1] = x[ix+1] ;
|
||||
x[ix] = y[iy] ;
|
||||
x[ix+1] = y[iy+1] ;
|
||||
y[iy] = temp[0] ;
|
||||
y[iy+1] = temp[1] ;
|
||||
|
||||
ix += inc_x2 ;
|
||||
iy += inc_y2 ;
|
||||
i++ ;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
return(0);
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,180 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
/**************************************************************************************
|
||||
* 2016/03/27 Werner Saar (wernsaar@googlemail.com)
|
||||
* BLASTEST : OK
|
||||
* CTEST : OK
|
||||
* TEST : OK
|
||||
* LAPACK-TEST : OK
|
||||
**************************************************************************************/
|
||||
|
||||
#define HAVE_KERNEL_32 1
|
||||
|
||||
static void cswap_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
|
||||
|
||||
static void cswap_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y)
|
||||
{
|
||||
|
||||
|
||||
BLASLONG i = n;
|
||||
BLASLONG o16 = 16;
|
||||
BLASLONG o32 = 32;
|
||||
BLASLONG o48 = 48;
|
||||
BLASLONG o64 = 64;
|
||||
BLASLONG o80 = 80;
|
||||
BLASLONG o96 = 96;
|
||||
BLASLONG o112 = 112;
|
||||
FLOAT *x1=x;
|
||||
FLOAT *y1=y;
|
||||
FLOAT *x2=x+1;
|
||||
FLOAT *y2=y+1;
|
||||
BLASLONG pre = 384;
|
||||
BLASLONG alpha=0;
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
|
||||
"addi %3, %3, -4 \n\t"
|
||||
"addi %4, %4, -4 \n\t"
|
||||
|
||||
".align 5 \n\t"
|
||||
"1: \n\t"
|
||||
|
||||
"lxvw4x 32, 0, %2 \n\t"
|
||||
"lxvw4x 33, %5, %2 \n\t"
|
||||
"lxvw4x 34, %6, %2 \n\t"
|
||||
"lxvw4x 35, %7, %2 \n\t"
|
||||
"lxvw4x 36, %8, %2 \n\t"
|
||||
"lxvw4x 37, %9, %2 \n\t"
|
||||
"lxvw4x 38, %10, %2 \n\t"
|
||||
"lxvw4x 39, %11, %2 \n\t"
|
||||
|
||||
"addi %2, %2, 128 \n\t"
|
||||
|
||||
"lxvw4x 40, 0, %2 \n\t"
|
||||
"lxvw4x 41, %5, %2 \n\t"
|
||||
"lxvw4x 42, %6, %2 \n\t"
|
||||
"lxvw4x 43, %7, %2 \n\t"
|
||||
"lxvw4x 44, %8, %2 \n\t"
|
||||
"lxvw4x 45, %9, %2 \n\t"
|
||||
"lxvw4x 46, %10, %2 \n\t"
|
||||
"lxvw4x 47, %11, %2 \n\t"
|
||||
|
||||
"addi %2, %2, 128 \n\t"
|
||||
|
||||
"lxvw4x 48, 0, %1 \n\t"
|
||||
"lxvw4x 49, %5, %1 \n\t"
|
||||
"lxvw4x 50, %6, %1 \n\t"
|
||||
"lxvw4x 51, %7, %1 \n\t"
|
||||
"lxvw4x 52, %8, %1 \n\t"
|
||||
"lxvw4x 53, %9, %1 \n\t"
|
||||
"lxvw4x 54, %10, %1 \n\t"
|
||||
"lxvw4x 55, %11, %1 \n\t"
|
||||
|
||||
"addi %1, %1, 128 \n\t"
|
||||
|
||||
"lxvw4x 56, 0, %1 \n\t"
|
||||
"lxvw4x 57, %5, %1 \n\t"
|
||||
"lxvw4x 58, %6, %1 \n\t"
|
||||
"lxvw4x 59, %7, %1 \n\t"
|
||||
"lxvw4x 60, %8, %1 \n\t"
|
||||
"lxvw4x 61, %9, %1 \n\t"
|
||||
"lxvw4x 62, %10, %1 \n\t"
|
||||
"lxvw4x 63, %11, %1 \n\t"
|
||||
|
||||
"addi %1, %1, 128 \n\t"
|
||||
|
||||
"stxvw4x 32, 0, %3 \n\t"
|
||||
"stxvw4x 33, %5, %3 \n\t"
|
||||
"stxvw4x 34, %6, %3 \n\t"
|
||||
"stxvw4x 35, %7, %3 \n\t"
|
||||
"stxvw4x 36, %8, %3 \n\t"
|
||||
"stxvw4x 37, %9, %3 \n\t"
|
||||
"stxvw4x 38, %10, %3 \n\t"
|
||||
"stxvw4x 39, %11, %3 \n\t"
|
||||
|
||||
"addi %3, %3, 128 \n\t"
|
||||
|
||||
"stxvw4x 40, 0, %3 \n\t"
|
||||
"stxvw4x 41, %5, %3 \n\t"
|
||||
"stxvw4x 42, %6, %3 \n\t"
|
||||
"stxvw4x 43, %7, %3 \n\t"
|
||||
"stxvw4x 44, %8, %3 \n\t"
|
||||
"stxvw4x 45, %9, %3 \n\t"
|
||||
"stxvw4x 46, %10, %3 \n\t"
|
||||
"stxvw4x 47, %11, %3 \n\t"
|
||||
|
||||
"addi %3, %3, 128 \n\t"
|
||||
|
||||
"stxvw4x 48, 0, %4 \n\t"
|
||||
"stxvw4x 49, %5, %4 \n\t"
|
||||
"stxvw4x 50, %6, %4 \n\t"
|
||||
"stxvw4x 51, %7, %4 \n\t"
|
||||
"stxvw4x 52, %8, %4 \n\t"
|
||||
"stxvw4x 53, %9, %4 \n\t"
|
||||
"stxvw4x 54, %10, %4 \n\t"
|
||||
"stxvw4x 55, %11, %4 \n\t"
|
||||
|
||||
"addi %4, %4, 128 \n\t"
|
||||
|
||||
"stxvw4x 56, 0, %4 \n\t"
|
||||
"stxvw4x 57, %5, %4 \n\t"
|
||||
"stxvw4x 58, %6, %4 \n\t"
|
||||
"stxvw4x 59, %7, %4 \n\t"
|
||||
"stxvw4x 60, %8, %4 \n\t"
|
||||
"stxvw4x 61, %9, %4 \n\t"
|
||||
"stxvw4x 62, %10, %4 \n\t"
|
||||
"stxvw4x 63, %11, %4 \n\t"
|
||||
|
||||
"addi %4, %4, 128 \n\t"
|
||||
|
||||
"addic. %0 , %0 , -32 \n\t"
|
||||
"bgt 1b \n\t"
|
||||
|
||||
"2: \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (y1), // 1
|
||||
"r" (x1), // 2
|
||||
"r" (y2), // 3
|
||||
"r" (x2), // 4
|
||||
"r" (o16), // 5
|
||||
"r" (o32), // 6
|
||||
"r" (o48), // 7
|
||||
"r" (o64), // 8
|
||||
"r" (o80), // 9
|
||||
"r" (o96), // 10
|
||||
"r" (o112) // 11
|
||||
: "cr0", "%0", "%2" , "%1", "%3", "%4", "memory"
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,399 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
/**************************************************************************************
|
||||
* 2016/04/04 Werner Saar (wernsaar@googlemail.com)
|
||||
* BLASTEST : OK
|
||||
* CTEST : OK
|
||||
* TEST : OK
|
||||
* LAPACK-TEST : OK
|
||||
**************************************************************************************/
|
||||
|
||||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
#include "def_vsx.h"
|
||||
|
||||
#ifndef __64BIT__
|
||||
#define LOAD lwz
|
||||
#else
|
||||
#define LOAD ld
|
||||
#endif
|
||||
|
||||
#ifdef __64BIT__
|
||||
#define STACKSIZE 400
|
||||
#define ALPHA_R_SP 304(SP)
|
||||
#define ALPHA_I_SP 312(SP)
|
||||
#else
|
||||
#define STACKSIZE 256
|
||||
#define ALPHA_R_SP 224(SP)
|
||||
#define ALPHA_I_SP 232(SP)
|
||||
#define FZERO 240(SP)
|
||||
#endif
|
||||
|
||||
#define M r3
|
||||
#define N r4
|
||||
#define K r5
|
||||
|
||||
#ifdef linux
|
||||
#ifndef __64BIT__
|
||||
#define A r6
|
||||
#define B r7
|
||||
#define C r8
|
||||
#define LDC r9
|
||||
#define OFFSET r10
|
||||
#else
|
||||
#define A r8
|
||||
#define B r9
|
||||
#define C r10
|
||||
#define LDC r6
|
||||
#define OFFSET r7
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(_AIX) || defined(__APPLE__)
|
||||
#if !defined(__64BIT__) && defined(DOUBLE)
|
||||
#define A r10
|
||||
#define B r6
|
||||
#define C r7
|
||||
#define LDC r8
|
||||
#define OFFSET r9
|
||||
#else
|
||||
#define A r8
|
||||
#define B r9
|
||||
#define C r10
|
||||
#define LDC r6
|
||||
#define OFFSET r7
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#define o0 0
|
||||
|
||||
#define alpha_dr vs28
|
||||
#define alpha_di vs29
|
||||
#define alpha_sr vs30
|
||||
#define alpha_si vs31
|
||||
|
||||
#define o12 r12
|
||||
#define KKK r13
|
||||
#define K1 r14
|
||||
#define L r15
|
||||
#define o16 r16
|
||||
#define NOTUSED r17
|
||||
#define T2 r19
|
||||
#define KK r20
|
||||
#define o8 r21
|
||||
#define I r22
|
||||
#define J r23
|
||||
#define AO r24
|
||||
#define BO r25
|
||||
#define CO r26
|
||||
#define o4 r27
|
||||
#define o32 r28
|
||||
#define o48 r29
|
||||
|
||||
#define PRE r30
|
||||
#define T1 r31
|
||||
|
||||
#ifndef NEEDPARAM
|
||||
|
||||
PROLOGUE
|
||||
PROFCODE
|
||||
|
||||
addi SP, SP, -STACKSIZE
|
||||
li r0, 0
|
||||
|
||||
stfd f14, 0(SP)
|
||||
stfd f15, 8(SP)
|
||||
stfd f16, 16(SP)
|
||||
stfd f17, 24(SP)
|
||||
|
||||
stfd f18, 32(SP)
|
||||
stfd f19, 40(SP)
|
||||
stfd f20, 48(SP)
|
||||
stfd f21, 56(SP)
|
||||
|
||||
stfd f22, 64(SP)
|
||||
stfd f23, 72(SP)
|
||||
stfd f24, 80(SP)
|
||||
stfd f25, 88(SP)
|
||||
|
||||
stfd f26, 96(SP)
|
||||
stfd f27, 104(SP)
|
||||
stfd f28, 112(SP)
|
||||
stfd f29, 120(SP)
|
||||
|
||||
stfd f30, 128(SP)
|
||||
stfd f31, 136(SP)
|
||||
|
||||
#ifdef __64BIT__
|
||||
std r31, 144(SP)
|
||||
std r30, 152(SP)
|
||||
std r29, 160(SP)
|
||||
std r28, 168(SP)
|
||||
std r27, 176(SP)
|
||||
std r26, 184(SP)
|
||||
std r25, 192(SP)
|
||||
std r24, 200(SP)
|
||||
std r23, 208(SP)
|
||||
std r22, 216(SP)
|
||||
std r21, 224(SP)
|
||||
std r20, 232(SP)
|
||||
std r19, 240(SP)
|
||||
std r18, 248(SP)
|
||||
std r17, 256(SP)
|
||||
std r16, 264(SP)
|
||||
std r15, 272(SP)
|
||||
std r14, 280(SP)
|
||||
std r13, 288(SP)
|
||||
std r12, 296(SP)
|
||||
#else
|
||||
stw r31, 144(SP)
|
||||
stw r30, 148(SP)
|
||||
stw r29, 152(SP)
|
||||
stw r28, 156(SP)
|
||||
stw r27, 160(SP)
|
||||
stw r26, 164(SP)
|
||||
stw r25, 168(SP)
|
||||
stw r24, 172(SP)
|
||||
stw r23, 176(SP)
|
||||
stw r22, 180(SP)
|
||||
stw r21, 184(SP)
|
||||
stw r20, 188(SP)
|
||||
stw r19, 192(SP)
|
||||
stw r18, 196(SP)
|
||||
stw r17, 200(SP)
|
||||
stw r16, 204(SP)
|
||||
stw r15, 208(SP)
|
||||
stw r14, 212(SP)
|
||||
stw r13, 216(SP)
|
||||
#endif
|
||||
|
||||
stfs f1, ALPHA_R_SP
|
||||
stfs f2, ALPHA_I_SP
|
||||
// stw r0, FZERO
|
||||
|
||||
#ifdef linux
|
||||
#ifdef __64BIT__
|
||||
ld LDC, FRAMESLOT(0) + STACKSIZE(SP)
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(_AIX) || defined(__APPLE__)
|
||||
#ifdef __64BIT__
|
||||
ld LDC, FRAMESLOT(0) + STACKSIZE(SP)
|
||||
#else
|
||||
#ifdef DOUBLE
|
||||
lwz B, FRAMESLOT(0) + STACKSIZE(SP)
|
||||
lwz C, FRAMESLOT(1) + STACKSIZE(SP)
|
||||
lwz LDC, FRAMESLOT(2) + STACKSIZE(SP)
|
||||
#else
|
||||
lwz LDC, FRAMESLOT(0) + STACKSIZE(SP)
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef TRMMKERNEL
|
||||
#if defined(linux) && defined(__64BIT__)
|
||||
ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
|
||||
#endif
|
||||
|
||||
#if defined(_AIX) || defined(__APPLE__)
|
||||
#ifdef __64BIT__
|
||||
ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
|
||||
#else
|
||||
#ifdef DOUBLE
|
||||
lwz OFFSET, FRAMESLOT(3) + STACKSIZE(SP)
|
||||
#else
|
||||
lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
#if defined(TRMMKERNEL) && !defined(LEFT)
|
||||
neg KK, OFFSET
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#include "ctrmm_macros_8x4_power8.S"
|
||||
|
||||
cmpwi cr0, M, 0
|
||||
ble L999_H1
|
||||
cmpwi cr0, N, 0
|
||||
ble L999_H1
|
||||
cmpwi cr0, K, 0
|
||||
ble L999_H1
|
||||
|
||||
slwi LDC, LDC, ZBASE_SHIFT
|
||||
li PRE, 384
|
||||
li o4 , 4
|
||||
li o8 , 8
|
||||
li o12 , 12
|
||||
li o16 , 16
|
||||
li o32 , 32
|
||||
li o48 , 48
|
||||
|
||||
|
||||
#ifdef __64BIT__
|
||||
addi T1, SP, 304
|
||||
#else
|
||||
addi T1, SP, 224
|
||||
#endif
|
||||
|
||||
lxsspx alpha_dr, 0, T1
|
||||
lxsspx alpha_di, o8, T1
|
||||
addi T1, SP, 360
|
||||
li T2, 0
|
||||
|
||||
stw T2, 0(T1)
|
||||
stw T2, 4(T1)
|
||||
stw T2, 8(T1)
|
||||
stxsspx alpha_dr, o12, T1
|
||||
lxvw4x alpha_sr, o0 , T1
|
||||
addi T1, T1, 16
|
||||
|
||||
stw T2, 0(T1)
|
||||
stw T2, 4(T1)
|
||||
stw T2, 8(T1)
|
||||
stxsspx alpha_di, o12, T1
|
||||
lxvw4x alpha_si, o0 , T1
|
||||
|
||||
.align 5
|
||||
|
||||
#include "ctrmm_logic_8x4_power8.S"
|
||||
|
||||
L999:
|
||||
addi r3, 0, 0
|
||||
|
||||
lfd f14, 0(SP)
|
||||
lfd f15, 8(SP)
|
||||
lfd f16, 16(SP)
|
||||
lfd f17, 24(SP)
|
||||
|
||||
lfd f18, 32(SP)
|
||||
lfd f19, 40(SP)
|
||||
lfd f20, 48(SP)
|
||||
lfd f21, 56(SP)
|
||||
|
||||
lfd f22, 64(SP)
|
||||
lfd f23, 72(SP)
|
||||
lfd f24, 80(SP)
|
||||
lfd f25, 88(SP)
|
||||
|
||||
lfd f26, 96(SP)
|
||||
lfd f27, 104(SP)
|
||||
lfd f28, 112(SP)
|
||||
lfd f29, 120(SP)
|
||||
|
||||
lfd f30, 128(SP)
|
||||
lfd f31, 136(SP)
|
||||
|
||||
#ifdef __64BIT__
|
||||
ld r31, 144(SP)
|
||||
ld r30, 152(SP)
|
||||
ld r29, 160(SP)
|
||||
ld r28, 168(SP)
|
||||
ld r27, 176(SP)
|
||||
ld r26, 184(SP)
|
||||
ld r25, 192(SP)
|
||||
ld r24, 200(SP)
|
||||
ld r23, 208(SP)
|
||||
ld r22, 216(SP)
|
||||
ld r21, 224(SP)
|
||||
ld r20, 232(SP)
|
||||
ld r19, 240(SP)
|
||||
ld r18, 248(SP)
|
||||
ld r17, 256(SP)
|
||||
ld r16, 264(SP)
|
||||
ld r15, 272(SP)
|
||||
ld r14, 280(SP)
|
||||
ld r13, 288(SP)
|
||||
ld r12, 296(SP)
|
||||
#else
|
||||
lwz r31, 144(SP)
|
||||
lwz r30, 148(SP)
|
||||
lwz r29, 152(SP)
|
||||
lwz r28, 156(SP)
|
||||
lwz r27, 160(SP)
|
||||
lwz r26, 164(SP)
|
||||
lwz r25, 168(SP)
|
||||
lwz r24, 172(SP)
|
||||
lwz r23, 176(SP)
|
||||
lwz r22, 180(SP)
|
||||
lwz r21, 184(SP)
|
||||
lwz r20, 188(SP)
|
||||
lwz r19, 192(SP)
|
||||
lwz r18, 196(SP)
|
||||
lwz r17, 200(SP)
|
||||
lwz r16, 204(SP)
|
||||
lwz r15, 208(SP)
|
||||
lwz r14, 212(SP)
|
||||
lwz r13, 216(SP)
|
||||
#endif
|
||||
|
||||
addi SP, SP, STACKSIZE
|
||||
|
||||
blr
|
||||
|
||||
EPILOGUE
|
||||
#endif
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
|
@ -0,0 +1,144 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
/**************************************************************************************
|
||||
* 2016/03/28 Werner Saar (wernsaar@googlemail.com)
|
||||
* BLASTEST : OK
|
||||
* CTEST : OK
|
||||
* TEST : OK
|
||||
* LAPACK-TEST : OK
|
||||
**************************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include <math.h>
|
||||
|
||||
#if defined(DOUBLE)
|
||||
|
||||
#define ABS fabs
|
||||
|
||||
#else
|
||||
|
||||
#define ABS fabsf
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(POWER8)
|
||||
#include "dasum_microk_power8.c"
|
||||
#endif
|
||||
|
||||
|
||||
#ifndef HAVE_KERNEL_16
|
||||
|
||||
static void dasum_kernel_16(BLASLONG n, FLOAT *x1, FLOAT *svec)
|
||||
{
|
||||
|
||||
BLASLONG i=0;
|
||||
FLOAT *x = x1;
|
||||
FLOAT temp0, temp1, temp2, temp3;
|
||||
FLOAT temp4, temp5, temp6, temp7;
|
||||
FLOAT sum0 = 0.0;
|
||||
FLOAT sum1 = 0.0;
|
||||
FLOAT sum2 = 0.0;
|
||||
FLOAT sum3 = 0.0;
|
||||
|
||||
while ( i< n )
|
||||
{
|
||||
|
||||
temp0 = ABS(x[0]);
|
||||
temp1 = ABS(x[1]);
|
||||
temp2 = ABS(x[2]);
|
||||
temp3 = ABS(x[3]);
|
||||
temp4 = ABS(x[4]);
|
||||
temp5 = ABS(x[5]);
|
||||
temp6 = ABS(x[6]);
|
||||
temp7 = ABS(x[7]);
|
||||
|
||||
sum0 += temp0;
|
||||
sum1 += temp1;
|
||||
sum2 += temp2;
|
||||
sum3 += temp3;
|
||||
|
||||
sum0 += temp4;
|
||||
sum1 += temp5;
|
||||
sum2 += temp6;
|
||||
sum3 += temp7;
|
||||
|
||||
x+=8;
|
||||
i+=8;
|
||||
|
||||
}
|
||||
|
||||
svec[0] = sum0+sum1+sum2+sum3;
|
||||
svec[1] = 0.0;
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
BLASLONG i=0;
|
||||
FLOAT sumf = 0.0;
|
||||
FLOAT svec[2] __attribute__ ((aligned (16)));;
|
||||
BLASLONG n1;
|
||||
|
||||
if (n <= 0 || inc_x <= 0) return(sumf);
|
||||
|
||||
if ( inc_x == 1 )
|
||||
{
|
||||
|
||||
n1 = n & -16;
|
||||
if ( n1 > 0 )
|
||||
{
|
||||
|
||||
dasum_kernel_16(n1, x, svec);
|
||||
sumf = svec[0] + svec[1];
|
||||
i=n1;
|
||||
}
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
sumf += ABS(x[i]);
|
||||
i++;
|
||||
}
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
n *= inc_x;
|
||||
while(i < n)
|
||||
{
|
||||
sumf += ABS(x[i]);
|
||||
i += inc_x;
|
||||
}
|
||||
|
||||
}
|
||||
return(sumf);
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,177 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
/**************************************************************************************
|
||||
* 2016/03/28 Werner Saar (wernsaar@googlemail.com)
|
||||
* BLASTEST : OK
|
||||
* CTEST : OK
|
||||
* TEST : OK
|
||||
* LAPACK-TEST : OK
|
||||
**************************************************************************************/
|
||||
|
||||
#define HAVE_KERNEL_16 1
|
||||
static void dasum_kernel_16( BLASLONG n, FLOAT *x, FLOAT *svec) __attribute__ ((noinline));
|
||||
|
||||
static void dasum_kernel_16( BLASLONG n, FLOAT *x, FLOAT *svec)
|
||||
{
|
||||
|
||||
|
||||
BLASLONG i = n;
|
||||
BLASLONG o16 = 16;
|
||||
BLASLONG o32 = 32;
|
||||
BLASLONG o48 = 48;
|
||||
BLASLONG o64 = 64;
|
||||
BLASLONG o80 = 80;
|
||||
BLASLONG o96 = 96;
|
||||
BLASLONG o112 = 112;
|
||||
FLOAT *x1=x;
|
||||
BLASLONG pre = 384;
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
|
||||
"dcbt %2 , %4 \n\t"
|
||||
|
||||
"xxlxor 32,32,32 \n\t"
|
||||
"xxlxor 33,33,33 \n\t"
|
||||
"xxlxor 34,34,34 \n\t"
|
||||
"xxlxor 35,35,35 \n\t"
|
||||
"xxlxor 36,36,36 \n\t"
|
||||
"xxlxor 37,37,37 \n\t"
|
||||
"xxlxor 38,38,38 \n\t"
|
||||
"xxlxor 39,39,39 \n\t"
|
||||
|
||||
"lxvd2x 40, 0, %2 \n\t"
|
||||
"lxvd2x 41, %5, %2 \n\t"
|
||||
"lxvd2x 42, %6, %2 \n\t"
|
||||
"lxvd2x 43, %7, %2 \n\t"
|
||||
"lxvd2x 44, %8, %2 \n\t"
|
||||
"lxvd2x 45, %9, %2 \n\t"
|
||||
"lxvd2x 46, %10, %2 \n\t"
|
||||
"lxvd2x 47, %11, %2 \n\t"
|
||||
|
||||
"addi %2, %2, 128 \n\t"
|
||||
|
||||
"addic. %0 , %0 , -16 \n\t"
|
||||
"ble 2f \n\t"
|
||||
|
||||
".align 5 \n\t"
|
||||
"1: \n\t"
|
||||
|
||||
"dcbt %2 , %4 \n\t"
|
||||
|
||||
"xvabsdp 48, 40 \n\t"
|
||||
"xvabsdp 49, 41 \n\t"
|
||||
"xvabsdp 50, 42 \n\t"
|
||||
"xvabsdp 51, 43 \n\t"
|
||||
|
||||
"lxvd2x 40, 0, %2 \n\t"
|
||||
"lxvd2x 41, %5, %2 \n\t"
|
||||
|
||||
"xvabsdp 52, 44 \n\t"
|
||||
"xvabsdp 53, 45 \n\t"
|
||||
|
||||
"lxvd2x 42, %6, %2 \n\t"
|
||||
"lxvd2x 43, %7, %2 \n\t"
|
||||
|
||||
"xvabsdp 54, 46 \n\t"
|
||||
"xvabsdp 55, 47 \n\t"
|
||||
|
||||
"lxvd2x 44, %8, %2 \n\t"
|
||||
"lxvd2x 45, %9, %2 \n\t"
|
||||
|
||||
"xvadddp 32, 32, 48 \n\t"
|
||||
"xvadddp 33, 33, 49 \n\t"
|
||||
|
||||
"lxvd2x 46, %10, %2 \n\t"
|
||||
"lxvd2x 47, %11, %2 \n\t"
|
||||
|
||||
"xvadddp 34, 34, 50 \n\t"
|
||||
"xvadddp 35, 35, 51 \n\t"
|
||||
"addi %2, %2, 128 \n\t"
|
||||
"xvadddp 36, 36, 52 \n\t"
|
||||
"xvadddp 37, 37, 53 \n\t"
|
||||
"addic. %0 , %0 , -16 \n\t"
|
||||
"xvadddp 38, 38, 54 \n\t"
|
||||
"xvadddp 39, 39, 55 \n\t"
|
||||
|
||||
"bgt 1b \n\t"
|
||||
|
||||
"2: \n\t"
|
||||
|
||||
|
||||
"xvabsdp 48, 40 \n\t"
|
||||
"xvabsdp 49, 41 \n\t"
|
||||
"xvabsdp 50, 42 \n\t"
|
||||
"xvabsdp 51, 43 \n\t"
|
||||
"xvabsdp 52, 44 \n\t"
|
||||
"xvabsdp 53, 45 \n\t"
|
||||
"xvabsdp 54, 46 \n\t"
|
||||
"xvabsdp 55, 47 \n\t"
|
||||
|
||||
"xvadddp 32, 32, 48 \n\t"
|
||||
"xvadddp 33, 33, 49 \n\t"
|
||||
"xvadddp 34, 34, 50 \n\t"
|
||||
"xvadddp 35, 35, 51 \n\t"
|
||||
"xvadddp 36, 36, 52 \n\t"
|
||||
"xvadddp 37, 37, 53 \n\t"
|
||||
"xvadddp 38, 38, 54 \n\t"
|
||||
"xvadddp 39, 39, 55 \n\t"
|
||||
|
||||
"xvadddp 32, 32, 33 \n\t"
|
||||
"xvadddp 34, 34, 35 \n\t"
|
||||
"xvadddp 36, 36, 37 \n\t"
|
||||
"xvadddp 38, 38, 39 \n\t"
|
||||
|
||||
"xvadddp 32, 32, 34 \n\t"
|
||||
"xvadddp 36, 36, 38 \n\t"
|
||||
|
||||
"xvadddp 32, 32, 36 \n\t"
|
||||
|
||||
|
||||
"stxvd2x 32, 0, %3 \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"r" (x1), // 2
|
||||
"r" (svec), // 3
|
||||
"r" (pre), // 4
|
||||
"r" (o16), // 5
|
||||
"r" (o32), // 6
|
||||
"r" (o48), // 7
|
||||
"r" (o64), // 8
|
||||
"r" (o80), // 9
|
||||
"r" (o96), // 10
|
||||
"r" (o112) // 11
|
||||
: "cr0", "%0", "%2", "memory"
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,136 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
/**************************************************************************************
|
||||
* 2016/03/22 Werner Saar (wernsaar@googlemail.com)
|
||||
* BLASTEST : OK
|
||||
* CTEST : OK
|
||||
* TEST : OK
|
||||
* LAPACK-TEST : OK
|
||||
**************************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
|
||||
#if defined(POWER8)
|
||||
#include "daxpy_microk_power8.c"
|
||||
#endif
|
||||
|
||||
|
||||
#ifndef HAVE_KERNEL_8
|
||||
|
||||
static void daxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
||||
{
|
||||
BLASLONG register i = 0;
|
||||
FLOAT a = *alpha;
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
y[i] += a * x[i];
|
||||
y[i+1] += a * x[i+1];
|
||||
y[i+2] += a * x[i+2];
|
||||
y[i+3] += a * x[i+3];
|
||||
y[i+4] += a * x[i+4];
|
||||
y[i+5] += a * x[i+5];
|
||||
y[i+6] += a * x[i+6];
|
||||
y[i+7] += a * x[i+7];
|
||||
i+=8 ;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
|
||||
{
|
||||
BLASLONG i=0;
|
||||
BLASLONG ix=0,iy=0;
|
||||
FLOAT a2[4];
|
||||
a2[0]=da;
|
||||
a2[1]=da;
|
||||
a2[2]=da;
|
||||
a2[3]=da;
|
||||
|
||||
if ( n <= 0 ) return(0);
|
||||
|
||||
if ( (inc_x == 1) && (inc_y == 1) )
|
||||
{
|
||||
|
||||
BLASLONG n1 = n & -16;
|
||||
|
||||
if ( n1 )
|
||||
daxpy_kernel_8(n1, x, y , a2 );
|
||||
|
||||
i = n1;
|
||||
while(i < n)
|
||||
{
|
||||
|
||||
y[i] += da * x[i] ;
|
||||
i++ ;
|
||||
|
||||
}
|
||||
return(0);
|
||||
|
||||
|
||||
}
|
||||
|
||||
BLASLONG n1 = n & -4;
|
||||
|
||||
while(i < n1)
|
||||
{
|
||||
|
||||
FLOAT m1 = da * x[ix] ;
|
||||
FLOAT m2 = da * x[ix+inc_x] ;
|
||||
FLOAT m3 = da * x[ix+2*inc_x] ;
|
||||
FLOAT m4 = da * x[ix+3*inc_x] ;
|
||||
|
||||
y[iy] += m1 ;
|
||||
y[iy+inc_y] += m2 ;
|
||||
y[iy+2*inc_y] += m3 ;
|
||||
y[iy+3*inc_y] += m4 ;
|
||||
|
||||
ix += inc_x*4 ;
|
||||
iy += inc_y*4 ;
|
||||
i+=4 ;
|
||||
|
||||
}
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
|
||||
y[iy] += da * x[ix] ;
|
||||
ix += inc_x ;
|
||||
iy += inc_y ;
|
||||
i++ ;
|
||||
|
||||
}
|
||||
return(0);
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,201 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
/**************************************************************************************
|
||||
* 2016/03/22 Werner Saar (wernsaar@googlemail.com)
|
||||
* BLASTEST : OK
|
||||
* CTEST : OK
|
||||
* TEST : OK
|
||||
* LAPACK-TEST : OK
|
||||
**************************************************************************************/
|
||||
|
||||
|
||||
#define HAVE_KERNEL_8 1
|
||||
static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __attribute__ ((noinline));
|
||||
|
||||
static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
||||
{
|
||||
|
||||
|
||||
BLASLONG i = n;
|
||||
BLASLONG o16 = 16;
|
||||
BLASLONG o32 = 32;
|
||||
BLASLONG o48 = 48;
|
||||
FLOAT *x1=x;
|
||||
FLOAT *y1=y;
|
||||
FLOAT *y2=y+1;
|
||||
BLASLONG pre = 384;
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
|
||||
"lxsdx 33, %5, %4 \n\t"
|
||||
"xxspltd 32, 33, 0 \n\t"
|
||||
"addi %8, %8, -8 \n\t"
|
||||
|
||||
"dcbt %2, %9 \n\t"
|
||||
"dcbt %3, %9 \n\t"
|
||||
|
||||
"lxvd2x 40, 0, %2 \n\t"
|
||||
"lxvd2x 41, %5, %2 \n\t"
|
||||
"lxvd2x 42, %6, %2 \n\t"
|
||||
"lxvd2x 43, %7, %2 \n\t"
|
||||
|
||||
"lxvd2x 48, 0, %3 \n\t"
|
||||
"lxvd2x 49, %5, %3 \n\t"
|
||||
"lxvd2x 50, %6, %3 \n\t"
|
||||
"lxvd2x 51, %7, %3 \n\t"
|
||||
|
||||
"addi %2, %2, 64 \n\t"
|
||||
"addi %3, %3, 64 \n\t"
|
||||
|
||||
"lxvd2x 44, 0, %2 \n\t"
|
||||
"lxvd2x 45, %5, %2 \n\t"
|
||||
"lxvd2x 46, %6, %2 \n\t"
|
||||
"lxvd2x 47, %7, %2 \n\t"
|
||||
|
||||
"lxvd2x 52, 0, %3 \n\t"
|
||||
"lxvd2x 53, %5, %3 \n\t"
|
||||
"lxvd2x 54, %6, %3 \n\t"
|
||||
"lxvd2x 55, %7, %3 \n\t"
|
||||
|
||||
"addi %2, %2, 64 \n\t"
|
||||
"addi %3, %3, 64 \n\t"
|
||||
|
||||
"addic. %0 , %0 , -16 \n\t"
|
||||
"ble 2f \n\t"
|
||||
|
||||
".align 5 \n\t"
|
||||
"1: \n\t"
|
||||
|
||||
"dcbt %2, %9 \n\t"
|
||||
"dcbt %3, %9 \n\t"
|
||||
|
||||
"xvmaddadp 48, 40, 32 \n\t"
|
||||
"xvmaddadp 49, 41, 32 \n\t"
|
||||
|
||||
"lxvd2x 40, 0, %2 \n\t"
|
||||
"lxvd2x 41, %5, %2 \n\t"
|
||||
|
||||
"stxvd2x 48, 0, %8 \n\t"
|
||||
"stxvd2x 49, %5, %8 \n\t"
|
||||
|
||||
"xvmaddadp 50, 42, 32 \n\t"
|
||||
"xvmaddadp 51, 43, 32 \n\t"
|
||||
|
||||
"lxvd2x 42, %6, %2 \n\t"
|
||||
"lxvd2x 43, %7, %2 \n\t"
|
||||
|
||||
"stxvd2x 50, %6, %8 \n\t"
|
||||
"stxvd2x 51, %7, %8 \n\t"
|
||||
|
||||
"lxvd2x 48, 0, %3 \n\t"
|
||||
"lxvd2x 49, %5, %3 \n\t"
|
||||
"lxvd2x 50, %6, %3 \n\t"
|
||||
"lxvd2x 51, %7, %3 \n\t"
|
||||
|
||||
"addi %2, %2, 64 \n\t"
|
||||
"addi %8, %8, 64 \n\t"
|
||||
|
||||
"xvmaddadp 52, 44, 32 \n\t"
|
||||
"addi %3, %3, 64 \n\t"
|
||||
"xvmaddadp 53, 45, 32 \n\t"
|
||||
|
||||
"lxvd2x 44, 0, %2 \n\t"
|
||||
"lxvd2x 45, %5, %2 \n\t"
|
||||
|
||||
"stxvd2x 52, 0, %8 \n\t"
|
||||
"stxvd2x 53, %5, %8 \n\t"
|
||||
|
||||
"xvmaddadp 54, 46, 32 \n\t"
|
||||
"xvmaddadp 55, 47, 32 \n\t"
|
||||
|
||||
"lxvd2x 46, %6, %2 \n\t"
|
||||
"lxvd2x 47, %7, %2 \n\t"
|
||||
|
||||
"stxvd2x 54, %6, %8 \n\t"
|
||||
"stxvd2x 55, %7, %8 \n\t"
|
||||
|
||||
"addi %2, %2, 64 \n\t"
|
||||
"addi %8, %8, 64 \n\t"
|
||||
|
||||
"lxvd2x 52, 0, %3 \n\t"
|
||||
"lxvd2x 53, %5, %3 \n\t"
|
||||
"lxvd2x 54, %6, %3 \n\t"
|
||||
"lxvd2x 55, %7, %3 \n\t"
|
||||
|
||||
"addi %3, %3, 64 \n\t"
|
||||
|
||||
|
||||
"addic. %0 , %0 , -16 \n\t"
|
||||
"bgt 1b \n\t"
|
||||
|
||||
"2: \n\t"
|
||||
|
||||
|
||||
"xvmaddadp 48, 40, 32 \n\t"
|
||||
"xvmaddadp 49, 41, 32 \n\t"
|
||||
"xvmaddadp 50, 42, 32 \n\t"
|
||||
"xvmaddadp 51, 43, 32 \n\t"
|
||||
|
||||
"xvmaddadp 52, 44, 32 \n\t"
|
||||
"xvmaddadp 53, 45, 32 \n\t"
|
||||
"xvmaddadp 54, 46, 32 \n\t"
|
||||
"xvmaddadp 55, 47, 32 \n\t"
|
||||
|
||||
"stxvd2x 48, 0, %8 \n\t"
|
||||
"stxvd2x 49, %5, %8 \n\t"
|
||||
"stxvd2x 50, %6, %8 \n\t"
|
||||
"stxvd2x 51, %7, %8 \n\t"
|
||||
|
||||
"addi %8, %8, 64 \n\t"
|
||||
|
||||
"stxvd2x 52, 0, %8 \n\t"
|
||||
"stxvd2x 53, %5, %8 \n\t"
|
||||
"stxvd2x 54, %6, %8 \n\t"
|
||||
"stxvd2x 55, %7, %8 \n\t"
|
||||
|
||||
"addi %8, %8, 64 \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"r" (x1), // 2
|
||||
"r" (y1), // 3
|
||||
"r" (alpha), // 4
|
||||
"r" (o16), // 5
|
||||
"r" (o32), // 6
|
||||
"r" (o48), // 7
|
||||
"r" (y2), // 8
|
||||
"r" (pre) // 9
|
||||
: "cr0", "%0", "%2" , "%3", "%8", "memory"
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,131 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
/**************************************************************************************
|
||||
* 2016/03/25 Werner Saar (wernsaar@googlemail.com)
|
||||
* BLASTEST : OK
|
||||
* CTEST : OK
|
||||
* TEST : OK
|
||||
* LAPACK-TEST : OK
|
||||
**************************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if defined(POWER8)
|
||||
#include "dcopy_microk_power8.c"
|
||||
#endif
|
||||
|
||||
#ifndef HAVE_KERNEL_32
|
||||
|
||||
static void dcopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y)
|
||||
{
|
||||
|
||||
BLASLONG i=0;
|
||||
FLOAT f0, f1, f2, f3, f4, f5, f6, f7;
|
||||
FLOAT *x1=x;
|
||||
FLOAT *y1=y;
|
||||
|
||||
while ( i<n )
|
||||
{
|
||||
|
||||
f0 = x1[0];
|
||||
f1 = x1[1];
|
||||
f2 = x1[2];
|
||||
f3 = x1[3];
|
||||
f4 = x1[4];
|
||||
f5 = x1[5];
|
||||
f6 = x1[6];
|
||||
f7 = x1[7];
|
||||
|
||||
y1[0] = f0;
|
||||
y1[1] = f1;
|
||||
y1[2] = f2;
|
||||
y1[3] = f3;
|
||||
y1[4] = f4;
|
||||
y1[5] = f5;
|
||||
y1[6] = f6;
|
||||
y1[7] = f7;
|
||||
|
||||
x1 += 8;
|
||||
y1 += 8;
|
||||
|
||||
i+=8;
|
||||
}
|
||||
return;
|
||||
|
||||
}
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
||||
{
|
||||
BLASLONG i=0;
|
||||
BLASLONG ix=0,iy=0;
|
||||
|
||||
if ( n <= 0 ) return(0);
|
||||
|
||||
if ( (inc_x == 1) && (inc_y == 1 ))
|
||||
{
|
||||
|
||||
BLASLONG n1 = n & -32;
|
||||
if ( n1 > 0 )
|
||||
{
|
||||
dcopy_kernel_32(n1, x, y);
|
||||
i=n1;
|
||||
}
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
y[i] = x[i] ;
|
||||
i++ ;
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
y[iy] = x[ix] ;
|
||||
ix += inc_x ;
|
||||
iy += inc_y ;
|
||||
i++ ;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
return(0);
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,174 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
/**************************************************************************************
|
||||
* 2016/03/25 Werner Saar (wernsaar@googlemail.com)
|
||||
* BLASTEST : OK
|
||||
* CTEST : OK
|
||||
* TEST : OK
|
||||
* LAPACK-TEST : OK
|
||||
**************************************************************************************/
|
||||
|
||||
#define HAVE_KERNEL_32 1
|
||||
|
||||
static void dcopy_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
|
||||
|
||||
static void dcopy_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y)
|
||||
{
|
||||
|
||||
|
||||
BLASLONG i = n;
|
||||
BLASLONG o16 = 16;
|
||||
BLASLONG o32 = 32;
|
||||
BLASLONG o48 = 48;
|
||||
BLASLONG o64 = 64;
|
||||
BLASLONG o80 = 80;
|
||||
BLASLONG o96 = 96;
|
||||
BLASLONG o112 = 112;
|
||||
FLOAT *x1=x;
|
||||
FLOAT *y1=y;
|
||||
BLASLONG pre = 384;
|
||||
BLASLONG alpha=0;
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
|
||||
"lxvd2x 40, 0, %2 \n\t"
|
||||
"lxvd2x 41, %5, %2 \n\t"
|
||||
"lxvd2x 42, %6, %2 \n\t"
|
||||
"lxvd2x 43, %7, %2 \n\t"
|
||||
"lxvd2x 44, %8, %2 \n\t"
|
||||
"lxvd2x 45, %9, %2 \n\t"
|
||||
"lxvd2x 46, %10, %2 \n\t"
|
||||
"lxvd2x 47, %11, %2 \n\t"
|
||||
|
||||
"addi %2, %2, 128 \n\t"
|
||||
|
||||
"lxvd2x 50, 0, %2 \n\t"
|
||||
"lxvd2x 51, %5, %2 \n\t"
|
||||
"lxvd2x 52, %6, %2 \n\t"
|
||||
"lxvd2x 53, %7, %2 \n\t"
|
||||
"lxvd2x 54, %8, %2 \n\t"
|
||||
"lxvd2x 55, %9, %2 \n\t"
|
||||
"lxvd2x 56, %10, %2 \n\t"
|
||||
"lxvd2x 57, %11, %2 \n\t"
|
||||
|
||||
"addi %2, %2, 128 \n\t"
|
||||
|
||||
"addic. %0 , %0 , -32 \n\t"
|
||||
"ble 2f \n\t"
|
||||
|
||||
".align 5 \n\t"
|
||||
"1: \n\t"
|
||||
|
||||
"stxvd2x 40, 0, %1 \n\t"
|
||||
"stxvd2x 41, %5, %1 \n\t"
|
||||
"lxvd2x 40, 0, %2 \n\t"
|
||||
"lxvd2x 41, %5, %2 \n\t"
|
||||
"stxvd2x 42, %6, %1 \n\t"
|
||||
"stxvd2x 43, %7, %1 \n\t"
|
||||
"lxvd2x 42, %6, %2 \n\t"
|
||||
"lxvd2x 43, %7, %2 \n\t"
|
||||
"stxvd2x 44, %8, %1 \n\t"
|
||||
"stxvd2x 45, %9, %1 \n\t"
|
||||
"lxvd2x 44, %8, %2 \n\t"
|
||||
"lxvd2x 45, %9, %2 \n\t"
|
||||
"stxvd2x 46, %10, %1 \n\t"
|
||||
"stxvd2x 47, %11, %1 \n\t"
|
||||
"lxvd2x 46, %10, %2 \n\t"
|
||||
"lxvd2x 47, %11, %2 \n\t"
|
||||
|
||||
|
||||
"addi %1, %1, 128 \n\t"
|
||||
"addi %2, %2, 128 \n\t"
|
||||
|
||||
"stxvd2x 50, 0, %1 \n\t"
|
||||
"stxvd2x 51, %5, %1 \n\t"
|
||||
"lxvd2x 50, 0, %2 \n\t"
|
||||
"lxvd2x 51, %5, %2 \n\t"
|
||||
"stxvd2x 52, %6, %1 \n\t"
|
||||
"stxvd2x 53, %7, %1 \n\t"
|
||||
"lxvd2x 52, %6, %2 \n\t"
|
||||
"lxvd2x 53, %7, %2 \n\t"
|
||||
"stxvd2x 54, %8, %1 \n\t"
|
||||
"stxvd2x 55, %9, %1 \n\t"
|
||||
"lxvd2x 54, %8, %2 \n\t"
|
||||
"lxvd2x 55, %9, %2 \n\t"
|
||||
"stxvd2x 56, %10, %1 \n\t"
|
||||
"stxvd2x 57, %11, %1 \n\t"
|
||||
"lxvd2x 56, %10, %2 \n\t"
|
||||
"lxvd2x 57, %11, %2 \n\t"
|
||||
|
||||
"addi %1, %1, 128 \n\t"
|
||||
"addi %2, %2, 128 \n\t"
|
||||
|
||||
"addic. %0 , %0 , -32 \n\t"
|
||||
"bgt 1b \n\t"
|
||||
|
||||
"2: \n\t"
|
||||
|
||||
"stxvd2x 40, 0, %1 \n\t"
|
||||
"stxvd2x 41, %5, %1 \n\t"
|
||||
"stxvd2x 42, %6, %1 \n\t"
|
||||
"stxvd2x 43, %7, %1 \n\t"
|
||||
"stxvd2x 44, %8, %1 \n\t"
|
||||
"stxvd2x 45, %9, %1 \n\t"
|
||||
"stxvd2x 46, %10, %1 \n\t"
|
||||
"stxvd2x 47, %11, %1 \n\t"
|
||||
|
||||
"addi %1, %1, 128 \n\t"
|
||||
|
||||
"stxvd2x 50, 0, %1 \n\t"
|
||||
"stxvd2x 51, %5, %1 \n\t"
|
||||
"stxvd2x 52, %6, %1 \n\t"
|
||||
"stxvd2x 53, %7, %1 \n\t"
|
||||
"stxvd2x 54, %8, %1 \n\t"
|
||||
"stxvd2x 55, %9, %1 \n\t"
|
||||
"stxvd2x 56, %10, %1 \n\t"
|
||||
"stxvd2x 57, %11, %1 \n\t"
|
||||
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (y1), // 1
|
||||
"r" (x1), // 2
|
||||
"r" (alpha), // 3
|
||||
"r" (pre), // 4
|
||||
"r" (o16), // 5
|
||||
"r" (o32), // 6
|
||||
"r" (o48), // 7
|
||||
"r" (o64), // 8
|
||||
"r" (o80), // 9
|
||||
"r" (o96), // 10
|
||||
"r" (o112) // 11
|
||||
: "cr0", "%0", "%2" , "%1", "memory"
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,139 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
/**************************************************************************************
|
||||
* 2016/03/20 Werner Saar (wernsaar@googlemail.com)
|
||||
* BLASTEST : OK
|
||||
* CTEST : OK
|
||||
* TEST : OK
|
||||
* LAPACK-TEST : OK
|
||||
**************************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
|
||||
#if defined(POWER8)
|
||||
#include "ddot_microk_power8.c"
|
||||
#endif
|
||||
|
||||
|
||||
#ifndef HAVE_KERNEL_8
|
||||
|
||||
static void ddot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d)
|
||||
{
|
||||
BLASLONG register i = 0;
|
||||
FLOAT dot = 0.0;
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
dot += y[i] * x[i]
|
||||
+ y[i+1] * x[i+1]
|
||||
+ y[i+2] * x[i+2]
|
||||
+ y[i+3] * x[i+3]
|
||||
+ y[i+4] * x[i+4]
|
||||
+ y[i+5] * x[i+5]
|
||||
+ y[i+6] * x[i+6]
|
||||
+ y[i+7] * x[i+7] ;
|
||||
|
||||
i+=8 ;
|
||||
|
||||
}
|
||||
*d += dot;
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
||||
{
|
||||
BLASLONG i=0;
|
||||
BLASLONG ix=0,iy=0;
|
||||
|
||||
FLOAT dot = 0.0 ;
|
||||
|
||||
if ( n <= 0 ) return(dot);
|
||||
|
||||
if ( (inc_x == 1) && (inc_y == 1) )
|
||||
{
|
||||
|
||||
BLASLONG n1 = n & -16;
|
||||
|
||||
if ( n1 )
|
||||
ddot_kernel_8(n1, x, y , &dot );
|
||||
|
||||
i = n1;
|
||||
while(i < n)
|
||||
{
|
||||
|
||||
dot += y[i] * x[i] ;
|
||||
i++ ;
|
||||
|
||||
}
|
||||
return(dot);
|
||||
|
||||
|
||||
}
|
||||
|
||||
FLOAT temp1 = 0.0;
|
||||
FLOAT temp2 = 0.0;
|
||||
|
||||
BLASLONG n1 = n & -4;
|
||||
|
||||
while(i < n1)
|
||||
{
|
||||
|
||||
FLOAT m1 = y[iy] * x[ix] ;
|
||||
FLOAT m2 = y[iy+inc_y] * x[ix+inc_x] ;
|
||||
|
||||
FLOAT m3 = y[iy+2*inc_y] * x[ix+2*inc_x] ;
|
||||
FLOAT m4 = y[iy+3*inc_y] * x[ix+3*inc_x] ;
|
||||
|
||||
ix += inc_x*4 ;
|
||||
iy += inc_y*4 ;
|
||||
|
||||
temp1 += m1+m3;
|
||||
temp2 += m2+m4;
|
||||
|
||||
i+=4 ;
|
||||
|
||||
}
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
|
||||
temp1 += y[iy] * x[ix] ;
|
||||
ix += inc_x ;
|
||||
iy += inc_y ;
|
||||
i++ ;
|
||||
|
||||
}
|
||||
dot = temp1 + temp2;
|
||||
return(dot);
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,178 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
/**************************************************************************************
|
||||
* 2016/03/20 Werner Saar (wernsaar@googlemail.com)
|
||||
* BLASTEST : OK
|
||||
* CTEST : OK
|
||||
* TEST : OK
|
||||
* LAPACK-TEST : OK
|
||||
**************************************************************************************/
|
||||
|
||||
#define HAVE_KERNEL_8 1
|
||||
static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline));
|
||||
|
||||
static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
|
||||
{
|
||||
|
||||
|
||||
BLASLONG i = n;
|
||||
BLASLONG o16 = 16;
|
||||
BLASLONG o32 = 32;
|
||||
BLASLONG o48 = 48;
|
||||
BLASLONG o64 = 64;
|
||||
BLASLONG o80 = 80;
|
||||
BLASLONG o96 = 96;
|
||||
BLASLONG o112 = 112;
|
||||
FLOAT *x1=x;
|
||||
FLOAT *y1=y;
|
||||
BLASLONG pre = 384;
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
"xxlxor 32,32,32 \n\t"
|
||||
"xxlxor 33,33,33 \n\t"
|
||||
"xxlxor 34,34,34 \n\t"
|
||||
"xxlxor 35,35,35 \n\t"
|
||||
"xxlxor 36,36,36 \n\t"
|
||||
"xxlxor 37,37,37 \n\t"
|
||||
"xxlxor 38,38,38 \n\t"
|
||||
"xxlxor 39,39,39 \n\t"
|
||||
|
||||
"dcbt %2, %12 \n\t"
|
||||
"dcbt %3, %12 \n\t"
|
||||
|
||||
"lxvd2x 40, 0, %2 \n\t"
|
||||
"lxvd2x 48, 0, %3 \n\t"
|
||||
"lxvd2x 41, %5, %2 \n\t"
|
||||
"lxvd2x 49, %5, %3 \n\t"
|
||||
"lxvd2x 42, %6, %2 \n\t"
|
||||
"lxvd2x 50, %6, %3 \n\t"
|
||||
"lxvd2x 43, %7, %2 \n\t"
|
||||
"lxvd2x 51, %7, %3 \n\t"
|
||||
"lxvd2x 44, %8, %2 \n\t"
|
||||
"lxvd2x 52, %8, %3 \n\t"
|
||||
"lxvd2x 45, %9, %2 \n\t"
|
||||
"lxvd2x 53, %9, %3 \n\t"
|
||||
"lxvd2x 46, %10, %2 \n\t"
|
||||
"lxvd2x 54, %10, %3 \n\t"
|
||||
"lxvd2x 47, %11, %2 \n\t"
|
||||
"lxvd2x 55, %11, %3 \n\t"
|
||||
|
||||
"addi %2, %2, 128 \n\t"
|
||||
"addi %3, %3, 128 \n\t"
|
||||
|
||||
"addic. %0 , %0 , -16 \n\t"
|
||||
"ble 2f \n\t"
|
||||
|
||||
".align 5 \n\t"
|
||||
"1: \n\t"
|
||||
|
||||
"dcbt %2, %12 \n\t"
|
||||
"dcbt %3, %12 \n\t"
|
||||
|
||||
"xvmaddadp 32, 40, 48 \n\t"
|
||||
"lxvd2x 40, 0, %2 \n\t"
|
||||
"lxvd2x 48, 0, %3 \n\t"
|
||||
"xvmaddadp 33, 41, 49 \n\t"
|
||||
"lxvd2x 41, %5, %2 \n\t"
|
||||
"lxvd2x 49, %5, %3 \n\t"
|
||||
"xvmaddadp 34, 42, 50 \n\t"
|
||||
"lxvd2x 42, %6, %2 \n\t"
|
||||
"lxvd2x 50, %6, %3 \n\t"
|
||||
"xvmaddadp 35, 43, 51 \n\t"
|
||||
"lxvd2x 43, %7, %2 \n\t"
|
||||
"lxvd2x 51, %7, %3 \n\t"
|
||||
"xvmaddadp 36, 44, 52 \n\t"
|
||||
"lxvd2x 44, %8, %2 \n\t"
|
||||
"lxvd2x 52, %8, %3 \n\t"
|
||||
"xvmaddadp 37, 45, 53 \n\t"
|
||||
"lxvd2x 45, %9, %2 \n\t"
|
||||
"lxvd2x 53, %9, %3 \n\t"
|
||||
"xvmaddadp 38, 46, 54 \n\t"
|
||||
"lxvd2x 46, %10, %2 \n\t"
|
||||
"lxvd2x 54, %10, %3 \n\t"
|
||||
"xvmaddadp 39, 47, 55 \n\t"
|
||||
|
||||
"lxvd2x 47, %11, %2 \n\t"
|
||||
"lxvd2x 55, %11, %3 \n\t"
|
||||
|
||||
|
||||
"addi %2, %2, 128 \n\t"
|
||||
"addi %3, %3, 128 \n\t"
|
||||
|
||||
"addic. %0 , %0 , -16 \n\t"
|
||||
"bgt 1b \n\t"
|
||||
|
||||
"2: \n\t"
|
||||
|
||||
"xvmaddadp 32, 40, 48 \n\t"
|
||||
"xvmaddadp 33, 41, 49 \n\t"
|
||||
"xvmaddadp 34, 42, 50 \n\t"
|
||||
"xvmaddadp 35, 43, 51 \n\t"
|
||||
"xvmaddadp 36, 44, 52 \n\t"
|
||||
"xvmaddadp 37, 45, 53 \n\t"
|
||||
"xvmaddadp 38, 46, 54 \n\t"
|
||||
"xvmaddadp 39, 47, 55 \n\t"
|
||||
|
||||
"xvadddp 32, 32, 33 \n\t"
|
||||
"xvadddp 34, 34, 35 \n\t"
|
||||
"xvadddp 36, 36, 37 \n\t"
|
||||
"xvadddp 38, 38, 39 \n\t"
|
||||
|
||||
"xvadddp 32, 32, 34 \n\t"
|
||||
"xvadddp 36, 36, 38 \n\t"
|
||||
|
||||
"xvadddp 32, 32, 36 \n\t"
|
||||
|
||||
"xxswapd 33, 32 \n\t"
|
||||
|
||||
"xsadddp 32, 32, 33 \n\t"
|
||||
|
||||
"stxsdx 32, 0, %4 \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"r" (x1), // 2
|
||||
"r" (y1), // 3
|
||||
"r" (dot), // 4
|
||||
"r" (o16), // 5
|
||||
"r" (o32), // 6
|
||||
"r" (o48), // 7
|
||||
"r" (o64), // 8
|
||||
"r" (o80), // 9
|
||||
"r" (o96), // 10
|
||||
"r" (o112), // 11
|
||||
"r" (pre) // 12
|
||||
: "cr0", "%0", "%2" , "%3", "memory"
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,426 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
/**************************************************************************************
|
||||
* 2016/03/30 Werner Saar (wernsaar@googlemail.com)
|
||||
* BLASTEST : OK
|
||||
* CTEST : OK
|
||||
* TEST : OK
|
||||
* LAPACK-TEST : OK
|
||||
**************************************************************************************/
|
||||
|
||||
|
||||
|
||||
#include "common.h"
|
||||
|
||||
|
||||
#if defined(POWER8)
|
||||
#include "dgemv_n_microk_power8.c"
|
||||
#endif
|
||||
|
||||
|
||||
#define NBMAX 4096
|
||||
|
||||
#ifndef HAVE_KERNEL_4x4
|
||||
|
||||
static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha)
|
||||
{
|
||||
BLASLONG i;
|
||||
FLOAT *a0,*a1,*a2,*a3;
|
||||
FLOAT x[4] __attribute__ ((aligned (16)));;
|
||||
a0 = ap[0];
|
||||
a1 = ap[1];
|
||||
a2 = ap[2];
|
||||
a3 = ap[3];
|
||||
|
||||
for ( i=0; i<4; i++)
|
||||
x[i] = xo[i] * *alpha;
|
||||
|
||||
for ( i=0; i< n; i+=4 )
|
||||
{
|
||||
y[i] += a0[i]*x[0] + a1[i]*x[1] + a2[i]*x[2] + a3[i]*x[3];
|
||||
y[i+1] += a0[i+1]*x[0] + a1[i+1]*x[1] + a2[i+1]*x[2] + a3[i+1]*x[3];
|
||||
y[i+2] += a0[i+2]*x[0] + a1[i+2]*x[1] + a2[i+2]*x[2] + a3[i+2]*x[3];
|
||||
y[i+3] += a0[i+3]*x[0] + a1[i+3]*x[1] + a2[i+3]*x[2] + a3[i+3]*x[3];
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#ifndef HAVE_KERNEL_4x2
|
||||
|
||||
static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha)
|
||||
{
|
||||
BLASLONG i;
|
||||
FLOAT *a0,*a1;
|
||||
FLOAT x[4] __attribute__ ((aligned (16)));;
|
||||
a0 = ap[0];
|
||||
a1 = ap[1];
|
||||
|
||||
for ( i=0; i<2; i++)
|
||||
x[i] = xo[i] * *alpha;
|
||||
|
||||
for ( i=0; i< n; i+=4 )
|
||||
{
|
||||
y[i] += a0[i]*x[0] + a1[i]*x[1];
|
||||
y[i+1] += a0[i+1]*x[0] + a1[i+1]*x[1];
|
||||
y[i+2] += a0[i+2]*x[0] + a1[i+2]*x[1];
|
||||
y[i+3] += a0[i+3]*x[0] + a1[i+3]*x[1];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
#ifndef HAVE_KERNEL_4x1
|
||||
|
||||
static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *xo, FLOAT *y, FLOAT *alpha)
|
||||
{
|
||||
BLASLONG i;
|
||||
FLOAT *a0;
|
||||
FLOAT x[4] __attribute__ ((aligned (16)));;
|
||||
a0 = ap;
|
||||
|
||||
for ( i=0; i<1; i++)
|
||||
x[i] = xo[i] * *alpha;
|
||||
|
||||
for ( i=0; i< n; i+=4 )
|
||||
{
|
||||
y[i] += a0[i]*x[0];
|
||||
y[i+1] += a0[i+1]*x[0];
|
||||
y[i+2] += a0[i+2]*x[0];
|
||||
y[i+3] += a0[i+3]*x[0];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
|
||||
{
|
||||
BLASLONG i;
|
||||
if ( inc_dest != 1 )
|
||||
{
|
||||
for ( i=0; i<n; i++ )
|
||||
{
|
||||
*dest += *src;
|
||||
src++;
|
||||
dest += inc_dest;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
|
||||
{
|
||||
|
||||
BLASLONG i;
|
||||
BLASLONG j;
|
||||
FLOAT *a_ptr;
|
||||
FLOAT *x_ptr;
|
||||
FLOAT *y_ptr;
|
||||
BLASLONG n1;
|
||||
BLASLONG m1;
|
||||
BLASLONG m2;
|
||||
BLASLONG m3;
|
||||
BLASLONG n2;
|
||||
BLASLONG lda4 = lda << 2;
|
||||
FLOAT *ap[4] __attribute__ ((aligned (16)));;
|
||||
FLOAT xbuffer[8] __attribute__ ((aligned (16)));;
|
||||
FLOAT alpha_r[4] __attribute__ ((aligned (16)));;
|
||||
FLOAT *ybuffer;
|
||||
|
||||
alpha_r[0] = alpha;
|
||||
|
||||
if ( m < 1 ) return(0);
|
||||
if ( n < 1 ) return(0);
|
||||
|
||||
ybuffer = buffer;
|
||||
|
||||
n1 = n >> 2 ;
|
||||
n2 = n & 3 ;
|
||||
|
||||
m3 = m & 3 ;
|
||||
m1 = m & -4 ;
|
||||
m2 = (m & (NBMAX-1)) - m3 ;
|
||||
|
||||
y_ptr = y;
|
||||
|
||||
BLASLONG NB = NBMAX;
|
||||
|
||||
while ( NB == NBMAX )
|
||||
{
|
||||
|
||||
m1 -= NB;
|
||||
if ( m1 < 0)
|
||||
{
|
||||
if ( m2 == 0 ) break;
|
||||
NB = m2;
|
||||
}
|
||||
|
||||
a_ptr = a;
|
||||
x_ptr = x;
|
||||
|
||||
ap[0] = a_ptr;
|
||||
ap[1] = a_ptr + lda;
|
||||
ap[2] = ap[1] + lda;
|
||||
ap[3] = ap[2] + lda;
|
||||
|
||||
if ( inc_y != 1 )
|
||||
memset(ybuffer,0,NB*8);
|
||||
else
|
||||
ybuffer = y_ptr;
|
||||
|
||||
if ( inc_x == 1 )
|
||||
{
|
||||
|
||||
|
||||
for( i = 0; i < n1 ; i++)
|
||||
{
|
||||
dgemv_kernel_4x4(NB,ap,x_ptr,ybuffer,alpha_r);
|
||||
ap[0] += lda4;
|
||||
ap[1] += lda4;
|
||||
ap[2] += lda4;
|
||||
ap[3] += lda4;
|
||||
a_ptr += lda4;
|
||||
x_ptr += 4;
|
||||
}
|
||||
|
||||
if ( n2 & 2 )
|
||||
{
|
||||
dgemv_kernel_4x2(NB,ap,x_ptr,ybuffer,alpha_r);
|
||||
a_ptr += lda*2;
|
||||
x_ptr += 2;
|
||||
}
|
||||
|
||||
|
||||
if ( n2 & 1 )
|
||||
{
|
||||
dgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer,alpha_r);
|
||||
a_ptr += lda;
|
||||
x_ptr += 1;
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
for( i = 0; i < n1 ; i++)
|
||||
{
|
||||
xbuffer[0] = x_ptr[0];
|
||||
x_ptr += inc_x;
|
||||
xbuffer[1] = x_ptr[0];
|
||||
x_ptr += inc_x;
|
||||
xbuffer[2] = x_ptr[0];
|
||||
x_ptr += inc_x;
|
||||
xbuffer[3] = x_ptr[0];
|
||||
x_ptr += inc_x;
|
||||
dgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,alpha_r);
|
||||
ap[0] += lda4;
|
||||
ap[1] += lda4;
|
||||
ap[2] += lda4;
|
||||
ap[3] += lda4;
|
||||
a_ptr += lda4;
|
||||
}
|
||||
|
||||
for( i = 0; i < n2 ; i++)
|
||||
{
|
||||
xbuffer[0] = x_ptr[0];
|
||||
x_ptr += inc_x;
|
||||
dgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,alpha_r);
|
||||
a_ptr += lda;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
a += NB;
|
||||
if ( inc_y != 1 )
|
||||
{
|
||||
add_y(NB,ybuffer,y_ptr,inc_y);
|
||||
y_ptr += NB * inc_y;
|
||||
}
|
||||
else
|
||||
y_ptr += NB ;
|
||||
|
||||
}
|
||||
|
||||
if ( m3 == 0 ) return(0);
|
||||
|
||||
if ( m3 == 3 )
|
||||
{
|
||||
a_ptr = a;
|
||||
x_ptr = x;
|
||||
FLOAT temp0 = 0.0;
|
||||
FLOAT temp1 = 0.0;
|
||||
FLOAT temp2 = 0.0;
|
||||
if ( lda == 3 && inc_x ==1 )
|
||||
{
|
||||
|
||||
for( i = 0; i < ( n & -4 ); i+=4 )
|
||||
{
|
||||
|
||||
temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1];
|
||||
temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1];
|
||||
temp2 += a_ptr[2] * x_ptr[0] + a_ptr[5] * x_ptr[1];
|
||||
|
||||
temp0 += a_ptr[6] * x_ptr[2] + a_ptr[9] * x_ptr[3];
|
||||
temp1 += a_ptr[7] * x_ptr[2] + a_ptr[10] * x_ptr[3];
|
||||
temp2 += a_ptr[8] * x_ptr[2] + a_ptr[11] * x_ptr[3];
|
||||
|
||||
a_ptr += 12;
|
||||
x_ptr += 4;
|
||||
}
|
||||
|
||||
for( ; i < n; i++ )
|
||||
{
|
||||
temp0 += a_ptr[0] * x_ptr[0];
|
||||
temp1 += a_ptr[1] * x_ptr[0];
|
||||
temp2 += a_ptr[2] * x_ptr[0];
|
||||
a_ptr += 3;
|
||||
x_ptr ++;
|
||||
}
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
for( i = 0; i < n; i++ )
|
||||
{
|
||||
temp0 += a_ptr[0] * x_ptr[0];
|
||||
temp1 += a_ptr[1] * x_ptr[0];
|
||||
temp2 += a_ptr[2] * x_ptr[0];
|
||||
a_ptr += lda;
|
||||
x_ptr += inc_x;
|
||||
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
y_ptr[0] += alpha * temp0;
|
||||
y_ptr += inc_y;
|
||||
y_ptr[0] += alpha * temp1;
|
||||
y_ptr += inc_y;
|
||||
y_ptr[0] += alpha * temp2;
|
||||
return(0);
|
||||
}
|
||||
|
||||
|
||||
if ( m3 == 2 )
|
||||
{
|
||||
a_ptr = a;
|
||||
x_ptr = x;
|
||||
FLOAT temp0 = 0.0;
|
||||
FLOAT temp1 = 0.0;
|
||||
if ( lda == 2 && inc_x ==1 )
|
||||
{
|
||||
|
||||
for( i = 0; i < (n & -4) ; i+=4 )
|
||||
{
|
||||
temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1];
|
||||
temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1];
|
||||
temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3];
|
||||
temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3];
|
||||
a_ptr += 8;
|
||||
x_ptr += 4;
|
||||
|
||||
}
|
||||
|
||||
|
||||
for( ; i < n; i++ )
|
||||
{
|
||||
temp0 += a_ptr[0] * x_ptr[0];
|
||||
temp1 += a_ptr[1] * x_ptr[0];
|
||||
a_ptr += 2;
|
||||
x_ptr ++;
|
||||
}
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
for( i = 0; i < n; i++ )
|
||||
{
|
||||
temp0 += a_ptr[0] * x_ptr[0];
|
||||
temp1 += a_ptr[1] * x_ptr[0];
|
||||
a_ptr += lda;
|
||||
x_ptr += inc_x;
|
||||
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
y_ptr[0] += alpha * temp0;
|
||||
y_ptr += inc_y;
|
||||
y_ptr[0] += alpha * temp1;
|
||||
return(0);
|
||||
}
|
||||
|
||||
if ( m3 == 1 )
|
||||
{
|
||||
a_ptr = a;
|
||||
x_ptr = x;
|
||||
FLOAT temp = 0.0;
|
||||
if ( lda == 1 && inc_x ==1 )
|
||||
{
|
||||
|
||||
for( i = 0; i < (n & -4); i+=4 )
|
||||
{
|
||||
temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3];
|
||||
|
||||
}
|
||||
|
||||
for( ; i < n; i++ )
|
||||
{
|
||||
temp += a_ptr[i] * x_ptr[i];
|
||||
}
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
for( i = 0; i < n; i++ )
|
||||
{
|
||||
temp += a_ptr[0] * x_ptr[0];
|
||||
a_ptr += lda;
|
||||
x_ptr += inc_x;
|
||||
}
|
||||
|
||||
}
|
||||
y_ptr[0] += alpha * temp;
|
||||
return(0);
|
||||
}
|
||||
|
||||
|
||||
return(0);
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,301 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
/**************************************************************************************
|
||||
* 2016/03/30 Werner Saar (wernsaar@googlemail.com)
|
||||
* BLASTEST : OK
|
||||
* CTEST : OK
|
||||
* TEST : OK
|
||||
* LAPACK-TEST : OK
|
||||
**************************************************************************************/
|
||||
|
||||
#define HAVE_KERNEL_4x4 1
|
||||
|
||||
static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));
|
||||
|
||||
static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha)
|
||||
{
|
||||
BLASLONG i=n;
|
||||
BLASLONG o8 = 8;
|
||||
BLASLONG o16 = 16;
|
||||
BLASLONG o24 = 24;
|
||||
BLASLONG pre = 384;
|
||||
|
||||
FLOAT *a0,*a1,*a2,*a3;
|
||||
FLOAT *y1=y+1;
|
||||
FLOAT x[4] __attribute__ ((aligned (16)));;
|
||||
a0 = ap[0]+1;
|
||||
a1 = ap[1]+1;
|
||||
a2 = ap[2]+1;
|
||||
a3 = ap[3]+1;
|
||||
|
||||
x[0]=xo[0] * *alpha;
|
||||
x[1]=xo[1] * *alpha;
|
||||
x[2]=xo[2] * *alpha;
|
||||
x[3]=xo[3] * *alpha;
|
||||
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
"lxvdsx 32, 0 , %1 \n\t" // x0
|
||||
"lxvdsx 33,%3 , %1 \n\t" // x1
|
||||
"lxvdsx 34,%4 , %1 \n\t" // x2
|
||||
"lxvdsx 35,%5 , %1 \n\t" // x3
|
||||
"addi %2 , %2 , -8 \n\t"
|
||||
"addi %6 , %6 , -8 \n\t"
|
||||
"addi %7 , %7 , -8 \n\t"
|
||||
"addi %8 , %8 , -8 \n\t"
|
||||
"addi %9 , %9 , -8 \n\t"
|
||||
|
||||
"lxvd2x 48, 0, %6 \n\t" // a0[0], a0[1]
|
||||
"lxvd2x 49,%4, %6 \n\t" // a0[2], a0[3]
|
||||
|
||||
"lxvd2x 50, 0, %7 \n\t" // a1[0], a1[1]
|
||||
"lxvd2x 51,%4, %7 \n\t" // a1[2], a1[3]
|
||||
|
||||
"lxvd2x 52, 0, %8 \n\t" // a2[0], a2[1]
|
||||
"lxvd2x 53,%4, %8 \n\t" // a2[2], a2[3]
|
||||
|
||||
"lxvd2x 54, 0, %9 \n\t" // a3[0], a3[1]
|
||||
"lxvd2x 55,%4, %9 \n\t" // a3[2], a3[3]
|
||||
|
||||
"addi %6, %6, 32 \n\t"
|
||||
"addi %7, %7, 32 \n\t"
|
||||
"addi %8, %8, 32 \n\t"
|
||||
"addi %9, %9, 32 \n\t"
|
||||
|
||||
"addic. %0 , %0 , -4 \n\t"
|
||||
"ble 2f \n\t"
|
||||
|
||||
".align 5 \n\t"
|
||||
"1: \n\t"
|
||||
|
||||
"dcbt %2, %10 \n\t"
|
||||
|
||||
"lxvd2x 40, 0, %2 \n\t" // y0, y1
|
||||
"lxvd2x 41,%4, %2 \n\t" // y2, y3
|
||||
|
||||
"dcbt %6, %10 \n\t"
|
||||
"dcbt %7, %10 \n\t"
|
||||
"dcbt %8, %10 \n\t"
|
||||
"dcbt %9, %10 \n\t"
|
||||
|
||||
"xvmaddadp 40, 48, 32 \n\t"
|
||||
"xvmaddadp 41, 49, 32 \n\t"
|
||||
|
||||
"lxvd2x 48, 0, %6 \n\t" // a0[0], a0[1]
|
||||
"lxvd2x 49,%4, %6 \n\t" // a0[2], a0[3]
|
||||
|
||||
"xvmaddadp 40, 50, 33 \n\t"
|
||||
"addi %6, %6, 32 \n\t"
|
||||
"xvmaddadp 41, 51, 33 \n\t"
|
||||
|
||||
"lxvd2x 50, 0, %7 \n\t" // a1[0], a1[1]
|
||||
"lxvd2x 51,%4, %7 \n\t" // a1[2], a1[3]
|
||||
|
||||
"xvmaddadp 40, 52, 34 \n\t"
|
||||
"addi %7, %7, 32 \n\t"
|
||||
"xvmaddadp 41, 53, 34 \n\t"
|
||||
|
||||
"lxvd2x 52, 0, %8 \n\t" // a2[0], a2[1]
|
||||
"lxvd2x 53,%4, %8 \n\t" // a2[2], a2[3]
|
||||
|
||||
"xvmaddadp 40, 54, 35 \n\t"
|
||||
"addi %8, %8, 32 \n\t"
|
||||
"xvmaddadp 41, 55, 35 \n\t"
|
||||
|
||||
"stxvd2x 40, 0, %2 \n\t" // y0, y1
|
||||
"stxvd2x 41,%4, %2 \n\t" // y2, y3
|
||||
|
||||
"lxvd2x 54, 0, %9 \n\t" // a3[0], a3[1]
|
||||
"lxvd2x 55,%4, %9 \n\t" // a3[2], a3[3]
|
||||
|
||||
"addi %9, %9, 32 \n\t"
|
||||
"addi %2, %2, 32 \n\t"
|
||||
|
||||
"addic. %0 , %0 , -4 \n\t"
|
||||
"ble 2f \n\t"
|
||||
|
||||
|
||||
"lxvd2x 40, 0, %2 \n\t" // y0, y1
|
||||
"lxvd2x 41,%4, %2 \n\t" // y2, y3
|
||||
|
||||
"xvmaddadp 40, 48, 32 \n\t"
|
||||
"xvmaddadp 41, 49, 32 \n\t"
|
||||
|
||||
"lxvd2x 48, 0, %6 \n\t" // a0[0], a0[1]
|
||||
"lxvd2x 49,%4, %6 \n\t" // a0[2], a0[3]
|
||||
|
||||
"xvmaddadp 40, 50, 33 \n\t"
|
||||
"addi %6, %6, 32 \n\t"
|
||||
"xvmaddadp 41, 51, 33 \n\t"
|
||||
|
||||
"lxvd2x 50, 0, %7 \n\t" // a1[0], a1[1]
|
||||
"lxvd2x 51,%4, %7 \n\t" // a1[2], a1[3]
|
||||
|
||||
"xvmaddadp 40, 52, 34 \n\t"
|
||||
"addi %7, %7, 32 \n\t"
|
||||
"xvmaddadp 41, 53, 34 \n\t"
|
||||
|
||||
"lxvd2x 52, 0, %8 \n\t" // a2[0], a2[1]
|
||||
"lxvd2x 53,%4, %8 \n\t" // a2[2], a2[3]
|
||||
|
||||
"xvmaddadp 40, 54, 35 \n\t"
|
||||
"addi %8, %8, 32 \n\t"
|
||||
"xvmaddadp 41, 55, 35 \n\t"
|
||||
|
||||
"stxvd2x 40, 0, %2 \n\t" // y0, y1
|
||||
"stxvd2x 41,%4, %2 \n\t" // y2, y3
|
||||
|
||||
"lxvd2x 54, 0, %9 \n\t" // a3[0], a3[1]
|
||||
"lxvd2x 55,%4, %9 \n\t" // a3[2], a3[3]
|
||||
|
||||
"addi %9, %9, 32 \n\t"
|
||||
"addi %2, %2, 32 \n\t"
|
||||
|
||||
"addic. %0 , %0 , -4 \n\t"
|
||||
"ble 2f \n\t"
|
||||
|
||||
|
||||
"lxvd2x 40, 0, %2 \n\t" // y0, y1
|
||||
"lxvd2x 41,%4, %2 \n\t" // y2, y3
|
||||
|
||||
"xvmaddadp 40, 48, 32 \n\t"
|
||||
"xvmaddadp 41, 49, 32 \n\t"
|
||||
|
||||
"lxvd2x 48, 0, %6 \n\t" // a0[0], a0[1]
|
||||
"lxvd2x 49,%4, %6 \n\t" // a0[2], a0[3]
|
||||
|
||||
"xvmaddadp 40, 50, 33 \n\t"
|
||||
"addi %6, %6, 32 \n\t"
|
||||
"xvmaddadp 41, 51, 33 \n\t"
|
||||
|
||||
"lxvd2x 50, 0, %7 \n\t" // a1[0], a1[1]
|
||||
"lxvd2x 51,%4, %7 \n\t" // a1[2], a1[3]
|
||||
|
||||
"xvmaddadp 40, 52, 34 \n\t"
|
||||
"addi %7, %7, 32 \n\t"
|
||||
"xvmaddadp 41, 53, 34 \n\t"
|
||||
|
||||
"lxvd2x 52, 0, %8 \n\t" // a2[0], a2[1]
|
||||
"lxvd2x 53,%4, %8 \n\t" // a2[2], a2[3]
|
||||
|
||||
"xvmaddadp 40, 54, 35 \n\t"
|
||||
"addi %8, %8, 32 \n\t"
|
||||
"xvmaddadp 41, 55, 35 \n\t"
|
||||
|
||||
"stxvd2x 40, 0, %2 \n\t" // y0, y1
|
||||
"stxvd2x 41,%4, %2 \n\t" // y2, y3
|
||||
|
||||
"lxvd2x 54, 0, %9 \n\t" // a3[0], a3[1]
|
||||
"lxvd2x 55,%4, %9 \n\t" // a3[2], a3[3]
|
||||
|
||||
"addi %9, %9, 32 \n\t"
|
||||
"addi %2, %2, 32 \n\t"
|
||||
|
||||
"addic. %0 , %0 , -4 \n\t"
|
||||
"ble 2f \n\t"
|
||||
|
||||
|
||||
"lxvd2x 40, 0, %2 \n\t" // y0, y1
|
||||
"lxvd2x 41,%4, %2 \n\t" // y2, y3
|
||||
|
||||
"xvmaddadp 40, 48, 32 \n\t"
|
||||
"xvmaddadp 41, 49, 32 \n\t"
|
||||
|
||||
"lxvd2x 48, 0, %6 \n\t" // a0[0], a0[1]
|
||||
"lxvd2x 49,%4, %6 \n\t" // a0[2], a0[3]
|
||||
|
||||
"xvmaddadp 40, 50, 33 \n\t"
|
||||
"addi %6, %6, 32 \n\t"
|
||||
"xvmaddadp 41, 51, 33 \n\t"
|
||||
|
||||
"lxvd2x 50, 0, %7 \n\t" // a1[0], a1[1]
|
||||
"lxvd2x 51,%4, %7 \n\t" // a1[2], a1[3]
|
||||
|
||||
"xvmaddadp 40, 52, 34 \n\t"
|
||||
"addi %7, %7, 32 \n\t"
|
||||
"xvmaddadp 41, 53, 34 \n\t"
|
||||
|
||||
"lxvd2x 52, 0, %8 \n\t" // a2[0], a2[1]
|
||||
"lxvd2x 53,%4, %8 \n\t" // a2[2], a2[3]
|
||||
|
||||
"xvmaddadp 40, 54, 35 \n\t"
|
||||
"addi %8, %8, 32 \n\t"
|
||||
"xvmaddadp 41, 55, 35 \n\t"
|
||||
|
||||
"stxvd2x 40, 0, %2 \n\t" // y0, y1
|
||||
"stxvd2x 41,%4, %2 \n\t" // y2, y3
|
||||
|
||||
"lxvd2x 54, 0, %9 \n\t" // a3[0], a3[1]
|
||||
"lxvd2x 55,%4, %9 \n\t" // a3[2], a3[3]
|
||||
|
||||
"addi %9, %9, 32 \n\t"
|
||||
"addi %2, %2, 32 \n\t"
|
||||
|
||||
"addic. %0 , %0 , -4 \n\t"
|
||||
"bgt 1b \n\t"
|
||||
|
||||
"2: \n\t"
|
||||
|
||||
"lxvd2x 40, 0, %2 \n\t" // y0, y1
|
||||
"lxvd2x 41,%4, %2 \n\t" // y2, y3
|
||||
|
||||
"xvmaddadp 40, 48, 32 \n\t"
|
||||
"xvmaddadp 41, 49, 32 \n\t"
|
||||
|
||||
"xvmaddadp 40, 50, 33 \n\t"
|
||||
"xvmaddadp 41, 51, 33 \n\t"
|
||||
|
||||
"xvmaddadp 40, 52, 34 \n\t"
|
||||
"xvmaddadp 41, 53, 34 \n\t"
|
||||
|
||||
"xvmaddadp 40, 54, 35 \n\t"
|
||||
"xvmaddadp 41, 55, 35 \n\t"
|
||||
|
||||
"stxvd2x 40, 0, %2 \n\t" // y0, y1
|
||||
"stxvd2x 41,%4, %2 \n\t" // y2, y3
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (x), // 1
|
||||
"r" (y1), // 2
|
||||
"r" (o8), // 3
|
||||
"r" (o16), // 4
|
||||
"r" (o24), // 5
|
||||
"r" (a0), // 6
|
||||
"r" (a1), // 7
|
||||
"r" (a2), // 8
|
||||
"r" (a3), // 9
|
||||
"r" (pre) // 10
|
||||
: "cr0", "%0", "%2" , "%6", "%7", "%8", "%9", "memory"
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,167 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
/**************************************************************************************
|
||||
* 2016/03/27 Werner Saar (wernsaar@googlemail.com)
|
||||
* BLASTEST : OK
|
||||
* CTEST : OK
|
||||
* TEST : OK
|
||||
* LAPACK-TEST : OK
|
||||
**************************************************************************************/
|
||||
|
||||
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#pragma GCC optimize "O1"
|
||||
|
||||
#if defined(POWER8)
|
||||
#include "drot_microk_power8.c"
|
||||
#endif
|
||||
|
||||
|
||||
#ifndef HAVE_KERNEL_16
|
||||
|
||||
static void drot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
|
||||
{
|
||||
|
||||
BLASLONG i=0;
|
||||
FLOAT f0, f1, f2, f3;
|
||||
FLOAT x00, x01, x02, x03;
|
||||
FLOAT g0, g1, g2, g3;
|
||||
FLOAT y00, y01, y02, y03;
|
||||
FLOAT *x1=x;
|
||||
FLOAT *y1=y;
|
||||
FLOAT c1=*c;
|
||||
FLOAT s1=*s;
|
||||
|
||||
while ( i<n )
|
||||
{
|
||||
|
||||
x00 = x1[0];
|
||||
y00 = y1[0];
|
||||
x01 = x1[1];
|
||||
y01 = y1[1];
|
||||
x02 = x1[2];
|
||||
y02 = y1[2];
|
||||
x03 = x1[3];
|
||||
y03 = y1[3];
|
||||
|
||||
f0 = c1*x00 + s1*y00;
|
||||
g0 = c1*y00 - s1*x00;
|
||||
f1 = c1*x01 + s1*y01;
|
||||
g1 = c1*y01 - s1*x01;
|
||||
f2 = c1*x02 + s1*y02;
|
||||
g2 = c1*y02 - s1*x02;
|
||||
f3 = c1*x03 + s1*y03;
|
||||
g3 = c1*y03 - s1*x03;
|
||||
|
||||
x1[0] = f0;
|
||||
y1[0] = g0;
|
||||
x1[1] = f1;
|
||||
y1[1] = g1;
|
||||
x1[2] = f2;
|
||||
y1[2] = g2;
|
||||
x1[3] = f3;
|
||||
y1[3] = g3;
|
||||
|
||||
x1 += 4;
|
||||
y1 += 4;
|
||||
|
||||
i+=4;
|
||||
}
|
||||
return;
|
||||
|
||||
}
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
|
||||
{
|
||||
BLASLONG i=0;
|
||||
BLASLONG ix=0,iy=0;
|
||||
FLOAT c1[4] __attribute__ ((aligned (16)));;
|
||||
FLOAT s1[4] __attribute__ ((aligned (16)));;
|
||||
FLOAT *x1=x;
|
||||
FLOAT *y1=y;
|
||||
FLOAT temp;
|
||||
|
||||
if ( n <= 0 ) return(0);
|
||||
|
||||
if ( (inc_x == 1) && (inc_y == 1) )
|
||||
{
|
||||
|
||||
BLASLONG n1 = n & -16;
|
||||
if ( n1 > 0 )
|
||||
{
|
||||
c1[0]=c;
|
||||
c1[1]=c;
|
||||
c1[2]=c;
|
||||
c1[3]=c;
|
||||
s1[0]=s;
|
||||
s1[1]=s;
|
||||
s1[2]=s;
|
||||
s1[3]=s;
|
||||
drot_kernel_16(n1, x1, y1, c1, s1);
|
||||
i=n1;
|
||||
}
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
temp = c*x[i] + s*y[i] ;
|
||||
y[i] = c*y[i] - s*x[i] ;
|
||||
x[i] = temp ;
|
||||
|
||||
i++ ;
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
temp = c*x[ix] + s*y[iy] ;
|
||||
y[iy] = c*y[iy] - s*x[ix] ;
|
||||
x[ix] = temp ;
|
||||
|
||||
ix += inc_x ;
|
||||
iy += inc_y ;
|
||||
i++ ;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
return(0);
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,211 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
/**************************************************************************************
|
||||
* 2016/03/27 Werner Saar (wernsaar@googlemail.com)
|
||||
*
|
||||
* I don't use fused multiply-add ( precision problems with lapack )
|
||||
*
|
||||
* BLASTEST : OK
|
||||
* CTEST : OK
|
||||
* TEST : OK
|
||||
* LAPACK-TEST : OK
|
||||
**************************************************************************************/
|
||||
|
||||
#define HAVE_KERNEL_16 1
|
||||
|
||||
static void drot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) __attribute__ ((noinline));
|
||||
|
||||
static void drot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
|
||||
{
|
||||
|
||||
|
||||
BLASLONG i = n;
|
||||
BLASLONG o16 = 16;
|
||||
BLASLONG o32 = 32;
|
||||
BLASLONG o48 = 48;
|
||||
FLOAT *x1=x;
|
||||
FLOAT *y1=y;
|
||||
FLOAT *x2=x+1;
|
||||
FLOAT *y2=y+1;
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
|
||||
"lxsdx 36 , %5, %3 \n\t" // load c
|
||||
"lxsdx 37 , %5, %4 \n\t" // load s
|
||||
"addi %8 , %8, -8 \n\t"
|
||||
"addi %9 , %9, -8 \n\t"
|
||||
|
||||
"xxspltd 36 , 36, 0 \n\t"
|
||||
"xxspltd 37 , 37, 0 \n\t"
|
||||
|
||||
"lxvd2x 32, 0, %1 \n\t" // load x
|
||||
"lxvd2x 33, %5, %1 \n\t"
|
||||
"lxvd2x 34, %6, %1 \n\t"
|
||||
"lxvd2x 35, %7, %1 \n\t"
|
||||
|
||||
"lxvd2x 40, 0, %2 \n\t" // load y
|
||||
"lxvd2x 41, %5, %2 \n\t"
|
||||
"lxvd2x 42, %6, %2 \n\t"
|
||||
"lxvd2x 43, %7, %2 \n\t"
|
||||
|
||||
"addi %1, %1, 64 \n\t"
|
||||
"addi %2, %2, 64 \n\t"
|
||||
|
||||
"addic. %0 , %0 , -8 \n\t"
|
||||
"ble 2f \n\t"
|
||||
|
||||
".align 5 \n\t"
|
||||
"1: \n\t"
|
||||
|
||||
"xvmuldp 48, 32, 36 \n\t" // c * x
|
||||
"xvmuldp 49, 33, 36 \n\t"
|
||||
"xvmuldp 50, 34, 36 \n\t"
|
||||
"xvmuldp 51, 35, 36 \n\t"
|
||||
|
||||
"xvmuldp 56, 40, 36 \n\t" // c * y
|
||||
"xvmuldp 57, 41, 36 \n\t"
|
||||
"xvmuldp 58, 42, 36 \n\t"
|
||||
"xvmuldp 59, 43, 36 \n\t"
|
||||
|
||||
"xvmuldp 52, 32, 37 \n\t" // s * x
|
||||
"xvmuldp 53, 33, 37 \n\t"
|
||||
|
||||
"lxvd2x 32, 0, %1 \n\t" // load x
|
||||
"lxvd2x 33, %5, %1 \n\t"
|
||||
|
||||
"xvmuldp 54, 34, 37 \n\t"
|
||||
"xvmuldp 55, 35, 37 \n\t"
|
||||
|
||||
"lxvd2x 34, %6, %1 \n\t"
|
||||
"lxvd2x 35, %7, %1 \n\t"
|
||||
|
||||
"xvmuldp 60, 40, 37 \n\t" // s * y
|
||||
"xvmuldp 61, 41, 37 \n\t"
|
||||
|
||||
"lxvd2x 40, 0, %2 \n\t" // load y
|
||||
"lxvd2x 41, %5, %2 \n\t"
|
||||
|
||||
"xvmuldp 62, 42, 37 \n\t"
|
||||
"xvmuldp 63, 43, 37 \n\t"
|
||||
|
||||
"lxvd2x 42, %6, %2 \n\t"
|
||||
"lxvd2x 43, %7, %2 \n\t"
|
||||
|
||||
"xvadddp 48, 48 , 60 \n\t" // c * x + s * y
|
||||
"xvadddp 49, 49 , 61 \n\t" // c * x + s * y
|
||||
|
||||
"addi %1, %1, 64 \n\t"
|
||||
"addi %2, %2, 64 \n\t"
|
||||
|
||||
"xvadddp 50, 50 , 62 \n\t" // c * x + s * y
|
||||
"xvadddp 51, 51 , 63 \n\t" // c * x + s * y
|
||||
|
||||
"xvsubdp 56, 56 , 52 \n\t" // c * y - s * x
|
||||
"xvsubdp 57, 57 , 53 \n\t" // c * y - s * x
|
||||
"xvsubdp 58, 58 , 54 \n\t" // c * y - s * x
|
||||
"xvsubdp 59, 59 , 55 \n\t" // c * y - s * x
|
||||
|
||||
"stxvd2x 48, 0, %8 \n\t" // store x
|
||||
"stxvd2x 49, %5, %8 \n\t"
|
||||
"stxvd2x 50, %6, %8 \n\t"
|
||||
"stxvd2x 51, %7, %8 \n\t"
|
||||
|
||||
"stxvd2x 56, 0, %9 \n\t" // store y
|
||||
"stxvd2x 57, %5, %9 \n\t"
|
||||
"stxvd2x 58, %6, %9 \n\t"
|
||||
"stxvd2x 59, %7, %9 \n\t"
|
||||
|
||||
"addi %8, %8, 64 \n\t"
|
||||
"addi %9, %9, 64 \n\t"
|
||||
|
||||
"addic. %0 , %0 , -8 \n\t"
|
||||
"bgt 1b \n\t"
|
||||
|
||||
"2: \n\t"
|
||||
|
||||
"xvmuldp 48, 32, 36 \n\t" // c * x
|
||||
"xvmuldp 49, 33, 36 \n\t"
|
||||
"xvmuldp 50, 34, 36 \n\t"
|
||||
"xvmuldp 51, 35, 36 \n\t"
|
||||
|
||||
"xvmuldp 56, 40, 36 \n\t" // c * y
|
||||
"xvmuldp 57, 41, 36 \n\t"
|
||||
"xvmuldp 58, 42, 36 \n\t"
|
||||
"xvmuldp 59, 43, 36 \n\t"
|
||||
|
||||
"xvmuldp 52, 32, 37 \n\t" // s * x
|
||||
"xvmuldp 53, 33, 37 \n\t"
|
||||
"xvmuldp 54, 34, 37 \n\t"
|
||||
"xvmuldp 55, 35, 37 \n\t"
|
||||
|
||||
"xvmuldp 60, 40, 37 \n\t" // s * y
|
||||
"xvmuldp 61, 41, 37 \n\t"
|
||||
"xvmuldp 62, 42, 37 \n\t"
|
||||
"xvmuldp 63, 43, 37 \n\t"
|
||||
|
||||
"xvadddp 48, 48 , 60 \n\t" // c * x + s * y
|
||||
"xvadddp 49, 49 , 61 \n\t" // c * x + s * y
|
||||
"xvadddp 50, 50 , 62 \n\t" // c * x + s * y
|
||||
"xvadddp 51, 51 , 63 \n\t" // c * x + s * y
|
||||
|
||||
"xvsubdp 56, 56 , 52 \n\t" // c * y - s * x
|
||||
"xvsubdp 57, 57 , 53 \n\t" // c * y - s * x
|
||||
"xvsubdp 58, 58 , 54 \n\t" // c * y - s * x
|
||||
"xvsubdp 59, 59 , 55 \n\t" // c * y - s * x
|
||||
|
||||
"stxvd2x 48, 0, %8 \n\t" // store x
|
||||
"stxvd2x 49, %5, %8 \n\t"
|
||||
"stxvd2x 50, %6, %8 \n\t"
|
||||
"stxvd2x 51, %7, %8 \n\t"
|
||||
|
||||
"stxvd2x 56, 0, %9 \n\t" // store y
|
||||
"stxvd2x 57, %5, %9 \n\t"
|
||||
"stxvd2x 58, %6, %9 \n\t"
|
||||
"stxvd2x 59, %7, %9 \n\t"
|
||||
|
||||
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (x1), // 1
|
||||
"r" (y1), // 2
|
||||
"r" (c), // 3
|
||||
"r" (s), // 4
|
||||
"r" (o16), // 5
|
||||
"r" (o32), // 6
|
||||
"r" (o48), // 7
|
||||
"r" (x2), // 8
|
||||
"r" (y2) // 9
|
||||
: "cr0", "%0", "%1" , "%2", "%8", "%9", "memory"
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,174 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
/**************************************************************************************
|
||||
* 2016/03/25 Werner Saar (wernsaar@googlemail.com)
|
||||
* BLASTEST : OK
|
||||
* CTEST : OK
|
||||
* TEST : OK
|
||||
* LAPACK-TEST : OK
|
||||
**************************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if defined(POWER8)
|
||||
#include "dscal_microk_power8.c"
|
||||
#endif
|
||||
|
||||
#if !defined(HAVE_KERNEL_8)
|
||||
|
||||
static void dscal_kernel_8( BLASLONG n, FLOAT *da , FLOAT *x )
|
||||
{
|
||||
|
||||
BLASLONG i;
|
||||
FLOAT alpha = *da;
|
||||
|
||||
for( i=0; i<n; i+=8 )
|
||||
{
|
||||
x[0] *= alpha;
|
||||
x[1] *= alpha;
|
||||
x[2] *= alpha;
|
||||
x[3] *= alpha;
|
||||
x[4] *= alpha;
|
||||
x[5] *= alpha;
|
||||
x[6] *= alpha;
|
||||
x[7] *= alpha;
|
||||
x+=8;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
static void dscal_kernel_8_zero( BLASLONG n, FLOAT *da , FLOAT *x )
|
||||
{
|
||||
|
||||
BLASLONG i;
|
||||
FLOAT alpha=0.0;
|
||||
|
||||
for( i=0; i<n; i+=8 )
|
||||
{
|
||||
x[0] = alpha;
|
||||
x[1] = alpha;
|
||||
x[2] = alpha;
|
||||
x[3] = alpha;
|
||||
x[4] = alpha;
|
||||
x[5] = alpha;
|
||||
x[6] = alpha;
|
||||
x[7] = alpha;
|
||||
x+=8;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
|
||||
{
|
||||
BLASLONG i=0,j=0;
|
||||
if ( n <= 0 || inc_x <=0 )
|
||||
return(0);
|
||||
|
||||
|
||||
if ( inc_x == 1 )
|
||||
{
|
||||
|
||||
if ( da == 0.0 )
|
||||
{
|
||||
|
||||
BLASLONG n1 = n & -16;
|
||||
if ( n1 > 0 )
|
||||
{
|
||||
FLOAT alpha[2];
|
||||
alpha[0]=da;
|
||||
alpha[1]=da;
|
||||
dscal_kernel_8_zero(n1 , alpha , x);
|
||||
j=n1;
|
||||
}
|
||||
|
||||
while(j < n)
|
||||
{
|
||||
|
||||
x[j]=0.0;
|
||||
j++;
|
||||
}
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
BLASLONG n1 = n & -16;
|
||||
if ( n1 > 0 )
|
||||
{
|
||||
FLOAT alpha[2];
|
||||
alpha[0]=da;
|
||||
alpha[1]=da;
|
||||
dscal_kernel_8(n1 , alpha , x);
|
||||
j=n1;
|
||||
}
|
||||
while(j < n)
|
||||
{
|
||||
|
||||
x[j] = da * x[j] ;
|
||||
j++;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
if ( da == 0.0 )
|
||||
{
|
||||
|
||||
while(j < n)
|
||||
{
|
||||
|
||||
x[i]=0.0;
|
||||
i += inc_x ;
|
||||
j++;
|
||||
}
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
while(j < n)
|
||||
{
|
||||
|
||||
x[i] = da * x[i] ;
|
||||
i += inc_x ;
|
||||
j++;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
return 0;
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,219 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
/**************************************************************************************
|
||||
* 2016/03/25 Werner Saar (wernsaar@googlemail.com)
|
||||
* BLASTEST : OK
|
||||
* CTEST : OK
|
||||
* TEST : OK
|
||||
* LAPACK-TEST : OK
|
||||
**************************************************************************************/
|
||||
|
||||
#define HAVE_KERNEL_8 1
|
||||
|
||||
static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline));
|
||||
|
||||
static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
||||
{
|
||||
|
||||
|
||||
BLASLONG i = n;
|
||||
BLASLONG o16 = 16;
|
||||
BLASLONG o32 = 32;
|
||||
BLASLONG o48 = 48;
|
||||
BLASLONG o64 = 64;
|
||||
BLASLONG o80 = 80;
|
||||
BLASLONG o96 = 96;
|
||||
BLASLONG o112 = 112;
|
||||
FLOAT *x1=x;
|
||||
FLOAT *x2=x+1;
|
||||
BLASLONG pre = 384;
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
|
||||
"lxsdx 33, 0, %3 \n\t"
|
||||
"xxspltd 32, 33, 0 \n\t"
|
||||
"addi %1, %1, -8 \n\t"
|
||||
|
||||
"dcbt %2, %4 \n\t"
|
||||
|
||||
"lxvd2x 40, 0, %2 \n\t"
|
||||
"lxvd2x 41, %5, %2 \n\t"
|
||||
"lxvd2x 42, %6, %2 \n\t"
|
||||
"lxvd2x 43, %7, %2 \n\t"
|
||||
"lxvd2x 44, %8, %2 \n\t"
|
||||
"lxvd2x 45, %9, %2 \n\t"
|
||||
"lxvd2x 46, %10, %2 \n\t"
|
||||
"lxvd2x 47, %11, %2 \n\t"
|
||||
|
||||
"addi %2, %2, 128 \n\t"
|
||||
|
||||
"addic. %0 , %0 , -16 \n\t"
|
||||
"ble 2f \n\t"
|
||||
|
||||
".align 5 \n\t"
|
||||
"1: \n\t"
|
||||
|
||||
"dcbt %2, %4 \n\t"
|
||||
|
||||
"xvmuldp 48, 40, 32 \n\t"
|
||||
"xvmuldp 49, 41, 32 \n\t"
|
||||
"lxvd2x 40, 0, %2 \n\t"
|
||||
"lxvd2x 41, %5, %2 \n\t"
|
||||
"xvmuldp 50, 42, 32 \n\t"
|
||||
"xvmuldp 51, 43, 32 \n\t"
|
||||
"lxvd2x 42, %6, %2 \n\t"
|
||||
"lxvd2x 43, %7, %2 \n\t"
|
||||
"xvmuldp 52, 44, 32 \n\t"
|
||||
"xvmuldp 53, 45, 32 \n\t"
|
||||
"lxvd2x 44, %8, %2 \n\t"
|
||||
"lxvd2x 45, %9, %2 \n\t"
|
||||
"xvmuldp 54, 46, 32 \n\t"
|
||||
"xvmuldp 55, 47, 32 \n\t"
|
||||
"lxvd2x 46, %10, %2 \n\t"
|
||||
"lxvd2x 47, %11, %2 \n\t"
|
||||
|
||||
"stxvd2x 48, 0, %1 \n\t"
|
||||
"stxvd2x 49, %5, %1 \n\t"
|
||||
"stxvd2x 50, %6, %1 \n\t"
|
||||
"stxvd2x 51, %7, %1 \n\t"
|
||||
"stxvd2x 52, %8, %1 \n\t"
|
||||
"stxvd2x 53, %9, %1 \n\t"
|
||||
"stxvd2x 54, %10, %1 \n\t"
|
||||
"stxvd2x 55, %11, %1 \n\t"
|
||||
|
||||
"addi %1, %1, 128 \n\t"
|
||||
"addi %2, %2, 128 \n\t"
|
||||
|
||||
"addic. %0 , %0 , -16 \n\t"
|
||||
"bgt 1b \n\t"
|
||||
|
||||
"2: \n\t"
|
||||
|
||||
"xvmuldp 48, 40, 32 \n\t"
|
||||
"xvmuldp 49, 41, 32 \n\t"
|
||||
"xvmuldp 50, 42, 32 \n\t"
|
||||
"xvmuldp 51, 43, 32 \n\t"
|
||||
"xvmuldp 52, 44, 32 \n\t"
|
||||
"xvmuldp 53, 45, 32 \n\t"
|
||||
"xvmuldp 54, 46, 32 \n\t"
|
||||
"xvmuldp 55, 47, 32 \n\t"
|
||||
|
||||
"stxvd2x 48, 0, %1 \n\t"
|
||||
"stxvd2x 49, %5, %1 \n\t"
|
||||
"stxvd2x 50, %6, %1 \n\t"
|
||||
"stxvd2x 51, %7, %1 \n\t"
|
||||
"stxvd2x 52, %8, %1 \n\t"
|
||||
"stxvd2x 53, %9, %1 \n\t"
|
||||
"stxvd2x 54, %10, %1 \n\t"
|
||||
"stxvd2x 55, %11, %1 \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (x2), // 1
|
||||
"r" (x1), // 2
|
||||
"r" (alpha), // 3
|
||||
"r" (pre), // 4
|
||||
"r" (o16), // 5
|
||||
"r" (o32), // 6
|
||||
"r" (o48), // 7
|
||||
"r" (o64), // 8
|
||||
"r" (o80), // 9
|
||||
"r" (o96), // 10
|
||||
"r" (o112) // 11
|
||||
: "cr0", "%0", "%2" , "%1", "memory"
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
|
||||
static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline));
|
||||
|
||||
static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
||||
{
|
||||
|
||||
|
||||
BLASLONG i = n;
|
||||
BLASLONG o16 = 16;
|
||||
BLASLONG o32 = 32;
|
||||
BLASLONG o48 = 48;
|
||||
BLASLONG o64 = 64;
|
||||
BLASLONG o80 = 80;
|
||||
BLASLONG o96 = 96;
|
||||
BLASLONG o112 = 112;
|
||||
FLOAT *x1=x;
|
||||
FLOAT *x2=x+1;
|
||||
BLASLONG pre = 384;
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
|
||||
"xxlxor 32 , 32 , 32 \n\t"
|
||||
"addi %1, %1, -8 \n\t"
|
||||
|
||||
|
||||
".align 5 \n\t"
|
||||
"1: \n\t"
|
||||
|
||||
"stxvd2x 32, 0, %1 \n\t"
|
||||
"stxvd2x 32, %5, %1 \n\t"
|
||||
"stxvd2x 32, %6, %1 \n\t"
|
||||
"stxvd2x 32, %7, %1 \n\t"
|
||||
"stxvd2x 32, %8, %1 \n\t"
|
||||
"stxvd2x 32, %9, %1 \n\t"
|
||||
"stxvd2x 32, %10, %1 \n\t"
|
||||
"stxvd2x 32, %11, %1 \n\t"
|
||||
|
||||
"addi %1, %1, 128 \n\t"
|
||||
|
||||
"addic. %0 , %0 , -16 \n\t"
|
||||
"bgt 1b \n\t"
|
||||
|
||||
"2: \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (x2), // 1
|
||||
"r" (x1), // 2
|
||||
"r" (alpha), // 3
|
||||
"r" (pre), // 4
|
||||
"r" (o16), // 5
|
||||
"r" (o32), // 6
|
||||
"r" (o48), // 7
|
||||
"r" (o64), // 8
|
||||
"r" (o80), // 9
|
||||
"r" (o96), // 10
|
||||
"r" (o112) // 11
|
||||
: "cr0", "%0", "%2" , "%1", "memory"
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,154 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
/**************************************************************************************
|
||||
* 2016/03/25 Werner Saar (wernsaar@googlemail.com)
|
||||
* BLASTEST : OK
|
||||
* CTEST : OK
|
||||
* TEST : OK
|
||||
* LAPACK-TEST : OK
|
||||
**************************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if defined(POWER8)
|
||||
#include "dswap_microk_power8.c"
|
||||
#endif
|
||||
|
||||
#ifndef HAVE_KERNEL_32
|
||||
|
||||
static void dswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y)
|
||||
{
|
||||
|
||||
BLASLONG i=0;
|
||||
FLOAT f0, f1, f2, f3, f4, f5, f6, f7;
|
||||
FLOAT g0, g1, g2, g3, g4, g5, g6, g7;
|
||||
FLOAT *x1=x;
|
||||
FLOAT *y1=y;
|
||||
|
||||
while ( i<n )
|
||||
{
|
||||
|
||||
f0 = x1[0];
|
||||
f1 = x1[1];
|
||||
f2 = x1[2];
|
||||
f3 = x1[3];
|
||||
f4 = x1[4];
|
||||
f5 = x1[5];
|
||||
f6 = x1[6];
|
||||
f7 = x1[7];
|
||||
|
||||
g0 = y1[0];
|
||||
g1 = y1[1];
|
||||
g2 = y1[2];
|
||||
g3 = y1[3];
|
||||
g4 = y1[4];
|
||||
g5 = y1[5];
|
||||
g6 = y1[6];
|
||||
g7 = y1[7];
|
||||
|
||||
y1[0] = f0;
|
||||
y1[1] = f1;
|
||||
y1[2] = f2;
|
||||
y1[3] = f3;
|
||||
y1[4] = f4;
|
||||
y1[5] = f5;
|
||||
y1[6] = f6;
|
||||
y1[7] = f7;
|
||||
|
||||
x1[0] = g0;
|
||||
x1[1] = g1;
|
||||
x1[2] = g2;
|
||||
x1[3] = g3;
|
||||
x1[4] = g4;
|
||||
x1[5] = g5;
|
||||
x1[6] = g6;
|
||||
x1[7] = g7;
|
||||
|
||||
x1 += 8;
|
||||
y1 += 8;
|
||||
|
||||
i+=8;
|
||||
}
|
||||
return;
|
||||
|
||||
}
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
|
||||
{
|
||||
BLASLONG i=0;
|
||||
BLASLONG ix=0,iy=0;
|
||||
FLOAT temp;
|
||||
|
||||
if ( n <= 0 ) return(0);
|
||||
|
||||
if ( (inc_x == 1) && (inc_y == 1 ))
|
||||
{
|
||||
|
||||
BLASLONG n1 = n & -32;
|
||||
if ( n1 > 0 )
|
||||
{
|
||||
dswap_kernel_32(n1, x, y);
|
||||
i=n1;
|
||||
}
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
temp = y[i];
|
||||
y[i] = x[i] ;
|
||||
x[i] = temp;
|
||||
i++ ;
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
temp = y[iy];
|
||||
y[iy] = x[ix] ;
|
||||
x[ix] = temp;
|
||||
ix += inc_x ;
|
||||
iy += inc_y ;
|
||||
i++ ;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
return(0);
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,180 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
/**************************************************************************************
|
||||
* 2016/03/25 Werner Saar (wernsaar@googlemail.com)
|
||||
* BLASTEST : OK
|
||||
* CTEST : OK
|
||||
* TEST : OK
|
||||
* LAPACK-TEST : OK
|
||||
**************************************************************************************/
|
||||
|
||||
#define HAVE_KERNEL_32 1
|
||||
|
||||
static void dswap_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
|
||||
|
||||
static void dswap_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y)
|
||||
{
|
||||
|
||||
|
||||
BLASLONG i = n;
|
||||
BLASLONG o16 = 16;
|
||||
BLASLONG o32 = 32;
|
||||
BLASLONG o48 = 48;
|
||||
BLASLONG o64 = 64;
|
||||
BLASLONG o80 = 80;
|
||||
BLASLONG o96 = 96;
|
||||
BLASLONG o112 = 112;
|
||||
FLOAT *x1=x;
|
||||
FLOAT *y1=y;
|
||||
FLOAT *x2=x+1;
|
||||
FLOAT *y2=y+1;
|
||||
BLASLONG pre = 384;
|
||||
BLASLONG alpha=0;
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
|
||||
"addi %3, %3, -8 \n\t"
|
||||
"addi %4, %4, -8 \n\t"
|
||||
|
||||
".align 5 \n\t"
|
||||
"1: \n\t"
|
||||
|
||||
"lxvd2x 32, 0, %2 \n\t"
|
||||
"lxvd2x 33, %5, %2 \n\t"
|
||||
"lxvd2x 34, %6, %2 \n\t"
|
||||
"lxvd2x 35, %7, %2 \n\t"
|
||||
"lxvd2x 36, %8, %2 \n\t"
|
||||
"lxvd2x 37, %9, %2 \n\t"
|
||||
"lxvd2x 38, %10, %2 \n\t"
|
||||
"lxvd2x 39, %11, %2 \n\t"
|
||||
|
||||
"addi %2, %2, 128 \n\t"
|
||||
|
||||
"lxvd2x 40, 0, %2 \n\t"
|
||||
"lxvd2x 41, %5, %2 \n\t"
|
||||
"lxvd2x 42, %6, %2 \n\t"
|
||||
"lxvd2x 43, %7, %2 \n\t"
|
||||
"lxvd2x 44, %8, %2 \n\t"
|
||||
"lxvd2x 45, %9, %2 \n\t"
|
||||
"lxvd2x 46, %10, %2 \n\t"
|
||||
"lxvd2x 47, %11, %2 \n\t"
|
||||
|
||||
"addi %2, %2, 128 \n\t"
|
||||
|
||||
"lxvd2x 48, 0, %1 \n\t"
|
||||
"lxvd2x 49, %5, %1 \n\t"
|
||||
"lxvd2x 50, %6, %1 \n\t"
|
||||
"lxvd2x 51, %7, %1 \n\t"
|
||||
"lxvd2x 52, %8, %1 \n\t"
|
||||
"lxvd2x 53, %9, %1 \n\t"
|
||||
"lxvd2x 54, %10, %1 \n\t"
|
||||
"lxvd2x 55, %11, %1 \n\t"
|
||||
|
||||
"addi %1, %1, 128 \n\t"
|
||||
|
||||
"lxvd2x 56, 0, %1 \n\t"
|
||||
"lxvd2x 57, %5, %1 \n\t"
|
||||
"lxvd2x 58, %6, %1 \n\t"
|
||||
"lxvd2x 59, %7, %1 \n\t"
|
||||
"lxvd2x 60, %8, %1 \n\t"
|
||||
"lxvd2x 61, %9, %1 \n\t"
|
||||
"lxvd2x 62, %10, %1 \n\t"
|
||||
"lxvd2x 63, %11, %1 \n\t"
|
||||
|
||||
"addi %1, %1, 128 \n\t"
|
||||
|
||||
"stxvd2x 32, 0, %3 \n\t"
|
||||
"stxvd2x 33, %5, %3 \n\t"
|
||||
"stxvd2x 34, %6, %3 \n\t"
|
||||
"stxvd2x 35, %7, %3 \n\t"
|
||||
"stxvd2x 36, %8, %3 \n\t"
|
||||
"stxvd2x 37, %9, %3 \n\t"
|
||||
"stxvd2x 38, %10, %3 \n\t"
|
||||
"stxvd2x 39, %11, %3 \n\t"
|
||||
|
||||
"addi %3, %3, 128 \n\t"
|
||||
|
||||
"stxvd2x 40, 0, %3 \n\t"
|
||||
"stxvd2x 41, %5, %3 \n\t"
|
||||
"stxvd2x 42, %6, %3 \n\t"
|
||||
"stxvd2x 43, %7, %3 \n\t"
|
||||
"stxvd2x 44, %8, %3 \n\t"
|
||||
"stxvd2x 45, %9, %3 \n\t"
|
||||
"stxvd2x 46, %10, %3 \n\t"
|
||||
"stxvd2x 47, %11, %3 \n\t"
|
||||
|
||||
"addi %3, %3, 128 \n\t"
|
||||
|
||||
"stxvd2x 48, 0, %4 \n\t"
|
||||
"stxvd2x 49, %5, %4 \n\t"
|
||||
"stxvd2x 50, %6, %4 \n\t"
|
||||
"stxvd2x 51, %7, %4 \n\t"
|
||||
"stxvd2x 52, %8, %4 \n\t"
|
||||
"stxvd2x 53, %9, %4 \n\t"
|
||||
"stxvd2x 54, %10, %4 \n\t"
|
||||
"stxvd2x 55, %11, %4 \n\t"
|
||||
|
||||
"addi %4, %4, 128 \n\t"
|
||||
|
||||
"stxvd2x 56, 0, %4 \n\t"
|
||||
"stxvd2x 57, %5, %4 \n\t"
|
||||
"stxvd2x 58, %6, %4 \n\t"
|
||||
"stxvd2x 59, %7, %4 \n\t"
|
||||
"stxvd2x 60, %8, %4 \n\t"
|
||||
"stxvd2x 61, %9, %4 \n\t"
|
||||
"stxvd2x 62, %10, %4 \n\t"
|
||||
"stxvd2x 63, %11, %4 \n\t"
|
||||
|
||||
"addi %4, %4, 128 \n\t"
|
||||
|
||||
"addic. %0 , %0 , -32 \n\t"
|
||||
"bgt 1b \n\t"
|
||||
|
||||
"2: \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (y1), // 1
|
||||
"r" (x1), // 2
|
||||
"r" (y2), // 3
|
||||
"r" (x2), // 4
|
||||
"r" (o16), // 5
|
||||
"r" (o32), // 6
|
||||
"r" (o48), // 7
|
||||
"r" (o64), // 8
|
||||
"r" (o80), // 9
|
||||
"r" (o96), // 10
|
||||
"r" (o112) // 11
|
||||
: "cr0", "%0", "%2" , "%1", "%3", "%4", "memory"
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,146 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
/**************************************************************************************
|
||||
* 2016/03/28 Werner Saar (wernsaar@googlemail.com)
|
||||
* BLASTEST : OK
|
||||
* CTEST : OK
|
||||
* TEST : OK
|
||||
* LAPACK-TEST : OK
|
||||
**************************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include <math.h>
|
||||
|
||||
#if defined(DOUBLE)
|
||||
|
||||
#define ABS fabs
|
||||
|
||||
#else
|
||||
|
||||
#define ABS fabsf
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(POWER8)
|
||||
#include "sasum_microk_power8.c"
|
||||
#endif
|
||||
|
||||
|
||||
#ifndef HAVE_KERNEL_32
|
||||
|
||||
static void sasum_kernel_32(BLASLONG n, FLOAT *x1, FLOAT *svec)
|
||||
{
|
||||
|
||||
BLASLONG i=0;
|
||||
FLOAT *x = x1;
|
||||
FLOAT temp0, temp1, temp2, temp3;
|
||||
FLOAT temp4, temp5, temp6, temp7;
|
||||
FLOAT sum0 = 0.0;
|
||||
FLOAT sum1 = 0.0;
|
||||
FLOAT sum2 = 0.0;
|
||||
FLOAT sum3 = 0.0;
|
||||
|
||||
while ( i< n )
|
||||
{
|
||||
|
||||
temp0 = ABS(x[0]);
|
||||
temp1 = ABS(x[1]);
|
||||
temp2 = ABS(x[2]);
|
||||
temp3 = ABS(x[3]);
|
||||
temp4 = ABS(x[4]);
|
||||
temp5 = ABS(x[5]);
|
||||
temp6 = ABS(x[6]);
|
||||
temp7 = ABS(x[7]);
|
||||
|
||||
sum0 += temp0;
|
||||
sum1 += temp1;
|
||||
sum2 += temp2;
|
||||
sum3 += temp3;
|
||||
|
||||
sum0 += temp4;
|
||||
sum1 += temp5;
|
||||
sum2 += temp6;
|
||||
sum3 += temp7;
|
||||
|
||||
x+=8;
|
||||
i+=8;
|
||||
|
||||
}
|
||||
|
||||
svec[0] = sum0+sum1+sum2+sum3;
|
||||
svec[1] = 0.0;
|
||||
svec[2] = 0.0;
|
||||
svec[3] = 0.0;
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
BLASLONG i=0;
|
||||
FLOAT sumf = 0.0;
|
||||
FLOAT svec[4] __attribute__ ((aligned (16)));;
|
||||
BLASLONG n1;
|
||||
|
||||
if (n <= 0 || inc_x <= 0) return(sumf);
|
||||
|
||||
if ( inc_x == 1 )
|
||||
{
|
||||
|
||||
n1 = n & -32;
|
||||
if ( n1 > 0 )
|
||||
{
|
||||
|
||||
sasum_kernel_32(n1, x, svec);
|
||||
sumf = svec[0] + svec[1]+svec[2]+svec[3];
|
||||
i=n1;
|
||||
}
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
sumf += ABS(x[i]);
|
||||
i++;
|
||||
}
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
n *= inc_x;
|
||||
while(i < n)
|
||||
{
|
||||
sumf += ABS(x[i]);
|
||||
i += inc_x;
|
||||
}
|
||||
|
||||
}
|
||||
return(sumf);
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,177 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
/**************************************************************************************
|
||||
* 2016/03/28 Werner Saar (wernsaar@googlemail.com)
|
||||
* BLASTEST : OK
|
||||
* CTEST : OK
|
||||
* TEST : OK
|
||||
* LAPACK-TEST : OK
|
||||
**************************************************************************************/
|
||||
|
||||
#define HAVE_KERNEL_32 1
|
||||
static void sasum_kernel_32( BLASLONG n, FLOAT *x, FLOAT *svec) __attribute__ ((noinline));
|
||||
|
||||
static void sasum_kernel_32( BLASLONG n, FLOAT *x, FLOAT *svec)
|
||||
{
|
||||
|
||||
|
||||
BLASLONG i = n;
|
||||
BLASLONG o16 = 16;
|
||||
BLASLONG o32 = 32;
|
||||
BLASLONG o48 = 48;
|
||||
BLASLONG o64 = 64;
|
||||
BLASLONG o80 = 80;
|
||||
BLASLONG o96 = 96;
|
||||
BLASLONG o112 = 112;
|
||||
FLOAT *x1=x;
|
||||
BLASLONG pre = 384;
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
|
||||
"dcbt %2 , %4 \n\t"
|
||||
|
||||
"xxlxor 32,32,32 \n\t"
|
||||
"xxlxor 33,33,33 \n\t"
|
||||
"xxlxor 34,34,34 \n\t"
|
||||
"xxlxor 35,35,35 \n\t"
|
||||
"xxlxor 36,36,36 \n\t"
|
||||
"xxlxor 37,37,37 \n\t"
|
||||
"xxlxor 38,38,38 \n\t"
|
||||
"xxlxor 39,39,39 \n\t"
|
||||
|
||||
"lxvw4x 40, 0, %2 \n\t"
|
||||
"lxvw4x 41, %5, %2 \n\t"
|
||||
"lxvw4x 42, %6, %2 \n\t"
|
||||
"lxvw4x 43, %7, %2 \n\t"
|
||||
"lxvw4x 44, %8, %2 \n\t"
|
||||
"lxvw4x 45, %9, %2 \n\t"
|
||||
"lxvw4x 46, %10, %2 \n\t"
|
||||
"lxvw4x 47, %11, %2 \n\t"
|
||||
|
||||
"addi %2, %2, 128 \n\t"
|
||||
|
||||
"addic. %0 , %0 , -32 \n\t"
|
||||
"ble 2f \n\t"
|
||||
|
||||
".align 5 \n\t"
|
||||
"1: \n\t"
|
||||
|
||||
"dcbt %2 , %4 \n\t"
|
||||
|
||||
"xvabssp 48, 40 \n\t"
|
||||
"xvabssp 49, 41 \n\t"
|
||||
"xvabssp 50, 42 \n\t"
|
||||
"xvabssp 51, 43 \n\t"
|
||||
|
||||
"lxvw4x 40, 0, %2 \n\t"
|
||||
"lxvw4x 41, %5, %2 \n\t"
|
||||
|
||||
"xvabssp 52, 44 \n\t"
|
||||
"xvabssp 53, 45 \n\t"
|
||||
|
||||
"lxvw4x 42, %6, %2 \n\t"
|
||||
"lxvw4x 43, %7, %2 \n\t"
|
||||
|
||||
"xvabssp 54, 46 \n\t"
|
||||
"xvabssp 55, 47 \n\t"
|
||||
|
||||
"lxvw4x 44, %8, %2 \n\t"
|
||||
"lxvw4x 45, %9, %2 \n\t"
|
||||
|
||||
"xvaddsp 32, 32, 48 \n\t"
|
||||
"xvaddsp 33, 33, 49 \n\t"
|
||||
|
||||
"lxvw4x 46, %10, %2 \n\t"
|
||||
"lxvw4x 47, %11, %2 \n\t"
|
||||
|
||||
"xvaddsp 34, 34, 50 \n\t"
|
||||
"xvaddsp 35, 35, 51 \n\t"
|
||||
"addi %2, %2, 128 \n\t"
|
||||
"xvaddsp 36, 36, 52 \n\t"
|
||||
"xvaddsp 37, 37, 53 \n\t"
|
||||
"addic. %0 , %0 , -32 \n\t"
|
||||
"xvaddsp 38, 38, 54 \n\t"
|
||||
"xvaddsp 39, 39, 55 \n\t"
|
||||
|
||||
"bgt 1b \n\t"
|
||||
|
||||
"2: \n\t"
|
||||
|
||||
|
||||
"xvabssp 48, 40 \n\t"
|
||||
"xvabssp 49, 41 \n\t"
|
||||
"xvabssp 50, 42 \n\t"
|
||||
"xvabssp 51, 43 \n\t"
|
||||
"xvabssp 52, 44 \n\t"
|
||||
"xvabssp 53, 45 \n\t"
|
||||
"xvabssp 54, 46 \n\t"
|
||||
"xvabssp 55, 47 \n\t"
|
||||
|
||||
"xvaddsp 32, 32, 48 \n\t"
|
||||
"xvaddsp 33, 33, 49 \n\t"
|
||||
"xvaddsp 34, 34, 50 \n\t"
|
||||
"xvaddsp 35, 35, 51 \n\t"
|
||||
"xvaddsp 36, 36, 52 \n\t"
|
||||
"xvaddsp 37, 37, 53 \n\t"
|
||||
"xvaddsp 38, 38, 54 \n\t"
|
||||
"xvaddsp 39, 39, 55 \n\t"
|
||||
|
||||
"xvaddsp 32, 32, 33 \n\t"
|
||||
"xvaddsp 34, 34, 35 \n\t"
|
||||
"xvaddsp 36, 36, 37 \n\t"
|
||||
"xvaddsp 38, 38, 39 \n\t"
|
||||
|
||||
"xvaddsp 32, 32, 34 \n\t"
|
||||
"xvaddsp 36, 36, 38 \n\t"
|
||||
|
||||
"xvaddsp 32, 32, 36 \n\t"
|
||||
|
||||
|
||||
"stxvw4x 32, 0, %3 \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"r" (x1), // 2
|
||||
"r" (svec), // 3
|
||||
"r" (pre), // 4
|
||||
"r" (o16), // 5
|
||||
"r" (o32), // 6
|
||||
"r" (o48), // 7
|
||||
"r" (o64), // 8
|
||||
"r" (o80), // 9
|
||||
"r" (o96), // 10
|
||||
"r" (o112) // 11
|
||||
: "cr0", "%0", "%2", "memory"
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,131 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
/**************************************************************************************
|
||||
* 2016/03/25 Werner Saar (wernsaar@googlemail.com)
|
||||
* BLASTEST : OK
|
||||
* CTEST : OK
|
||||
* TEST : OK
|
||||
* LAPACK-TEST : OK
|
||||
**************************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if defined(POWER8)
|
||||
#include "scopy_microk_power8.c"
|
||||
#endif
|
||||
|
||||
#ifndef HAVE_KERNEL_32
|
||||
|
||||
static void scopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y)
|
||||
{
|
||||
|
||||
BLASLONG i=0;
|
||||
FLOAT f0, f1, f2, f3, f4, f5, f6, f7;
|
||||
FLOAT *x1=x;
|
||||
FLOAT *y1=y;
|
||||
|
||||
while ( i<n )
|
||||
{
|
||||
|
||||
f0 = x1[0];
|
||||
f1 = x1[1];
|
||||
f2 = x1[2];
|
||||
f3 = x1[3];
|
||||
f4 = x1[4];
|
||||
f5 = x1[5];
|
||||
f6 = x1[6];
|
||||
f7 = x1[7];
|
||||
|
||||
y1[0] = f0;
|
||||
y1[1] = f1;
|
||||
y1[2] = f2;
|
||||
y1[3] = f3;
|
||||
y1[4] = f4;
|
||||
y1[5] = f5;
|
||||
y1[6] = f6;
|
||||
y1[7] = f7;
|
||||
|
||||
x1 += 8;
|
||||
y1 += 8;
|
||||
|
||||
i+=8;
|
||||
}
|
||||
return;
|
||||
|
||||
}
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
||||
{
|
||||
BLASLONG i=0;
|
||||
BLASLONG ix=0,iy=0;
|
||||
|
||||
if ( n <= 0 ) return(0);
|
||||
|
||||
if ( (inc_x == 1) && (inc_y == 1 ))
|
||||
{
|
||||
|
||||
BLASLONG n1 = n & -32;
|
||||
if ( n1 > 0 )
|
||||
{
|
||||
scopy_kernel_32(n1, x, y);
|
||||
i=n1;
|
||||
}
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
y[i] = x[i] ;
|
||||
i++ ;
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
y[iy] = x[ix] ;
|
||||
ix += inc_x ;
|
||||
iy += inc_y ;
|
||||
i++ ;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
return(0);
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,131 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
/**************************************************************************************
|
||||
* 2016/03/25 Werner Saar (wernsaar@googlemail.com)
|
||||
* BLASTEST : OK
|
||||
* CTEST : OK
|
||||
* TEST : OK
|
||||
* LAPACK-TEST : OK
|
||||
**************************************************************************************/
|
||||
|
||||
#define HAVE_KERNEL_32 1
|
||||
|
||||
static void scopy_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
|
||||
|
||||
static void scopy_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y)
|
||||
{
|
||||
|
||||
|
||||
BLASLONG i = n;
|
||||
BLASLONG o16 = 16;
|
||||
BLASLONG o32 = 32;
|
||||
BLASLONG o48 = 48;
|
||||
BLASLONG o64 = 64;
|
||||
BLASLONG o80 = 80;
|
||||
BLASLONG o96 = 96;
|
||||
BLASLONG o112 = 112;
|
||||
FLOAT *x1=x;
|
||||
FLOAT *y1=y;
|
||||
BLASLONG pre = 384;
|
||||
BLASLONG alpha=0;
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
|
||||
"lxvw4x 40, 0, %2 \n\t"
|
||||
"lxvw4x 41, %5, %2 \n\t"
|
||||
"lxvw4x 42, %6, %2 \n\t"
|
||||
"lxvw4x 43, %7, %2 \n\t"
|
||||
"lxvw4x 44, %8, %2 \n\t"
|
||||
"lxvw4x 45, %9, %2 \n\t"
|
||||
"lxvw4x 46, %10, %2 \n\t"
|
||||
"lxvw4x 47, %11, %2 \n\t"
|
||||
|
||||
"addi %2, %2, 128 \n\t"
|
||||
|
||||
"addic. %0 , %0 , -32 \n\t"
|
||||
"ble 2f \n\t"
|
||||
|
||||
".align 5 \n\t"
|
||||
"1: \n\t"
|
||||
|
||||
"stxvw4x 40, 0, %1 \n\t"
|
||||
"stxvw4x 41, %5, %1 \n\t"
|
||||
"lxvw4x 40, 0, %2 \n\t"
|
||||
"lxvw4x 41, %5, %2 \n\t"
|
||||
"stxvw4x 42, %6, %1 \n\t"
|
||||
"stxvw4x 43, %7, %1 \n\t"
|
||||
"lxvw4x 42, %6, %2 \n\t"
|
||||
"lxvw4x 43, %7, %2 \n\t"
|
||||
"stxvw4x 44, %8, %1 \n\t"
|
||||
"stxvw4x 45, %9, %1 \n\t"
|
||||
"lxvw4x 44, %8, %2 \n\t"
|
||||
"lxvw4x 45, %9, %2 \n\t"
|
||||
"stxvw4x 46, %10, %1 \n\t"
|
||||
"stxvw4x 47, %11, %1 \n\t"
|
||||
"lxvw4x 46, %10, %2 \n\t"
|
||||
"lxvw4x 47, %11, %2 \n\t"
|
||||
|
||||
|
||||
"addi %1, %1, 128 \n\t"
|
||||
"addi %2, %2, 128 \n\t"
|
||||
|
||||
"addic. %0 , %0 , -32 \n\t"
|
||||
"bgt 1b \n\t"
|
||||
|
||||
"2: \n\t"
|
||||
|
||||
"stxvw4x 40, 0, %1 \n\t"
|
||||
"stxvw4x 41, %5, %1 \n\t"
|
||||
"stxvw4x 42, %6, %1 \n\t"
|
||||
"stxvw4x 43, %7, %1 \n\t"
|
||||
"stxvw4x 44, %8, %1 \n\t"
|
||||
"stxvw4x 45, %9, %1 \n\t"
|
||||
"stxvw4x 46, %10, %1 \n\t"
|
||||
"stxvw4x 47, %11, %1 \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (y1), // 1
|
||||
"r" (x1), // 2
|
||||
"r" (alpha), // 3
|
||||
"r" (pre), // 4
|
||||
"r" (o16), // 5
|
||||
"r" (o32), // 6
|
||||
"r" (o48), // 7
|
||||
"r" (o64), // 8
|
||||
"r" (o80), // 9
|
||||
"r" (o96), // 10
|
||||
"r" (o112) // 11
|
||||
: "cr0", "%0", "%2" , "%1", "memory"
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,126 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
/**************************************************************************************
|
||||
* 2016/03/21 Werner Saar (wernsaar@googlemail.com)
|
||||
* BLASTEST : OK
|
||||
* CTEST : OK
|
||||
* TEST : OK
|
||||
* LAPACK-TEST : OK
|
||||
**************************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if defined(POWER8)
|
||||
#include "sdot_microk_power8.c"
|
||||
#endif
|
||||
|
||||
|
||||
#ifndef HAVE_KERNEL_16
|
||||
|
||||
static void sdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d)
|
||||
{
|
||||
BLASLONG register i = 0;
|
||||
FLOAT dot = 0.0;
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
dot += y[i] * x[i]
|
||||
+ y[i+1] * x[i+1]
|
||||
+ y[i+2] * x[i+2]
|
||||
+ y[i+3] * x[i+3]
|
||||
+ y[i+4] * x[i+4]
|
||||
+ y[i+5] * x[i+5]
|
||||
+ y[i+6] * x[i+6]
|
||||
+ y[i+7] * x[i+7] ;
|
||||
|
||||
i+=8 ;
|
||||
|
||||
}
|
||||
*d += dot;
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
||||
{
|
||||
BLASLONG i=0;
|
||||
BLASLONG ix=0,iy=0;
|
||||
|
||||
FLOAT dot = 0.0 ;
|
||||
|
||||
if ( n <= 0 ) return(dot);
|
||||
|
||||
if ( (inc_x == 1) && (inc_y == 1) )
|
||||
{
|
||||
|
||||
BLASLONG n1 = n & -32;
|
||||
|
||||
if ( n1 )
|
||||
sdot_kernel_16(n1, x, y , &dot );
|
||||
|
||||
|
||||
i = n1;
|
||||
while(i < n)
|
||||
{
|
||||
|
||||
dot += y[i] * x[i] ;
|
||||
i++ ;
|
||||
|
||||
}
|
||||
return(dot);
|
||||
|
||||
|
||||
}
|
||||
|
||||
BLASLONG n1 = n & -2;
|
||||
|
||||
while(i < n1)
|
||||
{
|
||||
|
||||
dot += y[iy] * x[ix] + y[iy+inc_y] * x[ix+inc_x];
|
||||
ix += inc_x*2 ;
|
||||
iy += inc_y*2 ;
|
||||
i+=2 ;
|
||||
|
||||
}
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
|
||||
dot += y[iy] * x[ix] ;
|
||||
ix += inc_x ;
|
||||
iy += inc_y ;
|
||||
i++ ;
|
||||
|
||||
}
|
||||
return(dot);
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,179 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
/**************************************************************************************
|
||||
* 2016/03/21 Werner Saar (wernsaar@googlemail.com)
|
||||
* BLASTEST : OK
|
||||
* CTEST : OK
|
||||
* TEST : OK
|
||||
* LAPACK-TEST : OK
|
||||
**************************************************************************************/
|
||||
|
||||
#define HAVE_KERNEL_16 1
|
||||
static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline));
|
||||
|
||||
static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
|
||||
{
|
||||
|
||||
|
||||
BLASLONG i = n;
|
||||
BLASLONG o16 = 16;
|
||||
BLASLONG o32 = 32;
|
||||
BLASLONG o48 = 48;
|
||||
BLASLONG o64 = 64;
|
||||
BLASLONG o80 = 80;
|
||||
BLASLONG o96 = 96;
|
||||
BLASLONG o112 = 112;
|
||||
FLOAT *x1=x;
|
||||
FLOAT *y1=y;
|
||||
BLASLONG pre = 384;
|
||||
FLOAT tempdot[4];
|
||||
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
"xxlxor 32,32,32 \n\t"
|
||||
"xxlxor 33,33,33 \n\t"
|
||||
"xxlxor 34,34,34 \n\t"
|
||||
"xxlxor 35,35,35 \n\t"
|
||||
"xxlxor 36,36,36 \n\t"
|
||||
"xxlxor 37,37,37 \n\t"
|
||||
"xxlxor 38,38,38 \n\t"
|
||||
"xxlxor 39,39,39 \n\t"
|
||||
|
||||
"dcbt %2, %12 \n\t"
|
||||
"dcbt %3, %12 \n\t"
|
||||
|
||||
"lxvw4x 40, 0, %2 \n\t"
|
||||
"lxvw4x 48, 0, %3 \n\t"
|
||||
"lxvw4x 41, %5, %2 \n\t"
|
||||
"lxvw4x 49, %5, %3 \n\t"
|
||||
"lxvw4x 42, %6, %2 \n\t"
|
||||
"lxvw4x 50, %6, %3 \n\t"
|
||||
"lxvw4x 43, %7, %2 \n\t"
|
||||
"lxvw4x 51, %7, %3 \n\t"
|
||||
"lxvw4x 44, %8, %2 \n\t"
|
||||
"lxvw4x 52, %8, %3 \n\t"
|
||||
"lxvw4x 45, %9, %2 \n\t"
|
||||
"lxvw4x 53, %9, %3 \n\t"
|
||||
"lxvw4x 46, %10, %2 \n\t"
|
||||
"lxvw4x 54, %10, %3 \n\t"
|
||||
"lxvw4x 47, %11, %2 \n\t"
|
||||
"lxvw4x 55, %11, %3 \n\t"
|
||||
|
||||
"addi %2, %2, 128 \n\t"
|
||||
"addi %3, %3, 128 \n\t"
|
||||
|
||||
"addic. %0 , %0 , -32 \n\t"
|
||||
"ble 2f \n\t"
|
||||
|
||||
".align 5 \n\t"
|
||||
"1: \n\t"
|
||||
|
||||
"dcbt %2, %12 \n\t"
|
||||
"dcbt %3, %12 \n\t"
|
||||
|
||||
"xvmaddasp 32, 40, 48 \n\t"
|
||||
"lxvw4x 40, 0, %2 \n\t"
|
||||
"lxvw4x 48, 0, %3 \n\t"
|
||||
"xvmaddasp 33, 41, 49 \n\t"
|
||||
"lxvw4x 41, %5, %2 \n\t"
|
||||
"lxvw4x 49, %5, %3 \n\t"
|
||||
"xvmaddasp 34, 42, 50 \n\t"
|
||||
"lxvw4x 42, %6, %2 \n\t"
|
||||
"lxvw4x 50, %6, %3 \n\t"
|
||||
"xvmaddasp 35, 43, 51 \n\t"
|
||||
"lxvw4x 43, %7, %2 \n\t"
|
||||
"lxvw4x 51, %7, %3 \n\t"
|
||||
"xvmaddasp 36, 44, 52 \n\t"
|
||||
"lxvw4x 44, %8, %2 \n\t"
|
||||
"lxvw4x 52, %8, %3 \n\t"
|
||||
"xvmaddasp 37, 45, 53 \n\t"
|
||||
"lxvw4x 45, %9, %2 \n\t"
|
||||
"lxvw4x 53, %9, %3 \n\t"
|
||||
"xvmaddasp 38, 46, 54 \n\t"
|
||||
"lxvw4x 46, %10, %2 \n\t"
|
||||
"lxvw4x 54, %10, %3 \n\t"
|
||||
"xvmaddasp 39, 47, 55 \n\t"
|
||||
|
||||
"lxvw4x 47, %11, %2 \n\t"
|
||||
"lxvw4x 55, %11, %3 \n\t"
|
||||
|
||||
|
||||
"addi %2, %2, 128 \n\t"
|
||||
"addi %3, %3, 128 \n\t"
|
||||
|
||||
"addic. %0 , %0 , -32 \n\t"
|
||||
"bgt 1b \n\t"
|
||||
|
||||
"2: \n\t"
|
||||
|
||||
"xvmaddasp 32, 40, 48 \n\t"
|
||||
"xvmaddasp 33, 41, 49 \n\t"
|
||||
"xvmaddasp 34, 42, 50 \n\t"
|
||||
"xvmaddasp 35, 43, 51 \n\t"
|
||||
"xvmaddasp 36, 44, 52 \n\t"
|
||||
"xvmaddasp 37, 45, 53 \n\t"
|
||||
"xvmaddasp 38, 46, 54 \n\t"
|
||||
"xvmaddasp 39, 47, 55 \n\t"
|
||||
|
||||
"xvaddsp 32, 32 , 33 \n\t"
|
||||
"xvaddsp 34, 34 , 35 \n\t"
|
||||
"xvaddsp 36, 36 , 37 \n\t"
|
||||
"xvaddsp 38, 38 , 39 \n\t"
|
||||
|
||||
"xvaddsp 32, 32 , 34 \n\t"
|
||||
"xvaddsp 36, 36 , 38 \n\t"
|
||||
|
||||
"xvaddsp 32, 32 , 36 \n\t"
|
||||
|
||||
"stxvw4x 32, 0 , %4 \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"r" (x1), // 2
|
||||
"r" (y1), // 3
|
||||
"r" (tempdot), // 4
|
||||
"r" (o16), // 5
|
||||
"r" (o32), // 6
|
||||
"r" (o48), // 7
|
||||
"r" (o64), // 8
|
||||
"r" (o80), // 9
|
||||
"r" (o96), // 10
|
||||
"r" (o112), // 11
|
||||
"r" (pre) // 12
|
||||
: "cr0", "%0", "%2" , "%3", "memory"
|
||||
);
|
||||
|
||||
*dot = tempdot[0] + tempdot[1] + tempdot[2] + tempdot[3];
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,371 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
/**************************************************************************************
|
||||
* 2016/04/02 Werner Saar (wernsaar@googlemail.com)
|
||||
* BLASTEST : OK
|
||||
* CTEST : OK
|
||||
* TEST : OK
|
||||
* LAPACK-TEST : OK
|
||||
**************************************************************************************/
|
||||
|
||||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
#include "def_vsx.h"
|
||||
|
||||
#ifndef __64BIT__
|
||||
#define LOAD lwz
|
||||
#else
|
||||
#define LOAD ld
|
||||
#endif
|
||||
|
||||
#ifdef __64BIT__
|
||||
#define STACKSIZE 32752
|
||||
#define ALPHA_SP 296(SP)
|
||||
#define FZERO 304(SP)
|
||||
#else
|
||||
#define STACKSIZE 240
|
||||
#define ALPHA_SP 224(SP)
|
||||
#define FZERO 232(SP)
|
||||
#endif
|
||||
|
||||
#define M r3
|
||||
#define N r4
|
||||
#define K r5
|
||||
|
||||
#ifdef linux
|
||||
#ifndef __64BIT__
|
||||
#define A r6
|
||||
#define B r7
|
||||
#define C r8
|
||||
#define LDC r9
|
||||
#define OFFSET r10
|
||||
#else
|
||||
#define A r7
|
||||
#define B r8
|
||||
#define C r9
|
||||
#define LDC r10
|
||||
#define OFFSET r6
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(_AIX) || defined(__APPLE__)
|
||||
#if !defined(__64BIT__) && defined(DOUBLE)
|
||||
#define A r8
|
||||
#define B r9
|
||||
#define C r10
|
||||
#define LDC r7
|
||||
#define OFFSET r6
|
||||
#else
|
||||
#define A r7
|
||||
#define B r8
|
||||
#define C r9
|
||||
#define LDC r10
|
||||
#define OFFSET r6
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#define alpha_r vs30
|
||||
#define alpha_vr vs31
|
||||
|
||||
#define o0 0
|
||||
|
||||
#define FRAMEPOINTER r12
|
||||
|
||||
#define BBUFFER r14
|
||||
#define o4 r15
|
||||
#define o12 r16
|
||||
#define o8 r17
|
||||
#define L r18
|
||||
#define T1 r19
|
||||
#define KK r20
|
||||
#define BBO r21
|
||||
#define I r22
|
||||
#define J r23
|
||||
#define AO r24
|
||||
#define BO r25
|
||||
#define CO r26
|
||||
#define o16 r27
|
||||
#define o32 r28
|
||||
#define o48 r29
|
||||
|
||||
#define PRE r30
|
||||
#define T2 r31
|
||||
|
||||
#include "sgemm_macros_16x8_power8.S"
|
||||
|
||||
|
||||
#ifndef NEEDPARAM
|
||||
|
||||
PROLOGUE
|
||||
PROFCODE
|
||||
|
||||
mr FRAMEPOINTER, SP
|
||||
addi SP, SP, -STACKSIZE
|
||||
addi SP, SP, -STACKSIZE
|
||||
addi SP, SP, -STACKSIZE
|
||||
addi SP, SP, -STACKSIZE
|
||||
li r0, 0
|
||||
|
||||
stfd f14, 0(SP)
|
||||
stfd f15, 8(SP)
|
||||
stfd f16, 16(SP)
|
||||
stfd f17, 24(SP)
|
||||
|
||||
stfd f18, 32(SP)
|
||||
stfd f19, 40(SP)
|
||||
stfd f20, 48(SP)
|
||||
stfd f21, 56(SP)
|
||||
|
||||
stfd f22, 64(SP)
|
||||
stfd f23, 72(SP)
|
||||
stfd f24, 80(SP)
|
||||
stfd f25, 88(SP)
|
||||
|
||||
stfd f26, 96(SP)
|
||||
stfd f27, 104(SP)
|
||||
stfd f28, 112(SP)
|
||||
stfd f29, 120(SP)
|
||||
|
||||
stfd f30, 128(SP)
|
||||
stfd f31, 136(SP)
|
||||
|
||||
#ifdef __64BIT__
|
||||
std r31, 144(SP)
|
||||
std r30, 152(SP)
|
||||
std r29, 160(SP)
|
||||
std r28, 168(SP)
|
||||
std r27, 176(SP)
|
||||
std r26, 184(SP)
|
||||
std r25, 192(SP)
|
||||
std r24, 200(SP)
|
||||
std r23, 208(SP)
|
||||
std r22, 216(SP)
|
||||
std r21, 224(SP)
|
||||
std r20, 232(SP)
|
||||
std r19, 240(SP)
|
||||
std r18, 248(SP)
|
||||
std r17, 256(SP)
|
||||
std r16, 264(SP)
|
||||
std r15, 272(SP)
|
||||
std r14, 280(SP)
|
||||
#else
|
||||
stw r31, 144(SP)
|
||||
stw r30, 148(SP)
|
||||
stw r29, 152(SP)
|
||||
stw r28, 156(SP)
|
||||
stw r27, 160(SP)
|
||||
stw r26, 164(SP)
|
||||
stw r25, 168(SP)
|
||||
stw r24, 172(SP)
|
||||
stw r23, 176(SP)
|
||||
stw r22, 180(SP)
|
||||
stw r21, 184(SP)
|
||||
stw r20, 188(SP)
|
||||
stw r19, 192(SP)
|
||||
stw r18, 196(SP)
|
||||
stw r17, 200(SP)
|
||||
stw r16, 204(SP)
|
||||
stw r15, 208(SP)
|
||||
stw r14, 212(SP)
|
||||
#endif
|
||||
|
||||
// stfd f1, ALPHA_SP
|
||||
// stw r0, FZERO
|
||||
|
||||
#if defined(_AIX) || defined(__APPLE__)
|
||||
#if !defined(__64BIT__) && defined(DOUBLE)
|
||||
lwz LDC, FRAMESLOT(0) + 0(FRAMEPOINTER)
|
||||
#endif
|
||||
#endif
|
||||
|
||||
slwi LDC, LDC, 2
|
||||
|
||||
#if defined(TRMMKERNEL)
|
||||
#if defined(linux) && defined(__64BIT__)
|
||||
ld OFFSET, FRAMESLOT(0) + 0(FRAMEPOINTER)
|
||||
#endif
|
||||
|
||||
#if defined(_AIX) || defined(__APPLE__)
|
||||
#ifdef __64BIT__
|
||||
ld OFFSET, FRAMESLOT(0) + 0(FRAMEPOINTER)
|
||||
#else
|
||||
#ifdef DOUBLE
|
||||
lwz OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER)
|
||||
#else
|
||||
lwz OFFSET, FRAMESLOT(0) + 0(FRAMEPOINTER)
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
cmpwi cr0, M, 0
|
||||
ble L999_H1
|
||||
cmpwi cr0, N, 0
|
||||
ble L999_H1
|
||||
cmpwi cr0, K, 0
|
||||
ble L999_H1
|
||||
|
||||
li PRE, 256
|
||||
li o4 , 4
|
||||
li o8 , 8
|
||||
li o12, 12
|
||||
li o16, 16
|
||||
li o32, 32
|
||||
li o48, 48
|
||||
|
||||
addi BBUFFER, SP, 512+4096
|
||||
li T1, -4096
|
||||
and BBUFFER, BBUFFER, T1
|
||||
|
||||
addi T1, SP, 300
|
||||
stxsspx f1, o0 , T1
|
||||
stxsspx f1, o4 , T1
|
||||
stxsspx f1, o8 , T1
|
||||
stxsspx f1, o12 , T1
|
||||
|
||||
lxsspx alpha_r, o0, T1
|
||||
lxvw4x alpha_vr, o0, T1
|
||||
|
||||
|
||||
|
||||
#include "sgemm_logic_16x8_power8.S"
|
||||
|
||||
L999:
|
||||
addi r3, 0, 0
|
||||
|
||||
lfd f14, 0(SP)
|
||||
lfd f15, 8(SP)
|
||||
lfd f16, 16(SP)
|
||||
lfd f17, 24(SP)
|
||||
|
||||
lfd f18, 32(SP)
|
||||
lfd f19, 40(SP)
|
||||
lfd f20, 48(SP)
|
||||
lfd f21, 56(SP)
|
||||
|
||||
lfd f22, 64(SP)
|
||||
lfd f23, 72(SP)
|
||||
lfd f24, 80(SP)
|
||||
lfd f25, 88(SP)
|
||||
|
||||
lfd f26, 96(SP)
|
||||
lfd f27, 104(SP)
|
||||
lfd f28, 112(SP)
|
||||
lfd f29, 120(SP)
|
||||
|
||||
lfd f30, 128(SP)
|
||||
lfd f31, 136(SP)
|
||||
|
||||
#ifdef __64BIT__
|
||||
ld r31, 144(SP)
|
||||
ld r30, 152(SP)
|
||||
ld r29, 160(SP)
|
||||
ld r28, 168(SP)
|
||||
ld r27, 176(SP)
|
||||
ld r26, 184(SP)
|
||||
ld r25, 192(SP)
|
||||
ld r24, 200(SP)
|
||||
ld r23, 208(SP)
|
||||
ld r22, 216(SP)
|
||||
ld r21, 224(SP)
|
||||
ld r20, 232(SP)
|
||||
ld r19, 240(SP)
|
||||
ld r18, 248(SP)
|
||||
ld r17, 256(SP)
|
||||
ld r16, 264(SP)
|
||||
ld r15, 272(SP)
|
||||
ld r14, 280(SP)
|
||||
#else
|
||||
lwz r31, 144(SP)
|
||||
lwz r30, 148(SP)
|
||||
lwz r29, 152(SP)
|
||||
lwz r28, 156(SP)
|
||||
lwz r27, 160(SP)
|
||||
lwz r26, 164(SP)
|
||||
lwz r25, 168(SP)
|
||||
lwz r24, 172(SP)
|
||||
lwz r23, 176(SP)
|
||||
lwz r22, 180(SP)
|
||||
lwz r21, 184(SP)
|
||||
lwz r20, 188(SP)
|
||||
lwz r19, 192(SP)
|
||||
lwz r18, 196(SP)
|
||||
lwz r17, 200(SP)
|
||||
lwz r16, 204(SP)
|
||||
lwz r15, 208(SP)
|
||||
lwz r14, 212(SP)
|
||||
#endif
|
||||
|
||||
addi SP, SP, STACKSIZE
|
||||
addi SP, SP, STACKSIZE
|
||||
addi SP, SP, STACKSIZE
|
||||
addi SP, SP, STACKSIZE
|
||||
|
||||
blr
|
||||
|
||||
EPILOGUE
|
||||
#endif
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
|
@ -0,0 +1,167 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
/**************************************************************************************
|
||||
* 2016/03/26 Werner Saar (wernsaar@googlemail.com)
|
||||
* BLASTEST : OK
|
||||
* CTEST : OK
|
||||
* TEST : OK
|
||||
* LAPACK-TEST : OK
|
||||
**************************************************************************************/
|
||||
|
||||
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#pragma GCC optimize "O1"
|
||||
|
||||
#if defined(POWER8)
|
||||
#include "srot_microk_power8.c"
|
||||
#endif
|
||||
|
||||
|
||||
#ifndef HAVE_KERNEL_16
|
||||
|
||||
static void srot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
|
||||
{
|
||||
|
||||
BLASLONG i=0;
|
||||
FLOAT f0, f1, f2, f3;
|
||||
FLOAT x00, x01, x02, x03;
|
||||
FLOAT g0, g1, g2, g3;
|
||||
FLOAT y00, y01, y02, y03;
|
||||
FLOAT *x1=x;
|
||||
FLOAT *y1=y;
|
||||
FLOAT c1=*c;
|
||||
FLOAT s1=*s;
|
||||
|
||||
while ( i<n )
|
||||
{
|
||||
|
||||
x00 = x1[0];
|
||||
y00 = y1[0];
|
||||
x01 = x1[1];
|
||||
y01 = y1[1];
|
||||
x02 = x1[2];
|
||||
y02 = y1[2];
|
||||
x03 = x1[3];
|
||||
y03 = y1[3];
|
||||
|
||||
f0 = c1*x00 + s1*y00;
|
||||
g0 = c1*y00 - s1*x00;
|
||||
f1 = c1*x01 + s1*y01;
|
||||
g1 = c1*y01 - s1*x01;
|
||||
f2 = c1*x02 + s1*y02;
|
||||
g2 = c1*y02 - s1*x02;
|
||||
f3 = c1*x03 + s1*y03;
|
||||
g3 = c1*y03 - s1*x03;
|
||||
|
||||
x1[0] = f0;
|
||||
y1[0] = g0;
|
||||
x1[1] = f1;
|
||||
y1[1] = g1;
|
||||
x1[2] = f2;
|
||||
y1[2] = g2;
|
||||
x1[3] = f3;
|
||||
y1[3] = g3;
|
||||
|
||||
x1 += 4;
|
||||
y1 += 4;
|
||||
|
||||
i+=4;
|
||||
}
|
||||
return;
|
||||
|
||||
}
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
|
||||
{
|
||||
BLASLONG i=0;
|
||||
BLASLONG ix=0,iy=0;
|
||||
FLOAT c1[4] __attribute__ ((aligned (16)));;
|
||||
FLOAT s1[4] __attribute__ ((aligned (16)));;
|
||||
FLOAT *x1=x;
|
||||
FLOAT *y1=y;
|
||||
FLOAT temp;
|
||||
|
||||
if ( n <= 0 ) return(0);
|
||||
|
||||
if ( (inc_x == 1) && (inc_y == 1) )
|
||||
{
|
||||
|
||||
BLASLONG n1 = n & -16;
|
||||
if ( n1 > 0 )
|
||||
{
|
||||
c1[0]=c;
|
||||
c1[1]=c;
|
||||
c1[2]=c;
|
||||
c1[3]=c;
|
||||
s1[0]=s;
|
||||
s1[1]=s;
|
||||
s1[2]=s;
|
||||
s1[3]=s;
|
||||
srot_kernel_16(n1, x1, y1, c1, s1);
|
||||
i=n1;
|
||||
}
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
temp = c*x[i] + s*y[i] ;
|
||||
y[i] = c*y[i] - s*x[i] ;
|
||||
x[i] = temp ;
|
||||
|
||||
i++ ;
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
temp = c*x[ix] + s*y[iy] ;
|
||||
y[iy] = c*y[iy] - s*x[ix] ;
|
||||
x[ix] = temp ;
|
||||
|
||||
ix += inc_x ;
|
||||
iy += inc_y ;
|
||||
i++ ;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
return(0);
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,208 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
/**************************************************************************************
|
||||
* 2016/03/27 Werner Saar (wernsaar@googlemail.com)
|
||||
*
|
||||
* I don't use fused multiply-add ( precision problems with lapack )
|
||||
*
|
||||
* BLASTEST : OK
|
||||
* CTEST : OK
|
||||
* TEST : OK
|
||||
* LAPACK-TEST : OK
|
||||
**************************************************************************************/
|
||||
|
||||
#define HAVE_KERNEL_16 1
|
||||
|
||||
static void srot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) __attribute__ ((noinline));
|
||||
|
||||
static void srot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
|
||||
{
|
||||
|
||||
|
||||
BLASLONG i = n;
|
||||
BLASLONG o16 = 16;
|
||||
BLASLONG o32 = 32;
|
||||
BLASLONG o48 = 48;
|
||||
FLOAT *x1=x;
|
||||
FLOAT *y1=y;
|
||||
FLOAT *x2=x+1;
|
||||
FLOAT *y2=y+1;
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
|
||||
"lxvw4x 36 , 0, %3 \n\t" // load c
|
||||
"lxvw4x 37 , 0, %4 \n\t" // load s
|
||||
"addi %8 , %8, -4 \n\t"
|
||||
"addi %9 , %9, -4 \n\t"
|
||||
|
||||
"lxvw4x 32, 0, %1 \n\t" // load x
|
||||
"lxvw4x 33, %5, %1 \n\t"
|
||||
"lxvw4x 34, %6, %1 \n\t"
|
||||
"lxvw4x 35, %7, %1 \n\t"
|
||||
|
||||
"lxvw4x 40, 0, %2 \n\t" // load y
|
||||
"lxvw4x 41, %5, %2 \n\t"
|
||||
"lxvw4x 42, %6, %2 \n\t"
|
||||
"lxvw4x 43, %7, %2 \n\t"
|
||||
|
||||
"addi %1, %1, 64 \n\t"
|
||||
"addi %2, %2, 64 \n\t"
|
||||
|
||||
"addic. %0 , %0 , -16 \n\t"
|
||||
"ble 2f \n\t"
|
||||
|
||||
".align 5 \n\t"
|
||||
"1: \n\t"
|
||||
|
||||
"xvmulsp 48, 32, 36 \n\t" // c * x
|
||||
"xvmulsp 49, 33, 36 \n\t"
|
||||
"xvmulsp 50, 34, 36 \n\t"
|
||||
"xvmulsp 51, 35, 36 \n\t"
|
||||
|
||||
"xvmulsp 56, 40, 36 \n\t" // c * y
|
||||
"xvmulsp 57, 41, 36 \n\t"
|
||||
"xvmulsp 58, 42, 36 \n\t"
|
||||
"xvmulsp 59, 43, 36 \n\t"
|
||||
|
||||
"xvmulsp 52, 32, 37 \n\t" // s * x
|
||||
"xvmulsp 53, 33, 37 \n\t"
|
||||
|
||||
"lxvw4x 32, 0, %1 \n\t" // load x
|
||||
"lxvw4x 33, %5, %1 \n\t"
|
||||
|
||||
"xvmulsp 54, 34, 37 \n\t"
|
||||
"xvmulsp 55, 35, 37 \n\t"
|
||||
|
||||
"lxvw4x 34, %6, %1 \n\t"
|
||||
"lxvw4x 35, %7, %1 \n\t"
|
||||
|
||||
"xvmulsp 60, 40, 37 \n\t" // s * y
|
||||
"xvmulsp 61, 41, 37 \n\t"
|
||||
|
||||
"lxvw4x 40, 0, %2 \n\t" // load y
|
||||
"lxvw4x 41, %5, %2 \n\t"
|
||||
|
||||
"xvmulsp 62, 42, 37 \n\t"
|
||||
"xvmulsp 63, 43, 37 \n\t"
|
||||
|
||||
"lxvw4x 42, %6, %2 \n\t"
|
||||
"lxvw4x 43, %7, %2 \n\t"
|
||||
|
||||
"xvaddsp 48, 48 , 60 \n\t" // c * x + s * y
|
||||
"xvaddsp 49, 49 , 61 \n\t" // c * x + s * y
|
||||
|
||||
"addi %1, %1, 64 \n\t"
|
||||
"addi %2, %2, 64 \n\t"
|
||||
|
||||
"xvaddsp 50, 50 , 62 \n\t" // c * x + s * y
|
||||
"xvaddsp 51, 51 , 63 \n\t" // c * x + s * y
|
||||
|
||||
"xvsubsp 56, 56 , 52 \n\t" // c * y - s * x
|
||||
"xvsubsp 57, 57 , 53 \n\t" // c * y - s * x
|
||||
"xvsubsp 58, 58 , 54 \n\t" // c * y - s * x
|
||||
"xvsubsp 59, 59 , 55 \n\t" // c * y - s * x
|
||||
|
||||
"stxvw4x 48, 0, %8 \n\t" // store x
|
||||
"stxvw4x 49, %5, %8 \n\t"
|
||||
"stxvw4x 50, %6, %8 \n\t"
|
||||
"stxvw4x 51, %7, %8 \n\t"
|
||||
|
||||
"stxvw4x 56, 0, %9 \n\t" // store y
|
||||
"stxvw4x 57, %5, %9 \n\t"
|
||||
"stxvw4x 58, %6, %9 \n\t"
|
||||
"stxvw4x 59, %7, %9 \n\t"
|
||||
|
||||
"addi %8, %8, 64 \n\t"
|
||||
"addi %9, %9, 64 \n\t"
|
||||
|
||||
"addic. %0 , %0 , -16 \n\t"
|
||||
"bgt 1b \n\t"
|
||||
|
||||
"2: \n\t"
|
||||
|
||||
"xvmulsp 48, 32, 36 \n\t" // c * x
|
||||
"xvmulsp 49, 33, 36 \n\t"
|
||||
"xvmulsp 50, 34, 36 \n\t"
|
||||
"xvmulsp 51, 35, 36 \n\t"
|
||||
|
||||
"xvmulsp 56, 40, 36 \n\t" // c * y
|
||||
"xvmulsp 57, 41, 36 \n\t"
|
||||
"xvmulsp 58, 42, 36 \n\t"
|
||||
"xvmulsp 59, 43, 36 \n\t"
|
||||
|
||||
"xvmulsp 52, 32, 37 \n\t" // s * x
|
||||
"xvmulsp 53, 33, 37 \n\t"
|
||||
"xvmulsp 54, 34, 37 \n\t"
|
||||
"xvmulsp 55, 35, 37 \n\t"
|
||||
|
||||
"xvmulsp 60, 40, 37 \n\t" // s * y
|
||||
"xvmulsp 61, 41, 37 \n\t"
|
||||
"xvmulsp 62, 42, 37 \n\t"
|
||||
"xvmulsp 63, 43, 37 \n\t"
|
||||
|
||||
"xvaddsp 48, 48 , 60 \n\t" // c * x + s * y
|
||||
"xvaddsp 49, 49 , 61 \n\t" // c * x + s * y
|
||||
"xvaddsp 50, 50 , 62 \n\t" // c * x + s * y
|
||||
"xvaddsp 51, 51 , 63 \n\t" // c * x + s * y
|
||||
|
||||
"xvsubsp 56, 56 , 52 \n\t" // c * y - s * x
|
||||
"xvsubsp 57, 57 , 53 \n\t" // c * y - s * x
|
||||
"xvsubsp 58, 58 , 54 \n\t" // c * y - s * x
|
||||
"xvsubsp 59, 59 , 55 \n\t" // c * y - s * x
|
||||
|
||||
"stxvw4x 48, 0, %8 \n\t" // store x
|
||||
"stxvw4x 49, %5, %8 \n\t"
|
||||
"stxvw4x 50, %6, %8 \n\t"
|
||||
"stxvw4x 51, %7, %8 \n\t"
|
||||
|
||||
"stxvw4x 56, 0, %9 \n\t" // store y
|
||||
"stxvw4x 57, %5, %9 \n\t"
|
||||
"stxvw4x 58, %6, %9 \n\t"
|
||||
"stxvw4x 59, %7, %9 \n\t"
|
||||
|
||||
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (x1), // 1
|
||||
"r" (y1), // 2
|
||||
"r" (c), // 3
|
||||
"r" (s), // 4
|
||||
"r" (o16), // 5
|
||||
"r" (o32), // 6
|
||||
"r" (o48), // 7
|
||||
"r" (x2), // 8
|
||||
"r" (y2) // 9
|
||||
: "cr0", "%0", "%1" , "%2", "%8", "%9", "memory"
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,179 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
/**************************************************************************************
|
||||
* 2016/03/27 Werner Saar (wernsaar@googlemail.com)
|
||||
* BLASTEST : OK
|
||||
* CTEST : OK
|
||||
* TEST : OK
|
||||
* LAPACK-TEST : OK
|
||||
**************************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if defined(POWER8)
|
||||
#include "sscal_microk_power8.c"
|
||||
#endif
|
||||
|
||||
|
||||
#if !defined(HAVE_KERNEL_16)
|
||||
|
||||
static void sscal_kernel_16( BLASLONG n, FLOAT *da , FLOAT *x )
|
||||
{
|
||||
|
||||
BLASLONG i;
|
||||
FLOAT alpha = *da;
|
||||
|
||||
for( i=0; i<n; i+=8 )
|
||||
{
|
||||
x[0] *= alpha;
|
||||
x[1] *= alpha;
|
||||
x[2] *= alpha;
|
||||
x[3] *= alpha;
|
||||
x[4] *= alpha;
|
||||
x[5] *= alpha;
|
||||
x[6] *= alpha;
|
||||
x[7] *= alpha;
|
||||
x+=8;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
static void sscal_kernel_16_zero( BLASLONG n, FLOAT *da , FLOAT *x )
|
||||
{
|
||||
|
||||
BLASLONG i;
|
||||
FLOAT alpha=0.0;
|
||||
|
||||
for( i=0; i<n; i+=8 )
|
||||
{
|
||||
x[0] = alpha;
|
||||
x[1] = alpha;
|
||||
x[2] = alpha;
|
||||
x[3] = alpha;
|
||||
x[4] = alpha;
|
||||
x[5] = alpha;
|
||||
x[6] = alpha;
|
||||
x[7] = alpha;
|
||||
x+=8;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
|
||||
{
|
||||
BLASLONG i=0,j=0;
|
||||
FLOAT alpha[4] __attribute__ ((aligned (16)));;
|
||||
|
||||
if ( n <= 0 || inc_x <=0 )
|
||||
return(0);
|
||||
|
||||
|
||||
if ( inc_x == 1 )
|
||||
{
|
||||
|
||||
if ( da == 0.0 )
|
||||
{
|
||||
|
||||
BLASLONG n1 = n & -32;
|
||||
if ( n1 > 0 )
|
||||
{
|
||||
alpha[0]=da;
|
||||
alpha[1]=da;
|
||||
alpha[2]=da;
|
||||
alpha[3]=da;
|
||||
sscal_kernel_16_zero(n1 , alpha , x);
|
||||
j=n1;
|
||||
}
|
||||
|
||||
while(j < n)
|
||||
{
|
||||
|
||||
x[j]=0.0;
|
||||
j++;
|
||||
}
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
BLASLONG n1 = n & -32;
|
||||
if ( n1 > 0 )
|
||||
{
|
||||
alpha[0]=da;
|
||||
alpha[1]=da;
|
||||
alpha[2]=da;
|
||||
alpha[3]=da;
|
||||
sscal_kernel_16(n1 , alpha , x);
|
||||
j=n1;
|
||||
}
|
||||
while(j < n)
|
||||
{
|
||||
|
||||
x[j] = da * x[j] ;
|
||||
j++;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
if ( da == 0.0 )
|
||||
{
|
||||
|
||||
while(j < n)
|
||||
{
|
||||
|
||||
x[i]=0.0;
|
||||
i += inc_x ;
|
||||
j++;
|
||||
}
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
while(j < n)
|
||||
{
|
||||
|
||||
x[i] = da * x[i] ;
|
||||
i += inc_x ;
|
||||
j++;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
return 0;
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,218 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
/**************************************************************************************
|
||||
* 2016/03/27 Werner Saar (wernsaar@googlemail.com)
|
||||
* BLASTEST : OK
|
||||
* CTEST : OK
|
||||
* TEST : OK
|
||||
* LAPACK-TEST : OK
|
||||
**************************************************************************************/
|
||||
|
||||
#define HAVE_KERNEL_16 1
|
||||
|
||||
static void sscal_kernel_16( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline));
|
||||
|
||||
static void sscal_kernel_16( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
||||
{
|
||||
|
||||
|
||||
BLASLONG i = n;
|
||||
BLASLONG o16 = 16;
|
||||
BLASLONG o32 = 32;
|
||||
BLASLONG o48 = 48;
|
||||
BLASLONG o64 = 64;
|
||||
BLASLONG o80 = 80;
|
||||
BLASLONG o96 = 96;
|
||||
BLASLONG o112 = 112;
|
||||
FLOAT *x1=x;
|
||||
FLOAT *x2=x+1;
|
||||
BLASLONG pre = 384;
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
|
||||
"lxvw4x 32, 0, %3 \n\t"
|
||||
"addi %1, %1, -4 \n\t"
|
||||
|
||||
"dcbt %2, %4 \n\t"
|
||||
|
||||
"lxvw4x 40, 0, %2 \n\t"
|
||||
"lxvw4x 41, %5, %2 \n\t"
|
||||
"lxvw4x 42, %6, %2 \n\t"
|
||||
"lxvw4x 43, %7, %2 \n\t"
|
||||
"lxvw4x 44, %8, %2 \n\t"
|
||||
"lxvw4x 45, %9, %2 \n\t"
|
||||
"lxvw4x 46, %10, %2 \n\t"
|
||||
"lxvw4x 47, %11, %2 \n\t"
|
||||
|
||||
"addi %2, %2, 128 \n\t"
|
||||
|
||||
"addic. %0 , %0 , -32 \n\t"
|
||||
"ble 2f \n\t"
|
||||
|
||||
".align 5 \n\t"
|
||||
"1: \n\t"
|
||||
|
||||
"dcbt %2, %4 \n\t"
|
||||
|
||||
"xvmulsp 48, 40, 32 \n\t"
|
||||
"xvmulsp 49, 41, 32 \n\t"
|
||||
"lxvw4x 40, 0, %2 \n\t"
|
||||
"lxvw4x 41, %5, %2 \n\t"
|
||||
"xvmulsp 50, 42, 32 \n\t"
|
||||
"xvmulsp 51, 43, 32 \n\t"
|
||||
"lxvw4x 42, %6, %2 \n\t"
|
||||
"lxvw4x 43, %7, %2 \n\t"
|
||||
"xvmulsp 52, 44, 32 \n\t"
|
||||
"xvmulsp 53, 45, 32 \n\t"
|
||||
"lxvw4x 44, %8, %2 \n\t"
|
||||
"lxvw4x 45, %9, %2 \n\t"
|
||||
"xvmulsp 54, 46, 32 \n\t"
|
||||
"xvmulsp 55, 47, 32 \n\t"
|
||||
"lxvw4x 46, %10, %2 \n\t"
|
||||
"lxvw4x 47, %11, %2 \n\t"
|
||||
|
||||
"stxvw4x 48, 0, %1 \n\t"
|
||||
"stxvw4x 49, %5, %1 \n\t"
|
||||
"stxvw4x 50, %6, %1 \n\t"
|
||||
"stxvw4x 51, %7, %1 \n\t"
|
||||
"stxvw4x 52, %8, %1 \n\t"
|
||||
"stxvw4x 53, %9, %1 \n\t"
|
||||
"stxvw4x 54, %10, %1 \n\t"
|
||||
"stxvw4x 55, %11, %1 \n\t"
|
||||
|
||||
"addi %1, %1, 128 \n\t"
|
||||
"addi %2, %2, 128 \n\t"
|
||||
|
||||
"addic. %0 , %0 , -32 \n\t"
|
||||
"bgt 1b \n\t"
|
||||
|
||||
"2: \n\t"
|
||||
|
||||
"xvmulsp 48, 40, 32 \n\t"
|
||||
"xvmulsp 49, 41, 32 \n\t"
|
||||
"xvmulsp 50, 42, 32 \n\t"
|
||||
"xvmulsp 51, 43, 32 \n\t"
|
||||
"xvmulsp 52, 44, 32 \n\t"
|
||||
"xvmulsp 53, 45, 32 \n\t"
|
||||
"xvmulsp 54, 46, 32 \n\t"
|
||||
"xvmulsp 55, 47, 32 \n\t"
|
||||
|
||||
"stxvw4x 48, 0, %1 \n\t"
|
||||
"stxvw4x 49, %5, %1 \n\t"
|
||||
"stxvw4x 50, %6, %1 \n\t"
|
||||
"stxvw4x 51, %7, %1 \n\t"
|
||||
"stxvw4x 52, %8, %1 \n\t"
|
||||
"stxvw4x 53, %9, %1 \n\t"
|
||||
"stxvw4x 54, %10, %1 \n\t"
|
||||
"stxvw4x 55, %11, %1 \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (x2), // 1
|
||||
"r" (x1), // 2
|
||||
"r" (alpha), // 3
|
||||
"r" (pre), // 4
|
||||
"r" (o16), // 5
|
||||
"r" (o32), // 6
|
||||
"r" (o48), // 7
|
||||
"r" (o64), // 8
|
||||
"r" (o80), // 9
|
||||
"r" (o96), // 10
|
||||
"r" (o112) // 11
|
||||
: "cr0", "%0", "%2" , "%1", "memory"
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
|
||||
static void sscal_kernel_16_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline));
|
||||
|
||||
static void sscal_kernel_16_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
||||
{
|
||||
|
||||
|
||||
BLASLONG i = n;
|
||||
BLASLONG o16 = 16;
|
||||
BLASLONG o32 = 32;
|
||||
BLASLONG o48 = 48;
|
||||
BLASLONG o64 = 64;
|
||||
BLASLONG o80 = 80;
|
||||
BLASLONG o96 = 96;
|
||||
BLASLONG o112 = 112;
|
||||
FLOAT *x1=x;
|
||||
FLOAT *x2=x+1;
|
||||
BLASLONG pre = 384;
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
|
||||
"xxlxor 32 , 32 , 32 \n\t"
|
||||
"addi %1, %1, -4 \n\t"
|
||||
|
||||
|
||||
".align 5 \n\t"
|
||||
"1: \n\t"
|
||||
|
||||
"stxvw4x 32, 0, %1 \n\t"
|
||||
"stxvw4x 32, %5, %1 \n\t"
|
||||
"stxvw4x 32, %6, %1 \n\t"
|
||||
"stxvw4x 32, %7, %1 \n\t"
|
||||
"stxvw4x 32, %8, %1 \n\t"
|
||||
"stxvw4x 32, %9, %1 \n\t"
|
||||
"stxvw4x 32, %10, %1 \n\t"
|
||||
"stxvw4x 32, %11, %1 \n\t"
|
||||
|
||||
"addi %1, %1, 128 \n\t"
|
||||
|
||||
"addic. %0 , %0 , -32 \n\t"
|
||||
"bgt 1b \n\t"
|
||||
|
||||
"2: \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (x2), // 1
|
||||
"r" (x1), // 2
|
||||
"r" (alpha), // 3
|
||||
"r" (pre), // 4
|
||||
"r" (o16), // 5
|
||||
"r" (o32), // 6
|
||||
"r" (o48), // 7
|
||||
"r" (o64), // 8
|
||||
"r" (o80), // 9
|
||||
"r" (o96), // 10
|
||||
"r" (o112) // 11
|
||||
: "cr0", "%0", "%2" , "%1", "memory"
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,154 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
/**************************************************************************************
|
||||
* 2016/03/25 Werner Saar (wernsaar@googlemail.com)
|
||||
* BLASTEST : OK
|
||||
* CTEST : OK
|
||||
* TEST : OK
|
||||
* LAPACK-TEST : OK
|
||||
**************************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if defined(POWER8)
|
||||
#include "sswap_microk_power8.c"
|
||||
#endif
|
||||
|
||||
#ifndef HAVE_KERNEL_32
|
||||
|
||||
static void sswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y)
|
||||
{
|
||||
|
||||
BLASLONG i=0;
|
||||
FLOAT f0, f1, f2, f3, f4, f5, f6, f7;
|
||||
FLOAT g0, g1, g2, g3, g4, g5, g6, g7;
|
||||
FLOAT *x1=x;
|
||||
FLOAT *y1=y;
|
||||
|
||||
while ( i<n )
|
||||
{
|
||||
|
||||
f0 = x1[0];
|
||||
f1 = x1[1];
|
||||
f2 = x1[2];
|
||||
f3 = x1[3];
|
||||
f4 = x1[4];
|
||||
f5 = x1[5];
|
||||
f6 = x1[6];
|
||||
f7 = x1[7];
|
||||
|
||||
g0 = y1[0];
|
||||
g1 = y1[1];
|
||||
g2 = y1[2];
|
||||
g3 = y1[3];
|
||||
g4 = y1[4];
|
||||
g5 = y1[5];
|
||||
g6 = y1[6];
|
||||
g7 = y1[7];
|
||||
|
||||
y1[0] = f0;
|
||||
y1[1] = f1;
|
||||
y1[2] = f2;
|
||||
y1[3] = f3;
|
||||
y1[4] = f4;
|
||||
y1[5] = f5;
|
||||
y1[6] = f6;
|
||||
y1[7] = f7;
|
||||
|
||||
x1[0] = g0;
|
||||
x1[1] = g1;
|
||||
x1[2] = g2;
|
||||
x1[3] = g3;
|
||||
x1[4] = g4;
|
||||
x1[5] = g5;
|
||||
x1[6] = g6;
|
||||
x1[7] = g7;
|
||||
|
||||
x1 += 8;
|
||||
y1 += 8;
|
||||
|
||||
i+=8;
|
||||
}
|
||||
return;
|
||||
|
||||
}
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
|
||||
{
|
||||
BLASLONG i=0;
|
||||
BLASLONG ix=0,iy=0;
|
||||
FLOAT temp;
|
||||
|
||||
if ( n <= 0 ) return(0);
|
||||
|
||||
if ( (inc_x == 1) && (inc_y == 1 ))
|
||||
{
|
||||
|
||||
BLASLONG n1 = n & -32;
|
||||
if ( n1 > 0 )
|
||||
{
|
||||
sswap_kernel_32(n1, x, y);
|
||||
i=n1;
|
||||
}
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
temp = y[i];
|
||||
y[i] = x[i] ;
|
||||
x[i] = temp;
|
||||
i++ ;
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
temp = y[iy];
|
||||
y[iy] = x[ix] ;
|
||||
x[ix] = temp;
|
||||
ix += inc_x ;
|
||||
iy += inc_y ;
|
||||
i++ ;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
return(0);
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,136 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
/**************************************************************************************
|
||||
* 2016/03/25 Werner Saar (wernsaar@googlemail.com)
|
||||
* BLASTEST : OK
|
||||
* CTEST : OK
|
||||
* TEST : OK
|
||||
* LAPACK-TEST : OK
|
||||
**************************************************************************************/
|
||||
|
||||
#define HAVE_KERNEL_32 1
|
||||
|
||||
static void sswap_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
|
||||
|
||||
static void sswap_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y)
|
||||
{
|
||||
|
||||
|
||||
BLASLONG i = n;
|
||||
BLASLONG o16 = 16;
|
||||
BLASLONG o32 = 32;
|
||||
BLASLONG o48 = 48;
|
||||
BLASLONG o64 = 64;
|
||||
BLASLONG o80 = 80;
|
||||
BLASLONG o96 = 96;
|
||||
BLASLONG o112 = 112;
|
||||
FLOAT *x1=x;
|
||||
FLOAT *y1=y;
|
||||
FLOAT *x2=x+1;
|
||||
FLOAT *y2=y+1;
|
||||
BLASLONG pre = 384;
|
||||
BLASLONG alpha=0;
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
|
||||
"addi %3, %3, -4 \n\t"
|
||||
"addi %4, %4, -4 \n\t"
|
||||
|
||||
".align 5 \n\t"
|
||||
"1: \n\t"
|
||||
|
||||
"lxvw4x 32, 0, %2 \n\t"
|
||||
"lxvw4x 33, %5, %2 \n\t"
|
||||
"lxvw4x 34, %6, %2 \n\t"
|
||||
"lxvw4x 35, %7, %2 \n\t"
|
||||
"lxvw4x 36, %8, %2 \n\t"
|
||||
"lxvw4x 37, %9, %2 \n\t"
|
||||
"lxvw4x 38, %10, %2 \n\t"
|
||||
"lxvw4x 39, %11, %2 \n\t"
|
||||
|
||||
"addi %2, %2, 128 \n\t"
|
||||
|
||||
"lxvw4x 48, 0, %1 \n\t"
|
||||
"lxvw4x 49, %5, %1 \n\t"
|
||||
"lxvw4x 50, %6, %1 \n\t"
|
||||
"lxvw4x 51, %7, %1 \n\t"
|
||||
"lxvw4x 52, %8, %1 \n\t"
|
||||
"lxvw4x 53, %9, %1 \n\t"
|
||||
"lxvw4x 54, %10, %1 \n\t"
|
||||
"lxvw4x 55, %11, %1 \n\t"
|
||||
|
||||
"addi %1, %1, 128 \n\t"
|
||||
|
||||
"stxvw4x 32, 0, %3 \n\t"
|
||||
"stxvw4x 33, %5, %3 \n\t"
|
||||
"stxvw4x 34, %6, %3 \n\t"
|
||||
"stxvw4x 35, %7, %3 \n\t"
|
||||
"stxvw4x 36, %8, %3 \n\t"
|
||||
"stxvw4x 37, %9, %3 \n\t"
|
||||
"stxvw4x 38, %10, %3 \n\t"
|
||||
"stxvw4x 39, %11, %3 \n\t"
|
||||
|
||||
"addi %3, %3, 128 \n\t"
|
||||
|
||||
"stxvw4x 48, 0, %4 \n\t"
|
||||
"stxvw4x 49, %5, %4 \n\t"
|
||||
"stxvw4x 50, %6, %4 \n\t"
|
||||
"stxvw4x 51, %7, %4 \n\t"
|
||||
"stxvw4x 52, %8, %4 \n\t"
|
||||
"stxvw4x 53, %9, %4 \n\t"
|
||||
"stxvw4x 54, %10, %4 \n\t"
|
||||
"stxvw4x 55, %11, %4 \n\t"
|
||||
|
||||
"addi %4, %4, 128 \n\t"
|
||||
|
||||
"addic. %0 , %0 , -32 \n\t"
|
||||
"bgt 1b \n\t"
|
||||
|
||||
"2: \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (y1), // 1
|
||||
"r" (x1), // 2
|
||||
"r" (y2), // 3
|
||||
"r" (x2), // 4
|
||||
"r" (o16), // 5
|
||||
"r" (o32), // 6
|
||||
"r" (o48), // 7
|
||||
"r" (o64), // 8
|
||||
"r" (o80), // 9
|
||||
"r" (o96), // 10
|
||||
"r" (o112) // 11
|
||||
: "cr0", "%0", "%2" , "%1", "%3", "%4", "memory"
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,369 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
/**************************************************************************************
|
||||
* 2016/04/02 Werner Saar (wernsaar@googlemail.com)
|
||||
* BLASTEST : OK
|
||||
* CTEST : OK
|
||||
* TEST : OK
|
||||
* LAPACK-TEST : OK
|
||||
**************************************************************************************/
|
||||
|
||||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
#include "def_vsx.h"
|
||||
|
||||
#ifndef __64BIT__
|
||||
#define LOAD lwz
|
||||
#else
|
||||
#define LOAD ld
|
||||
#endif
|
||||
|
||||
#ifdef __64BIT__
|
||||
#define STACKSIZE 340
|
||||
#define ALPHA_SP 296(SP)
|
||||
#define FZERO 304(SP)
|
||||
#else
|
||||
#define STACKSIZE 240
|
||||
#define ALPHA_SP 224(SP)
|
||||
#define FZERO 232(SP)
|
||||
#endif
|
||||
|
||||
#define M r3
|
||||
#define N r4
|
||||
#define K r5
|
||||
|
||||
#ifdef linux
|
||||
#ifndef __64BIT__
|
||||
#define A r6
|
||||
#define B r7
|
||||
#define C r8
|
||||
#define LDC r9
|
||||
#define OFFSET r10
|
||||
#else
|
||||
#define A r7
|
||||
#define B r8
|
||||
#define C r9
|
||||
#define LDC r10
|
||||
#define OFFSET r6
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(_AIX) || defined(__APPLE__)
|
||||
#if !defined(__64BIT__) && defined(DOUBLE)
|
||||
#define A r8
|
||||
#define B r9
|
||||
#define C r10
|
||||
#define LDC r7
|
||||
#define OFFSET r6
|
||||
#else
|
||||
#define A r7
|
||||
#define B r8
|
||||
#define C r9
|
||||
#define LDC r10
|
||||
#define OFFSET r6
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#define alpha_r vs30
|
||||
#define alpha_vr vs31
|
||||
|
||||
#define o0 0
|
||||
|
||||
#define TBUFFER r13
|
||||
#define o12 r14
|
||||
#define o4 r15
|
||||
#define K1 r16
|
||||
#define o8 r17
|
||||
#define L r18
|
||||
#define T1 r19
|
||||
#define KK r20
|
||||
#define KKK r21
|
||||
#define I r22
|
||||
#define J r23
|
||||
#define AO r24
|
||||
#define BO r25
|
||||
#define CO r26
|
||||
#define o16 r27
|
||||
#define o32 r28
|
||||
#define o48 r29
|
||||
|
||||
#define PRE r30
|
||||
#define T2 r31
|
||||
|
||||
#include "strmm_macros_16x8_power8.S"
|
||||
|
||||
|
||||
#ifndef NEEDPARAM
|
||||
|
||||
PROLOGUE
|
||||
PROFCODE
|
||||
|
||||
addi SP, SP, -STACKSIZE
|
||||
li r0, 0
|
||||
|
||||
stfd f14, 0(SP)
|
||||
stfd f15, 8(SP)
|
||||
stfd f16, 16(SP)
|
||||
stfd f17, 24(SP)
|
||||
|
||||
stfd f18, 32(SP)
|
||||
stfd f19, 40(SP)
|
||||
stfd f20, 48(SP)
|
||||
stfd f21, 56(SP)
|
||||
|
||||
stfd f22, 64(SP)
|
||||
stfd f23, 72(SP)
|
||||
stfd f24, 80(SP)
|
||||
stfd f25, 88(SP)
|
||||
|
||||
stfd f26, 96(SP)
|
||||
stfd f27, 104(SP)
|
||||
stfd f28, 112(SP)
|
||||
stfd f29, 120(SP)
|
||||
|
||||
stfd f30, 128(SP)
|
||||
stfd f31, 136(SP)
|
||||
|
||||
#ifdef __64BIT__
|
||||
std r31, 144(SP)
|
||||
std r30, 152(SP)
|
||||
std r29, 160(SP)
|
||||
std r28, 168(SP)
|
||||
std r27, 176(SP)
|
||||
std r26, 184(SP)
|
||||
std r25, 192(SP)
|
||||
std r24, 200(SP)
|
||||
std r23, 208(SP)
|
||||
std r22, 216(SP)
|
||||
std r21, 224(SP)
|
||||
std r20, 232(SP)
|
||||
std r19, 240(SP)
|
||||
std r18, 248(SP)
|
||||
std r17, 256(SP)
|
||||
std r16, 264(SP)
|
||||
std r15, 272(SP)
|
||||
std r14, 280(SP)
|
||||
std r13, 288(SP)
|
||||
#else
|
||||
stw r31, 144(SP)
|
||||
stw r30, 148(SP)
|
||||
stw r29, 152(SP)
|
||||
stw r28, 156(SP)
|
||||
stw r27, 160(SP)
|
||||
stw r26, 164(SP)
|
||||
stw r25, 168(SP)
|
||||
stw r24, 172(SP)
|
||||
stw r23, 176(SP)
|
||||
stw r22, 180(SP)
|
||||
stw r21, 184(SP)
|
||||
stw r20, 188(SP)
|
||||
stw r19, 192(SP)
|
||||
stw r18, 196(SP)
|
||||
stw r17, 200(SP)
|
||||
stw r16, 204(SP)
|
||||
stw r15, 208(SP)
|
||||
stw r14, 212(SP)
|
||||
stw r13, 216(SP)
|
||||
#endif
|
||||
|
||||
// stfd f1, ALPHA_SP
|
||||
// stw r0, FZERO
|
||||
|
||||
#if defined(_AIX) || defined(__APPLE__)
|
||||
#if !defined(__64BIT__) && defined(DOUBLE)
|
||||
lwz LDC, FRAMESLOT(0) + STACKSIZE(SP)
|
||||
#endif
|
||||
#endif
|
||||
|
||||
slwi LDC, LDC, BASE_SHIFT
|
||||
|
||||
#if defined(TRMMKERNEL)
|
||||
#if defined(linux) && defined(__64BIT__)
|
||||
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
|
||||
#endif
|
||||
|
||||
#if defined(_AIX) || defined(__APPLE__)
|
||||
#ifdef __64BIT__
|
||||
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
|
||||
#else
|
||||
#ifdef DOUBLE
|
||||
lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
|
||||
#else
|
||||
lwz OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
mr KK, OFFSET
|
||||
#if defined(TRMMKERNEL) && !defined(LEFT)
|
||||
neg KK, KK
|
||||
#endif
|
||||
|
||||
|
||||
cmpwi cr0, M, 0
|
||||
ble L999_H1
|
||||
cmpwi cr0, N, 0
|
||||
ble L999_H1
|
||||
cmpwi cr0, K, 0
|
||||
ble L999_H1
|
||||
|
||||
li PRE, 256
|
||||
li o4 , 4
|
||||
li o8 , 8
|
||||
li o12, 12
|
||||
li o16, 16
|
||||
li o32, 32
|
||||
li o48, 48
|
||||
addi TBUFFER, SP, 320
|
||||
|
||||
addi T1, SP, 300
|
||||
stxsspx f1, o0 , T1
|
||||
stxsspx f1, o4 , T1
|
||||
stxsspx f1, o8 , T1
|
||||
stxsspx f1, o12 , T1
|
||||
|
||||
lxsspx alpha_r, o0, T1
|
||||
lxvw4x alpha_vr, o0, T1
|
||||
|
||||
|
||||
|
||||
#include "strmm_logic_16x8_power8.S"
|
||||
|
||||
L999:
|
||||
addi r3, 0, 0
|
||||
|
||||
lfd f14, 0(SP)
|
||||
lfd f15, 8(SP)
|
||||
lfd f16, 16(SP)
|
||||
lfd f17, 24(SP)
|
||||
|
||||
lfd f18, 32(SP)
|
||||
lfd f19, 40(SP)
|
||||
lfd f20, 48(SP)
|
||||
lfd f21, 56(SP)
|
||||
|
||||
lfd f22, 64(SP)
|
||||
lfd f23, 72(SP)
|
||||
lfd f24, 80(SP)
|
||||
lfd f25, 88(SP)
|
||||
|
||||
lfd f26, 96(SP)
|
||||
lfd f27, 104(SP)
|
||||
lfd f28, 112(SP)
|
||||
lfd f29, 120(SP)
|
||||
|
||||
lfd f30, 128(SP)
|
||||
lfd f31, 136(SP)
|
||||
|
||||
#ifdef __64BIT__
|
||||
ld r31, 144(SP)
|
||||
ld r30, 152(SP)
|
||||
ld r29, 160(SP)
|
||||
ld r28, 168(SP)
|
||||
ld r27, 176(SP)
|
||||
ld r26, 184(SP)
|
||||
ld r25, 192(SP)
|
||||
ld r24, 200(SP)
|
||||
ld r23, 208(SP)
|
||||
ld r22, 216(SP)
|
||||
ld r21, 224(SP)
|
||||
ld r20, 232(SP)
|
||||
ld r19, 240(SP)
|
||||
ld r18, 248(SP)
|
||||
ld r17, 256(SP)
|
||||
ld r16, 264(SP)
|
||||
ld r15, 272(SP)
|
||||
ld r14, 280(SP)
|
||||
ld r13, 288(SP)
|
||||
#else
|
||||
lwz r31, 144(SP)
|
||||
lwz r30, 148(SP)
|
||||
lwz r29, 152(SP)
|
||||
lwz r28, 156(SP)
|
||||
lwz r27, 160(SP)
|
||||
lwz r26, 164(SP)
|
||||
lwz r25, 168(SP)
|
||||
lwz r24, 172(SP)
|
||||
lwz r23, 176(SP)
|
||||
lwz r22, 180(SP)
|
||||
lwz r21, 184(SP)
|
||||
lwz r20, 188(SP)
|
||||
lwz r19, 192(SP)
|
||||
lwz r18, 196(SP)
|
||||
lwz r17, 200(SP)
|
||||
lwz r16, 204(SP)
|
||||
lwz r15, 208(SP)
|
||||
lwz r14, 212(SP)
|
||||
lwz r13, 216(SP)
|
||||
#endif
|
||||
|
||||
addi SP, SP, STACKSIZE
|
||||
|
||||
blr
|
||||
|
||||
EPILOGUE
|
||||
#endif
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
|
@ -0,0 +1,149 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
/**************************************************************************************
|
||||
* 2016/03/28 Werner Saar (wernsaar@googlemail.com)
|
||||
* BLASTEST : OK
|
||||
* CTEST : OK
|
||||
* TEST : OK
|
||||
* LAPACK-TEST : OK
|
||||
**************************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include <math.h>
|
||||
|
||||
#if defined(DOUBLE)
|
||||
|
||||
#define ABS fabs
|
||||
|
||||
#else
|
||||
|
||||
#define ABS fabsf
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(POWER8)
|
||||
#include "zasum_microk_power8.c"
|
||||
#endif
|
||||
|
||||
|
||||
#ifndef HAVE_KERNEL_8
|
||||
|
||||
static void zasum_kernel_8(BLASLONG n, FLOAT *x1, FLOAT *svec)
|
||||
{
|
||||
|
||||
BLASLONG i=0;
|
||||
FLOAT *x = x1;
|
||||
FLOAT temp0, temp1, temp2, temp3;
|
||||
FLOAT temp4, temp5, temp6, temp7;
|
||||
FLOAT sum0 = 0.0;
|
||||
FLOAT sum1 = 0.0;
|
||||
FLOAT sum2 = 0.0;
|
||||
FLOAT sum3 = 0.0;
|
||||
|
||||
while ( i< n )
|
||||
{
|
||||
|
||||
temp0 = ABS(x[0]);
|
||||
temp1 = ABS(x[1]);
|
||||
temp2 = ABS(x[2]);
|
||||
temp3 = ABS(x[3]);
|
||||
temp4 = ABS(x[4]);
|
||||
temp5 = ABS(x[5]);
|
||||
temp6 = ABS(x[6]);
|
||||
temp7 = ABS(x[7]);
|
||||
|
||||
sum0 += temp0;
|
||||
sum1 += temp1;
|
||||
sum2 += temp2;
|
||||
sum3 += temp3;
|
||||
|
||||
sum0 += temp4;
|
||||
sum1 += temp5;
|
||||
sum2 += temp6;
|
||||
sum3 += temp7;
|
||||
|
||||
x+=8;
|
||||
i+=4;
|
||||
|
||||
}
|
||||
|
||||
svec[0] = sum0+sum1+sum2+sum3;
|
||||
svec[1] = 0.0;
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
BLASLONG i=0;
|
||||
BLASLONG ip=0;
|
||||
FLOAT sumf = 0.0;
|
||||
FLOAT svec[2] __attribute__ ((aligned (16)));;
|
||||
BLASLONG n1;
|
||||
BLASLONG inc_x2;
|
||||
|
||||
if (n <= 0 || inc_x <= 0) return(sumf);
|
||||
|
||||
if ( inc_x == 1 )
|
||||
{
|
||||
|
||||
n1 = n & -8;
|
||||
if ( n1 > 0 )
|
||||
{
|
||||
|
||||
zasum_kernel_8(n1, x, svec);
|
||||
sumf = svec[0] + svec[1];
|
||||
i=n1;
|
||||
ip=2*n1;
|
||||
}
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
sumf += ABS(x[ip]) + ABS(x[ip+1]);
|
||||
i++;
|
||||
ip+=2;
|
||||
}
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
inc_x2 = 2* inc_x;
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
sumf += ABS(x[ip]) + ABS(x[ip+1]);
|
||||
ip+=inc_x2;
|
||||
i++;
|
||||
}
|
||||
|
||||
}
|
||||
return(sumf);
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,177 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
/**************************************************************************************
|
||||
* 2016/03/28 Werner Saar (wernsaar@googlemail.com)
|
||||
* BLASTEST : OK
|
||||
* CTEST : OK
|
||||
* TEST : OK
|
||||
* LAPACK-TEST : OK
|
||||
**************************************************************************************/
|
||||
|
||||
#define HAVE_KERNEL_8 1
|
||||
static void zasum_kernel_8( BLASLONG n, FLOAT *x, FLOAT *svec) __attribute__ ((noinline));
|
||||
|
||||
static void zasum_kernel_8( BLASLONG n, FLOAT *x, FLOAT *svec)
|
||||
{
|
||||
|
||||
|
||||
BLASLONG i = n;
|
||||
BLASLONG o16 = 16;
|
||||
BLASLONG o32 = 32;
|
||||
BLASLONG o48 = 48;
|
||||
BLASLONG o64 = 64;
|
||||
BLASLONG o80 = 80;
|
||||
BLASLONG o96 = 96;
|
||||
BLASLONG o112 = 112;
|
||||
FLOAT *x1=x;
|
||||
BLASLONG pre = 384;
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
|
||||
"dcbt %2 , %4 \n\t"
|
||||
|
||||
"xxlxor 32,32,32 \n\t"
|
||||
"xxlxor 33,33,33 \n\t"
|
||||
"xxlxor 34,34,34 \n\t"
|
||||
"xxlxor 35,35,35 \n\t"
|
||||
"xxlxor 36,36,36 \n\t"
|
||||
"xxlxor 37,37,37 \n\t"
|
||||
"xxlxor 38,38,38 \n\t"
|
||||
"xxlxor 39,39,39 \n\t"
|
||||
|
||||
"lxvd2x 40, 0, %2 \n\t"
|
||||
"lxvd2x 41, %5, %2 \n\t"
|
||||
"lxvd2x 42, %6, %2 \n\t"
|
||||
"lxvd2x 43, %7, %2 \n\t"
|
||||
"lxvd2x 44, %8, %2 \n\t"
|
||||
"lxvd2x 45, %9, %2 \n\t"
|
||||
"lxvd2x 46, %10, %2 \n\t"
|
||||
"lxvd2x 47, %11, %2 \n\t"
|
||||
|
||||
"addi %2, %2, 128 \n\t"
|
||||
|
||||
"addic. %0 , %0 , -8 \n\t"
|
||||
"ble 2f \n\t"
|
||||
|
||||
".align 5 \n\t"
|
||||
"1: \n\t"
|
||||
|
||||
"dcbt %2 , %4 \n\t"
|
||||
|
||||
"xvabsdp 48, 40 \n\t"
|
||||
"xvabsdp 49, 41 \n\t"
|
||||
"xvabsdp 50, 42 \n\t"
|
||||
"xvabsdp 51, 43 \n\t"
|
||||
|
||||
"lxvd2x 40, 0, %2 \n\t"
|
||||
"lxvd2x 41, %5, %2 \n\t"
|
||||
|
||||
"xvabsdp 52, 44 \n\t"
|
||||
"xvabsdp 53, 45 \n\t"
|
||||
|
||||
"lxvd2x 42, %6, %2 \n\t"
|
||||
"lxvd2x 43, %7, %2 \n\t"
|
||||
|
||||
"xvabsdp 54, 46 \n\t"
|
||||
"xvabsdp 55, 47 \n\t"
|
||||
|
||||
"lxvd2x 44, %8, %2 \n\t"
|
||||
"lxvd2x 45, %9, %2 \n\t"
|
||||
|
||||
"xvadddp 32, 32, 48 \n\t"
|
||||
"xvadddp 33, 33, 49 \n\t"
|
||||
|
||||
"lxvd2x 46, %10, %2 \n\t"
|
||||
"lxvd2x 47, %11, %2 \n\t"
|
||||
|
||||
"xvadddp 34, 34, 50 \n\t"
|
||||
"xvadddp 35, 35, 51 \n\t"
|
||||
"addi %2, %2, 128 \n\t"
|
||||
"xvadddp 36, 36, 52 \n\t"
|
||||
"xvadddp 37, 37, 53 \n\t"
|
||||
"addic. %0 , %0 , -8 \n\t"
|
||||
"xvadddp 38, 38, 54 \n\t"
|
||||
"xvadddp 39, 39, 55 \n\t"
|
||||
|
||||
"bgt 1b \n\t"
|
||||
|
||||
"2: \n\t"
|
||||
|
||||
|
||||
"xvabsdp 48, 40 \n\t"
|
||||
"xvabsdp 49, 41 \n\t"
|
||||
"xvabsdp 50, 42 \n\t"
|
||||
"xvabsdp 51, 43 \n\t"
|
||||
"xvabsdp 52, 44 \n\t"
|
||||
"xvabsdp 53, 45 \n\t"
|
||||
"xvabsdp 54, 46 \n\t"
|
||||
"xvabsdp 55, 47 \n\t"
|
||||
|
||||
"xvadddp 32, 32, 48 \n\t"
|
||||
"xvadddp 33, 33, 49 \n\t"
|
||||
"xvadddp 34, 34, 50 \n\t"
|
||||
"xvadddp 35, 35, 51 \n\t"
|
||||
"xvadddp 36, 36, 52 \n\t"
|
||||
"xvadddp 37, 37, 53 \n\t"
|
||||
"xvadddp 38, 38, 54 \n\t"
|
||||
"xvadddp 39, 39, 55 \n\t"
|
||||
|
||||
"xvadddp 32, 32, 33 \n\t"
|
||||
"xvadddp 34, 34, 35 \n\t"
|
||||
"xvadddp 36, 36, 37 \n\t"
|
||||
"xvadddp 38, 38, 39 \n\t"
|
||||
|
||||
"xvadddp 32, 32, 34 \n\t"
|
||||
"xvadddp 36, 36, 38 \n\t"
|
||||
|
||||
"xvadddp 32, 32, 36 \n\t"
|
||||
|
||||
|
||||
"stxvd2x 32, 0, %3 \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"r" (x1), // 2
|
||||
"r" (svec), // 3
|
||||
"r" (pre), // 4
|
||||
"r" (o16), // 5
|
||||
"r" (o32), // 6
|
||||
"r" (o48), // 7
|
||||
"r" (o64), // 8
|
||||
"r" (o80), // 9
|
||||
"r" (o96), // 10
|
||||
"r" (o112) // 11
|
||||
: "cr0", "%0", "%2", "memory"
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,140 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
/**************************************************************************************
|
||||
* 2016/03/23 Werner Saar (wernsaar@googlemail.com)
|
||||
* BLASTEST : OK
|
||||
* CTEST : OK
|
||||
* TEST : OK
|
||||
* LAPACK-TEST : OK
|
||||
**************************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
|
||||
#if defined(POWER8)
|
||||
#include "zaxpy_microk_power8.c"
|
||||
#endif
|
||||
|
||||
|
||||
#ifndef HAVE_KERNEL_4
|
||||
|
||||
static void zaxpy_kernel_4(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
||||
{
|
||||
BLASLONG register i = 0;
|
||||
BLASLONG register ix = 0;
|
||||
FLOAT da_r = alpha[0];
|
||||
FLOAT da_i = alpha[1];
|
||||
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
#if !defined(CONJ)
|
||||
y[ix] += ( da_r * x[ix] - da_i * x[ix+1] ) ;
|
||||
y[ix+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ;
|
||||
y[ix+2] += ( da_r * x[ix+2] - da_i * x[ix+3] ) ;
|
||||
y[ix+3] += ( da_r * x[ix+3] + da_i * x[ix+2] ) ;
|
||||
#else
|
||||
y[ix] += ( da_r * x[ix] + da_i * x[ix+1] ) ;
|
||||
y[ix+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ;
|
||||
y[ix+2] += ( da_r * x[ix+2] + da_i * x[ix+3] ) ;
|
||||
y[ix+3] -= ( da_r * x[ix+3] - da_i * x[ix+2] ) ;
|
||||
#endif
|
||||
|
||||
ix+=4 ;
|
||||
i+=2 ;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
|
||||
{
|
||||
BLASLONG i=0;
|
||||
BLASLONG ix=0,iy=0;
|
||||
FLOAT da[4];
|
||||
|
||||
if ( n <= 0 ) return(0);
|
||||
|
||||
if ( (inc_x == 1) && (inc_y == 1) )
|
||||
{
|
||||
|
||||
BLASLONG n1 = n & -16;
|
||||
|
||||
if ( n1 )
|
||||
{
|
||||
da[0] = da_r;
|
||||
da[1] = da_r;
|
||||
da[2] = da_i;
|
||||
da[3] = da_i;
|
||||
zaxpy_kernel_4(n1, x, y , da );
|
||||
ix = 2 * n1;
|
||||
}
|
||||
i = n1;
|
||||
while(i < n)
|
||||
{
|
||||
#if !defined(CONJ)
|
||||
y[ix] += ( da_r * x[ix] - da_i * x[ix+1] ) ;
|
||||
y[ix+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ;
|
||||
#else
|
||||
y[ix] += ( da_r * x[ix] + da_i * x[ix+1] ) ;
|
||||
y[ix+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ;
|
||||
#endif
|
||||
i++ ;
|
||||
ix += 2;
|
||||
|
||||
}
|
||||
return(0);
|
||||
|
||||
|
||||
}
|
||||
|
||||
inc_x *=2;
|
||||
inc_y *=2;
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
|
||||
#if !defined(CONJ)
|
||||
y[iy] += ( da_r * x[ix] - da_i * x[ix+1] ) ;
|
||||
y[iy+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ;
|
||||
#else
|
||||
y[iy] += ( da_r * x[ix] + da_i * x[ix+1] ) ;
|
||||
y[iy+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ;
|
||||
#endif
|
||||
ix += inc_x ;
|
||||
iy += inc_y ;
|
||||
i++ ;
|
||||
|
||||
}
|
||||
return(0);
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,250 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
/**************************************************************************************
|
||||
* 2016/03/23 Werner Saar (wernsaar@googlemail.com)
|
||||
* BLASTEST : OK
|
||||
* CTEST : OK
|
||||
* TEST : OK
|
||||
* LAPACK-TEST : OK
|
||||
**************************************************************************************/
|
||||
|
||||
|
||||
#define HAVE_KERNEL_4 1
|
||||
static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __attribute__ ((noinline));
|
||||
|
||||
static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
||||
{
|
||||
|
||||
|
||||
BLASLONG i = n;
|
||||
BLASLONG o16 = 16;
|
||||
BLASLONG o32 = 32;
|
||||
BLASLONG o48 = 48;
|
||||
FLOAT *x1=x;
|
||||
FLOAT *y1=y;
|
||||
FLOAT *y2=y+1;
|
||||
BLASLONG pre = 384;
|
||||
|
||||
#if !defined(CONJ)
|
||||
FLOAT mvec[2] = { -1.0, 1.0 };
|
||||
#else
|
||||
FLOAT mvec[2] = { 1.0, -1.0 };
|
||||
#endif
|
||||
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
|
||||
"lxsdx 34, 0 , %4 \n\t" // alpha_r
|
||||
"lxsdx 35, %5, %4 \n\t" // alpha_i
|
||||
"xxspltd 32, 34, 0 \n\t"
|
||||
"xxspltd 33, 35, 0 \n\t"
|
||||
|
||||
"lxvd2x 36, 0, %9 \n\t" // mvec
|
||||
|
||||
#if !defined(CONJ)
|
||||
"xvmuldp 33, 33 , 36 \n\t" // alpha_i * mvec
|
||||
#else
|
||||
"xvmuldp 32, 32 , 36 \n\t" // alpha_r * mvec
|
||||
#endif
|
||||
|
||||
"addi %8, %8, -8 \n\t"
|
||||
|
||||
"dcbt %2, %10 \n\t"
|
||||
"dcbt %3, %10 \n\t"
|
||||
|
||||
|
||||
"lxvd2x 40, 0, %2 \n\t" // x0
|
||||
"lxvd2x 41, %5, %2 \n\t" // x1
|
||||
"lxvd2x 42, %6, %2 \n\t" // x2
|
||||
"lxvd2x 43, %7, %2 \n\t" // x3
|
||||
|
||||
"lxvd2x 48, 0, %3 \n\t" // y0
|
||||
"lxvd2x 49, %5, %3 \n\t" // y1
|
||||
"lxvd2x 50, %6, %3 \n\t" // y2
|
||||
"lxvd2x 51, %7, %3 \n\t" // y3
|
||||
|
||||
"xxswapd 56, 40 \n\t" // exchange real and imag part
|
||||
"xxswapd 57, 41 \n\t" // exchange real and imag part
|
||||
"xxswapd 58, 42 \n\t" // exchange real and imag part
|
||||
"xxswapd 59, 43 \n\t" // exchange real and imag part
|
||||
|
||||
"addi %2, %2, 64 \n\t"
|
||||
"addi %3, %3, 64 \n\t"
|
||||
|
||||
"lxvd2x 44, 0, %2 \n\t" // x4
|
||||
"lxvd2x 45, %5, %2 \n\t" // x5
|
||||
"lxvd2x 46, %6, %2 \n\t" // x6
|
||||
"lxvd2x 47, %7, %2 \n\t" // x7
|
||||
|
||||
"lxvd2x 52, 0, %3 \n\t" // y4
|
||||
"lxvd2x 53, %5, %3 \n\t" // y5
|
||||
"lxvd2x 54, %6, %3 \n\t" // y6
|
||||
"lxvd2x 55, %7, %3 \n\t" // y7
|
||||
|
||||
"xxswapd 60, 44 \n\t" // exchange real and imag part
|
||||
"xxswapd 61, 45 \n\t" // exchange real and imag part
|
||||
"xxswapd 62, 46 \n\t" // exchange real and imag part
|
||||
"xxswapd 63, 47 \n\t" // exchange real and imag part
|
||||
|
||||
"addi %2, %2, 64 \n\t"
|
||||
"addi %3, %3, 64 \n\t"
|
||||
|
||||
"addic. %0 , %0 , -8 \n\t"
|
||||
"ble 2f \n\t"
|
||||
|
||||
".align 5 \n\t"
|
||||
"1: \n\t"
|
||||
|
||||
"dcbt %2, %10 \n\t"
|
||||
"dcbt %3, %10 \n\t"
|
||||
|
||||
"xvmaddadp 48, 40, 32 \n\t" // alpha_r * x0_r , alpha_r * x0_i
|
||||
"xvmaddadp 49, 41, 32 \n\t"
|
||||
"lxvd2x 40, 0, %2 \n\t" // x0
|
||||
"lxvd2x 41, %5, %2 \n\t" // x1
|
||||
"xvmaddadp 50, 42, 32 \n\t"
|
||||
"xvmaddadp 51, 43, 32 \n\t"
|
||||
"lxvd2x 42, %6, %2 \n\t" // x2
|
||||
"lxvd2x 43, %7, %2 \n\t" // x3
|
||||
|
||||
"xvmaddadp 52, 44, 32 \n\t"
|
||||
"addi %2, %2, 64 \n\t"
|
||||
"xvmaddadp 53, 45, 32 \n\t"
|
||||
"lxvd2x 44, 0, %2 \n\t" // x4
|
||||
"lxvd2x 45, %5, %2 \n\t" // x5
|
||||
"xvmaddadp 54, 46, 32 \n\t"
|
||||
"xvmaddadp 55, 47, 32 \n\t"
|
||||
"lxvd2x 46, %6, %2 \n\t" // x6
|
||||
"lxvd2x 47, %7, %2 \n\t" // x7
|
||||
|
||||
"xvmaddadp 48, 56, 33 \n\t" // alpha_i * x0_i , alpha_i * x0_r
|
||||
"addi %2, %2, 64 \n\t"
|
||||
"xvmaddadp 49, 57, 33 \n\t"
|
||||
"xvmaddadp 50, 58, 33 \n\t"
|
||||
"xvmaddadp 51, 59, 33 \n\t"
|
||||
|
||||
"xvmaddadp 52, 60, 33 \n\t"
|
||||
"xvmaddadp 53, 61, 33 \n\t"
|
||||
"xvmaddadp 54, 62, 33 \n\t"
|
||||
"xvmaddadp 55, 63, 33 \n\t"
|
||||
|
||||
"stxvd2x 48, 0, %8 \n\t"
|
||||
"stxvd2x 49, %5, %8 \n\t"
|
||||
"stxvd2x 50, %6, %8 \n\t"
|
||||
"stxvd2x 51, %7, %8 \n\t"
|
||||
|
||||
"addi %8, %8, 64 \n\t"
|
||||
|
||||
"stxvd2x 52, 0, %8 \n\t"
|
||||
"stxvd2x 53, %5, %8 \n\t"
|
||||
"stxvd2x 54, %6, %8 \n\t"
|
||||
"stxvd2x 55, %7, %8 \n\t"
|
||||
|
||||
"addi %8, %8, 64 \n\t"
|
||||
|
||||
"xxswapd 56, 40 \n\t" // exchange real and imag part
|
||||
"xxswapd 57, 41 \n\t" // exchange real and imag part
|
||||
"lxvd2x 48, 0, %3 \n\t" // y0
|
||||
"lxvd2x 49, %5, %3 \n\t" // y1
|
||||
"xxswapd 58, 42 \n\t" // exchange real and imag part
|
||||
"xxswapd 59, 43 \n\t" // exchange real and imag part
|
||||
"lxvd2x 50, %6, %3 \n\t" // y2
|
||||
"lxvd2x 51, %7, %3 \n\t" // y3
|
||||
|
||||
"xxswapd 60, 44 \n\t" // exchange real and imag part
|
||||
"addi %3, %3, 64 \n\t"
|
||||
"xxswapd 61, 45 \n\t" // exchange real and imag part
|
||||
"lxvd2x 52, 0, %3 \n\t" // y4
|
||||
"lxvd2x 53, %5, %3 \n\t" // y5
|
||||
"xxswapd 62, 46 \n\t" // exchange real and imag part
|
||||
"xxswapd 63, 47 \n\t" // exchange real and imag part
|
||||
"lxvd2x 54, %6, %3 \n\t" // y6
|
||||
"lxvd2x 55, %7, %3 \n\t" // y7
|
||||
|
||||
"addi %3, %3, 64 \n\t"
|
||||
|
||||
"addic. %0 , %0 , -8 \n\t"
|
||||
"bgt 1b \n\t"
|
||||
|
||||
"2: \n\t"
|
||||
|
||||
"xvmaddadp 48, 40, 32 \n\t" // alpha_r * x0_r , alpha_r * x0_i
|
||||
"xvmaddadp 49, 41, 32 \n\t"
|
||||
"xvmaddadp 50, 42, 32 \n\t"
|
||||
"xvmaddadp 51, 43, 32 \n\t"
|
||||
|
||||
"xvmaddadp 52, 44, 32 \n\t"
|
||||
"xvmaddadp 53, 45, 32 \n\t"
|
||||
"xvmaddadp 54, 46, 32 \n\t"
|
||||
"xvmaddadp 55, 47, 32 \n\t"
|
||||
|
||||
"xvmaddadp 48, 56, 33 \n\t" // alpha_i * x0_i , alpha_i * x0_r
|
||||
"xvmaddadp 49, 57, 33 \n\t"
|
||||
"xvmaddadp 50, 58, 33 \n\t"
|
||||
"xvmaddadp 51, 59, 33 \n\t"
|
||||
|
||||
"xvmaddadp 52, 60, 33 \n\t"
|
||||
"xvmaddadp 53, 61, 33 \n\t"
|
||||
"xvmaddadp 54, 62, 33 \n\t"
|
||||
"xvmaddadp 55, 63, 33 \n\t"
|
||||
|
||||
|
||||
"stxvd2x 48, 0, %8 \n\t"
|
||||
"stxvd2x 49, %5, %8 \n\t"
|
||||
"stxvd2x 50, %6, %8 \n\t"
|
||||
"stxvd2x 51, %7, %8 \n\t"
|
||||
|
||||
"addi %8, %8, 64 \n\t"
|
||||
|
||||
"stxvd2x 52, 0, %8 \n\t"
|
||||
"stxvd2x 53, %5, %8 \n\t"
|
||||
"stxvd2x 54, %6, %8 \n\t"
|
||||
"stxvd2x 55, %7, %8 \n\t"
|
||||
|
||||
"addi %8, %8, 64 \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"r" (x1), // 2
|
||||
"r" (y1), // 3
|
||||
"r" (alpha), // 4
|
||||
"r" (o16), // 5
|
||||
"r" (o32), // 6
|
||||
"r" (o48), // 7
|
||||
"r" (y2), // 8
|
||||
"r" (mvec), // 9
|
||||
"r" (pre) // 10
|
||||
: "cr0", "%0", "%2" , "%3", "%8", "memory"
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,140 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
/**************************************************************************************
|
||||
* 2016/03/25 Werner Saar (wernsaar@googlemail.com)
|
||||
* BLASTEST : OK
|
||||
* CTEST : OK
|
||||
* TEST : OK
|
||||
* LAPACK-TEST : OK
|
||||
**************************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if defined(POWER8)
|
||||
#include "zcopy_microk_power8.c"
|
||||
#endif
|
||||
|
||||
#ifndef HAVE_KERNEL_16
|
||||
|
||||
static void zcopy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y)
|
||||
{
|
||||
|
||||
BLASLONG i=0;
|
||||
FLOAT f0, f1, f2, f3, f4, f5, f6, f7;
|
||||
FLOAT *x1=x;
|
||||
FLOAT *y1=y;
|
||||
|
||||
while ( i<n )
|
||||
{
|
||||
|
||||
f0 = x1[0];
|
||||
f1 = x1[1];
|
||||
f2 = x1[2];
|
||||
f3 = x1[3];
|
||||
f4 = x1[4];
|
||||
f5 = x1[5];
|
||||
f6 = x1[6];
|
||||
f7 = x1[7];
|
||||
|
||||
y1[0] = f0;
|
||||
y1[1] = f1;
|
||||
y1[2] = f2;
|
||||
y1[3] = f3;
|
||||
y1[4] = f4;
|
||||
y1[5] = f5;
|
||||
y1[6] = f6;
|
||||
y1[7] = f7;
|
||||
|
||||
x1 += 8;
|
||||
y1 += 8;
|
||||
|
||||
i+=4;
|
||||
}
|
||||
return;
|
||||
|
||||
}
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
||||
{
|
||||
BLASLONG i=0;
|
||||
BLASLONG ix=0,iy=0;
|
||||
|
||||
if ( n <= 0 ) return(0);
|
||||
|
||||
if ( (inc_x == 1) && (inc_y == 1 ))
|
||||
{
|
||||
|
||||
BLASLONG n1 = n & -16;
|
||||
if ( n1 > 0 )
|
||||
{
|
||||
zcopy_kernel_16(n1, x, y);
|
||||
i=n1;
|
||||
ix=n1*2;
|
||||
iy=n1*2;
|
||||
}
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
y[iy] = x[iy] ;
|
||||
y[iy+1] = x[ix+1] ;
|
||||
ix+=2;
|
||||
iy+=2;
|
||||
i++ ;
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
BLASLONG inc_x2 = 2 * inc_x;
|
||||
BLASLONG inc_y2 = 2 * inc_y;
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
y[iy] = x[ix] ;
|
||||
y[iy+1] = x[ix+1] ;
|
||||
ix += inc_x2 ;
|
||||
iy += inc_y2 ;
|
||||
i++ ;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
return(0);
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,174 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
/**************************************************************************************
|
||||
* 2016/03/25 Werner Saar (wernsaar@googlemail.com)
|
||||
* BLASTEST : OK
|
||||
* CTEST : OK
|
||||
* TEST : OK
|
||||
* LAPACK-TEST : OK
|
||||
**************************************************************************************/
|
||||
|
||||
#define HAVE_KERNEL_16 1
|
||||
|
||||
static void zcopy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
|
||||
|
||||
static void zcopy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y)
|
||||
{
|
||||
|
||||
|
||||
BLASLONG i = n;
|
||||
BLASLONG o16 = 16;
|
||||
BLASLONG o32 = 32;
|
||||
BLASLONG o48 = 48;
|
||||
BLASLONG o64 = 64;
|
||||
BLASLONG o80 = 80;
|
||||
BLASLONG o96 = 96;
|
||||
BLASLONG o112 = 112;
|
||||
FLOAT *x1=x;
|
||||
FLOAT *y1=y;
|
||||
BLASLONG pre = 384;
|
||||
BLASLONG alpha=0;
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
|
||||
"lxvd2x 40, 0, %2 \n\t"
|
||||
"lxvd2x 41, %5, %2 \n\t"
|
||||
"lxvd2x 42, %6, %2 \n\t"
|
||||
"lxvd2x 43, %7, %2 \n\t"
|
||||
"lxvd2x 44, %8, %2 \n\t"
|
||||
"lxvd2x 45, %9, %2 \n\t"
|
||||
"lxvd2x 46, %10, %2 \n\t"
|
||||
"lxvd2x 47, %11, %2 \n\t"
|
||||
|
||||
"addi %2, %2, 128 \n\t"
|
||||
|
||||
"lxvd2x 50, 0, %2 \n\t"
|
||||
"lxvd2x 51, %5, %2 \n\t"
|
||||
"lxvd2x 52, %6, %2 \n\t"
|
||||
"lxvd2x 53, %7, %2 \n\t"
|
||||
"lxvd2x 54, %8, %2 \n\t"
|
||||
"lxvd2x 55, %9, %2 \n\t"
|
||||
"lxvd2x 56, %10, %2 \n\t"
|
||||
"lxvd2x 57, %11, %2 \n\t"
|
||||
|
||||
"addi %2, %2, 128 \n\t"
|
||||
|
||||
"addic. %0 , %0 , -16 \n\t"
|
||||
"ble 2f \n\t"
|
||||
|
||||
".align 5 \n\t"
|
||||
"1: \n\t"
|
||||
|
||||
"stxvd2x 40, 0, %1 \n\t"
|
||||
"stxvd2x 41, %5, %1 \n\t"
|
||||
"lxvd2x 40, 0, %2 \n\t"
|
||||
"lxvd2x 41, %5, %2 \n\t"
|
||||
"stxvd2x 42, %6, %1 \n\t"
|
||||
"stxvd2x 43, %7, %1 \n\t"
|
||||
"lxvd2x 42, %6, %2 \n\t"
|
||||
"lxvd2x 43, %7, %2 \n\t"
|
||||
"stxvd2x 44, %8, %1 \n\t"
|
||||
"stxvd2x 45, %9, %1 \n\t"
|
||||
"lxvd2x 44, %8, %2 \n\t"
|
||||
"lxvd2x 45, %9, %2 \n\t"
|
||||
"stxvd2x 46, %10, %1 \n\t"
|
||||
"stxvd2x 47, %11, %1 \n\t"
|
||||
"lxvd2x 46, %10, %2 \n\t"
|
||||
"lxvd2x 47, %11, %2 \n\t"
|
||||
|
||||
|
||||
"addi %1, %1, 128 \n\t"
|
||||
"addi %2, %2, 128 \n\t"
|
||||
|
||||
"stxvd2x 50, 0, %1 \n\t"
|
||||
"stxvd2x 51, %5, %1 \n\t"
|
||||
"lxvd2x 50, 0, %2 \n\t"
|
||||
"lxvd2x 51, %5, %2 \n\t"
|
||||
"stxvd2x 52, %6, %1 \n\t"
|
||||
"stxvd2x 53, %7, %1 \n\t"
|
||||
"lxvd2x 52, %6, %2 \n\t"
|
||||
"lxvd2x 53, %7, %2 \n\t"
|
||||
"stxvd2x 54, %8, %1 \n\t"
|
||||
"stxvd2x 55, %9, %1 \n\t"
|
||||
"lxvd2x 54, %8, %2 \n\t"
|
||||
"lxvd2x 55, %9, %2 \n\t"
|
||||
"stxvd2x 56, %10, %1 \n\t"
|
||||
"stxvd2x 57, %11, %1 \n\t"
|
||||
"lxvd2x 56, %10, %2 \n\t"
|
||||
"lxvd2x 57, %11, %2 \n\t"
|
||||
|
||||
"addi %1, %1, 128 \n\t"
|
||||
"addi %2, %2, 128 \n\t"
|
||||
|
||||
"addic. %0 , %0 , -16 \n\t"
|
||||
"bgt 1b \n\t"
|
||||
|
||||
"2: \n\t"
|
||||
|
||||
"stxvd2x 40, 0, %1 \n\t"
|
||||
"stxvd2x 41, %5, %1 \n\t"
|
||||
"stxvd2x 42, %6, %1 \n\t"
|
||||
"stxvd2x 43, %7, %1 \n\t"
|
||||
"stxvd2x 44, %8, %1 \n\t"
|
||||
"stxvd2x 45, %9, %1 \n\t"
|
||||
"stxvd2x 46, %10, %1 \n\t"
|
||||
"stxvd2x 47, %11, %1 \n\t"
|
||||
|
||||
"addi %1, %1, 128 \n\t"
|
||||
|
||||
"stxvd2x 50, 0, %1 \n\t"
|
||||
"stxvd2x 51, %5, %1 \n\t"
|
||||
"stxvd2x 52, %6, %1 \n\t"
|
||||
"stxvd2x 53, %7, %1 \n\t"
|
||||
"stxvd2x 54, %8, %1 \n\t"
|
||||
"stxvd2x 55, %9, %1 \n\t"
|
||||
"stxvd2x 56, %10, %1 \n\t"
|
||||
"stxvd2x 57, %11, %1 \n\t"
|
||||
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (y1), // 1
|
||||
"r" (x1), // 2
|
||||
"r" (alpha), // 3
|
||||
"r" (pre), // 4
|
||||
"r" (o16), // 5
|
||||
"r" (o32), // 6
|
||||
"r" (o48), // 7
|
||||
"r" (o64), // 8
|
||||
"r" (o80), // 9
|
||||
"r" (o96), // 10
|
||||
"r" (o112) // 11
|
||||
: "cr0", "%0", "%2" , "%1", "memory"
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,167 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
/**************************************************************************************
|
||||
* 2016/03/21 Werner Saar (wernsaar@googlemail.com)
|
||||
* BLASTEST : OK
|
||||
* CTEST : OK
|
||||
* TEST : OK
|
||||
* LAPACK-TEST : OK
|
||||
**************************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include <complex.h>
|
||||
|
||||
|
||||
#if defined(POWER8)
|
||||
#include "zdot_microk_power8.c"
|
||||
#endif
|
||||
|
||||
|
||||
#ifndef HAVE_KERNEL_8
|
||||
|
||||
static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) __attribute__ ((noinline));
|
||||
|
||||
static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d)
|
||||
{
|
||||
BLASLONG register i = 0;
|
||||
FLOAT dot[4] = { 0.0, 0.0, 0.0, 0.0 };
|
||||
BLASLONG j=0;
|
||||
|
||||
while( i < n )
|
||||
{
|
||||
|
||||
dot[0] += x[j] * y[j] ;
|
||||
dot[1] += x[j+1] * y[j+1] ;
|
||||
dot[2] += x[j] * y[j+1] ;
|
||||
dot[3] += x[j+1] * y[j] ;
|
||||
|
||||
dot[0] += x[j+2] * y[j+2] ;
|
||||
dot[1] += x[j+3] * y[j+3] ;
|
||||
dot[2] += x[j+2] * y[j+3] ;
|
||||
dot[3] += x[j+3] * y[j+2] ;
|
||||
|
||||
dot[0] += x[j+4] * y[j+4] ;
|
||||
dot[1] += x[j+5] * y[j+5] ;
|
||||
dot[2] += x[j+4] * y[j+5] ;
|
||||
dot[3] += x[j+5] * y[j+4] ;
|
||||
|
||||
dot[0] += x[j+6] * y[j+6] ;
|
||||
dot[1] += x[j+7] * y[j+7] ;
|
||||
dot[2] += x[j+6] * y[j+7] ;
|
||||
dot[3] += x[j+7] * y[j+6] ;
|
||||
|
||||
j+=8;
|
||||
i+=4;
|
||||
|
||||
}
|
||||
d[0] = dot[0];
|
||||
d[1] = dot[1];
|
||||
d[2] = dot[2];
|
||||
d[3] = dot[3];
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
FLOAT _Complex CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
||||
{
|
||||
BLASLONG i;
|
||||
BLASLONG ix,iy;
|
||||
FLOAT _Complex result;
|
||||
FLOAT dot[4] = { 0.0, 0.0, 0.0 , 0.0 } ;
|
||||
|
||||
if ( n <= 0 )
|
||||
{
|
||||
__real__ result = 0.0 ;
|
||||
__imag__ result = 0.0 ;
|
||||
return(result);
|
||||
|
||||
}
|
||||
|
||||
if ( (inc_x == 1) && (inc_y == 1) )
|
||||
{
|
||||
|
||||
BLASLONG n1 = n & -8;
|
||||
|
||||
if ( n1 )
|
||||
zdot_kernel_8(n1, x, y , dot );
|
||||
|
||||
i = n1;
|
||||
BLASLONG j = i * 2;
|
||||
|
||||
while( i < n )
|
||||
{
|
||||
|
||||
dot[0] += x[j] * y[j] ;
|
||||
dot[1] += x[j+1] * y[j+1] ;
|
||||
dot[2] += x[j] * y[j+1] ;
|
||||
dot[3] += x[j+1] * y[j] ;
|
||||
|
||||
j+=2;
|
||||
i++ ;
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
i=0;
|
||||
ix=0;
|
||||
iy=0;
|
||||
inc_x <<= 1;
|
||||
inc_y <<= 1;
|
||||
while(i < n)
|
||||
{
|
||||
|
||||
dot[0] += x[ix] * y[iy] ;
|
||||
dot[1] += x[ix+1] * y[iy+1] ;
|
||||
dot[2] += x[ix] * y[iy+1] ;
|
||||
dot[3] += x[ix+1] * y[iy] ;
|
||||
|
||||
ix += inc_x ;
|
||||
iy += inc_y ;
|
||||
i++ ;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
#if !defined(CONJ)
|
||||
__real__ result = dot[0] - dot[1];
|
||||
__imag__ result = dot[2] + dot[3];
|
||||
#else
|
||||
__real__ result = dot[0] + dot[1];
|
||||
__imag__ result = dot[2] - dot[3];
|
||||
|
||||
#endif
|
||||
|
||||
return(result);
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,219 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
/**************************************************************************************
|
||||
* 2016/03/21 Werner Saar (wernsaar@googlemail.com)
|
||||
* BLASTEST : OK
|
||||
* CTEST : OK
|
||||
* TEST : OK
|
||||
* LAPACK-TEST : OK
|
||||
**************************************************************************************/
|
||||
|
||||
#define HAVE_KERNEL_8 1
|
||||
static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline));
|
||||
|
||||
static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
|
||||
{
|
||||
|
||||
|
||||
BLASLONG i = n;
|
||||
BLASLONG o16 = 16;
|
||||
BLASLONG o32 = 32;
|
||||
BLASLONG o48 = 48;
|
||||
FLOAT *x1=x;
|
||||
FLOAT *y1=y;
|
||||
BLASLONG pre = 384;
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
"xxlxor 32,32,32 \n\t"
|
||||
"xxlxor 33,33,33 \n\t"
|
||||
"xxlxor 34,34,34 \n\t"
|
||||
"xxlxor 35,35,35 \n\t"
|
||||
"xxlxor 36,36,36 \n\t"
|
||||
"xxlxor 37,37,37 \n\t"
|
||||
"xxlxor 38,38,38 \n\t"
|
||||
"xxlxor 39,39,39 \n\t"
|
||||
|
||||
"dcbt %2, %8 \n\t"
|
||||
"dcbt %3, %8 \n\t"
|
||||
|
||||
"lxvd2x 40, 0, %2 \n\t" // x0_r, x0_i
|
||||
"lxvd2x 48, 0, %3 \n\t" // y0_r, y0_i
|
||||
"lxvd2x 41, %5, %2 \n\t" // x1_r, x1_i
|
||||
"lxvd2x 49, %5, %3 \n\t" // y1_r, y1_i
|
||||
"lxvd2x 42, %6, %2 \n\t" // x2_r, x2_i
|
||||
"lxvd2x 50, %6, %3 \n\t" // y2_r, y2_i
|
||||
"lxvd2x 43, %7, %2 \n\t" // x3_r, x3_i
|
||||
"lxvd2x 51, %7, %3 \n\t" // y3_r, y3_i
|
||||
|
||||
"xxswapd 52,48 \n\t" // y0_i, y0_r
|
||||
"xxswapd 53,49 \n\t" // y1_i, y1_r
|
||||
"xxswapd 54,50 \n\t" // y2_i, y2_r
|
||||
"xxswapd 55,51 \n\t" // y3_i, y3_r
|
||||
|
||||
"addi %2, %2, 64 \n\t"
|
||||
"addi %3, %3, 64 \n\t"
|
||||
|
||||
|
||||
"lxvd2x 44, 0, %2 \n\t" // x0_r, x0_i
|
||||
"lxvd2x 56, 0, %3 \n\t" // y0_r, y0_i
|
||||
"lxvd2x 45, %5, %2 \n\t" // x1_r, x1_i
|
||||
"lxvd2x 57, %5, %3 \n\t" // y1_r, y1_i
|
||||
"lxvd2x 46, %6, %2 \n\t" // x2_r, x2_i
|
||||
"lxvd2x 58, %6, %3 \n\t" // y2_r, y2_i
|
||||
"lxvd2x 47, %7, %2 \n\t" // x3_r, x3_i
|
||||
"lxvd2x 59, %7, %3 \n\t" // y3_r, y3_i
|
||||
|
||||
"xxswapd 60,56 \n\t" // y0_i, y0_r
|
||||
"xxswapd 61,57 \n\t" // y1_i, y1_r
|
||||
"xxswapd 62,58 \n\t" // y2_i, y2_r
|
||||
"xxswapd 63,59 \n\t" // y3_i, y3_r
|
||||
|
||||
"addi %2, %2, 64 \n\t"
|
||||
"addi %3, %3, 64 \n\t"
|
||||
|
||||
"addic. %0 , %0 , -8 \n\t"
|
||||
"ble 2f \n\t"
|
||||
|
||||
".align 5 \n\t"
|
||||
"1: \n\t"
|
||||
|
||||
"dcbt %2, %8 \n\t"
|
||||
"dcbt %3, %8 \n\t"
|
||||
|
||||
"xvmaddadp 32, 40, 48 \n\t" // x0_r * y0_r , x0_i * y0_i
|
||||
"lxvd2x 48, 0, %3 \n\t" // y0_r, y0_i
|
||||
"xvmaddadp 34, 41, 49 \n\t" // x1_r * y1_r , x1_i * y1_i
|
||||
"lxvd2x 49, %5, %3 \n\t" // y1_r, y1_i
|
||||
|
||||
"xvmaddadp 36, 42, 50 \n\t" // x2_r * y2_r , x2_i * y2_i
|
||||
"lxvd2x 50, %6, %3 \n\t" // y2_r, y2_i
|
||||
"xvmaddadp 38, 43, 51 \n\t" // x3_r * y3_r , x3_i * y3_i
|
||||
"lxvd2x 51, %7, %3 \n\t" // y3_r, y3_i
|
||||
|
||||
"xvmaddadp 33, 40, 52 \n\t" // x0_r * y0_i , x0_i * y0_r
|
||||
"lxvd2x 40, 0, %2 \n\t" // x0_r, x0_i
|
||||
"xvmaddadp 35, 41, 53 \n\t" // x1_r * y1_i , x1_i * y1_r
|
||||
"lxvd2x 41, %5, %2 \n\t" // x1_r, x1_i
|
||||
|
||||
"xvmaddadp 37, 42, 54 \n\t" // x2_r * y2_i , x2_i * y2_r
|
||||
"lxvd2x 42, %6, %2 \n\t" // x2_r, x2_i
|
||||
"xvmaddadp 39, 43, 55 \n\t" // x3_r * y3_i , x3_i * y3_r
|
||||
"lxvd2x 43, %7, %2 \n\t" // x3_r, x3_i
|
||||
|
||||
"xxswapd 52,48 \n\t" // y0_i, y0_r
|
||||
"xxswapd 53,49 \n\t" // y1_i, y1_r
|
||||
|
||||
"addi %2, %2, 64 \n\t"
|
||||
"addi %3, %3, 64 \n\t"
|
||||
|
||||
"xxswapd 54,50 \n\t" // y2_i, y2_r
|
||||
"xxswapd 55,51 \n\t" // y3_i, y3_r
|
||||
|
||||
"xvmaddadp 32, 44, 56 \n\t" // x0_r * y0_r , x0_i * y0_i
|
||||
"lxvd2x 56, 0, %3 \n\t" // y0_r, y0_i
|
||||
"xvmaddadp 34, 45, 57 \n\t" // x1_r * y1_r , x1_i * y1_i
|
||||
"lxvd2x 57, %5, %3 \n\t" // y1_r, y1_i
|
||||
"xvmaddadp 36, 46, 58 \n\t" // x2_r * y2_r , x2_i * y2_i
|
||||
"lxvd2x 58, %6, %3 \n\t" // y2_r, y2_i
|
||||
"xvmaddadp 38, 47, 59 \n\t" // x3_r * y3_r , x3_i * y3_i
|
||||
"lxvd2x 59, %7, %3 \n\t" // y3_r, y3_i
|
||||
|
||||
"xvmaddadp 33, 44, 60 \n\t" // x0_r * y0_i , x0_i * y0_r
|
||||
"lxvd2x 44, 0, %2 \n\t" // x0_r, x0_i
|
||||
"xvmaddadp 35, 45, 61 \n\t" // x1_r * y1_i , x1_i * y1_r
|
||||
"lxvd2x 45, %5, %2 \n\t" // x1_r, x1_i
|
||||
"xvmaddadp 37, 46, 62 \n\t" // x2_r * y2_i , x2_i * y2_r
|
||||
"lxvd2x 46, %6, %2 \n\t" // x2_r, x2_i
|
||||
"xvmaddadp 39, 47, 63 \n\t" // x3_r * y3_i , x3_i * y3_r
|
||||
"lxvd2x 47, %7, %2 \n\t" // x3_r, x3_i
|
||||
|
||||
"xxswapd 60,56 \n\t" // y0_i, y0_r
|
||||
"xxswapd 61,57 \n\t" // y1_i, y1_r
|
||||
|
||||
"addi %2, %2, 64 \n\t"
|
||||
"addi %3, %3, 64 \n\t"
|
||||
|
||||
"xxswapd 62,58 \n\t" // y2_i, y2_r
|
||||
"xxswapd 63,59 \n\t" // y3_i, y3_r
|
||||
|
||||
"addic. %0 , %0 , -8 \n\t"
|
||||
"bgt 1b \n\t"
|
||||
|
||||
"2: \n\t"
|
||||
|
||||
"xvmaddadp 32, 40, 48 \n\t" // x0_r * y0_r , x0_i * y0_i
|
||||
"xvmaddadp 34, 41, 49 \n\t" // x1_r * y1_r , x1_i * y1_i
|
||||
"xvmaddadp 36, 42, 50 \n\t" // x2_r * y2_r , x2_i * y2_i
|
||||
"xvmaddadp 38, 43, 51 \n\t" // x3_r * y3_r , x3_i * y3_i
|
||||
|
||||
"xvmaddadp 33, 40, 52 \n\t" // x0_r * y0_i , x0_i * y0_r
|
||||
"xvmaddadp 35, 41, 53 \n\t" // x1_r * y1_i , x1_i * y1_r
|
||||
"xvmaddadp 37, 42, 54 \n\t" // x2_r * y2_i , x2_i * y2_r
|
||||
"xvmaddadp 39, 43, 55 \n\t" // x3_r * y3_i , x3_i * y3_r
|
||||
|
||||
"xvmaddadp 32, 44, 56 \n\t" // x0_r * y0_r , x0_i * y0_i
|
||||
"xvmaddadp 34, 45, 57 \n\t" // x1_r * y1_r , x1_i * y1_i
|
||||
"xvmaddadp 36, 46, 58 \n\t" // x2_r * y2_r , x2_i * y2_i
|
||||
"xvmaddadp 38, 47, 59 \n\t" // x3_r * y3_r , x3_i * y3_i
|
||||
|
||||
"xvmaddadp 33, 44, 60 \n\t" // x0_r * y0_i , x0_i * y0_r
|
||||
"xvmaddadp 35, 45, 61 \n\t" // x1_r * y1_i , x1_i * y1_r
|
||||
"xvmaddadp 37, 46, 62 \n\t" // x2_r * y2_i , x2_i * y2_r
|
||||
"xvmaddadp 39, 47, 63 \n\t" // x3_r * y3_i , x3_i * y3_r
|
||||
|
||||
|
||||
"xvadddp 32, 32, 34 \n\t"
|
||||
"xvadddp 36, 36, 38 \n\t"
|
||||
|
||||
"xvadddp 33, 33, 35 \n\t"
|
||||
"xvadddp 37, 37, 39 \n\t"
|
||||
|
||||
"xvadddp 32, 32, 36 \n\t"
|
||||
"xvadddp 33, 33, 37 \n\t"
|
||||
|
||||
"stxvd2x 32, 0, %4 \n\t"
|
||||
"stxvd2x 33, %5, %4 \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"r" (x1), // 2
|
||||
"r" (y1), // 3
|
||||
"r" (dot), // 4
|
||||
"r" (o16), // 5
|
||||
"r" (o32), // 6
|
||||
"r" (o48), // 7
|
||||
"r" (pre) // 8
|
||||
: "cr0", "%0", "%2" , "%3", "memory"
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -1,38 +1,3 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
/**************************************************************************************
|
||||
* 2016/03/05 Werner Saar (wernsaar@googlemail.com)
|
||||
* BLASTEST : OK
|
||||
* CTEST : OK
|
||||
* TEST : OK
|
||||
* LAPACK-TEST : OK
|
||||
**************************************************************************************/
|
||||
|
||||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
|
|
@ -82,7 +47,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#endif
|
||||
|
||||
#ifdef __64BIT__
|
||||
#define STACKSIZE 320
|
||||
#define STACKSIZE 32000
|
||||
#define ALPHA_R_SP 296(SP)
|
||||
#define ALPHA_I_SP 304(SP)
|
||||
#define FZERO 312(SP)
|
||||
|
|
@ -133,11 +98,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define alpha_r vs30
|
||||
#define alpha_i vs31
|
||||
|
||||
|
||||
#define FRAMEPOINTER r12
|
||||
|
||||
#define BBUFFER r14
|
||||
|
||||
#define L r15
|
||||
#define ALPHA r16
|
||||
#define o24 r17
|
||||
#define T2 r19
|
||||
#define KK r20
|
||||
#define BBO r20
|
||||
#define o8 r21
|
||||
#define I r22
|
||||
#define J r23
|
||||
|
|
@ -156,8 +126,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
PROLOGUE
|
||||
PROFCODE
|
||||
|
||||
addi SP, SP, -STACKSIZE
|
||||
li r0, 0
|
||||
mr FRAMEPOINTER, SP
|
||||
addi SP, SP, -STACKSIZE
|
||||
addi SP, SP, -STACKSIZE
|
||||
addi SP, SP, -STACKSIZE
|
||||
addi SP, SP, -STACKSIZE
|
||||
li r0, 0
|
||||
|
||||
stfd f14, 0(SP)
|
||||
stfd f15, 8(SP)
|
||||
|
|
@ -200,6 +174,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
std r17, 256(SP)
|
||||
std r16, 264(SP)
|
||||
std r15, 272(SP)
|
||||
std r14, 280(SP)
|
||||
#else
|
||||
stw r31, 144(SP)
|
||||
stw r30, 148(SP)
|
||||
|
|
@ -226,37 +201,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#ifdef linux
|
||||
#ifdef __64BIT__
|
||||
ld LDC, FRAMESLOT(0) + STACKSIZE(SP)
|
||||
ld LDC, FRAMESLOT(0) + 0(FRAMEPOINTER)
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(_AIX) || defined(__APPLE__)
|
||||
#ifdef __64BIT__
|
||||
ld LDC, FRAMESLOT(0) + STACKSIZE(SP)
|
||||
ld LDC, FRAMESLOT(0) + 0(FRAMEPOINTER)
|
||||
#else
|
||||
#ifdef DOUBLE
|
||||
lwz B, FRAMESLOT(0) + STACKSIZE(SP)
|
||||
lwz C, FRAMESLOT(1) + STACKSIZE(SP)
|
||||
lwz LDC, FRAMESLOT(2) + STACKSIZE(SP)
|
||||
lwz B, FRAMESLOT(0) + 0(FRAMEPOINTER)
|
||||
lwz C, FRAMESLOT(1) + 0(FRAMEPOINTER)
|
||||
lwz LDC, FRAMESLOT(2) + 0(FRAMEPOINTER)
|
||||
#else
|
||||
lwz LDC, FRAMESLOT(0) + STACKSIZE(SP)
|
||||
lwz LDC, FRAMESLOT(0) + 0(FRAMEPOINTER)
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef TRMMKERNEL
|
||||
#if defined(linux) && defined(__64BIT__)
|
||||
ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
|
||||
ld OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER)
|
||||
#endif
|
||||
|
||||
#if defined(_AIX) || defined(__APPLE__)
|
||||
#ifdef __64BIT__
|
||||
ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
|
||||
ld OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER)
|
||||
#else
|
||||
#ifdef DOUBLE
|
||||
lwz OFFSET, FRAMESLOT(3) + STACKSIZE(SP)
|
||||
lwz OFFSET, FRAMESLOT(3) + 0(FRAMEPOINTER)
|
||||
#else
|
||||
lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
|
||||
lwz OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER)
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
|
@ -268,34 +243,38 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#include "zgemm_macros_8x2_power8.S"
|
||||
|
||||
cmpwi cr0, M, 0
|
||||
ble .L999
|
||||
ble L999
|
||||
cmpwi cr0, N, 0
|
||||
ble .L999
|
||||
ble L999
|
||||
cmpwi cr0, K, 0
|
||||
ble .L999
|
||||
ble L999
|
||||
|
||||
slwi LDC, LDC, ZBASE_SHIFT
|
||||
li PRE, 256
|
||||
li PRE, 384
|
||||
li o8 , 8
|
||||
li o16 , 16
|
||||
li o24 , 24
|
||||
li o32 , 32
|
||||
li o48 , 48
|
||||
|
||||
addi BBUFFER, SP, 512+4096
|
||||
li T1, -4096
|
||||
and BBUFFER, BBUFFER, T1
|
||||
|
||||
#ifdef __64BIT__
|
||||
addi ALPHA, SP, 296
|
||||
#else
|
||||
addi ALPHA, SP, 224
|
||||
#endif
|
||||
|
||||
lxvdsx alpha_r, 0, ALPHA
|
||||
lxvdsx alpha_i, o8, ALPHA
|
||||
lxsdx alpha_r, 0, ALPHA
|
||||
lxsdx alpha_i, o8, ALPHA
|
||||
|
||||
.align 5
|
||||
.align 4
|
||||
|
||||
#include "zgemm_logic_8x2_power8.S"
|
||||
|
||||
.L999:
|
||||
L999:
|
||||
addi r3, 0, 0
|
||||
|
||||
lfd f14, 0(SP)
|
||||
|
|
@ -339,6 +318,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld r17, 256(SP)
|
||||
ld r16, 264(SP)
|
||||
ld r15, 272(SP)
|
||||
ld r14, 280(SP)
|
||||
#else
|
||||
lwz r31, 144(SP)
|
||||
lwz r30, 148(SP)
|
||||
|
|
@ -360,6 +340,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#endif
|
||||
|
||||
addi SP, SP, STACKSIZE
|
||||
addi SP, SP, STACKSIZE
|
||||
addi SP, SP, STACKSIZE
|
||||
addi SP, SP, STACKSIZE
|
||||
|
||||
blr
|
||||
|
||||
|
|
|
|||
|
|
@ -1,83 +1,111 @@
|
|||
srawi. J, N, 1
|
||||
ble .LZGEMM_L2_END
|
||||
ble ZGEMM_L2_END
|
||||
|
||||
ZGEMM_L2_BEGIN:
|
||||
|
||||
mr BO, B
|
||||
mr BBO, BBUFFER
|
||||
slwi T1, K, 1
|
||||
|
||||
ZGEMM_L2_COPYB:
|
||||
|
||||
lxvdsx vs4, o0, BO // b0_r
|
||||
lxvdsx vs5, o8, BO // b0_i
|
||||
addi BO, BO, 16
|
||||
stxvd2x vs4, o0, BBO
|
||||
stxvd2x vs5, o16, BBO
|
||||
addic. T1, T1, -1
|
||||
addi BBO, BBO, 32
|
||||
|
||||
bge ZGEMM_L2_COPYB
|
||||
|
||||
.LZGEMM_L2_BEGIN:
|
||||
|
||||
mr CO, C
|
||||
mr AO, A
|
||||
slwi T1, LDC , 1
|
||||
add C, C, T1
|
||||
srawi. I, M, 3
|
||||
ble .LZGEMM_L2x8_END
|
||||
ble ZGEMM_L2x8_END
|
||||
|
||||
.LZGEMM_L2x8_BEGIN:
|
||||
ZGEMM_L2x8_BEGIN:
|
||||
|
||||
|
||||
mr BO, B
|
||||
mr BO, BBUFFER
|
||||
srawi. L, K, 3
|
||||
ble .LZGEMM_L2x8_SUB0
|
||||
ble ZGEMM_L2x8_SUB0
|
||||
cmpwi cr0, L, 1
|
||||
ble .LZGEMM_L2x8_SUB4
|
||||
ble ZGEMM_L2x8_SUB4
|
||||
|
||||
.LZGEMM_L2x8_LOOP_START:
|
||||
ZGEMM_L2x8_LOOP_START:
|
||||
|
||||
dcbt AO, PRE
|
||||
dcbt BO, PRE
|
||||
LOAD2x8_1
|
||||
dcbt AO, PRE
|
||||
KERNEL2x8_I1
|
||||
dcbt AO, PRE
|
||||
dcbt BO, PRE
|
||||
KERNEL2x8_2
|
||||
dcbt AO, PRE
|
||||
KERNEL2x8_1
|
||||
dcbt AO, PRE
|
||||
dcbt BO, PRE
|
||||
KERNEL2x8_2
|
||||
|
||||
dcbt AO, PRE
|
||||
KERNEL2x8_1
|
||||
dcbt AO, PRE
|
||||
dcbt BO, PRE
|
||||
KERNEL2x8_2
|
||||
dcbt AO, PRE
|
||||
KERNEL2x8_1
|
||||
dcbt AO, PRE
|
||||
dcbt BO, PRE
|
||||
KERNEL2x8_2
|
||||
|
||||
addic. L, L, -2
|
||||
ble .LZGEMM_L2x8_LOOP_END
|
||||
ble ZGEMM_L2x8_LOOP_END
|
||||
|
||||
.align 5
|
||||
|
||||
.LZGEMM_L2x8_LOOP:
|
||||
ZGEMM_L2x8_LOOP:
|
||||
|
||||
dcbt AO, PRE
|
||||
KERNEL2x8_1
|
||||
dcbt AO, PRE
|
||||
dcbt BO, PRE
|
||||
KERNEL2x8_2
|
||||
dcbt AO, PRE
|
||||
KERNEL2x8_1
|
||||
dcbt AO, PRE
|
||||
dcbt BO, PRE
|
||||
KERNEL2x8_2
|
||||
|
||||
dcbt AO, PRE
|
||||
KERNEL2x8_1
|
||||
dcbt AO, PRE
|
||||
dcbt BO, PRE
|
||||
KERNEL2x8_2
|
||||
dcbt AO, PRE
|
||||
KERNEL2x8_1
|
||||
dcbt AO, PRE
|
||||
dcbt BO, PRE
|
||||
KERNEL2x8_2
|
||||
|
||||
addic. L, L, -1
|
||||
bgt .LZGEMM_L2x8_LOOP
|
||||
bgt ZGEMM_L2x8_LOOP
|
||||
|
||||
.LZGEMM_L2x8_LOOP_END:
|
||||
ZGEMM_L2x8_LOOP_END:
|
||||
|
||||
dcbt AO, PRE
|
||||
KERNEL2x8_1
|
||||
dcbt AO, PRE
|
||||
dcbt BO, PRE
|
||||
KERNEL2x8_2
|
||||
dcbt AO, PRE
|
||||
KERNEL2x8_1
|
||||
dcbt AO, PRE
|
||||
dcbt BO, PRE
|
||||
KERNEL2x8_2
|
||||
|
||||
dcbt AO, PRE
|
||||
|
|
@ -88,9 +116,9 @@
|
|||
KERNEL2x8_1
|
||||
KERNEL2x8_E2
|
||||
|
||||
b .LZGEMM_L2x8_SUB1
|
||||
b ZGEMM_L2x8_SUB1
|
||||
|
||||
.LZGEMM_L2x8_SUB4:
|
||||
ZGEMM_L2x8_SUB4:
|
||||
|
||||
dcbt AO, PRE
|
||||
KERNEL2x8_SUBI1
|
||||
|
|
@ -106,53 +134,53 @@
|
|||
KERNEL2x8_SUB1
|
||||
KERNEL2x8_SUB1
|
||||
|
||||
b .LZGEMM_L2x8_SUB1
|
||||
b ZGEMM_L2x8_SUB1
|
||||
|
||||
.LZGEMM_L2x8_SUB0:
|
||||
ZGEMM_L2x8_SUB0:
|
||||
|
||||
andi. L, K, 7
|
||||
|
||||
KERNEL2x8_SUBI1
|
||||
|
||||
addic. L, L, -1
|
||||
ble .LZGEMM_L2x8_SAVE
|
||||
b .LZGEMM_L2x8_SUB2
|
||||
ble ZGEMM_L2x8_SAVE
|
||||
b ZGEMM_L2x8_SUB2
|
||||
|
||||
.LZGEMM_L2x8_SUB1:
|
||||
ZGEMM_L2x8_SUB1:
|
||||
|
||||
andi. L, K, 7
|
||||
ble .LZGEMM_L2x8_SAVE
|
||||
ble ZGEMM_L2x8_SAVE
|
||||
|
||||
.LZGEMM_L2x8_SUB2:
|
||||
ZGEMM_L2x8_SUB2:
|
||||
|
||||
KERNEL2x8_SUB1
|
||||
|
||||
addic. L, L, -1
|
||||
bgt .LZGEMM_L2x8_SUB2
|
||||
bgt ZGEMM_L2x8_SUB2
|
||||
|
||||
.LZGEMM_L2x8_SAVE:
|
||||
ZGEMM_L2x8_SAVE:
|
||||
|
||||
SAVE2x8
|
||||
|
||||
addic. I, I, -1
|
||||
bgt .LZGEMM_L2x8_BEGIN
|
||||
bgt ZGEMM_L2x8_BEGIN
|
||||
|
||||
.LZGEMM_L2x8_END:
|
||||
ZGEMM_L2x8_END:
|
||||
|
||||
.LZGEMM_L2x4_BEGIN:
|
||||
ZGEMM_L2x4_BEGIN:
|
||||
|
||||
andi. T2, M, 7
|
||||
ble .LZGEMM_L2x1_END
|
||||
ble ZGEMM_L2x1_END
|
||||
|
||||
andi. T1, M, 4
|
||||
ble .LZGEMM_L2x4_END
|
||||
mr BO, B
|
||||
ble ZGEMM_L2x4_END
|
||||
mr BO, BBUFFER
|
||||
srawi. L, K, 3
|
||||
ble .LZGEMM_L2x4_SUB0
|
||||
ble ZGEMM_L2x4_SUB0
|
||||
cmpwi cr0, L, 1
|
||||
ble .LZGEMM_L2x4_SUB4
|
||||
ble ZGEMM_L2x4_SUB4
|
||||
|
||||
.LZGEMM_L2x4_LOOP_START:
|
||||
ZGEMM_L2x4_LOOP_START:
|
||||
|
||||
LOAD2x4_1
|
||||
KERNEL2x4_I1
|
||||
|
|
@ -166,11 +194,11 @@
|
|||
KERNEL2x4_2
|
||||
|
||||
addic. L, L, -2
|
||||
ble .LZGEMM_L2x4_LOOP_END
|
||||
ble ZGEMM_L2x4_LOOP_END
|
||||
|
||||
.align 5
|
||||
|
||||
.LZGEMM_L2x4_LOOP:
|
||||
ZGEMM_L2x4_LOOP:
|
||||
|
||||
KERNEL2x4_1
|
||||
KERNEL2x4_2
|
||||
|
|
@ -183,9 +211,9 @@
|
|||
KERNEL2x4_2
|
||||
|
||||
addic. L, L, -1
|
||||
bgt .LZGEMM_L2x4_LOOP
|
||||
bgt ZGEMM_L2x4_LOOP
|
||||
|
||||
.LZGEMM_L2x4_LOOP_END:
|
||||
ZGEMM_L2x4_LOOP_END:
|
||||
|
||||
KERNEL2x4_1
|
||||
KERNEL2x4_2
|
||||
|
|
@ -197,9 +225,9 @@
|
|||
KERNEL2x4_1
|
||||
KERNEL2x4_E2
|
||||
|
||||
b .LZGEMM_L2x4_SUB1
|
||||
b ZGEMM_L2x4_SUB1
|
||||
|
||||
.LZGEMM_L2x4_SUB4:
|
||||
ZGEMM_L2x4_SUB4:
|
||||
|
||||
KERNEL2x4_SUBI1
|
||||
KERNEL2x4_SUB1
|
||||
|
|
@ -211,48 +239,48 @@
|
|||
KERNEL2x4_SUB1
|
||||
KERNEL2x4_SUB1
|
||||
|
||||
b .LZGEMM_L2x4_SUB1
|
||||
b ZGEMM_L2x4_SUB1
|
||||
|
||||
.LZGEMM_L2x4_SUB0:
|
||||
ZGEMM_L2x4_SUB0:
|
||||
|
||||
andi. L, K, 7
|
||||
|
||||
KERNEL2x4_SUBI1
|
||||
|
||||
addic. L, L, -1
|
||||
ble .LZGEMM_L2x4_SAVE
|
||||
b .LZGEMM_L2x4_SUB2
|
||||
ble ZGEMM_L2x4_SAVE
|
||||
b ZGEMM_L2x4_SUB2
|
||||
|
||||
.LZGEMM_L2x4_SUB1:
|
||||
ZGEMM_L2x4_SUB1:
|
||||
|
||||
andi. L, K, 7
|
||||
ble .LZGEMM_L2x4_SAVE
|
||||
ble ZGEMM_L2x4_SAVE
|
||||
|
||||
.LZGEMM_L2x4_SUB2:
|
||||
ZGEMM_L2x4_SUB2:
|
||||
|
||||
KERNEL2x4_SUB1
|
||||
|
||||
addic. L, L, -1
|
||||
bgt .LZGEMM_L2x4_SUB2
|
||||
bgt ZGEMM_L2x4_SUB2
|
||||
|
||||
.LZGEMM_L2x4_SAVE:
|
||||
ZGEMM_L2x4_SAVE:
|
||||
|
||||
SAVE2x4
|
||||
|
||||
.LZGEMM_L2x4_END:
|
||||
ZGEMM_L2x4_END:
|
||||
|
||||
.LZGEMM_L2x2_BEGIN:
|
||||
ZGEMM_L2x2_BEGIN:
|
||||
|
||||
|
||||
andi. T1, M, 2
|
||||
ble .LZGEMM_L2x2_END
|
||||
mr BO, B
|
||||
ble ZGEMM_L2x2_END
|
||||
mr BO, BBUFFER
|
||||
srawi. L, K, 3
|
||||
ble .LZGEMM_L2x2_SUB0
|
||||
ble ZGEMM_L2x2_SUB0
|
||||
cmpwi cr0, L, 1
|
||||
ble .LZGEMM_L2x2_SUB4
|
||||
ble ZGEMM_L2x2_SUB4
|
||||
|
||||
.LZGEMM_L2x2_LOOP_START:
|
||||
ZGEMM_L2x2_LOOP_START:
|
||||
|
||||
LOAD2x2_1
|
||||
KERNEL2x2_I1
|
||||
|
|
@ -266,11 +294,11 @@
|
|||
KERNEL2x2_2
|
||||
|
||||
addic. L, L, -2
|
||||
ble .LZGEMM_L2x2_LOOP_END
|
||||
ble ZGEMM_L2x2_LOOP_END
|
||||
|
||||
.align 5
|
||||
|
||||
.LZGEMM_L2x2_LOOP:
|
||||
ZGEMM_L2x2_LOOP:
|
||||
|
||||
KERNEL2x2_1
|
||||
KERNEL2x2_2
|
||||
|
|
@ -283,9 +311,9 @@
|
|||
KERNEL2x2_2
|
||||
|
||||
addic. L, L, -1
|
||||
bgt .LZGEMM_L2x2_LOOP
|
||||
bgt ZGEMM_L2x2_LOOP
|
||||
|
||||
.LZGEMM_L2x2_LOOP_END:
|
||||
ZGEMM_L2x2_LOOP_END:
|
||||
|
||||
KERNEL2x2_1
|
||||
KERNEL2x2_2
|
||||
|
|
@ -297,9 +325,9 @@
|
|||
KERNEL2x2_1
|
||||
KERNEL2x2_E2
|
||||
|
||||
b .LZGEMM_L2x2_SUB1
|
||||
b ZGEMM_L2x2_SUB1
|
||||
|
||||
.LZGEMM_L2x2_SUB4:
|
||||
ZGEMM_L2x2_SUB4:
|
||||
|
||||
KERNEL2x2_SUBI1
|
||||
KERNEL2x2_SUB1
|
||||
|
|
@ -311,48 +339,48 @@
|
|||
KERNEL2x2_SUB1
|
||||
KERNEL2x2_SUB1
|
||||
|
||||
b .LZGEMM_L2x2_SUB1
|
||||
b ZGEMM_L2x2_SUB1
|
||||
|
||||
.LZGEMM_L2x2_SUB0:
|
||||
ZGEMM_L2x2_SUB0:
|
||||
|
||||
andi. L, K, 7
|
||||
|
||||
KERNEL2x2_SUBI1
|
||||
|
||||
addic. L, L, -1
|
||||
ble .LZGEMM_L2x2_SAVE
|
||||
b .LZGEMM_L2x2_SUB2
|
||||
ble ZGEMM_L2x2_SAVE
|
||||
b ZGEMM_L2x2_SUB2
|
||||
|
||||
.LZGEMM_L2x2_SUB1:
|
||||
ZGEMM_L2x2_SUB1:
|
||||
|
||||
andi. L, K, 7
|
||||
ble .LZGEMM_L2x2_SAVE
|
||||
ble ZGEMM_L2x2_SAVE
|
||||
|
||||
.LZGEMM_L2x2_SUB2:
|
||||
ZGEMM_L2x2_SUB2:
|
||||
|
||||
KERNEL2x2_SUB1
|
||||
|
||||
addic. L, L, -1
|
||||
bgt .LZGEMM_L2x2_SUB2
|
||||
bgt ZGEMM_L2x2_SUB2
|
||||
|
||||
.LZGEMM_L2x2_SAVE:
|
||||
ZGEMM_L2x2_SAVE:
|
||||
|
||||
SAVE2x2
|
||||
|
||||
.LZGEMM_L2x2_END:
|
||||
ZGEMM_L2x2_END:
|
||||
|
||||
.LZGEMM_L2x1_BEGIN:
|
||||
ZGEMM_L2x1_BEGIN:
|
||||
|
||||
|
||||
andi. T1, M, 1
|
||||
ble .LZGEMM_L2x1_END
|
||||
mr BO, B
|
||||
ble ZGEMM_L2x1_END
|
||||
mr BO, BBUFFER
|
||||
srawi. L, K, 3
|
||||
ble .LZGEMM_L2x1_SUB0
|
||||
ble ZGEMM_L2x1_SUB0
|
||||
cmpwi cr0, L, 1
|
||||
ble .LZGEMM_L2x1_SUB4
|
||||
ble ZGEMM_L2x1_SUB4
|
||||
|
||||
.LZGEMM_L2x1_LOOP_START:
|
||||
ZGEMM_L2x1_LOOP_START:
|
||||
|
||||
LOAD2x1_1
|
||||
KERNEL2x1_I1
|
||||
|
|
@ -366,11 +394,11 @@
|
|||
KERNEL2x1_2
|
||||
|
||||
addic. L, L, -2
|
||||
ble .LZGEMM_L2x1_LOOP_END
|
||||
ble ZGEMM_L2x1_LOOP_END
|
||||
|
||||
.align 5
|
||||
|
||||
.LZGEMM_L2x1_LOOP:
|
||||
ZGEMM_L2x1_LOOP:
|
||||
|
||||
KERNEL2x1_1
|
||||
KERNEL2x1_2
|
||||
|
|
@ -383,9 +411,9 @@
|
|||
KERNEL2x1_2
|
||||
|
||||
addic. L, L, -1
|
||||
bgt .LZGEMM_L2x1_LOOP
|
||||
bgt ZGEMM_L2x1_LOOP
|
||||
|
||||
.LZGEMM_L2x1_LOOP_END:
|
||||
ZGEMM_L2x1_LOOP_END:
|
||||
|
||||
KERNEL2x1_1
|
||||
KERNEL2x1_2
|
||||
|
|
@ -397,9 +425,9 @@
|
|||
KERNEL2x1_1
|
||||
KERNEL2x1_E2
|
||||
|
||||
b .LZGEMM_L2x1_SUB1
|
||||
b ZGEMM_L2x1_SUB1
|
||||
|
||||
.LZGEMM_L2x1_SUB4:
|
||||
ZGEMM_L2x1_SUB4:
|
||||
|
||||
KERNEL2x1_SUBI1
|
||||
KERNEL2x1_SUB1
|
||||
|
|
@ -411,72 +439,89 @@
|
|||
KERNEL2x1_SUB1
|
||||
KERNEL2x1_SUB1
|
||||
|
||||
b .LZGEMM_L2x1_SUB1
|
||||
b ZGEMM_L2x1_SUB1
|
||||
|
||||
.LZGEMM_L2x1_SUB0:
|
||||
ZGEMM_L2x1_SUB0:
|
||||
|
||||
andi. L, K, 7
|
||||
|
||||
KERNEL2x1_SUBI1
|
||||
|
||||
addic. L, L, -1
|
||||
ble .LZGEMM_L2x1_SAVE
|
||||
b .LZGEMM_L2x1_SUB2
|
||||
ble ZGEMM_L2x1_SAVE
|
||||
b ZGEMM_L2x1_SUB2
|
||||
|
||||
.LZGEMM_L2x1_SUB1:
|
||||
ZGEMM_L2x1_SUB1:
|
||||
|
||||
andi. L, K, 7
|
||||
ble .LZGEMM_L2x1_SAVE
|
||||
ble ZGEMM_L2x1_SAVE
|
||||
|
||||
.LZGEMM_L2x1_SUB2:
|
||||
ZGEMM_L2x1_SUB2:
|
||||
|
||||
KERNEL2x1_SUB1
|
||||
|
||||
addic. L, L, -1
|
||||
bgt .LZGEMM_L2x1_SUB2
|
||||
bgt ZGEMM_L2x1_SUB2
|
||||
|
||||
.LZGEMM_L2x1_SAVE:
|
||||
ZGEMM_L2x1_SAVE:
|
||||
|
||||
SAVE2x1
|
||||
|
||||
.LZGEMM_L2x1_END:
|
||||
ZGEMM_L2x1_END:
|
||||
|
||||
slwi T1, K, 5
|
||||
add B, B, T1
|
||||
|
||||
addic. J, J, -1
|
||||
bgt .LZGEMM_L2_BEGIN
|
||||
bgt ZGEMM_L2_BEGIN
|
||||
|
||||
andi. T2, N, 1
|
||||
ble .L999
|
||||
ble L999
|
||||
|
||||
.LZGEMM_L2_END:
|
||||
ZGEMM_L2_END:
|
||||
|
||||
b .LZGEMM_L1_BEGIN
|
||||
b ZGEMM_L1_BEGIN
|
||||
|
||||
.L999_H1:
|
||||
L999_H1:
|
||||
|
||||
b .L999
|
||||
b L999
|
||||
|
||||
ZGEMM_L1_BEGIN:
|
||||
|
||||
mr BO, B
|
||||
mr BBO, BBUFFER
|
||||
slwi T1, K, 0
|
||||
|
||||
ZGEMM_L1_COPYB:
|
||||
|
||||
lxvdsx vs4, o0, BO // b0_r
|
||||
lxvdsx vs5, o8, BO // b0_i
|
||||
addi BO, BO, 16
|
||||
stxvd2x vs4, o0, BBO
|
||||
stxvd2x vs5, o16, BBO
|
||||
addic. T1, T1, -1
|
||||
addi BBO, BBO, 32
|
||||
|
||||
bge ZGEMM_L1_COPYB
|
||||
|
||||
.LZGEMM_L1_BEGIN:
|
||||
|
||||
andi. T1, N, 1
|
||||
ble .LZGEMM_L1_END
|
||||
ble ZGEMM_L1_END
|
||||
mr CO, C
|
||||
mr AO, A
|
||||
srawi. I, M, 3
|
||||
ble .LZGEMM_L1x8_END
|
||||
ble ZGEMM_L1x8_END
|
||||
|
||||
.LZGEMM_L1x8_BEGIN:
|
||||
ZGEMM_L1x8_BEGIN:
|
||||
|
||||
|
||||
mr BO, B
|
||||
mr BO, BBUFFER
|
||||
srawi. L, K, 3
|
||||
ble .LZGEMM_L1x8_SUB0
|
||||
ble ZGEMM_L1x8_SUB0
|
||||
cmpwi cr0, L, 1
|
||||
ble .LZGEMM_L1x8_SUB4
|
||||
ble ZGEMM_L1x8_SUB4
|
||||
|
||||
.LZGEMM_L1x8_LOOP_START:
|
||||
ZGEMM_L1x8_LOOP_START:
|
||||
|
||||
dcbt AO, PRE
|
||||
LOAD1x8_1
|
||||
|
|
@ -499,11 +544,11 @@
|
|||
KERNEL1x8_2
|
||||
|
||||
addic. L, L, -2
|
||||
ble .LZGEMM_L1x8_LOOP_END
|
||||
ble ZGEMM_L1x8_LOOP_END
|
||||
|
||||
.align 5
|
||||
|
||||
.LZGEMM_L1x8_LOOP:
|
||||
ZGEMM_L1x8_LOOP:
|
||||
|
||||
dcbt AO, PRE
|
||||
KERNEL1x8_1
|
||||
|
|
@ -524,9 +569,9 @@
|
|||
KERNEL1x8_2
|
||||
|
||||
addic. L, L, -1
|
||||
bgt .LZGEMM_L1x8_LOOP
|
||||
bgt ZGEMM_L1x8_LOOP
|
||||
|
||||
.LZGEMM_L1x8_LOOP_END:
|
||||
ZGEMM_L1x8_LOOP_END:
|
||||
|
||||
dcbt AO, PRE
|
||||
KERNEL1x8_1
|
||||
|
|
@ -545,9 +590,9 @@
|
|||
KERNEL1x8_1
|
||||
KERNEL1x8_E2
|
||||
|
||||
b .LZGEMM_L1x8_SUB1
|
||||
b ZGEMM_L1x8_SUB1
|
||||
|
||||
.LZGEMM_L1x8_SUB4:
|
||||
ZGEMM_L1x8_SUB4:
|
||||
|
||||
dcbt AO, PRE
|
||||
KERNEL1x8_SUBI1
|
||||
|
|
@ -563,53 +608,53 @@
|
|||
KERNEL1x8_SUB1
|
||||
KERNEL1x8_SUB1
|
||||
|
||||
b .LZGEMM_L1x8_SUB1
|
||||
b ZGEMM_L1x8_SUB1
|
||||
|
||||
.LZGEMM_L1x8_SUB0:
|
||||
ZGEMM_L1x8_SUB0:
|
||||
|
||||
andi. L, K, 7
|
||||
|
||||
KERNEL1x8_SUBI1
|
||||
|
||||
addic. L, L, -1
|
||||
ble .LZGEMM_L1x8_SAVE
|
||||
b .LZGEMM_L1x8_SUB2
|
||||
ble ZGEMM_L1x8_SAVE
|
||||
b ZGEMM_L1x8_SUB2
|
||||
|
||||
.LZGEMM_L1x8_SUB1:
|
||||
ZGEMM_L1x8_SUB1:
|
||||
|
||||
andi. L, K, 7
|
||||
ble .LZGEMM_L1x8_SAVE
|
||||
ble ZGEMM_L1x8_SAVE
|
||||
|
||||
.LZGEMM_L1x8_SUB2:
|
||||
ZGEMM_L1x8_SUB2:
|
||||
|
||||
KERNEL1x8_SUB1
|
||||
|
||||
addic. L, L, -1
|
||||
bgt .LZGEMM_L1x8_SUB2
|
||||
bgt ZGEMM_L1x8_SUB2
|
||||
|
||||
.LZGEMM_L1x8_SAVE:
|
||||
ZGEMM_L1x8_SAVE:
|
||||
|
||||
SAVE1x8
|
||||
|
||||
addic. I, I, -1
|
||||
bgt .LZGEMM_L1x8_BEGIN
|
||||
bgt ZGEMM_L1x8_BEGIN
|
||||
|
||||
.LZGEMM_L1x8_END:
|
||||
ZGEMM_L1x8_END:
|
||||
|
||||
.LZGEMM_L1x4_BEGIN:
|
||||
ZGEMM_L1x4_BEGIN:
|
||||
|
||||
andi. T2, M, 7
|
||||
ble .LZGEMM_L1x1_END
|
||||
ble ZGEMM_L1x1_END
|
||||
|
||||
andi. T1, M, 4
|
||||
ble .LZGEMM_L1x4_END
|
||||
mr BO, B
|
||||
ble ZGEMM_L1x4_END
|
||||
mr BO, BBUFFER
|
||||
srawi. L, K, 3
|
||||
ble .LZGEMM_L1x4_SUB0
|
||||
ble ZGEMM_L1x4_SUB0
|
||||
cmpwi cr0, L, 1
|
||||
ble .LZGEMM_L1x4_SUB4
|
||||
ble ZGEMM_L1x4_SUB4
|
||||
|
||||
.LZGEMM_L1x4_LOOP_START:
|
||||
ZGEMM_L1x4_LOOP_START:
|
||||
|
||||
LOAD1x4_1
|
||||
KERNEL1x4_I1
|
||||
|
|
@ -623,11 +668,11 @@
|
|||
KERNEL1x4_2
|
||||
|
||||
addic. L, L, -2
|
||||
ble .LZGEMM_L1x4_LOOP_END
|
||||
ble ZGEMM_L1x4_LOOP_END
|
||||
|
||||
.align 5
|
||||
|
||||
.LZGEMM_L1x4_LOOP:
|
||||
ZGEMM_L1x4_LOOP:
|
||||
|
||||
KERNEL1x4_1
|
||||
KERNEL1x4_2
|
||||
|
|
@ -640,9 +685,9 @@
|
|||
KERNEL1x4_2
|
||||
|
||||
addic. L, L, -1
|
||||
bgt .LZGEMM_L1x4_LOOP
|
||||
bgt ZGEMM_L1x4_LOOP
|
||||
|
||||
.LZGEMM_L1x4_LOOP_END:
|
||||
ZGEMM_L1x4_LOOP_END:
|
||||
|
||||
KERNEL1x4_1
|
||||
KERNEL1x4_2
|
||||
|
|
@ -654,9 +699,9 @@
|
|||
KERNEL1x4_1
|
||||
KERNEL1x4_E2
|
||||
|
||||
b .LZGEMM_L1x4_SUB1
|
||||
b ZGEMM_L1x4_SUB1
|
||||
|
||||
.LZGEMM_L1x4_SUB4:
|
||||
ZGEMM_L1x4_SUB4:
|
||||
|
||||
KERNEL1x4_SUBI1
|
||||
KERNEL1x4_SUB1
|
||||
|
|
@ -668,48 +713,48 @@
|
|||
KERNEL1x4_SUB1
|
||||
KERNEL1x4_SUB1
|
||||
|
||||
b .LZGEMM_L1x4_SUB1
|
||||
b ZGEMM_L1x4_SUB1
|
||||
|
||||
.LZGEMM_L1x4_SUB0:
|
||||
ZGEMM_L1x4_SUB0:
|
||||
|
||||
andi. L, K, 7
|
||||
|
||||
KERNEL1x4_SUBI1
|
||||
|
||||
addic. L, L, -1
|
||||
ble .LZGEMM_L1x4_SAVE
|
||||
b .LZGEMM_L1x4_SUB2
|
||||
ble ZGEMM_L1x4_SAVE
|
||||
b ZGEMM_L1x4_SUB2
|
||||
|
||||
.LZGEMM_L1x4_SUB1:
|
||||
ZGEMM_L1x4_SUB1:
|
||||
|
||||
andi. L, K, 7
|
||||
ble .LZGEMM_L1x4_SAVE
|
||||
ble ZGEMM_L1x4_SAVE
|
||||
|
||||
.LZGEMM_L1x4_SUB2:
|
||||
ZGEMM_L1x4_SUB2:
|
||||
|
||||
KERNEL1x4_SUB1
|
||||
|
||||
addic. L, L, -1
|
||||
bgt .LZGEMM_L1x4_SUB2
|
||||
bgt ZGEMM_L1x4_SUB2
|
||||
|
||||
.LZGEMM_L1x4_SAVE:
|
||||
ZGEMM_L1x4_SAVE:
|
||||
|
||||
SAVE1x4
|
||||
|
||||
.LZGEMM_L1x4_END:
|
||||
ZGEMM_L1x4_END:
|
||||
|
||||
.LZGEMM_L1x2_BEGIN:
|
||||
ZGEMM_L1x2_BEGIN:
|
||||
|
||||
|
||||
andi. T1, M, 2
|
||||
ble .LZGEMM_L1x2_END
|
||||
mr BO, B
|
||||
ble ZGEMM_L1x2_END
|
||||
mr BO, BBUFFER
|
||||
srawi. L, K, 3
|
||||
ble .LZGEMM_L1x2_SUB0
|
||||
ble ZGEMM_L1x2_SUB0
|
||||
cmpwi cr0, L, 1
|
||||
ble .LZGEMM_L1x2_SUB4
|
||||
ble ZGEMM_L1x2_SUB4
|
||||
|
||||
.LZGEMM_L1x2_LOOP_START:
|
||||
ZGEMM_L1x2_LOOP_START:
|
||||
|
||||
LOAD1x2_1
|
||||
KERNEL1x2_I1
|
||||
|
|
@ -723,11 +768,11 @@
|
|||
KERNEL1x2_2
|
||||
|
||||
addic. L, L, -2
|
||||
ble .LZGEMM_L1x2_LOOP_END
|
||||
ble ZGEMM_L1x2_LOOP_END
|
||||
|
||||
.align 5
|
||||
|
||||
.LZGEMM_L1x2_LOOP:
|
||||
ZGEMM_L1x2_LOOP:
|
||||
|
||||
KERNEL1x2_1
|
||||
KERNEL1x2_2
|
||||
|
|
@ -740,9 +785,9 @@
|
|||
KERNEL1x2_2
|
||||
|
||||
addic. L, L, -1
|
||||
bgt .LZGEMM_L1x2_LOOP
|
||||
bgt ZGEMM_L1x2_LOOP
|
||||
|
||||
.LZGEMM_L1x2_LOOP_END:
|
||||
ZGEMM_L1x2_LOOP_END:
|
||||
|
||||
KERNEL1x2_1
|
||||
KERNEL1x2_2
|
||||
|
|
@ -754,9 +799,9 @@
|
|||
KERNEL1x2_1
|
||||
KERNEL1x2_E2
|
||||
|
||||
b .LZGEMM_L1x2_SUB1
|
||||
b ZGEMM_L1x2_SUB1
|
||||
|
||||
.LZGEMM_L1x2_SUB4:
|
||||
ZGEMM_L1x2_SUB4:
|
||||
|
||||
KERNEL1x2_SUBI1
|
||||
KERNEL1x2_SUB1
|
||||
|
|
@ -768,48 +813,48 @@
|
|||
KERNEL1x2_SUB1
|
||||
KERNEL1x2_SUB1
|
||||
|
||||
b .LZGEMM_L1x2_SUB1
|
||||
b ZGEMM_L1x2_SUB1
|
||||
|
||||
.LZGEMM_L1x2_SUB0:
|
||||
ZGEMM_L1x2_SUB0:
|
||||
|
||||
andi. L, K, 7
|
||||
|
||||
KERNEL1x2_SUBI1
|
||||
|
||||
addic. L, L, -1
|
||||
ble .LZGEMM_L1x2_SAVE
|
||||
b .LZGEMM_L1x2_SUB2
|
||||
ble ZGEMM_L1x2_SAVE
|
||||
b ZGEMM_L1x2_SUB2
|
||||
|
||||
.LZGEMM_L1x2_SUB1:
|
||||
ZGEMM_L1x2_SUB1:
|
||||
|
||||
andi. L, K, 7
|
||||
ble .LZGEMM_L1x2_SAVE
|
||||
ble ZGEMM_L1x2_SAVE
|
||||
|
||||
.LZGEMM_L1x2_SUB2:
|
||||
ZGEMM_L1x2_SUB2:
|
||||
|
||||
KERNEL1x2_SUB1
|
||||
|
||||
addic. L, L, -1
|
||||
bgt .LZGEMM_L1x2_SUB2
|
||||
bgt ZGEMM_L1x2_SUB2
|
||||
|
||||
.LZGEMM_L1x2_SAVE:
|
||||
ZGEMM_L1x2_SAVE:
|
||||
|
||||
SAVE1x2
|
||||
|
||||
.LZGEMM_L1x2_END:
|
||||
ZGEMM_L1x2_END:
|
||||
|
||||
.LZGEMM_L1x1_BEGIN:
|
||||
ZGEMM_L1x1_BEGIN:
|
||||
|
||||
|
||||
andi. T1, M, 1
|
||||
ble .LZGEMM_L1x1_END
|
||||
mr BO, B
|
||||
ble ZGEMM_L1x1_END
|
||||
mr BO, BBUFFER
|
||||
srawi. L, K, 3
|
||||
ble .LZGEMM_L1x1_SUB0
|
||||
ble ZGEMM_L1x1_SUB0
|
||||
cmpwi cr0, L, 1
|
||||
ble .LZGEMM_L1x1_SUB4
|
||||
ble ZGEMM_L1x1_SUB4
|
||||
|
||||
.LZGEMM_L1x1_LOOP_START:
|
||||
ZGEMM_L1x1_LOOP_START:
|
||||
|
||||
LOAD1x1_1
|
||||
KERNEL1x1_I1
|
||||
|
|
@ -823,11 +868,11 @@
|
|||
KERNEL1x1_2
|
||||
|
||||
addic. L, L, -2
|
||||
ble .LZGEMM_L1x1_LOOP_END
|
||||
ble ZGEMM_L1x1_LOOP_END
|
||||
|
||||
.align 5
|
||||
|
||||
.LZGEMM_L1x1_LOOP:
|
||||
ZGEMM_L1x1_LOOP:
|
||||
|
||||
KERNEL1x1_1
|
||||
KERNEL1x1_2
|
||||
|
|
@ -840,9 +885,9 @@
|
|||
KERNEL1x1_2
|
||||
|
||||
addic. L, L, -1
|
||||
bgt .LZGEMM_L1x1_LOOP
|
||||
bgt ZGEMM_L1x1_LOOP
|
||||
|
||||
.LZGEMM_L1x1_LOOP_END:
|
||||
ZGEMM_L1x1_LOOP_END:
|
||||
|
||||
KERNEL1x1_1
|
||||
KERNEL1x1_2
|
||||
|
|
@ -854,9 +899,9 @@
|
|||
KERNEL1x1_1
|
||||
KERNEL1x1_E2
|
||||
|
||||
b .LZGEMM_L1x1_SUB1
|
||||
b ZGEMM_L1x1_SUB1
|
||||
|
||||
.LZGEMM_L1x1_SUB4:
|
||||
ZGEMM_L1x1_SUB4:
|
||||
|
||||
KERNEL1x1_SUBI1
|
||||
KERNEL1x1_SUB1
|
||||
|
|
@ -868,34 +913,34 @@
|
|||
KERNEL1x1_SUB1
|
||||
KERNEL1x1_SUB1
|
||||
|
||||
b .LZGEMM_L1x1_SUB1
|
||||
b ZGEMM_L1x1_SUB1
|
||||
|
||||
.LZGEMM_L1x1_SUB0:
|
||||
ZGEMM_L1x1_SUB0:
|
||||
|
||||
andi. L, K, 7
|
||||
|
||||
KERNEL1x1_SUBI1
|
||||
|
||||
addic. L, L, -1
|
||||
ble .LZGEMM_L1x1_SAVE
|
||||
b .LZGEMM_L1x1_SUB2
|
||||
ble ZGEMM_L1x1_SAVE
|
||||
b ZGEMM_L1x1_SUB2
|
||||
|
||||
.LZGEMM_L1x1_SUB1:
|
||||
ZGEMM_L1x1_SUB1:
|
||||
|
||||
andi. L, K, 7
|
||||
ble .LZGEMM_L1x1_SAVE
|
||||
ble ZGEMM_L1x1_SAVE
|
||||
|
||||
.LZGEMM_L1x1_SUB2:
|
||||
ZGEMM_L1x1_SUB2:
|
||||
|
||||
KERNEL1x1_SUB1
|
||||
|
||||
addic. L, L, -1
|
||||
bgt .LZGEMM_L1x1_SUB2
|
||||
bgt ZGEMM_L1x1_SUB2
|
||||
|
||||
.LZGEMM_L1x1_SAVE:
|
||||
ZGEMM_L1x1_SAVE:
|
||||
|
||||
SAVE1x1
|
||||
|
||||
.LZGEMM_L1x1_END:
|
||||
ZGEMM_L1x1_END:
|
||||
|
||||
.LZGEMM_L1_END:
|
||||
ZGEMM_L1_END:
|
||||
|
|
|
|||
|
|
@ -1,39 +1,3 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
/**************************************************************************************
|
||||
* 2016/03/05 Werner Saar (wernsaar@googlemail.com)
|
||||
* BLASTEST : OK
|
||||
* CTEST : OK
|
||||
* TEST : OK
|
||||
* LAPACK-TEST : OK
|
||||
**************************************************************************************/
|
||||
|
||||
|
||||
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
|
||||
|
||||
#define XSFADD_R1 xsadddp
|
||||
|
|
@ -70,12 +34,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
.macro LOAD2x8_1
|
||||
|
||||
lxvdsx vs16, o0, BO // load real part from B
|
||||
lxvdsx vs17, o8, BO // load imag part from B
|
||||
lxvdsx vs18, o16, BO // load real part from B
|
||||
lxvdsx vs19, o24, BO // load imag part from B
|
||||
lxvd2x vs16, o0, BO // load real part from B
|
||||
lxvd2x vs17, o16, BO // load imag part from B
|
||||
lxvd2x vs18, o32, BO // load real part from B
|
||||
lxvd2x vs19, o48, BO // load imag part from B
|
||||
|
||||
addi BO, BO, 32
|
||||
addi BO, BO, 64
|
||||
|
||||
lxvd2x vs0, o0, AO // load real,imag from A
|
||||
lxvd2x vs1, o16, AO // load real,imag from A
|
||||
|
|
@ -110,12 +74,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
addi AO, AO, 64
|
||||
|
||||
lxvdsx vs20, o0, BO // load real part from B
|
||||
lxvdsx vs21, o8, BO // load imag part from B
|
||||
lxvdsx vs22, o16, BO // load real part from B
|
||||
lxvdsx vs23, o24, BO // load imag part from B
|
||||
lxvd2x vs20, o0, BO // load real part from B
|
||||
lxvd2x vs21, o16, BO // load imag part from B
|
||||
lxvd2x vs22, o32, BO // load real part from B
|
||||
lxvd2x vs23, o48, BO // load imag part from B
|
||||
|
||||
addi BO, BO, 32
|
||||
addi BO, BO, 64
|
||||
|
||||
xvmuldp vs32, vs0, vs16 // real*real, imag*real
|
||||
xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
|
||||
|
|
@ -156,36 +120,41 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
.macro KERNEL2x8_1
|
||||
|
||||
lxvd2x vs8, o0, AO // load real,imag from A
|
||||
lxvd2x vs9, o16, AO // load real,imag from A
|
||||
lxvd2x vs10, o32, AO // load real,imag from A
|
||||
lxvd2x vs11, o48, AO // load real,imag from A
|
||||
|
||||
addi AO, AO, 64
|
||||
|
||||
lxvd2x vs12, o0, AO // load real,imag from A
|
||||
lxvd2x vs13, o16, AO // load real,imag from A
|
||||
lxvd2x vs14, o32, AO // load real,imag from A
|
||||
lxvd2x vs15, o48, AO // load real,imag from A
|
||||
|
||||
addi AO, AO, 64
|
||||
|
||||
lxvd2x vs20, o0, BO // load real part from B
|
||||
lxvd2x vs21, o16, BO // load imag part from B
|
||||
lxvd2x vs22, o32, BO // load real part from B
|
||||
lxvd2x vs23, o48, BO // load imag part from B
|
||||
|
||||
addi BO, BO, 64
|
||||
|
||||
xvmaddadp vs32, vs0, vs16 // real*real, imag*real
|
||||
xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
|
||||
xvmaddadp vs34, vs1, vs16 // real*real, imag*real
|
||||
xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag
|
||||
|
||||
lxvdsx vs22, o16, BO // load real part from B
|
||||
lxvdsx vs23, o24, BO // load imag part from B
|
||||
|
||||
xvmaddadp vs36, vs2, vs16 // real*real, imag*real
|
||||
xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag
|
||||
xvmaddadp vs38, vs3, vs16 // real*real, imag*real
|
||||
xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag
|
||||
|
||||
lxvd2x vs8, o0, AO // load real,imag from A
|
||||
lxvd2x vs9, o16, AO // load real,imag from A
|
||||
|
||||
xvmaddadp vs40, vs4, vs16 // real*real, imag*real
|
||||
xvmaddadp vs41, vs4, vs17 // real*imag, imag*imag
|
||||
xvmaddadp vs42, vs5, vs16 // real*real, imag*real
|
||||
xvmaddadp vs43, vs5, vs17 // real*imag, imag*imag
|
||||
|
||||
lxvd2x vs10, o32, AO // load real,imag from A
|
||||
lxvd2x vs11, o48, AO // load real,imag from A
|
||||
|
||||
xvmaddadp vs44, vs6, vs16 // real*real, imag*real
|
||||
xvmaddadp vs45, vs6, vs17 // real*imag, imag*imag
|
||||
|
||||
addi AO, AO, 64
|
||||
|
||||
xvmaddadp vs46, vs7, vs16 // real*real, imag*real
|
||||
xvmaddadp vs47, vs7, vs17 // real*imag, imag*imag
|
||||
|
||||
|
|
@ -193,101 +162,79 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
xvmaddadp vs49, vs0, vs19 // real*imag, imag*imag
|
||||
xvmaddadp vs50, vs1, vs18 // real*real, imag*real
|
||||
xvmaddadp vs51, vs1, vs19 // real*imag, imag*imag
|
||||
|
||||
lxvd2x vs12, o0, AO // load real,imag from A
|
||||
lxvd2x vs13, o16, AO // load real,imag from A
|
||||
|
||||
xvmaddadp vs52, vs2, vs18 // real*real, imag*real
|
||||
xvmaddadp vs53, vs2, vs19 // real*imag, imag*imag
|
||||
xvmaddadp vs54, vs3, vs18 // real*real, imag*real
|
||||
xvmaddadp vs55, vs3, vs19 // real*imag, imag*imag
|
||||
|
||||
lxvd2x vs14, o32, AO // load real,imag from A
|
||||
lxvd2x vs15, o48, AO // load real,imag from A
|
||||
|
||||
xvmaddadp vs56, vs4, vs18 // real*real, imag*real
|
||||
xvmaddadp vs57, vs4, vs19 // real*imag, imag*imag
|
||||
xvmaddadp vs58, vs5, vs18 // real*real, imag*real
|
||||
xvmaddadp vs59, vs5, vs19 // real*imag, imag*imag
|
||||
|
||||
lxvdsx vs20, o0, BO // load real part from B
|
||||
lxvdsx vs21, o8, BO // load imag part from B
|
||||
|
||||
xvmaddadp vs60, vs6, vs18 // real*real, imag*real
|
||||
xvmaddadp vs61, vs6, vs19 // real*imag, imag*imag
|
||||
xvmaddadp vs62, vs7, vs18 // real*real, imag*real
|
||||
xvmaddadp vs63, vs7, vs19 // real*imag, imag*imag
|
||||
|
||||
addi AO, AO, 64
|
||||
addi BO, BO, 32
|
||||
|
||||
.endm
|
||||
|
||||
.macro KERNEL2x8_2
|
||||
|
||||
lxvd2x vs0, o0, AO // load real,imag from A
|
||||
lxvd2x vs1, o16, AO // load real,imag from A
|
||||
lxvd2x vs2, o32, AO // load real,imag from A
|
||||
lxvd2x vs3, o48, AO // load real,imag from A
|
||||
|
||||
addi AO, AO, 64
|
||||
|
||||
lxvd2x vs4, o0, AO // load real,imag from A
|
||||
lxvd2x vs5, o16, AO // load real,imag from A
|
||||
lxvd2x vs6, o32, AO // load real,imag from A
|
||||
lxvd2x vs7, o48, AO // load real,imag from A
|
||||
|
||||
addi AO, AO, 64
|
||||
|
||||
lxvd2x vs16, o0, BO // load real part from B
|
||||
lxvd2x vs17, o16, BO // load imag part from B
|
||||
lxvd2x vs18, o32, BO // load real part from B
|
||||
lxvd2x vs19, o48, BO // load imag part from B
|
||||
|
||||
addi BO, BO, 64
|
||||
|
||||
xvmaddadp vs32, vs8, vs20 // real*real, imag*real
|
||||
xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
|
||||
xvmaddadp vs34, vs9, vs20 // real*real, imag*real
|
||||
xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag
|
||||
|
||||
lxvdsx vs16, o0, BO // load real part from B
|
||||
lxvdsx vs17, o8, BO // load imag part from B
|
||||
|
||||
xvmaddadp vs36, vs10, vs20 // real*real, imag*real
|
||||
xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag
|
||||
xvmaddadp vs38, vs11, vs20 // real*real, imag*real
|
||||
xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag
|
||||
|
||||
lxvd2x vs0, o0, AO // load real,imag from A
|
||||
lxvd2x vs1, o16, AO // load real,imag from A
|
||||
|
||||
xvmaddadp vs40, vs12, vs20 // real*real, imag*real
|
||||
xvmaddadp vs41, vs12, vs21 // real*imag, imag*imag
|
||||
xvmaddadp vs42, vs13, vs20 // real*real, imag*real
|
||||
xvmaddadp vs43, vs13, vs21 // real*imag, imag*imag
|
||||
|
||||
lxvd2x vs2, o32, AO // load real,imag from A
|
||||
lxvd2x vs3, o48, AO // load real,imag from A
|
||||
|
||||
xvmaddadp vs44, vs14, vs20 // real*real, imag*real
|
||||
xvmaddadp vs45, vs14, vs21 // real*imag, imag*imag
|
||||
xvmaddadp vs46, vs15, vs20 // real*real, imag*real
|
||||
xvmaddadp vs47, vs15, vs21 // real*imag, imag*imag
|
||||
|
||||
addi AO, AO, 64
|
||||
|
||||
xvmaddadp vs48, vs8, vs22 // real*real, imag*real
|
||||
xvmaddadp vs49, vs8, vs23 // real*imag, imag*imag
|
||||
xvmaddadp vs50, vs9, vs22 // real*real, imag*real
|
||||
xvmaddadp vs51, vs9, vs23 // real*imag, imag*imag
|
||||
|
||||
lxvd2x vs4, o0, AO // load real,imag from A
|
||||
lxvd2x vs5, o16, AO // load real,imag from A
|
||||
|
||||
xvmaddadp vs52, vs10, vs22 // real*real, imag*real
|
||||
xvmaddadp vs53, vs10, vs23 // real*imag, imag*imag
|
||||
xvmaddadp vs54, vs11, vs22 // real*real, imag*real
|
||||
xvmaddadp vs55, vs11, vs23 // real*imag, imag*imag
|
||||
|
||||
lxvd2x vs6, o32, AO // load real,imag from A
|
||||
lxvd2x vs7, o48, AO // load real,imag from A
|
||||
|
||||
xvmaddadp vs56, vs12, vs22 // real*real, imag*real
|
||||
xvmaddadp vs57, vs12, vs23 // real*imag, imag*imag
|
||||
xvmaddadp vs58, vs13, vs22 // real*real, imag*real
|
||||
xvmaddadp vs59, vs13, vs23 // real*imag, imag*imag
|
||||
|
||||
lxvdsx vs18, o16, BO // load real part from B
|
||||
lxvdsx vs19, o24, BO // load imag part from B
|
||||
|
||||
xvmaddadp vs60, vs14, vs22 // real*real, imag*real
|
||||
xvmaddadp vs61, vs14, vs23 // real*imag, imag*imag
|
||||
xvmaddadp vs62, vs15, vs22 // real*real, imag*real
|
||||
xvmaddadp vs63, vs15, vs23 // real*imag, imag*imag
|
||||
|
||||
addi AO, AO, 64
|
||||
addi BO, BO, 32
|
||||
|
||||
.endm
|
||||
|
||||
|
|
@ -347,12 +294,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
addi AO, AO, 64
|
||||
|
||||
lxvdsx vs16, o0, BO // load real part from B
|
||||
lxvdsx vs17, o8, BO // load imag part from B
|
||||
lxvdsx vs18, o16, BO // load real part from B
|
||||
lxvdsx vs19, o24, BO // load imag part from B
|
||||
lxvd2x vs16, o0, BO // load real part from B
|
||||
lxvd2x vs17, o16, BO // load imag part from B
|
||||
lxvd2x vs18, o32, BO // load real part from B
|
||||
lxvd2x vs19, o48, BO // load imag part from B
|
||||
|
||||
addi BO, BO, 32
|
||||
addi BO, BO, 64
|
||||
|
||||
xvmuldp vs32, vs0, vs16 // real*real, imag*real
|
||||
xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
|
||||
|
|
@ -407,12 +354,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
addi AO, AO, 64
|
||||
|
||||
lxvdsx vs16, o0, BO // load real part from B
|
||||
lxvdsx vs17, o8, BO // load imag part from B
|
||||
lxvdsx vs18, o16, BO // load real part from B
|
||||
lxvdsx vs19, o24, BO // load imag part from B
|
||||
lxvd2x vs16, o0, BO // load real part from B
|
||||
lxvd2x vs17, o16, BO // load imag part from B
|
||||
lxvd2x vs18, o32, BO // load real part from B
|
||||
lxvd2x vs19, o48, BO // load imag part from B
|
||||
|
||||
addi BO, BO, 32
|
||||
addi BO, BO, 64
|
||||
|
||||
xvmaddadp vs32, vs0, vs16 // real*real, imag*real
|
||||
xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
|
||||
|
|
@ -927,12 +874,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
.macro LOAD2x4_1
|
||||
|
||||
lxvdsx vs16, o0, BO // load real part from B
|
||||
lxvdsx vs17, o8, BO // load imag part from B
|
||||
lxvdsx vs18, o16, BO // load real part from B
|
||||
lxvdsx vs19, o24, BO // load imag part from B
|
||||
lxvd2x vs16, o0, BO // load real part from B
|
||||
lxvd2x vs17, o16, BO // load imag part from B
|
||||
lxvd2x vs18, o32, BO // load real part from B
|
||||
lxvd2x vs19, o48, BO // load imag part from B
|
||||
|
||||
addi BO, BO, 32
|
||||
addi BO, BO, 64
|
||||
|
||||
lxvd2x vs0, o0, AO // load real,imag from A
|
||||
lxvd2x vs1, o16, AO // load real,imag from A
|
||||
|
|
@ -953,12 +900,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
addi AO, AO, 64
|
||||
|
||||
lxvdsx vs20, o0, BO // load real part from B
|
||||
lxvdsx vs21, o8, BO // load imag part from B
|
||||
lxvdsx vs22, o16, BO // load real part from B
|
||||
lxvdsx vs23, o24, BO // load imag part from B
|
||||
lxvd2x vs20, o0, BO // load real part from B
|
||||
lxvd2x vs21, o16, BO // load imag part from B
|
||||
lxvd2x vs22, o32, BO // load real part from B
|
||||
lxvd2x vs23, o48, BO // load imag part from B
|
||||
|
||||
addi BO, BO, 32
|
||||
addi BO, BO, 64
|
||||
|
||||
xvmuldp vs32, vs0, vs16 // real*real, imag*real
|
||||
xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
|
||||
|
|
@ -990,12 +937,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
addi AO, AO, 64
|
||||
|
||||
lxvdsx vs20, o0, BO // load real part from B
|
||||
lxvdsx vs21, o8, BO // load imag part from B
|
||||
lxvdsx vs22, o16, BO // load real part from B
|
||||
lxvdsx vs23, o24, BO // load imag part from B
|
||||
lxvd2x vs20, o0, BO // load real part from B
|
||||
lxvd2x vs21, o16, BO // load imag part from B
|
||||
lxvd2x vs22, o32, BO // load real part from B
|
||||
lxvd2x vs23, o48, BO // load imag part from B
|
||||
|
||||
addi BO, BO, 32
|
||||
addi BO, BO, 64
|
||||
|
||||
xvmaddadp vs32, vs0, vs16 // real*real, imag*real
|
||||
xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
|
||||
|
|
@ -1027,12 +974,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
addi AO, AO, 64
|
||||
|
||||
lxvdsx vs16, o0, BO // load real part from B
|
||||
lxvdsx vs17, o8, BO // load imag part from B
|
||||
lxvdsx vs18, o16, BO // load real part from B
|
||||
lxvdsx vs19, o24, BO // load imag part from B
|
||||
lxvd2x vs16, o0, BO // load real part from B
|
||||
lxvd2x vs17, o16, BO // load imag part from B
|
||||
lxvd2x vs18, o32, BO // load real part from B
|
||||
lxvd2x vs19, o48, BO // load imag part from B
|
||||
|
||||
addi BO, BO, 32
|
||||
addi BO, BO, 64
|
||||
|
||||
xvmaddadp vs32, vs8, vs20 // real*real, imag*real
|
||||
xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
|
||||
|
|
@ -1088,12 +1035,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
addi AO, AO, 64
|
||||
|
||||
lxvdsx vs16, o0, BO // load real part from B
|
||||
lxvdsx vs17, o8, BO // load imag part from B
|
||||
lxvdsx vs18, o16, BO // load real part from B
|
||||
lxvdsx vs19, o24, BO // load imag part from B
|
||||
lxvd2x vs16, o0, BO // load real part from B
|
||||
lxvd2x vs17, o16, BO // load imag part from B
|
||||
lxvd2x vs18, o32, BO // load real part from B
|
||||
lxvd2x vs19, o48, BO // load imag part from B
|
||||
|
||||
addi BO, BO, 32
|
||||
addi BO, BO, 64
|
||||
|
||||
xvmuldp vs32, vs0, vs16 // real*real, imag*real
|
||||
xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
|
||||
|
|
@ -1125,12 +1072,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
addi AO, AO, 64
|
||||
|
||||
lxvdsx vs16, o0, BO // load real part from B
|
||||
lxvdsx vs17, o8, BO // load imag part from B
|
||||
lxvdsx vs18, o16, BO // load real part from B
|
||||
lxvdsx vs19, o24, BO // load imag part from B
|
||||
lxvd2x vs16, o0, BO // load real part from B
|
||||
lxvd2x vs17, o16, BO // load imag part from B
|
||||
lxvd2x vs18, o32, BO // load real part from B
|
||||
lxvd2x vs19, o48, BO // load imag part from B
|
||||
|
||||
addi BO, BO, 32
|
||||
addi BO, BO, 64
|
||||
|
||||
xvmaddadp vs32, vs0, vs16 // real*real, imag*real
|
||||
xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
|
||||
|
|
@ -1410,12 +1357,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
.macro LOAD2x2_1
|
||||
|
||||
lxvdsx vs16, o0, BO // load real part from B
|
||||
lxvdsx vs17, o8, BO // load imag part from B
|
||||
lxvdsx vs18, o16, BO // load real part from B
|
||||
lxvdsx vs19, o24, BO // load imag part from B
|
||||
lxvd2x vs16, o0, BO // load real part from B
|
||||
lxvd2x vs17, o16, BO // load imag part from B
|
||||
lxvd2x vs18, o32, BO // load real part from B
|
||||
lxvd2x vs19, o48, BO // load imag part from B
|
||||
|
||||
addi BO, BO, 32
|
||||
addi BO, BO, 64
|
||||
|
||||
lxvd2x vs0, o0, AO // load real,imag from A
|
||||
lxvd2x vs1, o16, AO // load real,imag from A
|
||||
|
|
@ -1432,12 +1379,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
addi AO, AO, 32
|
||||
|
||||
lxvdsx vs20, o0, BO // load real part from B
|
||||
lxvdsx vs21, o8, BO // load imag part from B
|
||||
lxvdsx vs22, o16, BO // load real part from B
|
||||
lxvdsx vs23, o24, BO // load imag part from B
|
||||
lxvd2x vs20, o0, BO // load real part from B
|
||||
lxvd2x vs21, o16, BO // load imag part from B
|
||||
lxvd2x vs22, o32, BO // load real part from B
|
||||
lxvd2x vs23, o48, BO // load imag part from B
|
||||
|
||||
addi BO, BO, 32
|
||||
addi BO, BO, 64
|
||||
|
||||
xvmuldp vs32, vs0, vs16 // real*real, imag*real
|
||||
xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
|
||||
|
|
@ -1459,12 +1406,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
addi AO, AO, 32
|
||||
|
||||
lxvdsx vs20, o0, BO // load real part from B
|
||||
lxvdsx vs21, o8, BO // load imag part from B
|
||||
lxvdsx vs22, o16, BO // load real part from B
|
||||
lxvdsx vs23, o24, BO // load imag part from B
|
||||
lxvd2x vs20, o0, BO // load real part from B
|
||||
lxvd2x vs21, o16, BO // load imag part from B
|
||||
lxvd2x vs22, o32, BO // load real part from B
|
||||
lxvd2x vs23, o48, BO // load imag part from B
|
||||
|
||||
addi BO, BO, 32
|
||||
addi BO, BO, 64
|
||||
|
||||
xvmaddadp vs32, vs0, vs16 // real*real, imag*real
|
||||
xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
|
||||
|
|
@ -1486,12 +1433,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
addi AO, AO, 32
|
||||
|
||||
lxvdsx vs16, o0, BO // load real part from B
|
||||
lxvdsx vs17, o8, BO // load imag part from B
|
||||
lxvdsx vs18, o16, BO // load real part from B
|
||||
lxvdsx vs19, o24, BO // load imag part from B
|
||||
lxvd2x vs16, o0, BO // load real part from B
|
||||
lxvd2x vs17, o16, BO // load imag part from B
|
||||
lxvd2x vs18, o32, BO // load real part from B
|
||||
lxvd2x vs19, o48, BO // load imag part from B
|
||||
|
||||
addi BO, BO, 32
|
||||
addi BO, BO, 64
|
||||
|
||||
xvmaddadp vs32, vs8, vs20 // real*real, imag*real
|
||||
xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
|
||||
|
|
@ -1529,12 +1476,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
addi AO, AO, 32
|
||||
|
||||
lxvdsx vs16, o0, BO // load real part from B
|
||||
lxvdsx vs17, o8, BO // load imag part from B
|
||||
lxvdsx vs18, o16, BO // load real part from B
|
||||
lxvdsx vs19, o24, BO // load imag part from B
|
||||
lxvd2x vs16, o0, BO // load real part from B
|
||||
lxvd2x vs17, o16, BO // load imag part from B
|
||||
lxvd2x vs18, o32, BO // load real part from B
|
||||
lxvd2x vs19, o48, BO // load imag part from B
|
||||
|
||||
addi BO, BO, 32
|
||||
addi BO, BO, 64
|
||||
|
||||
xvmuldp vs32, vs0, vs16 // real*real, imag*real
|
||||
xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
|
||||
|
|
@ -1556,12 +1503,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
addi AO, AO, 32
|
||||
|
||||
lxvdsx vs16, o0, BO // load real part from B
|
||||
lxvdsx vs17, o8, BO // load imag part from B
|
||||
lxvdsx vs18, o16, BO // load real part from B
|
||||
lxvdsx vs19, o24, BO // load imag part from B
|
||||
lxvd2x vs16, o0, BO // load real part from B
|
||||
lxvd2x vs17, o16, BO // load imag part from B
|
||||
lxvd2x vs18, o32, BO // load real part from B
|
||||
lxvd2x vs19, o48, BO // load imag part from B
|
||||
|
||||
addi BO, BO, 32
|
||||
addi BO, BO, 64
|
||||
|
||||
xvmaddadp vs32, vs0, vs16 // real*real, imag*real
|
||||
xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
|
||||
|
|
@ -1725,12 +1672,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
.macro LOAD2x1_1
|
||||
|
||||
lxvdsx vs16, o0, BO // load real part from B
|
||||
lxvdsx vs17, o8, BO // load imag part from B
|
||||
lxvdsx vs18, o16, BO // load real part from B
|
||||
lxvdsx vs19, o24, BO // load imag part from B
|
||||
lxvd2x vs16, o0, BO // load real part from B
|
||||
lxvd2x vs17, o16, BO // load imag part from B
|
||||
lxvd2x vs18, o32, BO // load real part from B
|
||||
lxvd2x vs19, o48, BO // load imag part from B
|
||||
|
||||
addi BO, BO, 32
|
||||
addi BO, BO, 64
|
||||
|
||||
lxvd2x vs0, o0, AO // load real,imag from A
|
||||
|
||||
|
|
@ -1745,12 +1692,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
addi AO, AO, 16
|
||||
|
||||
lxvdsx vs20, o0, BO // load real part from B
|
||||
lxvdsx vs21, o8, BO // load imag part from B
|
||||
lxvdsx vs22, o16, BO // load real part from B
|
||||
lxvdsx vs23, o24, BO // load imag part from B
|
||||
lxvd2x vs20, o0, BO // load real part from B
|
||||
lxvd2x vs21, o16, BO // load imag part from B
|
||||
lxvd2x vs22, o32, BO // load real part from B
|
||||
lxvd2x vs23, o48, BO // load imag part from B
|
||||
|
||||
addi BO, BO, 32
|
||||
addi BO, BO, 64
|
||||
|
||||
xvmuldp vs32, vs0, vs16 // real*real, imag*real
|
||||
xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
|
||||
|
|
@ -1767,12 +1714,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
addi AO, AO, 16
|
||||
|
||||
lxvdsx vs20, o0, BO // load real part from B
|
||||
lxvdsx vs21, o8, BO // load imag part from B
|
||||
lxvdsx vs22, o16, BO // load real part from B
|
||||
lxvdsx vs23, o24, BO // load imag part from B
|
||||
lxvd2x vs20, o0, BO // load real part from B
|
||||
lxvd2x vs21, o16, BO // load imag part from B
|
||||
lxvd2x vs22, o32, BO // load real part from B
|
||||
lxvd2x vs23, o48, BO // load imag part from B
|
||||
|
||||
addi BO, BO, 32
|
||||
addi BO, BO, 64
|
||||
|
||||
xvmaddadp vs32, vs0, vs16 // real*real, imag*real
|
||||
xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
|
||||
|
|
@ -1789,12 +1736,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
addi AO, AO, 16
|
||||
|
||||
lxvdsx vs16, o0, BO // load real part from B
|
||||
lxvdsx vs17, o8, BO // load imag part from B
|
||||
lxvdsx vs18, o16, BO // load real part from B
|
||||
lxvdsx vs19, o24, BO // load imag part from B
|
||||
lxvd2x vs16, o0, BO // load real part from B
|
||||
lxvd2x vs17, o16, BO // load imag part from B
|
||||
lxvd2x vs18, o32, BO // load real part from B
|
||||
lxvd2x vs19, o48, BO // load imag part from B
|
||||
|
||||
addi BO, BO, 32
|
||||
addi BO, BO, 64
|
||||
|
||||
xvmaddadp vs32, vs8, vs20 // real*real, imag*real
|
||||
xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
|
||||
|
|
@ -1823,12 +1770,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
addi AO, AO, 16
|
||||
|
||||
lxvdsx vs16, o0, BO // load real part from B
|
||||
lxvdsx vs17, o8, BO // load imag part from B
|
||||
lxvdsx vs18, o16, BO // load real part from B
|
||||
lxvdsx vs19, o24, BO // load imag part from B
|
||||
lxvd2x vs16, o0, BO // load real part from B
|
||||
lxvd2x vs17, o16, BO // load imag part from B
|
||||
lxvd2x vs18, o32, BO // load real part from B
|
||||
lxvd2x vs19, o48, BO // load imag part from B
|
||||
|
||||
addi BO, BO, 32
|
||||
addi BO, BO, 64
|
||||
|
||||
xvmuldp vs32, vs0, vs16 // real*real, imag*real
|
||||
xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
|
||||
|
|
@ -1845,12 +1792,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
addi AO, AO, 16
|
||||
|
||||
lxvdsx vs16, o0, BO // load real part from B
|
||||
lxvdsx vs17, o8, BO // load imag part from B
|
||||
lxvdsx vs18, o16, BO // load real part from B
|
||||
lxvdsx vs19, o24, BO // load imag part from B
|
||||
lxvd2x vs16, o0, BO // load real part from B
|
||||
lxvd2x vs17, o16, BO // load imag part from B
|
||||
lxvd2x vs18, o32, BO // load real part from B
|
||||
lxvd2x vs19, o48, BO // load imag part from B
|
||||
|
||||
addi BO, BO, 32
|
||||
addi BO, BO, 64
|
||||
|
||||
xvmaddadp vs32, vs0, vs16 // real*real, imag*real
|
||||
xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
|
||||
|
|
@ -1956,10 +1903,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
.macro LOAD1x8_1
|
||||
|
||||
lxvdsx vs16, o0, BO // load real part from B
|
||||
lxvdsx vs17, o8, BO // load imag part from B
|
||||
lxvd2x vs16, o0, BO // load real part from B
|
||||
lxvd2x vs17, o16, BO // load imag part from B
|
||||
|
||||
addi BO, BO, 16
|
||||
addi BO, BO, 32
|
||||
|
||||
lxvd2x vs0, o0, AO // load real,imag from A
|
||||
lxvd2x vs1, o16, AO // load real,imag from A
|
||||
|
|
@ -1994,10 +1941,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
addi AO, AO, 64
|
||||
|
||||
lxvdsx vs20, o0, BO // load real part from B
|
||||
lxvdsx vs21, o8, BO // load imag part from B
|
||||
lxvd2x vs20, o0, BO // load real part from B
|
||||
lxvd2x vs21, o16, BO // load imag part from B
|
||||
|
||||
addi BO, BO, 16
|
||||
addi BO, BO, 32
|
||||
|
||||
xvmuldp vs32, vs0, vs16 // real*real, imag*real
|
||||
xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
|
||||
|
|
@ -2035,10 +1982,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
addi AO, AO, 64
|
||||
|
||||
lxvdsx vs20, o0, BO // load real part from B
|
||||
lxvdsx vs21, o8, BO // load imag part from B
|
||||
lxvd2x vs20, o0, BO // load real part from B
|
||||
lxvd2x vs21, o16, BO // load imag part from B
|
||||
|
||||
addi BO, BO, 16
|
||||
addi BO, BO, 32
|
||||
|
||||
xvmaddadp vs32, vs0, vs16 // real*real, imag*real
|
||||
xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
|
||||
|
|
@ -2076,10 +2023,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
addi AO, AO, 64
|
||||
|
||||
lxvdsx vs16, o0, BO // load real part from B
|
||||
lxvdsx vs17, o8, BO // load imag part from B
|
||||
lxvd2x vs16, o0, BO // load real part from B
|
||||
lxvd2x vs17, o16, BO // load imag part from B
|
||||
|
||||
addi BO, BO, 16
|
||||
addi BO, BO, 32
|
||||
|
||||
xvmaddadp vs32, vs8, vs20 // real*real, imag*real
|
||||
xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
|
||||
|
|
@ -2140,10 +2087,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
addi AO, AO, 64
|
||||
|
||||
lxvdsx vs16, o0, BO // load real part from B
|
||||
lxvdsx vs17, o8, BO // load imag part from B
|
||||
lxvd2x vs16, o0, BO // load real part from B
|
||||
lxvd2x vs17, o16, BO // load imag part from B
|
||||
|
||||
addi BO, BO, 16
|
||||
addi BO, BO, 32
|
||||
|
||||
xvmuldp vs32, vs0, vs16 // real*real, imag*real
|
||||
xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
|
||||
|
|
@ -2181,10 +2128,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
addi AO, AO, 64
|
||||
|
||||
lxvdsx vs16, o0, BO // load real part from B
|
||||
lxvdsx vs17, o8, BO // load imag part from B
|
||||
lxvd2x vs16, o0, BO // load real part from B
|
||||
lxvd2x vs17, o16, BO // load imag part from B
|
||||
|
||||
addi BO, BO, 16
|
||||
addi BO, BO, 32
|
||||
|
||||
xvmaddadp vs32, vs0, vs16 // real*real, imag*real
|
||||
xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
|
||||
|
|
@ -2452,10 +2399,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
.macro LOAD1x4_1
|
||||
|
||||
lxvdsx vs16, o0, BO // load real part from B
|
||||
lxvdsx vs17, o8, BO // load imag part from B
|
||||
lxvd2x vs16, o0, BO // load real part from B
|
||||
lxvd2x vs17, o16, BO // load imag part from B
|
||||
|
||||
addi BO, BO, 16
|
||||
addi BO, BO, 32
|
||||
|
||||
lxvd2x vs0, o0, AO // load real,imag from A
|
||||
lxvd2x vs1, o16, AO // load real,imag from A
|
||||
|
|
@ -2476,10 +2423,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
addi AO, AO, 64
|
||||
|
||||
lxvdsx vs20, o0, BO // load real part from B
|
||||
lxvdsx vs21, o8, BO // load imag part from B
|
||||
lxvd2x vs20, o0, BO // load real part from B
|
||||
lxvd2x vs21, o16, BO // load imag part from B
|
||||
|
||||
addi BO, BO, 16
|
||||
addi BO, BO, 32
|
||||
|
||||
xvmuldp vs32, vs0, vs16 // real*real, imag*real
|
||||
xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
|
||||
|
|
@ -2502,10 +2449,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
addi AO, AO, 64
|
||||
|
||||
lxvdsx vs20, o0, BO // load real part from B
|
||||
lxvdsx vs21, o8, BO // load imag part from B
|
||||
lxvd2x vs20, o0, BO // load real part from B
|
||||
lxvd2x vs21, o16, BO // load imag part from B
|
||||
|
||||
addi BO, BO, 16
|
||||
addi BO, BO, 32
|
||||
|
||||
xvmaddadp vs32, vs0, vs16 // real*real, imag*real
|
||||
xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
|
||||
|
|
@ -2528,10 +2475,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
addi AO, AO, 64
|
||||
|
||||
lxvdsx vs16, o0, BO // load real part from B
|
||||
lxvdsx vs17, o8, BO // load imag part from B
|
||||
lxvd2x vs16, o0, BO // load real part from B
|
||||
lxvd2x vs17, o16, BO // load imag part from B
|
||||
|
||||
addi BO, BO, 16
|
||||
addi BO, BO, 32
|
||||
|
||||
xvmaddadp vs32, vs8, vs20 // real*real, imag*real
|
||||
xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
|
||||
|
|
@ -2569,10 +2516,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
addi AO, AO, 64
|
||||
|
||||
lxvdsx vs16, o0, BO // load real part from B
|
||||
lxvdsx vs17, o8, BO // load imag part from B
|
||||
lxvd2x vs16, o0, BO // load real part from B
|
||||
lxvd2x vs17, o16, BO // load imag part from B
|
||||
|
||||
addi BO, BO, 16
|
||||
addi BO, BO, 32
|
||||
|
||||
xvmuldp vs32, vs0, vs16 // real*real, imag*real
|
||||
xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
|
||||
|
|
@ -2595,10 +2542,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
addi AO, AO, 64
|
||||
|
||||
lxvdsx vs16, o0, BO // load real part from B
|
||||
lxvdsx vs17, o8, BO // load imag part from B
|
||||
lxvd2x vs16, o0, BO // load real part from B
|
||||
lxvd2x vs17, o16, BO // load imag part from B
|
||||
|
||||
addi BO, BO, 16
|
||||
addi BO, BO, 32
|
||||
|
||||
xvmaddadp vs32, vs0, vs16 // real*real, imag*real
|
||||
xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
|
||||
|
|
@ -2748,10 +2695,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
.macro LOAD1x2_1
|
||||
|
||||
lxvdsx vs16, o0, BO // load real part from B
|
||||
lxvdsx vs17, o8, BO // load imag part from B
|
||||
lxvd2x vs16, o0, BO // load real part from B
|
||||
lxvd2x vs17, o16, BO // load imag part from B
|
||||
|
||||
addi BO, BO, 16
|
||||
addi BO, BO, 32
|
||||
|
||||
lxvd2x vs0, o0, AO // load real,imag from A
|
||||
lxvd2x vs1, o16, AO // load real,imag from A
|
||||
|
|
@ -2768,10 +2715,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
addi AO, AO, 32
|
||||
|
||||
lxvdsx vs20, o0, BO // load real part from B
|
||||
lxvdsx vs21, o8, BO // load imag part from B
|
||||
lxvd2x vs20, o0, BO // load real part from B
|
||||
lxvd2x vs21, o16, BO // load imag part from B
|
||||
|
||||
addi BO, BO, 16
|
||||
addi BO, BO, 32
|
||||
|
||||
xvmuldp vs32, vs0, vs16 // real*real, imag*real
|
||||
xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
|
||||
|
|
@ -2788,10 +2735,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
addi AO, AO, 32
|
||||
|
||||
lxvdsx vs20, o0, BO // load real part from B
|
||||
lxvdsx vs21, o8, BO // load imag part from B
|
||||
lxvd2x vs20, o0, BO // load real part from B
|
||||
lxvd2x vs21, o16, BO // load imag part from B
|
||||
|
||||
addi BO, BO, 16
|
||||
addi BO, BO, 32
|
||||
|
||||
xvmaddadp vs32, vs0, vs16 // real*real, imag*real
|
||||
xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
|
||||
|
|
@ -2808,10 +2755,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
addi AO, AO, 32
|
||||
|
||||
lxvdsx vs16, o0, BO // load real part from B
|
||||
lxvdsx vs17, o8, BO // load imag part from B
|
||||
lxvd2x vs16, o0, BO // load real part from B
|
||||
lxvd2x vs17, o16, BO // load imag part from B
|
||||
|
||||
addi BO, BO, 16
|
||||
addi BO, BO, 32
|
||||
|
||||
xvmaddadp vs32, vs8, vs20 // real*real, imag*real
|
||||
xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
|
||||
|
|
@ -2839,10 +2786,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
addi AO, AO, 32
|
||||
|
||||
lxvdsx vs16, o0, BO // load real part from B
|
||||
lxvdsx vs17, o8, BO // load imag part from B
|
||||
lxvd2x vs16, o0, BO // load real part from B
|
||||
lxvd2x vs17, o16, BO // load imag part from B
|
||||
|
||||
addi BO, BO, 16
|
||||
addi BO, BO, 32
|
||||
|
||||
xvmuldp vs32, vs0, vs16 // real*real, imag*real
|
||||
xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
|
||||
|
|
@ -2859,10 +2806,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
addi AO, AO, 32
|
||||
|
||||
lxvdsx vs16, o0, BO // load real part from B
|
||||
lxvdsx vs17, o8, BO // load imag part from B
|
||||
lxvd2x vs16, o0, BO // load real part from B
|
||||
lxvd2x vs17, o16, BO // load imag part from B
|
||||
|
||||
addi BO, BO, 16
|
||||
addi BO, BO, 32
|
||||
|
||||
xvmaddadp vs32, vs0, vs16 // real*real, imag*real
|
||||
xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
|
||||
|
|
@ -2954,10 +2901,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
.macro LOAD1x1_1
|
||||
|
||||
lxvdsx vs16, o0, BO // load real part from B
|
||||
lxvdsx vs17, o8, BO // load imag part from B
|
||||
lxvd2x vs16, o0, BO // load real part from B
|
||||
lxvd2x vs17, o16, BO // load imag part from B
|
||||
|
||||
addi BO, BO, 16
|
||||
addi BO, BO, 32
|
||||
|
||||
lxvd2x vs0, o0, AO // load real,imag from A
|
||||
|
||||
|
|
@ -2972,10 +2919,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
addi AO, AO, 16
|
||||
|
||||
lxvdsx vs20, o0, BO // load real part from B
|
||||
lxvdsx vs21, o8, BO // load imag part from B
|
||||
lxvd2x vs20, o0, BO // load real part from B
|
||||
lxvd2x vs21, o16, BO // load imag part from B
|
||||
|
||||
addi BO, BO, 16
|
||||
addi BO, BO, 32
|
||||
|
||||
xvmuldp vs32, vs0, vs16 // real*real, imag*real
|
||||
xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
|
||||
|
|
@ -2989,10 +2936,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
addi AO, AO, 16
|
||||
|
||||
lxvdsx vs20, o0, BO // load real part from B
|
||||
lxvdsx vs21, o8, BO // load imag part from B
|
||||
lxvd2x vs20, o0, BO // load real part from B
|
||||
lxvd2x vs21, o16, BO // load imag part from B
|
||||
|
||||
addi BO, BO, 16
|
||||
addi BO, BO, 32
|
||||
|
||||
xvmaddadp vs32, vs0, vs16 // real*real, imag*real
|
||||
xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
|
||||
|
|
@ -3006,10 +2953,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
addi AO, AO, 16
|
||||
|
||||
lxvdsx vs16, o0, BO // load real part from B
|
||||
lxvdsx vs17, o8, BO // load imag part from B
|
||||
lxvd2x vs16, o0, BO // load real part from B
|
||||
lxvd2x vs17, o16, BO // load imag part from B
|
||||
|
||||
addi BO, BO, 16
|
||||
addi BO, BO, 32
|
||||
|
||||
xvmaddadp vs32, vs8, vs20 // real*real, imag*real
|
||||
xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
|
||||
|
|
@ -3032,10 +2979,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
addi AO, AO, 16
|
||||
|
||||
lxvdsx vs16, o0, BO // load real part from B
|
||||
lxvdsx vs17, o8, BO // load imag part from B
|
||||
lxvd2x vs16, o0, BO // load real part from B
|
||||
lxvd2x vs17, o16, BO // load imag part from B
|
||||
|
||||
addi BO, BO, 16
|
||||
addi BO, BO, 32
|
||||
|
||||
xvmuldp vs32, vs0, vs16 // real*real, imag*real
|
||||
xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
|
||||
|
|
@ -3049,10 +2996,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
addi AO, AO, 16
|
||||
|
||||
lxvdsx vs16, o0, BO // load real part from B
|
||||
lxvdsx vs17, o8, BO // load imag part from B
|
||||
lxvd2x vs16, o0, BO // load real part from B
|
||||
lxvd2x vs17, o16, BO // load imag part from B
|
||||
|
||||
addi BO, BO, 16
|
||||
addi BO, BO, 32
|
||||
|
||||
xvmaddadp vs32, vs0, vs16 // real*real, imag*real
|
||||
xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
|
||||
|
|
|
|||
|
|
@ -0,0 +1,176 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
/**************************************************************************************
|
||||
* 2016/03/27 Werner Saar (wernsaar@googlemail.com)
|
||||
* BLASTEST : OK
|
||||
* CTEST : OK
|
||||
* TEST : OK
|
||||
* LAPACK-TEST : OK
|
||||
**************************************************************************************/
|
||||
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#pragma GCC optimize "O1"
|
||||
|
||||
#if defined(POWER8)
|
||||
#include "zscal_microk_power8.c"
|
||||
#endif
|
||||
|
||||
|
||||
#ifndef HAVE_KERNEL_8
|
||||
|
||||
static void zscal_kernel_8(BLASLONG n, FLOAT *x, FLOAT *alpha)
|
||||
{
|
||||
|
||||
BLASLONG i=0;
|
||||
FLOAT *x1=x;
|
||||
FLOAT alpha_r1=alpha[0];
|
||||
FLOAT alpha_r2=alpha[1];
|
||||
FLOAT alpha_i1=alpha[2];
|
||||
FLOAT alpha_i2=alpha[3];
|
||||
FLOAT temp00, temp01, temp10, temp11, temp20, temp21, temp30, temp31;
|
||||
FLOAT x0_r, x0_i, x1_r, x1_i, x2_r, x2_i, x3_r, x3_i;
|
||||
|
||||
while ( i<n )
|
||||
{
|
||||
x0_r = x1[0];
|
||||
x0_i = x1[1];
|
||||
x1_r = x1[2];
|
||||
x1_i = x1[3];
|
||||
x2_r = x1[4];
|
||||
x2_i = x1[5];
|
||||
x3_r = x1[6];
|
||||
x3_i = x1[7];
|
||||
|
||||
temp00 = x0_r * alpha_r1;
|
||||
temp10 = x1_r * alpha_r1;
|
||||
temp20 = x2_r * alpha_r1;
|
||||
temp30 = x3_r * alpha_r1;
|
||||
|
||||
temp01 = x0_i * alpha_r2;
|
||||
temp11 = x1_i * alpha_r2;
|
||||
temp21 = x2_i * alpha_r2;
|
||||
temp31 = x3_i * alpha_r2;
|
||||
|
||||
temp00 += x0_i * alpha_i1;
|
||||
temp10 += x1_i * alpha_i1;
|
||||
temp20 += x2_i * alpha_i1;
|
||||
temp30 += x3_i * alpha_i1;
|
||||
|
||||
temp01 += x0_r * alpha_i2;
|
||||
temp11 += x1_r * alpha_i2;
|
||||
temp21 += x2_r * alpha_i2;
|
||||
temp31 += x3_r * alpha_i2;
|
||||
|
||||
x1[0] = temp00;
|
||||
x1[1] = temp01;
|
||||
x1[2] = temp10;
|
||||
x1[3] = temp11;
|
||||
x1[4] = temp20;
|
||||
x1[5] = temp21;
|
||||
x1[6] = temp30;
|
||||
x1[7] = temp31;
|
||||
|
||||
x1 += 8;
|
||||
i+=4;
|
||||
|
||||
}
|
||||
return;
|
||||
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
|
||||
{
|
||||
BLASLONG i=0;
|
||||
BLASLONG inc_x2;
|
||||
BLASLONG ip = 0;
|
||||
FLOAT temp;
|
||||
FLOAT alpha[4] __attribute__ ((aligned (16)));;
|
||||
BLASLONG n1;
|
||||
|
||||
if ( n <= 0 )
|
||||
return(0);
|
||||
|
||||
if ( inc_x <= 0 )
|
||||
return(0);
|
||||
|
||||
if ( inc_x == 1 )
|
||||
{
|
||||
|
||||
|
||||
n1 = n & -8;
|
||||
if ( n1 > 0 )
|
||||
{
|
||||
alpha[0] = da_r;
|
||||
alpha[1] = da_r;
|
||||
alpha[2] = -da_i;
|
||||
alpha[3] = da_i;
|
||||
zscal_kernel_8(n1, x, alpha);
|
||||
i=n1;
|
||||
ip = n1 * 2;
|
||||
|
||||
}
|
||||
|
||||
while ( i < n )
|
||||
{
|
||||
|
||||
temp = da_r * x[ip] - da_i * x[ip+1] ;
|
||||
x[ip+1] = da_r * x[ip+1] + da_i * x[ip] ;
|
||||
x[ip] = temp;
|
||||
ip += 2;
|
||||
i++;
|
||||
}
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
inc_x2 = 2 * inc_x;
|
||||
|
||||
while ( i < n )
|
||||
{
|
||||
|
||||
temp = da_r * x[ip] - da_i * x[ip+1] ;
|
||||
x[ip+1] = da_r * x[ip+1] + da_i * x[ip] ;
|
||||
x[ip] = temp;
|
||||
ip += inc_x2;
|
||||
i++;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
return(0);
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,224 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
/**************************************************************************************
|
||||
* 2016/03/25 Werner Saar (wernsaar@googlemail.com)
|
||||
*
|
||||
* I don't use fused multipy-add ( lapack precision problems )
|
||||
*
|
||||
* BLASTEST : OK
|
||||
* CTEST : OK
|
||||
* TEST : OK
|
||||
* LAPACK-TEST : OK
|
||||
**************************************************************************************/
|
||||
|
||||
#define HAVE_KERNEL_8 1
|
||||
|
||||
static void zscal_kernel_8( BLASLONG n, FLOAT *x, FLOAT *alpha) __attribute__ ((noinline));
|
||||
|
||||
static void zscal_kernel_8( BLASLONG n, FLOAT *x, FLOAT *alpha)
|
||||
{
|
||||
|
||||
|
||||
BLASLONG i = n;
|
||||
BLASLONG o16 = 16;
|
||||
BLASLONG o32 = 32;
|
||||
BLASLONG o48 = 48;
|
||||
BLASLONG o64 = 64;
|
||||
BLASLONG o80 = 80;
|
||||
BLASLONG o96 = 96;
|
||||
BLASLONG o112 = 112;
|
||||
FLOAT *x1=x;
|
||||
FLOAT *x2=x+1;
|
||||
BLASLONG pre = 384;
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
|
||||
"lxvd2x 32, 0, %3 \n\t" // alpha_r , alpha_r
|
||||
"lxvd2x 33, %5, %3 \n\t" // -alpha_i , alpha_i
|
||||
"addi %1, %1, -8 \n\t"
|
||||
|
||||
"dcbt %2, %4 \n\t"
|
||||
|
||||
"lxvd2x 40, 0, %2 \n\t" // x0_r, x0_i
|
||||
"lxvd2x 41, %5, %2 \n\t"
|
||||
"lxvd2x 42, %6, %2 \n\t"
|
||||
"lxvd2x 43, %7, %2 \n\t"
|
||||
"lxvd2x 44, %8, %2 \n\t"
|
||||
"lxvd2x 45, %9, %2 \n\t"
|
||||
"lxvd2x 46, %10, %2 \n\t"
|
||||
"lxvd2x 47, %11, %2 \n\t"
|
||||
|
||||
"addi %2, %2, 128 \n\t"
|
||||
|
||||
"addic. %0 , %0 , -8 \n\t"
|
||||
"ble 2f \n\t"
|
||||
|
||||
".align 5 \n\t"
|
||||
"1: \n\t"
|
||||
|
||||
"dcbt %2, %4 \n\t"
|
||||
|
||||
"xvmuldp 48, 40, 32 \n\t" // x0_r * alpha_r, x0_i * alpha_r
|
||||
"xvmuldp 49, 41, 32 \n\t"
|
||||
"xvmuldp 50, 42, 32 \n\t"
|
||||
"xvmuldp 51, 43, 32 \n\t"
|
||||
"xvmuldp 52, 44, 32 \n\t"
|
||||
"xvmuldp 53, 45, 32 \n\t"
|
||||
"xvmuldp 54, 46, 32 \n\t"
|
||||
"xvmuldp 55, 47, 32 \n\t"
|
||||
|
||||
"xxswapd 56, 40 \n\t"
|
||||
"xxswapd 57, 41 \n\t"
|
||||
"xxswapd 58, 42 \n\t"
|
||||
"xxswapd 59, 43 \n\t"
|
||||
"xxswapd 60, 44 \n\t"
|
||||
"xxswapd 61, 45 \n\t"
|
||||
"xxswapd 62, 46 \n\t"
|
||||
"xxswapd 63, 47 \n\t"
|
||||
|
||||
"xvmuldp 56, 56, 33 \n\t" // x0_i * -alpha_i, x0_r * alpha_i
|
||||
"xvmuldp 57, 57, 33 \n\t"
|
||||
|
||||
"lxvd2x 40, 0, %2 \n\t" // x0_r, x0_i
|
||||
"lxvd2x 41, %5, %2 \n\t"
|
||||
|
||||
"xvmuldp 58, 58, 33 \n\t"
|
||||
"xvmuldp 59, 59, 33 \n\t"
|
||||
|
||||
"lxvd2x 42, %6, %2 \n\t"
|
||||
"lxvd2x 43, %7, %2 \n\t"
|
||||
|
||||
"xvmuldp 60, 60, 33 \n\t"
|
||||
"xvmuldp 61, 61, 33 \n\t"
|
||||
|
||||
"lxvd2x 44, %8, %2 \n\t"
|
||||
"lxvd2x 45, %9, %2 \n\t"
|
||||
|
||||
"xvmuldp 62, 62, 33 \n\t"
|
||||
"xvmuldp 63, 63, 33 \n\t"
|
||||
|
||||
"lxvd2x 46, %10, %2 \n\t"
|
||||
"lxvd2x 47, %11, %2 \n\t"
|
||||
|
||||
"xvadddp 48, 48 , 56 \n\t"
|
||||
"xvadddp 49, 49 , 57 \n\t"
|
||||
"xvadddp 50, 50 , 58 \n\t"
|
||||
"xvadddp 51, 51 , 59 \n\t"
|
||||
|
||||
"stxvd2x 48, 0, %1 \n\t"
|
||||
"stxvd2x 49, %5, %1 \n\t"
|
||||
|
||||
"xvadddp 52, 52 , 60 \n\t"
|
||||
"xvadddp 53, 53 , 61 \n\t"
|
||||
|
||||
"stxvd2x 50, %6, %1 \n\t"
|
||||
"stxvd2x 51, %7, %1 \n\t"
|
||||
|
||||
"xvadddp 54, 54 , 62 \n\t"
|
||||
"xvadddp 55, 55 , 63 \n\t"
|
||||
|
||||
"stxvd2x 52, %8, %1 \n\t"
|
||||
"stxvd2x 53, %9, %1 \n\t"
|
||||
"stxvd2x 54, %10, %1 \n\t"
|
||||
"stxvd2x 55, %11, %1 \n\t"
|
||||
|
||||
"addi %1, %1, 128 \n\t"
|
||||
"addi %2, %2, 128 \n\t"
|
||||
|
||||
"addic. %0 , %0 , -8 \n\t"
|
||||
"bgt 1b \n\t"
|
||||
|
||||
"2: \n\t"
|
||||
|
||||
"xvmuldp 48, 40, 32 \n\t" // x0_r * alpha_r, x0_i * alpha_r
|
||||
"xvmuldp 49, 41, 32 \n\t"
|
||||
"xvmuldp 50, 42, 32 \n\t"
|
||||
"xvmuldp 51, 43, 32 \n\t"
|
||||
"xvmuldp 52, 44, 32 \n\t"
|
||||
"xvmuldp 53, 45, 32 \n\t"
|
||||
"xvmuldp 54, 46, 32 \n\t"
|
||||
"xvmuldp 55, 47, 32 \n\t"
|
||||
|
||||
"xxswapd 56, 40 \n\t"
|
||||
"xxswapd 57, 41 \n\t"
|
||||
"xxswapd 58, 42 \n\t"
|
||||
"xxswapd 59, 43 \n\t"
|
||||
"xxswapd 60, 44 \n\t"
|
||||
"xxswapd 61, 45 \n\t"
|
||||
"xxswapd 62, 46 \n\t"
|
||||
"xxswapd 63, 47 \n\t"
|
||||
|
||||
"xvmuldp 56, 56, 33 \n\t" // x0_i * -alpha_i, x0_r * alpha_i
|
||||
"xvmuldp 57, 57, 33 \n\t"
|
||||
"xvmuldp 58, 58, 33 \n\t"
|
||||
"xvmuldp 59, 59, 33 \n\t"
|
||||
"xvmuldp 60, 60, 33 \n\t"
|
||||
"xvmuldp 61, 61, 33 \n\t"
|
||||
"xvmuldp 62, 62, 33 \n\t"
|
||||
"xvmuldp 63, 63, 33 \n\t"
|
||||
|
||||
"xvadddp 48, 48 , 56 \n\t"
|
||||
"xvadddp 49, 49 , 57 \n\t"
|
||||
"xvadddp 50, 50 , 58 \n\t"
|
||||
"xvadddp 51, 51 , 59 \n\t"
|
||||
"xvadddp 52, 52 , 60 \n\t"
|
||||
"xvadddp 53, 53 , 61 \n\t"
|
||||
"xvadddp 54, 54 , 62 \n\t"
|
||||
"xvadddp 55, 55 , 63 \n\t"
|
||||
|
||||
"stxvd2x 48, 0, %1 \n\t"
|
||||
"stxvd2x 49, %5, %1 \n\t"
|
||||
"stxvd2x 50, %6, %1 \n\t"
|
||||
"stxvd2x 51, %7, %1 \n\t"
|
||||
"stxvd2x 52, %8, %1 \n\t"
|
||||
"stxvd2x 53, %9, %1 \n\t"
|
||||
"stxvd2x 54, %10, %1 \n\t"
|
||||
"stxvd2x 55, %11, %1 \n\t"
|
||||
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (x2), // 1
|
||||
"r" (x1), // 2
|
||||
"r" (alpha), // 3
|
||||
"r" (pre), // 4
|
||||
"r" (o16), // 5
|
||||
"r" (o32), // 6
|
||||
"r" (o48), // 7
|
||||
"r" (o64), // 8
|
||||
"r" (o80), // 9
|
||||
"r" (o96), // 10
|
||||
"r" (o112) // 11
|
||||
: "cr0", "%0", "%2" , "%1", "memory"
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,175 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
/**************************************************************************************
|
||||
* 2016/03/27 Werner Saar (wernsaar@googlemail.com)
|
||||
* BLASTEST : OK
|
||||
* CTEST : OK
|
||||
* TEST : OK
|
||||
* LAPACK-TEST : OK
|
||||
**************************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
|
||||
#if defined(POWER8)
|
||||
#include "zswap_microk_power8.c"
|
||||
#endif
|
||||
|
||||
|
||||
#ifndef HAVE_KERNEL_16
|
||||
|
||||
static void zswap_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y)
|
||||
{
|
||||
|
||||
BLASLONG i=0;
|
||||
FLOAT f0, f1, f2, f3, f4, f5, f6, f7;
|
||||
FLOAT g0, g1, g2, g3, g4, g5, g6, g7;
|
||||
FLOAT *x1=x;
|
||||
FLOAT *y1=y;
|
||||
|
||||
while ( i<n )
|
||||
{
|
||||
|
||||
f0 = x1[0];
|
||||
f1 = x1[1];
|
||||
f2 = x1[2];
|
||||
f3 = x1[3];
|
||||
f4 = x1[4];
|
||||
f5 = x1[5];
|
||||
f6 = x1[6];
|
||||
f7 = x1[7];
|
||||
|
||||
g0 = y1[0];
|
||||
g1 = y1[1];
|
||||
g2 = y1[2];
|
||||
g3 = y1[3];
|
||||
g4 = y1[4];
|
||||
g5 = y1[5];
|
||||
g6 = y1[6];
|
||||
g7 = y1[7];
|
||||
|
||||
y1[0] = f0;
|
||||
y1[1] = f1;
|
||||
y1[2] = f2;
|
||||
y1[3] = f3;
|
||||
y1[4] = f4;
|
||||
y1[5] = f5;
|
||||
y1[6] = f6;
|
||||
y1[7] = f7;
|
||||
|
||||
x1[0] = g0;
|
||||
x1[1] = g1;
|
||||
x1[2] = g2;
|
||||
x1[3] = g3;
|
||||
x1[4] = g4;
|
||||
x1[5] = g5;
|
||||
x1[6] = g6;
|
||||
x1[7] = g7;
|
||||
|
||||
x1 += 8;
|
||||
y1 += 8;
|
||||
|
||||
i+=4;
|
||||
}
|
||||
return;
|
||||
|
||||
}
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
|
||||
{
|
||||
BLASLONG i=0;
|
||||
BLASLONG ix=0,iy=0;
|
||||
FLOAT temp[2];
|
||||
BLASLONG inc_x2, inc_y2;
|
||||
|
||||
if ( n <= 0 ) return(0);
|
||||
|
||||
if ( (inc_x == 1) && (inc_y == 1 ))
|
||||
{
|
||||
|
||||
BLASLONG n1 = n & -16;
|
||||
if ( n1 > 0 )
|
||||
{
|
||||
zswap_kernel_16(n1, x, y);
|
||||
i=n1;
|
||||
ix = 2* n1;
|
||||
iy = 2* n1;
|
||||
}
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
|
||||
temp[0] = x[ix] ;
|
||||
temp[1] = x[ix+1] ;
|
||||
x[ix] = y[iy] ;
|
||||
x[ix+1] = y[iy+1] ;
|
||||
y[iy] = temp[0] ;
|
||||
y[iy+1] = temp[1] ;
|
||||
|
||||
ix += 2 ;
|
||||
iy += 2 ;
|
||||
i++ ;
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
inc_x2 = 2 * inc_x;
|
||||
inc_y2 = 2 * inc_y;
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
|
||||
temp[0] = x[ix] ;
|
||||
temp[1] = x[ix+1] ;
|
||||
x[ix] = y[iy] ;
|
||||
x[ix+1] = y[iy+1] ;
|
||||
y[iy] = temp[0] ;
|
||||
y[iy+1] = temp[1] ;
|
||||
|
||||
ix += inc_x2 ;
|
||||
iy += inc_y2 ;
|
||||
i++ ;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
return(0);
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,180 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
/**************************************************************************************
|
||||
* 2016/03/27 Werner Saar (wernsaar@googlemail.com)
|
||||
* BLASTEST : OK
|
||||
* CTEST : OK
|
||||
* TEST : OK
|
||||
* LAPACK-TEST : OK
|
||||
**************************************************************************************/
|
||||
|
||||
#define HAVE_KERNEL_16 1
|
||||
|
||||
static void zswap_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
|
||||
|
||||
static void zswap_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y)
|
||||
{
|
||||
|
||||
|
||||
BLASLONG i = n;
|
||||
BLASLONG o16 = 16;
|
||||
BLASLONG o32 = 32;
|
||||
BLASLONG o48 = 48;
|
||||
BLASLONG o64 = 64;
|
||||
BLASLONG o80 = 80;
|
||||
BLASLONG o96 = 96;
|
||||
BLASLONG o112 = 112;
|
||||
FLOAT *x1=x;
|
||||
FLOAT *y1=y;
|
||||
FLOAT *x2=x+1;
|
||||
FLOAT *y2=y+1;
|
||||
BLASLONG pre = 384;
|
||||
BLASLONG alpha=0;
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
|
||||
"addi %3, %3, -8 \n\t"
|
||||
"addi %4, %4, -8 \n\t"
|
||||
|
||||
".align 5 \n\t"
|
||||
"1: \n\t"
|
||||
|
||||
"lxvd2x 32, 0, %2 \n\t"
|
||||
"lxvd2x 33, %5, %2 \n\t"
|
||||
"lxvd2x 34, %6, %2 \n\t"
|
||||
"lxvd2x 35, %7, %2 \n\t"
|
||||
"lxvd2x 36, %8, %2 \n\t"
|
||||
"lxvd2x 37, %9, %2 \n\t"
|
||||
"lxvd2x 38, %10, %2 \n\t"
|
||||
"lxvd2x 39, %11, %2 \n\t"
|
||||
|
||||
"addi %2, %2, 128 \n\t"
|
||||
|
||||
"lxvd2x 40, 0, %2 \n\t"
|
||||
"lxvd2x 41, %5, %2 \n\t"
|
||||
"lxvd2x 42, %6, %2 \n\t"
|
||||
"lxvd2x 43, %7, %2 \n\t"
|
||||
"lxvd2x 44, %8, %2 \n\t"
|
||||
"lxvd2x 45, %9, %2 \n\t"
|
||||
"lxvd2x 46, %10, %2 \n\t"
|
||||
"lxvd2x 47, %11, %2 \n\t"
|
||||
|
||||
"addi %2, %2, 128 \n\t"
|
||||
|
||||
"lxvd2x 48, 0, %1 \n\t"
|
||||
"lxvd2x 49, %5, %1 \n\t"
|
||||
"lxvd2x 50, %6, %1 \n\t"
|
||||
"lxvd2x 51, %7, %1 \n\t"
|
||||
"lxvd2x 52, %8, %1 \n\t"
|
||||
"lxvd2x 53, %9, %1 \n\t"
|
||||
"lxvd2x 54, %10, %1 \n\t"
|
||||
"lxvd2x 55, %11, %1 \n\t"
|
||||
|
||||
"addi %1, %1, 128 \n\t"
|
||||
|
||||
"lxvd2x 56, 0, %1 \n\t"
|
||||
"lxvd2x 57, %5, %1 \n\t"
|
||||
"lxvd2x 58, %6, %1 \n\t"
|
||||
"lxvd2x 59, %7, %1 \n\t"
|
||||
"lxvd2x 60, %8, %1 \n\t"
|
||||
"lxvd2x 61, %9, %1 \n\t"
|
||||
"lxvd2x 62, %10, %1 \n\t"
|
||||
"lxvd2x 63, %11, %1 \n\t"
|
||||
|
||||
"addi %1, %1, 128 \n\t"
|
||||
|
||||
"stxvd2x 32, 0, %3 \n\t"
|
||||
"stxvd2x 33, %5, %3 \n\t"
|
||||
"stxvd2x 34, %6, %3 \n\t"
|
||||
"stxvd2x 35, %7, %3 \n\t"
|
||||
"stxvd2x 36, %8, %3 \n\t"
|
||||
"stxvd2x 37, %9, %3 \n\t"
|
||||
"stxvd2x 38, %10, %3 \n\t"
|
||||
"stxvd2x 39, %11, %3 \n\t"
|
||||
|
||||
"addi %3, %3, 128 \n\t"
|
||||
|
||||
"stxvd2x 40, 0, %3 \n\t"
|
||||
"stxvd2x 41, %5, %3 \n\t"
|
||||
"stxvd2x 42, %6, %3 \n\t"
|
||||
"stxvd2x 43, %7, %3 \n\t"
|
||||
"stxvd2x 44, %8, %3 \n\t"
|
||||
"stxvd2x 45, %9, %3 \n\t"
|
||||
"stxvd2x 46, %10, %3 \n\t"
|
||||
"stxvd2x 47, %11, %3 \n\t"
|
||||
|
||||
"addi %3, %3, 128 \n\t"
|
||||
|
||||
"stxvd2x 48, 0, %4 \n\t"
|
||||
"stxvd2x 49, %5, %4 \n\t"
|
||||
"stxvd2x 50, %6, %4 \n\t"
|
||||
"stxvd2x 51, %7, %4 \n\t"
|
||||
"stxvd2x 52, %8, %4 \n\t"
|
||||
"stxvd2x 53, %9, %4 \n\t"
|
||||
"stxvd2x 54, %10, %4 \n\t"
|
||||
"stxvd2x 55, %11, %4 \n\t"
|
||||
|
||||
"addi %4, %4, 128 \n\t"
|
||||
|
||||
"stxvd2x 56, 0, %4 \n\t"
|
||||
"stxvd2x 57, %5, %4 \n\t"
|
||||
"stxvd2x 58, %6, %4 \n\t"
|
||||
"stxvd2x 59, %7, %4 \n\t"
|
||||
"stxvd2x 60, %8, %4 \n\t"
|
||||
"stxvd2x 61, %9, %4 \n\t"
|
||||
"stxvd2x 62, %10, %4 \n\t"
|
||||
"stxvd2x 63, %11, %4 \n\t"
|
||||
|
||||
"addi %4, %4, 128 \n\t"
|
||||
|
||||
"addic. %0 , %0 , -16 \n\t"
|
||||
"bgt 1b \n\t"
|
||||
|
||||
"2: \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (y1), // 1
|
||||
"r" (x1), // 2
|
||||
"r" (y2), // 3
|
||||
"r" (x2), // 4
|
||||
"r" (o16), // 5
|
||||
"r" (o32), // 6
|
||||
"r" (o48), // 7
|
||||
"r" (o64), // 8
|
||||
"r" (o80), // 9
|
||||
"r" (o96), // 10
|
||||
"r" (o112) // 11
|
||||
: "cr0", "%0", "%2" , "%1", "%3", "%4", "memory"
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -271,7 +271,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#endif
|
||||
#endif
|
||||
|
||||
#include "zgemm_macros_8x2_power8.S"
|
||||
#include "ztrmm_macros_8x2_power8.S"
|
||||
|
||||
cmpwi cr0, M, 0
|
||||
ble .L999
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load Diff
|
|
@ -24,7 +24,7 @@ SGEMVTKERNEL = sgemv_t_4.c
|
|||
DGEMVNKERNEL = dgemv_n_4.c
|
||||
DGEMVTKERNEL = dgemv_t_4.c
|
||||
|
||||
ZGEMVNKERNEL = zgemv_t_4.c
|
||||
ZGEMVNKERNEL = zgemv_n_4.c
|
||||
ZGEMVTKERNEL = zgemv_t_4.c
|
||||
|
||||
DCOPYKERNEL = dcopy_bulldozer.S
|
||||
|
|
|
|||
|
|
@ -72,18 +72,20 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
|||
{
|
||||
BLASLONG i=0;
|
||||
BLASLONG ix=0,iy=0;
|
||||
double dot = 0.0 ;
|
||||
|
||||
FLOAT dot = 0.0 ;
|
||||
FLOAT mydot=0.0;
|
||||
BLASLONG n1;
|
||||
|
||||
if ( n <= 0 ) return(dot);
|
||||
|
||||
if ( (inc_x == 1) && (inc_y == 1) )
|
||||
{
|
||||
|
||||
BLASLONG n1 = n & -32;
|
||||
n1 = n & (BLASLONG)(-32);
|
||||
|
||||
if ( n1 )
|
||||
sdot_kernel_16(n1, x, y , &dot );
|
||||
sdot_kernel_16(n1, x, y , &mydot );
|
||||
|
||||
|
||||
i = n1;
|
||||
|
|
@ -94,12 +96,13 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
|||
i++ ;
|
||||
|
||||
}
|
||||
dot+=mydot;
|
||||
return(dot);
|
||||
|
||||
|
||||
}
|
||||
|
||||
BLASLONG n1 = n & -2;
|
||||
n1 = n & (BLASLONG)(-2);
|
||||
|
||||
while(i < n1)
|
||||
{
|
||||
|
|
@ -124,4 +127,3 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
|||
|
||||
}
|
||||
|
||||
|
||||
|
|
|
|||
31
param.h
31
param.h
|
|
@ -1961,35 +1961,36 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#if defined(POWER8)
|
||||
|
||||
#define SNUMOPT 4
|
||||
#define SNUMOPT 16
|
||||
#define DNUMOPT 8
|
||||
|
||||
#define GEMM_DEFAULT_OFFSET_A 384
|
||||
#define GEMM_DEFAULT_OFFSET_B 1024
|
||||
#define GEMM_DEFAULT_OFFSET_A 4096
|
||||
#define GEMM_DEFAULT_OFFSET_B 4096
|
||||
#define GEMM_DEFAULT_ALIGN 0x03fffUL
|
||||
|
||||
#define SGEMM_DEFAULT_UNROLL_M 4
|
||||
#define SGEMM_DEFAULT_UNROLL_N 4
|
||||
#define SGEMM_DEFAULT_UNROLL_M 16
|
||||
#define SGEMM_DEFAULT_UNROLL_N 8
|
||||
#define DGEMM_DEFAULT_UNROLL_M 16
|
||||
#define DGEMM_DEFAULT_UNROLL_N 4
|
||||
#define CGEMM_DEFAULT_UNROLL_M 2
|
||||
#define CGEMM_DEFAULT_UNROLL_N 2
|
||||
#define CGEMM_DEFAULT_UNROLL_M 8
|
||||
#define CGEMM_DEFAULT_UNROLL_N 4
|
||||
#define ZGEMM_DEFAULT_UNROLL_M 8
|
||||
#define ZGEMM_DEFAULT_UNROLL_N 2
|
||||
|
||||
#define SGEMM_DEFAULT_P 992
|
||||
#define SGEMM_DEFAULT_P 960
|
||||
#define DGEMM_DEFAULT_P 480
|
||||
#define CGEMM_DEFAULT_P 488
|
||||
#define ZGEMM_DEFAULT_P 240
|
||||
#define CGEMM_DEFAULT_P 720
|
||||
#define ZGEMM_DEFAULT_P 480
|
||||
|
||||
#define SGEMM_DEFAULT_Q 504
|
||||
#define SGEMM_DEFAULT_Q 720
|
||||
#define DGEMM_DEFAULT_Q 720
|
||||
#define CGEMM_DEFAULT_Q 400
|
||||
#define ZGEMM_DEFAULT_Q 360
|
||||
#define CGEMM_DEFAULT_Q 720
|
||||
#define ZGEMM_DEFAULT_Q 720
|
||||
|
||||
#define SGEMM_DEFAULT_R 28800
|
||||
#define SGEMM_DEFAULT_R 21600
|
||||
#define DGEMM_DEFAULT_R 14400
|
||||
#define ZGEMM_DEFAULT_R 7200
|
||||
#define CGEMM_DEFAULT_R 16200
|
||||
#define ZGEMM_DEFAULT_R 21600
|
||||
|
||||
#define SYMV_P 8
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue