Merge branch 'develop'

This commit is contained in:
Zhang Xianyi 2016-04-12 15:29:19 -04:00
commit 12ab1804b6
97 changed files with 50538 additions and 3900 deletions

View File

@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.4)
project(OpenBLAS) project(OpenBLAS)
set(OpenBLAS_MAJOR_VERSION 0) set(OpenBLAS_MAJOR_VERSION 0)
set(OpenBLAS_MINOR_VERSION 2) set(OpenBLAS_MINOR_VERSION 2)
set(OpenBLAS_PATCH_VERSION 17) set(OpenBLAS_PATCH_VERSION 18)
set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
enable_language(ASM) enable_language(ASM)

View File

@ -147,5 +147,6 @@ In chronological order:
* [2016-03-14] Additional functional Assembly Kernels for Cortex-A57 * [2016-03-14] Additional functional Assembly Kernels for Cortex-A57
* [2016-03-14] Optimize Dgemm 4x4 for Cortex-A57 * [2016-03-14] Optimize Dgemm 4x4 for Cortex-A57
* [Your name or handle] <[email or website]> * theoractice <https://github.com/theoractice/>
* [Date] [Brief summary of your changes] * [2016-03-20] Fix compiler error in VisualStudio with CMake
* [2016-03-22] Fix access violation on Windows while static linking

View File

@ -1,4 +1,22 @@
OpenBLAS ChangeLog OpenBLAS ChangeLog
====================================================================
Version 0.2.18
12-Apr-2016
common:
* If you set MAKE_NB_JOBS flag less or equal than zero,
make will be without -j.
x86/x86_64:
* Support building Visual Studio static library. (#813, Thanks, theoractice)
* Fix bugs to pass buidbot CI tests (http://build.openblas.net)
ARM:
* Provide DGEMM 8x4 kernel for Cortex-A57 (Thanks, Ashwin Sekhar T K)
POWER:
* Optimize S and C BLAS3 on Power8
* Optimize BLAS2/1 on Power8
==================================================================== ====================================================================
Version 0.2.17 Version 0.2.17
20-Mar-2016 20-Mar-2016

View File

@ -3,7 +3,7 @@
# #
# This library's version # This library's version
VERSION = 0.2.17 VERSION = 0.2.18
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
@ -112,7 +112,10 @@ NO_AFFINITY = 1
# NO_PARALLEL_MAKE = 1 # NO_PARALLEL_MAKE = 1
# Force number of make jobs. The default is the number of logical CPU of the host. # Force number of make jobs. The default is the number of logical CPU of the host.
# This is particularly useful when using distcc # This is particularly useful when using distcc.
# A negative value will disable adding a -j flag to make, allowing to use a parent
# make -j value. This is useful to call OpenBLAS make from an other project
# makefile
# MAKE_NB_JOBS = 2 # MAKE_NB_JOBS = 2
# If you would like to know minute performance report of GotoBLAS. # If you would like to know minute performance report of GotoBLAS.

View File

@ -1,4 +1,4 @@
version: 0.2.15.{build} version: 0.2.18.{build}
#environment: #environment:

View File

@ -33,6 +33,10 @@ LIBMKL = -L$(MKL) -lmkl_intel_lp64 -lmkl_gnu_thread -lmkl_core -lgomp -lpthread
# Apple vecLib # Apple vecLib
LIBVECLIB = -framework Accelerate LIBVECLIB = -framework Accelerate
ESSL=/opt/ibm/lib
#LIBESSL = -lesslsmp $(ESSL)/libxlomp_ser.so.1 $(ESSL)/libxlf90_r.so.1 $(ESSL)/libxlfmath.so.1 $(ESSL)/libxlsmp.so.1 /opt/ibm/xlC/13.1.3/lib/libxl.a
LIBESSL = -lesslsmp $(ESSL)/libxlf90_r.so.1 $(ESSL)/libxlfmath.so.1 $(ESSL)/libxlsmp.so.1 /opt/ibm/xlC/13.1.3/lib/libxl.a
ifeq ($(OSNAME), WINNT) ifeq ($(OSNAME), WINNT)
goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \
@ -44,6 +48,7 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \
ssyr2k.goto dsyr2k.goto csyr2k.goto zsyr2k.goto \ ssyr2k.goto dsyr2k.goto csyr2k.goto zsyr2k.goto \
sger.goto dger.goto cger.goto zger.goto \ sger.goto dger.goto cger.goto zger.goto \
sdot.goto ddot.goto \ sdot.goto ddot.goto \
srot.goto drot.goto \
saxpy.goto daxpy.goto caxpy.goto zaxpy.goto \ saxpy.goto daxpy.goto caxpy.goto zaxpy.goto \
scopy.goto dcopy.goto ccopy.goto zcopy.goto \ scopy.goto dcopy.goto ccopy.goto zcopy.goto \
sswap.goto dswap.goto cswap.goto zswap.goto \ sswap.goto dswap.goto cswap.goto zswap.goto \
@ -151,6 +156,7 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \
ssyr2k.goto dsyr2k.goto csyr2k.goto zsyr2k.goto \ ssyr2k.goto dsyr2k.goto csyr2k.goto zsyr2k.goto \
sger.goto dger.goto cger.goto zger.goto \ sger.goto dger.goto cger.goto zger.goto \
sdot.goto ddot.goto cdot.goto zdot.goto \ sdot.goto ddot.goto cdot.goto zdot.goto \
srot.goto drot.goto \
saxpy.goto daxpy.goto caxpy.goto zaxpy.goto \ saxpy.goto daxpy.goto caxpy.goto zaxpy.goto \
scopy.goto dcopy.goto ccopy.goto zcopy.goto \ scopy.goto dcopy.goto ccopy.goto zcopy.goto \
sswap.goto dswap.goto cswap.goto zswap.goto \ sswap.goto dswap.goto cswap.goto zswap.goto \
@ -253,7 +259,9 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \
endif endif
essl :: sgemm.essl strmm.essl dgemm.essl dtrmm.essl \
cgemm.essl ctrmm.essl zgemm.essl ztrmm.essl \
slinpack.essl clinpack.essl dlinpack.essl zlinpack.essl
veclib :: slinpack.veclib dlinpack.veclib clinpack.veclib zlinpack.veclib \ veclib :: slinpack.veclib dlinpack.veclib clinpack.veclib zlinpack.veclib \
scholesky.veclib dcholesky.veclib ccholesky.veclib zcholesky.veclib \ scholesky.veclib dcholesky.veclib ccholesky.veclib zcholesky.veclib \
@ -306,6 +314,9 @@ slinpack.mkl : slinpack.$(SUFFIX)
slinpack.veclib : slinpack.$(SUFFIX) slinpack.veclib : slinpack.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
slinpack.essl : slinpack.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
##################################### Dlinpack #################################################### ##################################### Dlinpack ####################################################
dlinpack.goto : dlinpack.$(SUFFIX) ../$(LIBNAME) dlinpack.goto : dlinpack.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
@ -322,6 +333,9 @@ dlinpack.mkl : dlinpack.$(SUFFIX)
dlinpack.veclib : dlinpack.$(SUFFIX) dlinpack.veclib : dlinpack.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
dlinpack.essl : dlinpack.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
##################################### Clinpack #################################################### ##################################### Clinpack ####################################################
clinpack.goto : clinpack.$(SUFFIX) ../$(LIBNAME) clinpack.goto : clinpack.$(SUFFIX) ../$(LIBNAME)
@ -339,6 +353,9 @@ clinpack.mkl : clinpack.$(SUFFIX)
clinpack.veclib : clinpack.$(SUFFIX) clinpack.veclib : clinpack.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
clinpack.essl : clinpack.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
##################################### Zlinpack #################################################### ##################################### Zlinpack ####################################################
zlinpack.goto : zlinpack.$(SUFFIX) ../$(LIBNAME) zlinpack.goto : zlinpack.$(SUFFIX) ../$(LIBNAME)
@ -356,6 +373,9 @@ zlinpack.mkl : zlinpack.$(SUFFIX)
zlinpack.veclib : zlinpack.$(SUFFIX) zlinpack.veclib : zlinpack.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
zlinpack.essl : zlinpack.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
##################################### Scholesky ################################################### ##################################### Scholesky ###################################################
scholesky.goto : scholesky.$(SUFFIX) ../$(LIBNAME) scholesky.goto : scholesky.$(SUFFIX) ../$(LIBNAME)
@ -441,6 +461,9 @@ sgemm.mkl : sgemm.$(SUFFIX)
sgemm.veclib : sgemm.$(SUFFIX) sgemm.veclib : sgemm.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
sgemm.essl : sgemm.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
##################################### Dgemm #################################################### ##################################### Dgemm ####################################################
dgemm.goto : dgemm.$(SUFFIX) ../$(LIBNAME) dgemm.goto : dgemm.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
@ -457,6 +480,9 @@ dgemm.mkl : dgemm.$(SUFFIX)
dgemm.veclib : dgemm.$(SUFFIX) dgemm.veclib : dgemm.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
dgemm.essl : dgemm.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
##################################### Cgemm #################################################### ##################################### Cgemm ####################################################
cgemm.goto : cgemm.$(SUFFIX) ../$(LIBNAME) cgemm.goto : cgemm.$(SUFFIX) ../$(LIBNAME)
@ -474,6 +500,9 @@ cgemm.mkl : cgemm.$(SUFFIX)
cgemm.veclib : cgemm.$(SUFFIX) cgemm.veclib : cgemm.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
cgemm.essl : cgemm.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
##################################### Zgemm #################################################### ##################################### Zgemm ####################################################
zgemm.goto : zgemm.$(SUFFIX) ../$(LIBNAME) zgemm.goto : zgemm.$(SUFFIX) ../$(LIBNAME)
@ -491,6 +520,9 @@ zgemm.mkl : zgemm.$(SUFFIX)
zgemm.veclib : zgemm.$(SUFFIX) zgemm.veclib : zgemm.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
zgemm.essl : zgemm.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
##################################### Ssymm #################################################### ##################################### Ssymm ####################################################
ssymm.goto : ssymm.$(SUFFIX) ../$(LIBNAME) ssymm.goto : ssymm.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
@ -573,6 +605,9 @@ strmm.mkl : strmm.$(SUFFIX)
strmm.veclib : strmm.$(SUFFIX) strmm.veclib : strmm.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
strmm.essl : strmm.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
##################################### Dtrmm #################################################### ##################################### Dtrmm ####################################################
dtrmm.goto : dtrmm.$(SUFFIX) ../$(LIBNAME) dtrmm.goto : dtrmm.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
@ -589,6 +624,9 @@ dtrmm.mkl : dtrmm.$(SUFFIX)
dtrmm.veclib : dtrmm.$(SUFFIX) dtrmm.veclib : dtrmm.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
dtrmm.essl : dtrmm.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
##################################### Ctrmm #################################################### ##################################### Ctrmm ####################################################
ctrmm.goto : ctrmm.$(SUFFIX) ../$(LIBNAME) ctrmm.goto : ctrmm.$(SUFFIX) ../$(LIBNAME)
@ -606,6 +644,9 @@ ctrmm.mkl : ctrmm.$(SUFFIX)
ctrmm.veclib : ctrmm.$(SUFFIX) ctrmm.veclib : ctrmm.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
ctrmm.essl : ctrmm.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
##################################### Ztrmm #################################################### ##################################### Ztrmm ####################################################
ztrmm.goto : ztrmm.$(SUFFIX) ../$(LIBNAME) ztrmm.goto : ztrmm.$(SUFFIX) ../$(LIBNAME)
@ -623,6 +664,9 @@ ztrmm.mkl : ztrmm.$(SUFFIX)
ztrmm.veclib : ztrmm.$(SUFFIX) ztrmm.veclib : ztrmm.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
ztrmm.essl : ztrmm.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
##################################### Strsm #################################################### ##################################### Strsm ####################################################
strsm.goto : strsm.$(SUFFIX) ../$(LIBNAME) strsm.goto : strsm.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
@ -1413,6 +1457,39 @@ zdot.mkl : zdot-intel.$(SUFFIX)
zdot.veclib : zdot-intel.$(SUFFIX) zdot.veclib : zdot-intel.$(SUFFIX)
$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
##################################### Srot ####################################################
srot.goto : srot.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
srot.acml : srot.$(SUFFIX)
$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
srot.atlas : srot.$(SUFFIX)
$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
srot.mkl : srot.$(SUFFIX)
$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
srot.veclib : srot.$(SUFFIX)
$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
##################################### Drot ####################################################
drot.goto : drot.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
drot.acml : drot.$(SUFFIX)
$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
drot.atlas : drot.$(SUFFIX)
$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
drot.mkl : drot.$(SUFFIX)
$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
drot.veclib : drot.$(SUFFIX)
$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
##################################### Saxpy #################################################### ##################################### Saxpy ####################################################
saxpy.goto : saxpy.$(SUFFIX) ../$(LIBNAME) saxpy.goto : saxpy.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
@ -2124,6 +2201,13 @@ cgesv.$(SUFFIX) : gesv.c
zgesv.$(SUFFIX) : gesv.c zgesv.$(SUFFIX) : gesv.c
$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
srot.$(SUFFIX) : rot.c
$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
drot.$(SUFFIX) : rot.c
$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
@ -2137,7 +2221,7 @@ smallscaling: smallscaling.c ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(EXTRALIB) -fopenmp -lm $(CC) $(CFLAGS) -o $(@F) $^ $(EXTRALIB) -fopenmp -lm
clean :: clean ::
@rm -f *.goto *.mkl *.acml *.atlas *.veclib @rm -f *.goto *.mkl *.acml *.atlas *.veclib *.essl
include $(TOPDIR)/Makefile.tail include $(TOPDIR)/Makefile.tail

197
benchmark/rot.c Normal file
View File

@ -0,0 +1,197 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#undef DOT
#ifdef DOUBLE
#define ROT BLASFUNC(drot)
#else
#define ROT BLASFUNC(srot)
#endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){
FLOAT *x, *y;
// FLOAT result;
blasint m, i;
blasint inc_x=1,inc_y=1;
FLOAT c[1] = { 2.0 };
FLOAT s[1] = { 2.0 };
int loops = 1;
int l;
char *p;
int from = 1;
int to = 200;
int step = 1;
struct timeval start, stop;
double time1,timeg;
argc--;argv++;
if (argc > 0) { from = atol(*argv); argc--; argv++;}
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}
if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p);
fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops);
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
srandom(getpid());
#endif
fprintf(stderr, " SIZE Flops\n");
for(m = from; m <= to; m += step)
{
timeg=0;
fprintf(stderr, " %6d : ", (int)m);
for (l=0; l<loops; l++)
{
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
gettimeofday( &start, (struct timezone *)0);
ROT (&m, x, &inc_x, y, &inc_y, c, s);
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg += time1;
}
timeg /= loops;
fprintf(stderr,
" %10.2f MFlops\n",
COMPSIZE * COMPSIZE * 6. * (double)m / timeg * 1.e-6);
}
return 0;
}
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

View File

@ -798,7 +798,7 @@ Lmcount$lazy_ptr:
#elif defined(PPC440FP2) #elif defined(PPC440FP2)
#define BUFFER_SIZE ( 16 << 20) #define BUFFER_SIZE ( 16 << 20)
#elif defined(POWER8) #elif defined(POWER8)
#define BUFFER_SIZE ( 64 << 20) #define BUFFER_SIZE ( 32 << 20)
#else #else
#define BUFFER_SIZE ( 16 << 20) #define BUFFER_SIZE ( 16 << 20)
#endif #endif

View File

@ -62,7 +62,7 @@ static void __inline blas_lock(volatile BLASULONG *address){
#if defined(_MSC_VER) && !defined(__clang__) #if defined(_MSC_VER) && !defined(__clang__)
// use intrinsic instead of inline assembly // use intrinsic instead of inline assembly
ret = _InterlockedExchange(address, 1); ret = _InterlockedExchange((volatile LONG *)address, 1);
// inline assembly // inline assembly
/*__asm { /*__asm {
mov eax, address mov eax, address

View File

@ -1452,6 +1452,31 @@ BOOL APIENTRY DllMain(HMODULE hModule, DWORD ul_reason_for_call, LPVOID lpReser
} }
return TRUE; return TRUE;
} }
/*
This is to allow static linking.
Code adapted from Google performance tools:
https://gperftools.googlecode.com/git-history/perftools-1.0/src/windows/port.cc
Reference:
https://sourceware.org/ml/pthreads-win32/2008/msg00028.html
http://ci.boost.org/svn-trac/browser/trunk/libs/thread/src/win32/tss_pe.cpp
*/
static int on_process_term(void)
{
gotoblas_quit();
return 0;
}
#ifdef _WIN64
#pragma comment(linker, "/INCLUDE:_tls_used")
#else
#pragma comment(linker, "/INCLUDE:__tls_used")
#endif
#pragma data_seg(push, old_seg)
#pragma data_seg(".CRT$XLB")
static void (APIENTRY *dll_callback)(HINSTANCE h, DWORD ul_reason_for_call, PVOID pv) = DllMain;
#pragma data_seg(".CRT$XTU")
static int(*p_process_term)(void) = on_process_term;
#pragma data_seg(pop, old_seg)
#endif #endif
#if (defined(C_PGI) || (!defined(C_SUN) && defined(F_INTERFACE_SUN))) && (defined(ARCH_X86) || defined(ARCH_X86_64)) #if (defined(C_PGI) || (!defined(C_SUN) && defined(F_INTERFACE_SUN))) && (defined(ARCH_X86) || defined(ARCH_X86_64))

View File

@ -1013,7 +1013,12 @@ int main(int argc, char *argv[]){
#endif #endif
#ifdef MAKE_NB_JOBS #ifdef MAKE_NB_JOBS
#if MAKE_NB_JOBS > 0
printf("MAKE += -j %d\n", MAKE_NB_JOBS); printf("MAKE += -j %d\n", MAKE_NB_JOBS);
#else
// Let make use parent -j argument or -j1 if there
// is no make parent
#endif
#elif NO_PARALLEL_MAKE==1 #elif NO_PARALLEL_MAKE==1
printf("MAKE += -j 1\n"); printf("MAKE += -j 1\n");
#else #else

View File

@ -64,10 +64,13 @@ int main(int argc, char **argv) {
if ((argc >= 2) && (*argv[1] == '1')) { if ((argc >= 2) && (*argv[1] == '1')) {
#if defined(ARCH_X86) || defined(ARCH_X86_64)
printf("#define SLOCAL_BUFFER_SIZE\t%ld\n", (SGEMM_DEFAULT_Q * SGEMM_DEFAULT_UNROLL_N * 4 * 1 * sizeof(float))); printf("#define SLOCAL_BUFFER_SIZE\t%ld\n", (SGEMM_DEFAULT_Q * SGEMM_DEFAULT_UNROLL_N * 4 * 1 * sizeof(float)));
printf("#define DLOCAL_BUFFER_SIZE\t%ld\n", (DGEMM_DEFAULT_Q * DGEMM_DEFAULT_UNROLL_N * 2 * 1 * sizeof(double))); printf("#define DLOCAL_BUFFER_SIZE\t%ld\n", (DGEMM_DEFAULT_Q * DGEMM_DEFAULT_UNROLL_N * 2 * 1 * sizeof(double)));
printf("#define CLOCAL_BUFFER_SIZE\t%ld\n", (CGEMM_DEFAULT_Q * CGEMM_DEFAULT_UNROLL_N * 4 * 2 * sizeof(float))); printf("#define CLOCAL_BUFFER_SIZE\t%ld\n", (CGEMM_DEFAULT_Q * CGEMM_DEFAULT_UNROLL_N * 4 * 2 * sizeof(float)));
printf("#define ZLOCAL_BUFFER_SIZE\t%ld\n", (ZGEMM_DEFAULT_Q * ZGEMM_DEFAULT_UNROLL_N * 2 * 2 * sizeof(double))); printf("#define ZLOCAL_BUFFER_SIZE\t%ld\n", (ZGEMM_DEFAULT_Q * ZGEMM_DEFAULT_UNROLL_N * 2 * 2 * sizeof(double)));
#endif
#ifdef USE64BITINT #ifdef USE64BITINT
printf("#define USE64BITINT\n"); printf("#define USE64BITINT\n");

View File

@ -179,93 +179,93 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v2.4s, v3.4s}, [ppA] ld2 {v2.4s, v3.4s}, [ppA]
add ppA, ppA, #32 add ppA, ppA, #32
fmul v16.4s, v0.4s, v8.4s[0] fmul v16.4s, v0.4s, v8.s[0]
OP_ii v16.4s, v1.4s, v9.4s[0] OP_ii v16.4s, v1.4s, v9.s[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC) defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v17.16b, v17.16b, v17.16b eor v17.16b, v17.16b, v17.16b
fmls v17.4s, v0.4s, v9.4s[0] fmls v17.4s, v0.4s, v9.s[0]
#else #else
fmul v17.4s, v0.4s, v9.4s[0] fmul v17.4s, v0.4s, v9.s[0]
#endif #endif
OP_ir v17.4s, v1.4s, v8.4s[0] OP_ir v17.4s, v1.4s, v8.s[0]
fmul v20.4s, v0.4s, v8.4s[1] fmul v20.4s, v0.4s, v8.s[1]
OP_ii v20.4s, v1.4s, v9.4s[1] OP_ii v20.4s, v1.4s, v9.s[1]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC) defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v21.16b, v21.16b, v21.16b eor v21.16b, v21.16b, v21.16b
fmls v21.4s, v0.4s, v9.4s[1] fmls v21.4s, v0.4s, v9.s[1]
#else #else
fmul v21.4s, v0.4s, v9.4s[1] fmul v21.4s, v0.4s, v9.s[1]
#endif #endif
OP_ir v21.4s, v1.4s, v8.4s[1] OP_ir v21.4s, v1.4s, v8.s[1]
fmul v24.4s, v0.4s, v8.4s[2] fmul v24.4s, v0.4s, v8.s[2]
OP_ii v24.4s, v1.4s, v9.4s[2] OP_ii v24.4s, v1.4s, v9.s[2]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC) defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v25.16b, v25.16b, v25.16b eor v25.16b, v25.16b, v25.16b
fmls v25.4s, v0.4s, v9.4s[2] fmls v25.4s, v0.4s, v9.s[2]
#else #else
fmul v25.4s, v0.4s, v9.4s[2] fmul v25.4s, v0.4s, v9.s[2]
#endif #endif
OP_ir v25.4s, v1.4s, v8.4s[2] OP_ir v25.4s, v1.4s, v8.s[2]
fmul v28.4s, v0.4s, v8.4s[3] fmul v28.4s, v0.4s, v8.s[3]
OP_ii v28.4s, v1.4s, v9.4s[3] OP_ii v28.4s, v1.4s, v9.s[3]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC) defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v29.16b, v29.16b, v29.16b eor v29.16b, v29.16b, v29.16b
fmls v29.4s, v0.4s, v9.4s[3] fmls v29.4s, v0.4s, v9.s[3]
#else #else
fmul v29.4s, v0.4s, v9.4s[3] fmul v29.4s, v0.4s, v9.s[3]
#endif #endif
OP_ir v29.4s, v1.4s, v8.4s[3] OP_ir v29.4s, v1.4s, v8.s[3]
fmul v18.4s, v2.4s, v8.4s[0] fmul v18.4s, v2.4s, v8.s[0]
OP_ii v18.4s, v3.4s, v9.4s[0] OP_ii v18.4s, v3.4s, v9.s[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC) defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v19.16b, v19.16b, v19.16b eor v19.16b, v19.16b, v19.16b
fmls v19.4s, v2.4s, v9.4s[0] fmls v19.4s, v2.4s, v9.s[0]
#else #else
fmul v19.4s, v2.4s, v9.4s[0] fmul v19.4s, v2.4s, v9.s[0]
#endif #endif
OP_ir v19.4s, v3.4s, v8.4s[0] OP_ir v19.4s, v3.4s, v8.s[0]
fmul v22.4s, v2.4s, v8.4s[1] fmul v22.4s, v2.4s, v8.s[1]
OP_ii v22.4s, v3.4s, v9.4s[1] OP_ii v22.4s, v3.4s, v9.s[1]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC) defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v23.16b, v23.16b, v23.16b eor v23.16b, v23.16b, v23.16b
fmls v23.4s, v2.4s, v9.4s[1] fmls v23.4s, v2.4s, v9.s[1]
#else #else
fmul v23.4s, v2.4s, v9.4s[1] fmul v23.4s, v2.4s, v9.s[1]
#endif #endif
OP_ir v23.4s, v3.4s, v8.4s[1] OP_ir v23.4s, v3.4s, v8.s[1]
fmul v26.4s, v2.4s, v8.4s[2] fmul v26.4s, v2.4s, v8.s[2]
OP_ii v26.4s, v3.4s, v9.4s[2] OP_ii v26.4s, v3.4s, v9.s[2]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC) defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v27.16b, v27.16b, v27.16b eor v27.16b, v27.16b, v27.16b
fmls v27.4s, v2.4s, v9.4s[2] fmls v27.4s, v2.4s, v9.s[2]
#else #else
fmul v27.4s, v2.4s, v9.4s[2] fmul v27.4s, v2.4s, v9.s[2]
#endif #endif
OP_ir v27.4s, v3.4s, v8.4s[2] OP_ir v27.4s, v3.4s, v8.s[2]
fmul v30.4s, v2.4s, v8.4s[3] fmul v30.4s, v2.4s, v8.s[3]
OP_ii v30.4s, v3.4s, v9.4s[3] OP_ii v30.4s, v3.4s, v9.s[3]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC) defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v31.16b, v31.16b, v31.16b eor v31.16b, v31.16b, v31.16b
fmls v31.4s, v2.4s, v9.4s[3] fmls v31.4s, v2.4s, v9.s[3]
#else #else
fmul v31.4s, v2.4s, v9.4s[3] fmul v31.4s, v2.4s, v9.s[3]
#endif #endif
OP_ir v31.4s, v3.4s, v8.4s[3] OP_ir v31.4s, v3.4s, v8.s[3]
ld2 {v12.4s, v13.4s}, [pB] ld2 {v12.4s, v13.4s}, [pB]
add pB, pB, #32 add pB, pB, #32
@ -276,159 +276,159 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL8x4_M1 .macro KERNEL8x4_M1
OP_rr v16.4s, v0.4s, v8.4s[0] OP_rr v16.4s, v0.4s, v8.s[0]
OP_ii v16.4s, v1.4s, v9.4s[0] OP_ii v16.4s, v1.4s, v9.s[0]
OP_ri v17.4s, v0.4s, v9.4s[0] OP_ri v17.4s, v0.4s, v9.s[0]
OP_ir v17.4s, v1.4s, v8.4s[0] OP_ir v17.4s, v1.4s, v8.s[0]
ld2 {v12.4s, v13.4s}, [pB] // for next round ld2 {v12.4s, v13.4s}, [pB] // for next round
add pB, pB, #32 add pB, pB, #32
OP_rr v20.4s, v0.4s, v8.4s[1] OP_rr v20.4s, v0.4s, v8.s[1]
OP_ii v20.4s, v1.4s, v9.4s[1] OP_ii v20.4s, v1.4s, v9.s[1]
OP_ri v21.4s, v0.4s, v9.4s[1] OP_ri v21.4s, v0.4s, v9.s[1]
OP_ir v21.4s, v1.4s, v8.4s[1] OP_ir v21.4s, v1.4s, v8.s[1]
prfm PLDL1KEEP, [pB, #512] prfm PLDL1KEEP, [pB, #512]
OP_rr v24.4s, v0.4s, v8.4s[2] OP_rr v24.4s, v0.4s, v8.s[2]
OP_ii v24.4s, v1.4s, v9.4s[2] OP_ii v24.4s, v1.4s, v9.s[2]
OP_ri v25.4s, v0.4s, v9.4s[2] OP_ri v25.4s, v0.4s, v9.s[2]
OP_ir v25.4s, v1.4s, v8.4s[2] OP_ir v25.4s, v1.4s, v8.s[2]
ld2 {v4.4s, v5.4s} , [pA] // for next round ld2 {v4.4s, v5.4s} , [pA] // for next round
add pA, pA, #32 add pA, pA, #32
OP_rr v28.4s, v0.4s, v8.4s[3] OP_rr v28.4s, v0.4s, v8.s[3]
OP_ii v28.4s, v1.4s, v9.4s[3] OP_ii v28.4s, v1.4s, v9.s[3]
OP_ri v29.4s, v0.4s, v9.4s[3] OP_ri v29.4s, v0.4s, v9.s[3]
OP_ir v29.4s, v1.4s, v8.4s[3] OP_ir v29.4s, v1.4s, v8.s[3]
prfm PLDL1KEEP, [pA, #512] prfm PLDL1KEEP, [pA, #512]
OP_rr v18.4s, v2.4s, v8.4s[0] OP_rr v18.4s, v2.4s, v8.s[0]
OP_ii v18.4s, v3.4s, v9.4s[0] OP_ii v18.4s, v3.4s, v9.s[0]
OP_ri v19.4s, v2.4s, v9.4s[0] OP_ri v19.4s, v2.4s, v9.s[0]
OP_ir v19.4s, v3.4s, v8.4s[0] OP_ir v19.4s, v3.4s, v8.s[0]
ld2 {v6.4s, v7.4s} , [ppA] // for next round ld2 {v6.4s, v7.4s} , [ppA] // for next round
add ppA, ppA, #32 add ppA, ppA, #32
OP_rr v22.4s, v2.4s, v8.4s[1] OP_rr v22.4s, v2.4s, v8.s[1]
OP_ii v22.4s, v3.4s, v9.4s[1] OP_ii v22.4s, v3.4s, v9.s[1]
OP_ri v23.4s, v2.4s, v9.4s[1] OP_ri v23.4s, v2.4s, v9.s[1]
OP_ir v23.4s, v3.4s, v8.4s[1] OP_ir v23.4s, v3.4s, v8.s[1]
prfm PLDL1KEEP, [ppA, #512] prfm PLDL1KEEP, [ppA, #512]
OP_rr v26.4s, v2.4s, v8.4s[2] OP_rr v26.4s, v2.4s, v8.s[2]
OP_ii v26.4s, v3.4s, v9.4s[2] OP_ii v26.4s, v3.4s, v9.s[2]
OP_ri v27.4s, v2.4s, v9.4s[2] OP_ri v27.4s, v2.4s, v9.s[2]
OP_ir v27.4s, v3.4s, v8.4s[2] OP_ir v27.4s, v3.4s, v8.s[2]
OP_rr v30.4s, v2.4s, v8.4s[3] OP_rr v30.4s, v2.4s, v8.s[3]
OP_ii v30.4s, v3.4s, v9.4s[3] OP_ii v30.4s, v3.4s, v9.s[3]
OP_ri v31.4s, v2.4s, v9.4s[3] OP_ri v31.4s, v2.4s, v9.s[3]
OP_ir v31.4s, v3.4s, v8.4s[3] OP_ir v31.4s, v3.4s, v8.s[3]
.endm .endm
.macro KERNEL8x4_M2 .macro KERNEL8x4_M2
OP_rr v16.4s, v4.4s, v12.4s[0] OP_rr v16.4s, v4.4s, v12.s[0]
OP_ii v16.4s, v5.4s, v13.4s[0] OP_ii v16.4s, v5.4s, v13.s[0]
OP_ri v17.4s, v4.4s, v13.4s[0] OP_ri v17.4s, v4.4s, v13.s[0]
OP_ir v17.4s, v5.4s, v12.4s[0] OP_ir v17.4s, v5.4s, v12.s[0]
ld2 {v8.4s, v9.4s}, [pB] // for next round ld2 {v8.4s, v9.4s}, [pB] // for next round
add pB, pB, #32 add pB, pB, #32
OP_rr v20.4s, v4.4s, v12.4s[1] OP_rr v20.4s, v4.4s, v12.s[1]
OP_ii v20.4s, v5.4s, v13.4s[1] OP_ii v20.4s, v5.4s, v13.s[1]
OP_ri v21.4s, v4.4s, v13.4s[1] OP_ri v21.4s, v4.4s, v13.s[1]
OP_ir v21.4s, v5.4s, v12.4s[1] OP_ir v21.4s, v5.4s, v12.s[1]
prfm PLDL1KEEP, [pA, #512] prfm PLDL1KEEP, [pA, #512]
OP_rr v24.4s, v4.4s, v12.4s[2] OP_rr v24.4s, v4.4s, v12.s[2]
OP_ii v24.4s, v5.4s, v13.4s[2] OP_ii v24.4s, v5.4s, v13.s[2]
OP_ri v25.4s, v4.4s, v13.4s[2] OP_ri v25.4s, v4.4s, v13.s[2]
OP_ir v25.4s, v5.4s, v12.4s[2] OP_ir v25.4s, v5.4s, v12.s[2]
ld2 {v0.4s, v1.4s}, [pA] // for next round ld2 {v0.4s, v1.4s}, [pA] // for next round
add pA, pA, #32 add pA, pA, #32
OP_rr v28.4s, v4.4s, v12.4s[3] OP_rr v28.4s, v4.4s, v12.s[3]
OP_ii v28.4s, v5.4s, v13.4s[3] OP_ii v28.4s, v5.4s, v13.s[3]
OP_ri v29.4s, v4.4s, v13.4s[3] OP_ri v29.4s, v4.4s, v13.s[3]
OP_ir v29.4s, v5.4s, v12.4s[3] OP_ir v29.4s, v5.4s, v12.s[3]
prfm PLDL1KEEP, [ppA, #512] prfm PLDL1KEEP, [ppA, #512]
OP_rr v18.4s, v6.4s, v12.4s[0] OP_rr v18.4s, v6.4s, v12.s[0]
OP_ii v18.4s, v7.4s, v13.4s[0] OP_ii v18.4s, v7.4s, v13.s[0]
OP_ri v19.4s, v6.4s, v13.4s[0] OP_ri v19.4s, v6.4s, v13.s[0]
OP_ir v19.4s, v7.4s, v12.4s[0] OP_ir v19.4s, v7.4s, v12.s[0]
ld2 {v2.4s, v3.4s}, [ppA] // for next round ld2 {v2.4s, v3.4s}, [ppA] // for next round
add ppA, ppA, #32 add ppA, ppA, #32
OP_rr v22.4s, v6.4s, v12.4s[1] OP_rr v22.4s, v6.4s, v12.s[1]
OP_ii v22.4s, v7.4s, v13.4s[1] OP_ii v22.4s, v7.4s, v13.s[1]
OP_ri v23.4s, v6.4s, v13.4s[1] OP_ri v23.4s, v6.4s, v13.s[1]
OP_ir v23.4s, v7.4s, v12.4s[1] OP_ir v23.4s, v7.4s, v12.s[1]
prfm PLDL1KEEP, [pB, #512] prfm PLDL1KEEP, [pB, #512]
OP_rr v26.4s, v6.4s, v12.4s[2] OP_rr v26.4s, v6.4s, v12.s[2]
OP_ii v26.4s, v7.4s, v13.4s[2] OP_ii v26.4s, v7.4s, v13.s[2]
OP_ri v27.4s, v6.4s, v13.4s[2] OP_ri v27.4s, v6.4s, v13.s[2]
OP_ir v27.4s, v7.4s, v12.4s[2] OP_ir v27.4s, v7.4s, v12.s[2]
OP_rr v30.4s, v6.4s, v12.4s[3] OP_rr v30.4s, v6.4s, v12.s[3]
OP_ii v30.4s, v7.4s, v13.4s[3] OP_ii v30.4s, v7.4s, v13.s[3]
OP_ri v31.4s, v6.4s, v13.4s[3] OP_ri v31.4s, v6.4s, v13.s[3]
OP_ir v31.4s, v7.4s, v12.4s[3] OP_ir v31.4s, v7.4s, v12.s[3]
.endm .endm
.macro KERNEL8x4_E .macro KERNEL8x4_E
OP_rr v16.4s, v4.4s, v12.4s[0] OP_rr v16.4s, v4.4s, v12.s[0]
OP_ii v16.4s, v5.4s, v13.4s[0] OP_ii v16.4s, v5.4s, v13.s[0]
OP_ri v17.4s, v4.4s, v13.4s[0] OP_ri v17.4s, v4.4s, v13.s[0]
OP_ir v17.4s, v5.4s, v12.4s[0] OP_ir v17.4s, v5.4s, v12.s[0]
OP_rr v20.4s, v4.4s, v12.4s[1] OP_rr v20.4s, v4.4s, v12.s[1]
OP_ii v20.4s, v5.4s, v13.4s[1] OP_ii v20.4s, v5.4s, v13.s[1]
OP_ri v21.4s, v4.4s, v13.4s[1] OP_ri v21.4s, v4.4s, v13.s[1]
OP_ir v21.4s, v5.4s, v12.4s[1] OP_ir v21.4s, v5.4s, v12.s[1]
OP_rr v24.4s, v4.4s, v12.4s[2] OP_rr v24.4s, v4.4s, v12.s[2]
OP_ii v24.4s, v5.4s, v13.4s[2] OP_ii v24.4s, v5.4s, v13.s[2]
OP_ri v25.4s, v4.4s, v13.4s[2] OP_ri v25.4s, v4.4s, v13.s[2]
OP_ir v25.4s, v5.4s, v12.4s[2] OP_ir v25.4s, v5.4s, v12.s[2]
OP_rr v28.4s, v4.4s, v12.4s[3] OP_rr v28.4s, v4.4s, v12.s[3]
OP_ii v28.4s, v5.4s, v13.4s[3] OP_ii v28.4s, v5.4s, v13.s[3]
OP_ri v29.4s, v4.4s, v13.4s[3] OP_ri v29.4s, v4.4s, v13.s[3]
OP_ir v29.4s, v5.4s, v12.4s[3] OP_ir v29.4s, v5.4s, v12.s[3]
OP_rr v18.4s, v6.4s, v12.4s[0] OP_rr v18.4s, v6.4s, v12.s[0]
OP_ii v18.4s, v7.4s, v13.4s[0] OP_ii v18.4s, v7.4s, v13.s[0]
OP_ri v19.4s, v6.4s, v13.4s[0] OP_ri v19.4s, v6.4s, v13.s[0]
OP_ir v19.4s, v7.4s, v12.4s[0] OP_ir v19.4s, v7.4s, v12.s[0]
OP_rr v22.4s, v6.4s, v12.4s[1] OP_rr v22.4s, v6.4s, v12.s[1]
OP_ii v22.4s, v7.4s, v13.4s[1] OP_ii v22.4s, v7.4s, v13.s[1]
OP_ri v23.4s, v6.4s, v13.4s[1] OP_ri v23.4s, v6.4s, v13.s[1]
OP_ir v23.4s, v7.4s, v12.4s[1] OP_ir v23.4s, v7.4s, v12.s[1]
OP_rr v26.4s, v6.4s, v12.4s[2] OP_rr v26.4s, v6.4s, v12.s[2]
OP_ii v26.4s, v7.4s, v13.4s[2] OP_ii v26.4s, v7.4s, v13.s[2]
OP_ri v27.4s, v6.4s, v13.4s[2] OP_ri v27.4s, v6.4s, v13.s[2]
OP_ir v27.4s, v7.4s, v12.4s[2] OP_ir v27.4s, v7.4s, v12.s[2]
OP_rr v30.4s, v6.4s, v12.4s[3] OP_rr v30.4s, v6.4s, v12.s[3]
OP_ii v30.4s, v7.4s, v13.4s[3] OP_ii v30.4s, v7.4s, v13.s[3]
OP_ri v31.4s, v6.4s, v13.4s[3] OP_ri v31.4s, v6.4s, v13.s[3]
OP_ir v31.4s, v7.4s, v12.4s[3] OP_ir v31.4s, v7.4s, v12.s[3]
.endm .endm
.macro KERNEL8x4_SUB .macro KERNEL8x4_SUB
@ -437,48 +437,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v0.4s, v1.4s}, [pA] ld2 {v0.4s, v1.4s}, [pA]
add pA, pA, #32 add pA, pA, #32
OP_rr v16.4s, v0.4s, v8.4s[0] OP_rr v16.4s, v0.4s, v8.s[0]
OP_ii v16.4s, v1.4s, v9.4s[0] OP_ii v16.4s, v1.4s, v9.s[0]
OP_ri v17.4s, v0.4s, v9.4s[0] OP_ri v17.4s, v0.4s, v9.s[0]
OP_ir v17.4s, v1.4s, v8.4s[0] OP_ir v17.4s, v1.4s, v8.s[0]
OP_rr v20.4s, v0.4s, v8.4s[1] OP_rr v20.4s, v0.4s, v8.s[1]
OP_ii v20.4s, v1.4s, v9.4s[1] OP_ii v20.4s, v1.4s, v9.s[1]
OP_ri v21.4s, v0.4s, v9.4s[1] OP_ri v21.4s, v0.4s, v9.s[1]
OP_ir v21.4s, v1.4s, v8.4s[1] OP_ir v21.4s, v1.4s, v8.s[1]
ld2 {v2.4s, v3.4s}, [ppA] ld2 {v2.4s, v3.4s}, [ppA]
add ppA, ppA, #32 add ppA, ppA, #32
OP_rr v24.4s, v0.4s, v8.4s[2] OP_rr v24.4s, v0.4s, v8.s[2]
OP_ii v24.4s, v1.4s, v9.4s[2] OP_ii v24.4s, v1.4s, v9.s[2]
OP_ri v25.4s, v0.4s, v9.4s[2] OP_ri v25.4s, v0.4s, v9.s[2]
OP_ir v25.4s, v1.4s, v8.4s[2] OP_ir v25.4s, v1.4s, v8.s[2]
OP_rr v28.4s, v0.4s, v8.4s[3] OP_rr v28.4s, v0.4s, v8.s[3]
OP_ii v28.4s, v1.4s, v9.4s[3] OP_ii v28.4s, v1.4s, v9.s[3]
OP_ri v29.4s, v0.4s, v9.4s[3] OP_ri v29.4s, v0.4s, v9.s[3]
OP_ir v29.4s, v1.4s, v8.4s[3] OP_ir v29.4s, v1.4s, v8.s[3]
OP_rr v18.4s, v2.4s, v8.4s[0] OP_rr v18.4s, v2.4s, v8.s[0]
OP_ii v18.4s, v3.4s, v9.4s[0] OP_ii v18.4s, v3.4s, v9.s[0]
OP_ri v19.4s, v2.4s, v9.4s[0] OP_ri v19.4s, v2.4s, v9.s[0]
OP_ir v19.4s, v3.4s, v8.4s[0] OP_ir v19.4s, v3.4s, v8.s[0]
OP_rr v22.4s, v2.4s, v8.4s[1] OP_rr v22.4s, v2.4s, v8.s[1]
OP_ii v22.4s, v3.4s, v9.4s[1] OP_ii v22.4s, v3.4s, v9.s[1]
OP_ri v23.4s, v2.4s, v9.4s[1] OP_ri v23.4s, v2.4s, v9.s[1]
OP_ir v23.4s, v3.4s, v8.4s[1] OP_ir v23.4s, v3.4s, v8.s[1]
OP_rr v26.4s, v2.4s, v8.4s[2] OP_rr v26.4s, v2.4s, v8.s[2]
OP_ii v26.4s, v3.4s, v9.4s[2] OP_ii v26.4s, v3.4s, v9.s[2]
OP_ri v27.4s, v2.4s, v9.4s[2] OP_ri v27.4s, v2.4s, v9.s[2]
OP_ir v27.4s, v3.4s, v8.4s[2] OP_ir v27.4s, v3.4s, v8.s[2]
OP_rr v30.4s, v2.4s, v8.4s[3] OP_rr v30.4s, v2.4s, v8.s[3]
OP_ii v30.4s, v3.4s, v9.4s[3] OP_ii v30.4s, v3.4s, v9.s[3]
OP_ri v31.4s, v2.4s, v9.4s[3] OP_ri v31.4s, v2.4s, v9.s[3]
OP_ir v31.4s, v3.4s, v8.4s[3] OP_ir v31.4s, v3.4s, v8.s[3]
.endm .endm
.macro SAVE8x4 .macro SAVE8x4
@ -578,25 +578,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v0.4s, v1.4s}, [pA] ld2 {v0.4s, v1.4s}, [pA]
add pA, pA, #32 add pA, pA, #32
OP_rr v16.4s, v0.4s, v8.4s[0] OP_rr v16.4s, v0.4s, v8.s[0]
OP_ii v16.4s, v1.4s, v9.4s[0] OP_ii v16.4s, v1.4s, v9.s[0]
OP_ri v17.4s, v0.4s, v9.4s[0] OP_ri v17.4s, v0.4s, v9.s[0]
OP_ir v17.4s, v1.4s, v8.4s[0] OP_ir v17.4s, v1.4s, v8.s[0]
OP_rr v20.4s, v0.4s, v8.4s[1] OP_rr v20.4s, v0.4s, v8.s[1]
OP_ii v20.4s, v1.4s, v9.4s[1] OP_ii v20.4s, v1.4s, v9.s[1]
OP_ri v21.4s, v0.4s, v9.4s[1] OP_ri v21.4s, v0.4s, v9.s[1]
OP_ir v21.4s, v1.4s, v8.4s[1] OP_ir v21.4s, v1.4s, v8.s[1]
OP_rr v24.4s, v0.4s, v8.4s[2] OP_rr v24.4s, v0.4s, v8.s[2]
OP_ii v24.4s, v1.4s, v9.4s[2] OP_ii v24.4s, v1.4s, v9.s[2]
OP_ri v25.4s, v0.4s, v9.4s[2] OP_ri v25.4s, v0.4s, v9.s[2]
OP_ir v25.4s, v1.4s, v8.4s[2] OP_ir v25.4s, v1.4s, v8.s[2]
OP_rr v28.4s, v0.4s, v8.4s[3] OP_rr v28.4s, v0.4s, v8.s[3]
OP_ii v28.4s, v1.4s, v9.4s[3] OP_ii v28.4s, v1.4s, v9.s[3]
OP_ri v29.4s, v0.4s, v9.4s[3] OP_ri v29.4s, v0.4s, v9.s[3]
OP_ir v29.4s, v1.4s, v8.4s[3] OP_ir v29.4s, v1.4s, v8.s[3]
.endm .endm
.macro SAVE4x4 .macro SAVE4x4
@ -658,25 +658,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v0.2s, v1.2s}, [pA] ld2 {v0.2s, v1.2s}, [pA]
add pA, pA, #16 add pA, pA, #16
OP_rr v16.2s, v0.2s, v8.4s[0] OP_rr v16.2s, v0.2s, v8.s[0]
OP_ii v16.2s, v1.2s, v9.4s[0] OP_ii v16.2s, v1.2s, v9.s[0]
OP_ri v17.2s, v0.2s, v9.4s[0] OP_ri v17.2s, v0.2s, v9.s[0]
OP_ir v17.2s, v1.2s, v8.4s[0] OP_ir v17.2s, v1.2s, v8.s[0]
OP_rr v20.2s, v0.2s, v8.4s[1] OP_rr v20.2s, v0.2s, v8.s[1]
OP_ii v20.2s, v1.2s, v9.4s[1] OP_ii v20.2s, v1.2s, v9.s[1]
OP_ri v21.2s, v0.2s, v9.4s[1] OP_ri v21.2s, v0.2s, v9.s[1]
OP_ir v21.2s, v1.2s, v8.4s[1] OP_ir v21.2s, v1.2s, v8.s[1]
OP_rr v24.2s, v0.2s, v8.4s[2] OP_rr v24.2s, v0.2s, v8.s[2]
OP_ii v24.2s, v1.2s, v9.4s[2] OP_ii v24.2s, v1.2s, v9.s[2]
OP_ri v25.2s, v0.2s, v9.4s[2] OP_ri v25.2s, v0.2s, v9.s[2]
OP_ir v25.2s, v1.2s, v8.4s[2] OP_ir v25.2s, v1.2s, v8.s[2]
OP_rr v28.2s, v0.2s, v8.4s[3] OP_rr v28.2s, v0.2s, v8.s[3]
OP_ii v28.2s, v1.2s, v9.4s[3] OP_ii v28.2s, v1.2s, v9.s[3]
OP_ri v29.2s, v0.2s, v9.4s[3] OP_ri v29.2s, v0.2s, v9.s[3]
OP_ir v29.2s, v1.2s, v8.4s[3] OP_ir v29.2s, v1.2s, v8.s[3]
.endm .endm
.macro SAVE2x4 .macro SAVE2x4
@ -738,25 +738,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v0.s, v1.s}[0], [pA] ld2 {v0.s, v1.s}[0], [pA]
add pA, pA, #8 add pA, pA, #8
OP_rr s16, s0, v8.4s[0] OP_rr s16, s0, v8.s[0]
OP_ii s16, s1, v9.4s[0] OP_ii s16, s1, v9.s[0]
OP_ri s17, s0, v9.4s[0] OP_ri s17, s0, v9.s[0]
OP_ir s17, s1, v8.4s[0] OP_ir s17, s1, v8.s[0]
OP_rr s20, s0, v8.4s[1] OP_rr s20, s0, v8.s[1]
OP_ii s20, s1, v9.4s[1] OP_ii s20, s1, v9.s[1]
OP_ri s21, s0, v9.4s[1] OP_ri s21, s0, v9.s[1]
OP_ir s21, s1, v8.4s[1] OP_ir s21, s1, v8.s[1]
OP_rr s24, s0, v8.4s[2] OP_rr s24, s0, v8.s[2]
OP_ii s24, s1, v9.4s[2] OP_ii s24, s1, v9.s[2]
OP_ri s25, s0, v9.4s[2] OP_ri s25, s0, v9.s[2]
OP_ir s25, s1, v8.4s[2] OP_ir s25, s1, v8.s[2]
OP_rr s28, s0, v8.4s[3] OP_rr s28, s0, v8.s[3]
OP_ii s28, s1, v9.4s[3] OP_ii s28, s1, v9.s[3]
OP_ri s29, s0, v9.4s[3] OP_ri s29, s0, v9.s[3]
OP_ir s29, s1, v8.4s[3] OP_ir s29, s1, v8.s[3]
.endm .endm
.macro SAVE1x4 .macro SAVE1x4
@ -814,15 +814,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v0.4s, v1.4s}, [pA] ld2 {v0.4s, v1.4s}, [pA]
add pA, pA, #32 add pA, pA, #32
OP_rr v16.4s, v0.4s, v8.2s[0] OP_rr v16.4s, v0.4s, v8.s[0]
OP_ii v16.4s, v1.4s, v9.2s[0] OP_ii v16.4s, v1.4s, v9.s[0]
OP_ri v17.4s, v0.4s, v9.2s[0] OP_ri v17.4s, v0.4s, v9.s[0]
OP_ir v17.4s, v1.4s, v8.2s[0] OP_ir v17.4s, v1.4s, v8.s[0]
OP_rr v20.4s, v0.4s, v8.2s[1] OP_rr v20.4s, v0.4s, v8.s[1]
OP_ii v20.4s, v1.4s, v9.2s[1] OP_ii v20.4s, v1.4s, v9.s[1]
OP_ri v21.4s, v0.4s, v9.2s[1] OP_ri v21.4s, v0.4s, v9.s[1]
OP_ir v21.4s, v1.4s, v8.2s[1] OP_ir v21.4s, v1.4s, v8.s[1]
.endm .endm
.macro SAVE4x2 .macro SAVE4x2
@ -862,15 +862,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v0.2s, v1.2s}, [pA] ld2 {v0.2s, v1.2s}, [pA]
add pA, pA, #16 add pA, pA, #16
OP_rr v16.2s, v0.2s, v8.2s[0] OP_rr v16.2s, v0.2s, v8.s[0]
OP_ii v16.2s, v1.2s, v9.2s[0] OP_ii v16.2s, v1.2s, v9.s[0]
OP_ri v17.2s, v0.2s, v9.2s[0] OP_ri v17.2s, v0.2s, v9.s[0]
OP_ir v17.2s, v1.2s, v8.2s[0] OP_ir v17.2s, v1.2s, v8.s[0]
OP_rr v20.2s, v0.2s, v8.2s[1] OP_rr v20.2s, v0.2s, v8.s[1]
OP_ii v20.2s, v1.2s, v9.2s[1] OP_ii v20.2s, v1.2s, v9.s[1]
OP_ri v21.2s, v0.2s, v9.2s[1] OP_ri v21.2s, v0.2s, v9.s[1]
OP_ir v21.2s, v1.2s, v8.2s[1] OP_ir v21.2s, v1.2s, v8.s[1]
.endm .endm
.macro SAVE2x2 .macro SAVE2x2
@ -910,15 +910,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v0.s, v1.s}[0], [pA] ld2 {v0.s, v1.s}[0], [pA]
add pA, pA, #8 add pA, pA, #8
OP_rr s16, s0, v8.2s[0] OP_rr s16, s0, v8.s[0]
OP_ii s16, s1, v9.2s[0] OP_ii s16, s1, v9.s[0]
OP_ri s17, s0, v9.2s[0] OP_ri s17, s0, v9.s[0]
OP_ir s17, s1, v8.2s[0] OP_ir s17, s1, v8.s[0]
OP_rr s20, s0, v8.2s[1] OP_rr s20, s0, v8.s[1]
OP_ii s20, s1, v9.2s[1] OP_ii s20, s1, v9.s[1]
OP_ri s21, s0, v9.2s[1] OP_ri s21, s0, v9.s[1]
OP_ir s21, s1, v8.2s[1] OP_ir s21, s1, v8.s[1]
.endm .endm
.macro SAVE1x2 .macro SAVE1x2

664
kernel/arm64/cgemm_kernel_8x4.S Executable file → Normal file
View File

@ -178,93 +178,93 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v2.4s, v3.4s}, [pA] ld2 {v2.4s, v3.4s}, [pA]
add pA, pA, #32 add pA, pA, #32
fmul v16.4s, v0.4s, v8.4s[0] fmul v16.4s, v0.4s, v8.s[0]
OP_ii v16.4s, v1.4s, v9.4s[0] OP_ii v16.4s, v1.4s, v9.s[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC) defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v17.16b, v17.16b, v17.16b eor v17.16b, v17.16b, v17.16b
fmls v17.4s, v0.4s, v9.4s[0] fmls v17.4s, v0.4s, v9.s[0]
#else #else
fmul v17.4s, v0.4s, v9.4s[0] fmul v17.4s, v0.4s, v9.s[0]
#endif #endif
OP_ir v17.4s, v1.4s, v8.4s[0] OP_ir v17.4s, v1.4s, v8.s[0]
fmul v18.4s, v2.4s, v8.4s[0] fmul v18.4s, v2.4s, v8.s[0]
OP_ii v18.4s, v3.4s, v9.4s[0] OP_ii v18.4s, v3.4s, v9.s[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC) defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v19.16b, v19.16b, v19.16b eor v19.16b, v19.16b, v19.16b
fmls v19.4s, v2.4s, v9.4s[0] fmls v19.4s, v2.4s, v9.s[0]
#else #else
fmul v19.4s, v2.4s, v9.4s[0] fmul v19.4s, v2.4s, v9.s[0]
#endif #endif
OP_ir v19.4s, v3.4s, v8.4s[0] OP_ir v19.4s, v3.4s, v8.s[0]
fmul v20.4s, v0.4s, v8.4s[1] fmul v20.4s, v0.4s, v8.s[1]
OP_ii v20.4s, v1.4s, v9.4s[1] OP_ii v20.4s, v1.4s, v9.s[1]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC) defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v21.16b, v21.16b, v21.16b eor v21.16b, v21.16b, v21.16b
fmls v21.4s, v0.4s, v9.4s[1] fmls v21.4s, v0.4s, v9.s[1]
#else #else
fmul v21.4s, v0.4s, v9.4s[1] fmul v21.4s, v0.4s, v9.s[1]
#endif #endif
OP_ir v21.4s, v1.4s, v8.4s[1] OP_ir v21.4s, v1.4s, v8.s[1]
fmul v22.4s, v2.4s, v8.4s[1] fmul v22.4s, v2.4s, v8.s[1]
OP_ii v22.4s, v3.4s, v9.4s[1] OP_ii v22.4s, v3.4s, v9.s[1]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC) defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v23.16b, v23.16b, v23.16b eor v23.16b, v23.16b, v23.16b
fmls v23.4s, v2.4s, v9.4s[1] fmls v23.4s, v2.4s, v9.s[1]
#else #else
fmul v23.4s, v2.4s, v9.4s[1] fmul v23.4s, v2.4s, v9.s[1]
#endif #endif
OP_ir v23.4s, v3.4s, v8.4s[1] OP_ir v23.4s, v3.4s, v8.s[1]
fmul v24.4s, v0.4s, v8.4s[2] fmul v24.4s, v0.4s, v8.s[2]
OP_ii v24.4s, v1.4s, v9.4s[2] OP_ii v24.4s, v1.4s, v9.s[2]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC) defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v25.16b, v25.16b, v25.16b eor v25.16b, v25.16b, v25.16b
fmls v25.4s, v0.4s, v9.4s[2] fmls v25.4s, v0.4s, v9.s[2]
#else #else
fmul v25.4s, v0.4s, v9.4s[2] fmul v25.4s, v0.4s, v9.s[2]
#endif #endif
OP_ir v25.4s, v1.4s, v8.4s[2] OP_ir v25.4s, v1.4s, v8.s[2]
fmul v26.4s, v2.4s, v8.4s[2] fmul v26.4s, v2.4s, v8.s[2]
OP_ii v26.4s, v3.4s, v9.4s[2] OP_ii v26.4s, v3.4s, v9.s[2]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC) defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v27.16b, v27.16b, v27.16b eor v27.16b, v27.16b, v27.16b
fmls v27.4s, v2.4s, v9.4s[2] fmls v27.4s, v2.4s, v9.s[2]
#else #else
fmul v27.4s, v2.4s, v9.4s[2] fmul v27.4s, v2.4s, v9.s[2]
#endif #endif
OP_ir v27.4s, v3.4s, v8.4s[2] OP_ir v27.4s, v3.4s, v8.s[2]
fmul v28.4s, v0.4s, v8.4s[3] fmul v28.4s, v0.4s, v8.s[3]
OP_ii v28.4s, v1.4s, v9.4s[3] OP_ii v28.4s, v1.4s, v9.s[3]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC) defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v29.16b, v29.16b, v29.16b eor v29.16b, v29.16b, v29.16b
fmls v29.4s, v0.4s, v9.4s[3] fmls v29.4s, v0.4s, v9.s[3]
#else #else
fmul v29.4s, v0.4s, v9.4s[3] fmul v29.4s, v0.4s, v9.s[3]
#endif #endif
OP_ir v29.4s, v1.4s, v8.4s[3] OP_ir v29.4s, v1.4s, v8.s[3]
fmul v30.4s, v2.4s, v8.4s[3] fmul v30.4s, v2.4s, v8.s[3]
OP_ii v30.4s, v3.4s, v9.4s[3] OP_ii v30.4s, v3.4s, v9.s[3]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC) defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v31.16b, v31.16b, v31.16b eor v31.16b, v31.16b, v31.16b
fmls v31.4s, v2.4s, v9.4s[3] fmls v31.4s, v2.4s, v9.s[3]
#else #else
fmul v31.4s, v2.4s, v9.4s[3] fmul v31.4s, v2.4s, v9.s[3]
#endif #endif
OP_ir v31.4s, v3.4s, v8.4s[3] OP_ir v31.4s, v3.4s, v8.s[3]
ld2 {v12.4s, v13.4s}, [pB] ld2 {v12.4s, v13.4s}, [pB]
add pB, pB, #32 add pB, pB, #32
@ -275,45 +275,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL8x4_M1 .macro KERNEL8x4_M1
OP_rr v16.4s, v0.4s, v8.4s[0] OP_rr v16.4s, v0.4s, v8.s[0]
OP_ii v16.4s, v1.4s, v9.4s[0] OP_ii v16.4s, v1.4s, v9.s[0]
OP_ri v17.4s, v0.4s, v9.4s[0] OP_ri v17.4s, v0.4s, v9.s[0]
OP_ir v17.4s, v1.4s, v8.4s[0] OP_ir v17.4s, v1.4s, v8.s[0]
OP_rr v18.4s, v2.4s, v8.4s[0] OP_rr v18.4s, v2.4s, v8.s[0]
OP_ii v18.4s, v3.4s, v9.4s[0] OP_ii v18.4s, v3.4s, v9.s[0]
OP_ri v19.4s, v2.4s, v9.4s[0] OP_ri v19.4s, v2.4s, v9.s[0]
OP_ir v19.4s, v3.4s, v8.4s[0] OP_ir v19.4s, v3.4s, v8.s[0]
OP_rr v20.4s, v0.4s, v8.4s[1] OP_rr v20.4s, v0.4s, v8.s[1]
OP_ii v20.4s, v1.4s, v9.4s[1] OP_ii v20.4s, v1.4s, v9.s[1]
OP_ri v21.4s, v0.4s, v9.4s[1] OP_ri v21.4s, v0.4s, v9.s[1]
OP_ir v21.4s, v1.4s, v8.4s[1] OP_ir v21.4s, v1.4s, v8.s[1]
OP_rr v22.4s, v2.4s, v8.4s[1] OP_rr v22.4s, v2.4s, v8.s[1]
OP_ii v22.4s, v3.4s, v9.4s[1] OP_ii v22.4s, v3.4s, v9.s[1]
OP_ri v23.4s, v2.4s, v9.4s[1] OP_ri v23.4s, v2.4s, v9.s[1]
OP_ir v23.4s, v3.4s, v8.4s[1] OP_ir v23.4s, v3.4s, v8.s[1]
OP_rr v24.4s, v0.4s, v8.4s[2] OP_rr v24.4s, v0.4s, v8.s[2]
OP_ii v24.4s, v1.4s, v9.4s[2] OP_ii v24.4s, v1.4s, v9.s[2]
OP_ri v25.4s, v0.4s, v9.4s[2] OP_ri v25.4s, v0.4s, v9.s[2]
OP_ir v25.4s, v1.4s, v8.4s[2] OP_ir v25.4s, v1.4s, v8.s[2]
OP_rr v26.4s, v2.4s, v8.4s[2] OP_rr v26.4s, v2.4s, v8.s[2]
OP_ii v26.4s, v3.4s, v9.4s[2] OP_ii v26.4s, v3.4s, v9.s[2]
OP_ri v27.4s, v2.4s, v9.4s[2] OP_ri v27.4s, v2.4s, v9.s[2]
OP_ir v27.4s, v3.4s, v8.4s[2] OP_ir v27.4s, v3.4s, v8.s[2]
OP_rr v28.4s, v0.4s, v8.4s[3] OP_rr v28.4s, v0.4s, v8.s[3]
OP_ii v28.4s, v1.4s, v9.4s[3] OP_ii v28.4s, v1.4s, v9.s[3]
OP_ri v29.4s, v0.4s, v9.4s[3] OP_ri v29.4s, v0.4s, v9.s[3]
OP_ir v29.4s, v1.4s, v8.4s[3] OP_ir v29.4s, v1.4s, v8.s[3]
OP_rr v30.4s, v2.4s, v8.4s[3] OP_rr v30.4s, v2.4s, v8.s[3]
OP_ii v30.4s, v3.4s, v9.4s[3] OP_ii v30.4s, v3.4s, v9.s[3]
OP_ri v31.4s, v2.4s, v9.4s[3] OP_ri v31.4s, v2.4s, v9.s[3]
OP_ir v31.4s, v3.4s, v8.4s[3] OP_ir v31.4s, v3.4s, v8.s[3]
ld2 {v12.4s, v13.4s}, [pB] // For next round ld2 {v12.4s, v13.4s}, [pB] // For next round
add pB, pB, #32 add pB, pB, #32
@ -324,45 +324,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL8x4_M2 .macro KERNEL8x4_M2
OP_rr v16.4s, v4.4s, v12.4s[0] OP_rr v16.4s, v4.4s, v12.s[0]
OP_ii v16.4s, v5.4s, v13.4s[0] OP_ii v16.4s, v5.4s, v13.s[0]
OP_ri v17.4s, v4.4s, v13.4s[0] OP_ri v17.4s, v4.4s, v13.s[0]
OP_ir v17.4s, v5.4s, v12.4s[0] OP_ir v17.4s, v5.4s, v12.s[0]
OP_rr v18.4s, v6.4s, v12.4s[0] OP_rr v18.4s, v6.4s, v12.s[0]
OP_ii v18.4s, v7.4s, v13.4s[0] OP_ii v18.4s, v7.4s, v13.s[0]
OP_ri v19.4s, v6.4s, v13.4s[0] OP_ri v19.4s, v6.4s, v13.s[0]
OP_ir v19.4s, v7.4s, v12.4s[0] OP_ir v19.4s, v7.4s, v12.s[0]
OP_rr v20.4s, v4.4s, v12.4s[1] OP_rr v20.4s, v4.4s, v12.s[1]
OP_ii v20.4s, v5.4s, v13.4s[1] OP_ii v20.4s, v5.4s, v13.s[1]
OP_ri v21.4s, v4.4s, v13.4s[1] OP_ri v21.4s, v4.4s, v13.s[1]
OP_ir v21.4s, v5.4s, v12.4s[1] OP_ir v21.4s, v5.4s, v12.s[1]
OP_rr v22.4s, v6.4s, v12.4s[1] OP_rr v22.4s, v6.4s, v12.s[1]
OP_ii v22.4s, v7.4s, v13.4s[1] OP_ii v22.4s, v7.4s, v13.s[1]
OP_ri v23.4s, v6.4s, v13.4s[1] OP_ri v23.4s, v6.4s, v13.s[1]
OP_ir v23.4s, v7.4s, v12.4s[1] OP_ir v23.4s, v7.4s, v12.s[1]
OP_rr v24.4s, v4.4s, v12.4s[2] OP_rr v24.4s, v4.4s, v12.s[2]
OP_ii v24.4s, v5.4s, v13.4s[2] OP_ii v24.4s, v5.4s, v13.s[2]
OP_ri v25.4s, v4.4s, v13.4s[2] OP_ri v25.4s, v4.4s, v13.s[2]
OP_ir v25.4s, v5.4s, v12.4s[2] OP_ir v25.4s, v5.4s, v12.s[2]
OP_rr v26.4s, v6.4s, v12.4s[2] OP_rr v26.4s, v6.4s, v12.s[2]
OP_ii v26.4s, v7.4s, v13.4s[2] OP_ii v26.4s, v7.4s, v13.s[2]
OP_ri v27.4s, v6.4s, v13.4s[2] OP_ri v27.4s, v6.4s, v13.s[2]
OP_ir v27.4s, v7.4s, v12.4s[2] OP_ir v27.4s, v7.4s, v12.s[2]
OP_rr v28.4s, v4.4s, v12.4s[3] OP_rr v28.4s, v4.4s, v12.s[3]
OP_ii v28.4s, v5.4s, v13.4s[3] OP_ii v28.4s, v5.4s, v13.s[3]
OP_ri v29.4s, v4.4s, v13.4s[3] OP_ri v29.4s, v4.4s, v13.s[3]
OP_ir v29.4s, v5.4s, v12.4s[3] OP_ir v29.4s, v5.4s, v12.s[3]
OP_rr v30.4s, v6.4s, v12.4s[3] OP_rr v30.4s, v6.4s, v12.s[3]
OP_ii v30.4s, v7.4s, v13.4s[3] OP_ii v30.4s, v7.4s, v13.s[3]
OP_ri v31.4s, v6.4s, v13.4s[3] OP_ri v31.4s, v6.4s, v13.s[3]
OP_ir v31.4s, v7.4s, v12.4s[3] OP_ir v31.4s, v7.4s, v12.s[3]
ld2 {v8.4s, v9.4s}, [pB] ld2 {v8.4s, v9.4s}, [pB]
add pB, pB, #32 add pB, pB, #32
@ -373,45 +373,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL8x4_E .macro KERNEL8x4_E
OP_rr v16.4s, v4.4s, v12.4s[0] OP_rr v16.4s, v4.4s, v12.s[0]
OP_ii v16.4s, v5.4s, v13.4s[0] OP_ii v16.4s, v5.4s, v13.s[0]
OP_ri v17.4s, v4.4s, v13.4s[0] OP_ri v17.4s, v4.4s, v13.s[0]
OP_ir v17.4s, v5.4s, v12.4s[0] OP_ir v17.4s, v5.4s, v12.s[0]
OP_rr v18.4s, v6.4s, v12.4s[0] OP_rr v18.4s, v6.4s, v12.s[0]
OP_ii v18.4s, v7.4s, v13.4s[0] OP_ii v18.4s, v7.4s, v13.s[0]
OP_ri v19.4s, v6.4s, v13.4s[0] OP_ri v19.4s, v6.4s, v13.s[0]
OP_ir v19.4s, v7.4s, v12.4s[0] OP_ir v19.4s, v7.4s, v12.s[0]
OP_rr v20.4s, v4.4s, v12.4s[1] OP_rr v20.4s, v4.4s, v12.s[1]
OP_ii v20.4s, v5.4s, v13.4s[1] OP_ii v20.4s, v5.4s, v13.s[1]
OP_ri v21.4s, v4.4s, v13.4s[1] OP_ri v21.4s, v4.4s, v13.s[1]
OP_ir v21.4s, v5.4s, v12.4s[1] OP_ir v21.4s, v5.4s, v12.s[1]
OP_rr v22.4s, v6.4s, v12.4s[1] OP_rr v22.4s, v6.4s, v12.s[1]
OP_ii v22.4s, v7.4s, v13.4s[1] OP_ii v22.4s, v7.4s, v13.s[1]
OP_ri v23.4s, v6.4s, v13.4s[1] OP_ri v23.4s, v6.4s, v13.s[1]
OP_ir v23.4s, v7.4s, v12.4s[1] OP_ir v23.4s, v7.4s, v12.s[1]
OP_rr v24.4s, v4.4s, v12.4s[2] OP_rr v24.4s, v4.4s, v12.s[2]
OP_ii v24.4s, v5.4s, v13.4s[2] OP_ii v24.4s, v5.4s, v13.s[2]
OP_ri v25.4s, v4.4s, v13.4s[2] OP_ri v25.4s, v4.4s, v13.s[2]
OP_ir v25.4s, v5.4s, v12.4s[2] OP_ir v25.4s, v5.4s, v12.s[2]
OP_rr v26.4s, v6.4s, v12.4s[2] OP_rr v26.4s, v6.4s, v12.s[2]
OP_ii v26.4s, v7.4s, v13.4s[2] OP_ii v26.4s, v7.4s, v13.s[2]
OP_ri v27.4s, v6.4s, v13.4s[2] OP_ri v27.4s, v6.4s, v13.s[2]
OP_ir v27.4s, v7.4s, v12.4s[2] OP_ir v27.4s, v7.4s, v12.s[2]
OP_rr v28.4s, v4.4s, v12.4s[3] OP_rr v28.4s, v4.4s, v12.s[3]
OP_ii v28.4s, v5.4s, v13.4s[3] OP_ii v28.4s, v5.4s, v13.s[3]
OP_ri v29.4s, v4.4s, v13.4s[3] OP_ri v29.4s, v4.4s, v13.s[3]
OP_ir v29.4s, v5.4s, v12.4s[3] OP_ir v29.4s, v5.4s, v12.s[3]
OP_rr v30.4s, v6.4s, v12.4s[3] OP_rr v30.4s, v6.4s, v12.s[3]
OP_ii v30.4s, v7.4s, v13.4s[3] OP_ii v30.4s, v7.4s, v13.s[3]
OP_ri v31.4s, v6.4s, v13.4s[3] OP_ri v31.4s, v6.4s, v13.s[3]
OP_ir v31.4s, v7.4s, v12.4s[3] OP_ir v31.4s, v7.4s, v12.s[3]
.endm .endm
@ -423,45 +423,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v2.4s, v3.4s}, [pA] ld2 {v2.4s, v3.4s}, [pA]
add pA, pA, #32 add pA, pA, #32
OP_rr v16.4s, v0.4s, v8.4s[0] OP_rr v16.4s, v0.4s, v8.s[0]
OP_ii v16.4s, v1.4s, v9.4s[0] OP_ii v16.4s, v1.4s, v9.s[0]
OP_ri v17.4s, v0.4s, v9.4s[0] OP_ri v17.4s, v0.4s, v9.s[0]
OP_ir v17.4s, v1.4s, v8.4s[0] OP_ir v17.4s, v1.4s, v8.s[0]
OP_rr v18.4s, v2.4s, v8.4s[0] OP_rr v18.4s, v2.4s, v8.s[0]
OP_ii v18.4s, v3.4s, v9.4s[0] OP_ii v18.4s, v3.4s, v9.s[0]
OP_ri v19.4s, v2.4s, v9.4s[0] OP_ri v19.4s, v2.4s, v9.s[0]
OP_ir v19.4s, v3.4s, v8.4s[0] OP_ir v19.4s, v3.4s, v8.s[0]
OP_rr v20.4s, v0.4s, v8.4s[1] OP_rr v20.4s, v0.4s, v8.s[1]
OP_ii v20.4s, v1.4s, v9.4s[1] OP_ii v20.4s, v1.4s, v9.s[1]
OP_ri v21.4s, v0.4s, v9.4s[1] OP_ri v21.4s, v0.4s, v9.s[1]
OP_ir v21.4s, v1.4s, v8.4s[1] OP_ir v21.4s, v1.4s, v8.s[1]
OP_rr v22.4s, v2.4s, v8.4s[1] OP_rr v22.4s, v2.4s, v8.s[1]
OP_ii v22.4s, v3.4s, v9.4s[1] OP_ii v22.4s, v3.4s, v9.s[1]
OP_ri v23.4s, v2.4s, v9.4s[1] OP_ri v23.4s, v2.4s, v9.s[1]
OP_ir v23.4s, v3.4s, v8.4s[1] OP_ir v23.4s, v3.4s, v8.s[1]
OP_rr v24.4s, v0.4s, v8.4s[2] OP_rr v24.4s, v0.4s, v8.s[2]
OP_ii v24.4s, v1.4s, v9.4s[2] OP_ii v24.4s, v1.4s, v9.s[2]
OP_ri v25.4s, v0.4s, v9.4s[2] OP_ri v25.4s, v0.4s, v9.s[2]
OP_ir v25.4s, v1.4s, v8.4s[2] OP_ir v25.4s, v1.4s, v8.s[2]
OP_rr v26.4s, v2.4s, v8.4s[2] OP_rr v26.4s, v2.4s, v8.s[2]
OP_ii v26.4s, v3.4s, v9.4s[2] OP_ii v26.4s, v3.4s, v9.s[2]
OP_ri v27.4s, v2.4s, v9.4s[2] OP_ri v27.4s, v2.4s, v9.s[2]
OP_ir v27.4s, v3.4s, v8.4s[2] OP_ir v27.4s, v3.4s, v8.s[2]
OP_rr v28.4s, v0.4s, v8.4s[3] OP_rr v28.4s, v0.4s, v8.s[3]
OP_ii v28.4s, v1.4s, v9.4s[3] OP_ii v28.4s, v1.4s, v9.s[3]
OP_ri v29.4s, v0.4s, v9.4s[3] OP_ri v29.4s, v0.4s, v9.s[3]
OP_ir v29.4s, v1.4s, v8.4s[3] OP_ir v29.4s, v1.4s, v8.s[3]
OP_rr v30.4s, v2.4s, v8.4s[3] OP_rr v30.4s, v2.4s, v8.s[3]
OP_ii v30.4s, v3.4s, v9.4s[3] OP_ii v30.4s, v3.4s, v9.s[3]
OP_ri v31.4s, v2.4s, v9.4s[3] OP_ri v31.4s, v2.4s, v9.s[3]
OP_ir v31.4s, v3.4s, v8.4s[3] OP_ir v31.4s, v3.4s, v8.s[3]
.endm .endm
@ -560,49 +560,49 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v0.4s, v1.4s}, [pA] ld2 {v0.4s, v1.4s}, [pA]
add pA, pA, #32 add pA, pA, #32
fmul v16.4s, v0.4s, v8.4s[0] fmul v16.4s, v0.4s, v8.s[0]
OP_ii v16.4s, v1.4s, v9.4s[0] OP_ii v16.4s, v1.4s, v9.s[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC) defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v17.16b, v17.16b, v17.16b eor v17.16b, v17.16b, v17.16b
fmls v17.4s, v0.4s, v9.4s[0] fmls v17.4s, v0.4s, v9.s[0]
#else #else
fmul v17.4s, v0.4s, v9.4s[0] fmul v17.4s, v0.4s, v9.s[0]
#endif #endif
OP_ir v17.4s, v1.4s, v8.4s[0] OP_ir v17.4s, v1.4s, v8.s[0]
fmul v20.4s, v0.4s, v8.4s[1] fmul v20.4s, v0.4s, v8.s[1]
OP_ii v20.4s, v1.4s, v9.4s[1] OP_ii v20.4s, v1.4s, v9.s[1]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC) defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v21.16b, v21.16b, v21.16b eor v21.16b, v21.16b, v21.16b
fmls v21.4s, v0.4s, v9.4s[1] fmls v21.4s, v0.4s, v9.s[1]
#else #else
fmul v21.4s, v0.4s, v9.4s[1] fmul v21.4s, v0.4s, v9.s[1]
#endif #endif
OP_ir v21.4s, v1.4s, v8.4s[1] OP_ir v21.4s, v1.4s, v8.s[1]
fmul v24.4s, v0.4s, v8.4s[2] fmul v24.4s, v0.4s, v8.s[2]
OP_ii v24.4s, v1.4s, v9.4s[2] OP_ii v24.4s, v1.4s, v9.s[2]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC) defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v25.16b, v25.16b, v25.16b eor v25.16b, v25.16b, v25.16b
fmls v25.4s, v0.4s, v9.4s[2] fmls v25.4s, v0.4s, v9.s[2]
#else #else
fmul v25.4s, v0.4s, v9.4s[2] fmul v25.4s, v0.4s, v9.s[2]
#endif #endif
OP_ir v25.4s, v1.4s, v8.4s[2] OP_ir v25.4s, v1.4s, v8.s[2]
fmul v28.4s, v0.4s, v8.4s[3] fmul v28.4s, v0.4s, v8.s[3]
OP_ii v28.4s, v1.4s, v9.4s[3] OP_ii v28.4s, v1.4s, v9.s[3]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC) defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v29.16b, v29.16b, v29.16b eor v29.16b, v29.16b, v29.16b
fmls v29.4s, v0.4s, v9.4s[3] fmls v29.4s, v0.4s, v9.s[3]
#else #else
fmul v29.4s, v0.4s, v9.4s[3] fmul v29.4s, v0.4s, v9.s[3]
#endif #endif
OP_ir v29.4s, v1.4s, v8.4s[3] OP_ir v29.4s, v1.4s, v8.s[3]
ld2 {v12.4s, v13.4s}, [pB] ld2 {v12.4s, v13.4s}, [pB]
add pB, pB, #32 add pB, pB, #32
@ -611,85 +611,85 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL4x4_M1 .macro KERNEL4x4_M1
OP_rr v16.4s, v0.4s, v8.4s[0] OP_rr v16.4s, v0.4s, v8.s[0]
OP_ii v16.4s, v1.4s, v9.4s[0] OP_ii v16.4s, v1.4s, v9.s[0]
OP_ri v17.4s, v0.4s, v9.4s[0] OP_ri v17.4s, v0.4s, v9.s[0]
OP_ir v17.4s, v1.4s, v8.4s[0] OP_ir v17.4s, v1.4s, v8.s[0]
ld2 {v12.4s, v13.4s}, [pB] // For next round ld2 {v12.4s, v13.4s}, [pB] // For next round
add pB, pB, #32 add pB, pB, #32
OP_rr v20.4s, v0.4s, v8.4s[1] OP_rr v20.4s, v0.4s, v8.s[1]
OP_ii v20.4s, v1.4s, v9.4s[1] OP_ii v20.4s, v1.4s, v9.s[1]
OP_ri v21.4s, v0.4s, v9.4s[1] OP_ri v21.4s, v0.4s, v9.s[1]
OP_ir v21.4s, v1.4s, v8.4s[1] OP_ir v21.4s, v1.4s, v8.s[1]
ld2 {v4.4s, v5.4s}, [pA] // For next round ld2 {v4.4s, v5.4s}, [pA] // For next round
add pA, pA, #32 add pA, pA, #32
OP_rr v24.4s, v0.4s, v8.4s[2] OP_rr v24.4s, v0.4s, v8.s[2]
OP_ii v24.4s, v1.4s, v9.4s[2] OP_ii v24.4s, v1.4s, v9.s[2]
OP_ri v25.4s, v0.4s, v9.4s[2] OP_ri v25.4s, v0.4s, v9.s[2]
OP_ir v25.4s, v1.4s, v8.4s[2] OP_ir v25.4s, v1.4s, v8.s[2]
prfm PLDL1KEEP, [pA, #512] prfm PLDL1KEEP, [pA, #512]
OP_rr v28.4s, v0.4s, v8.4s[3] OP_rr v28.4s, v0.4s, v8.s[3]
OP_ii v28.4s, v1.4s, v9.4s[3] OP_ii v28.4s, v1.4s, v9.s[3]
OP_ri v29.4s, v0.4s, v9.4s[3] OP_ri v29.4s, v0.4s, v9.s[3]
OP_ir v29.4s, v1.4s, v8.4s[3] OP_ir v29.4s, v1.4s, v8.s[3]
.endm .endm
.macro KERNEL4x4_M2 .macro KERNEL4x4_M2
OP_rr v16.4s, v4.4s, v12.4s[0] OP_rr v16.4s, v4.4s, v12.s[0]
OP_ii v16.4s, v5.4s, v13.4s[0] OP_ii v16.4s, v5.4s, v13.s[0]
OP_ri v17.4s, v4.4s, v13.4s[0] OP_ri v17.4s, v4.4s, v13.s[0]
OP_ir v17.4s, v5.4s, v12.4s[0] OP_ir v17.4s, v5.4s, v12.s[0]
ld2 {v8.4s, v9.4s}, [pB] // For next round ld2 {v8.4s, v9.4s}, [pB] // For next round
add pB, pB, #32 add pB, pB, #32
OP_rr v20.4s, v4.4s, v12.4s[1] OP_rr v20.4s, v4.4s, v12.s[1]
OP_ii v20.4s, v5.4s, v13.4s[1] OP_ii v20.4s, v5.4s, v13.s[1]
OP_ri v21.4s, v4.4s, v13.4s[1] OP_ri v21.4s, v4.4s, v13.s[1]
OP_ir v21.4s, v5.4s, v12.4s[1] OP_ir v21.4s, v5.4s, v12.s[1]
ld2 {v0.4s, v1.4s}, [pA] // For next round ld2 {v0.4s, v1.4s}, [pA] // For next round
add pA, pA, #32 add pA, pA, #32
OP_rr v24.4s, v4.4s, v12.4s[2] OP_rr v24.4s, v4.4s, v12.s[2]
OP_ii v24.4s, v5.4s, v13.4s[2] OP_ii v24.4s, v5.4s, v13.s[2]
OP_ri v25.4s, v4.4s, v13.4s[2] OP_ri v25.4s, v4.4s, v13.s[2]
OP_ir v25.4s, v5.4s, v12.4s[2] OP_ir v25.4s, v5.4s, v12.s[2]
prfm PLDL1KEEP, [pB, #512] prfm PLDL1KEEP, [pB, #512]
OP_rr v28.4s, v4.4s, v12.4s[3] OP_rr v28.4s, v4.4s, v12.s[3]
OP_ii v28.4s, v5.4s, v13.4s[3] OP_ii v28.4s, v5.4s, v13.s[3]
OP_ri v29.4s, v4.4s, v13.4s[3] OP_ri v29.4s, v4.4s, v13.s[3]
OP_ir v29.4s, v5.4s, v12.4s[3] OP_ir v29.4s, v5.4s, v12.s[3]
.endm .endm
.macro KERNEL4x4_E .macro KERNEL4x4_E
OP_rr v16.4s, v4.4s, v12.4s[0] OP_rr v16.4s, v4.4s, v12.s[0]
OP_ii v16.4s, v5.4s, v13.4s[0] OP_ii v16.4s, v5.4s, v13.s[0]
OP_ri v17.4s, v4.4s, v13.4s[0] OP_ri v17.4s, v4.4s, v13.s[0]
OP_ir v17.4s, v5.4s, v12.4s[0] OP_ir v17.4s, v5.4s, v12.s[0]
OP_rr v20.4s, v4.4s, v12.4s[1] OP_rr v20.4s, v4.4s, v12.s[1]
OP_ii v20.4s, v5.4s, v13.4s[1] OP_ii v20.4s, v5.4s, v13.s[1]
OP_ri v21.4s, v4.4s, v13.4s[1] OP_ri v21.4s, v4.4s, v13.s[1]
OP_ir v21.4s, v5.4s, v12.4s[1] OP_ir v21.4s, v5.4s, v12.s[1]
OP_rr v24.4s, v4.4s, v12.4s[2] OP_rr v24.4s, v4.4s, v12.s[2]
OP_ii v24.4s, v5.4s, v13.4s[2] OP_ii v24.4s, v5.4s, v13.s[2]
OP_ri v25.4s, v4.4s, v13.4s[2] OP_ri v25.4s, v4.4s, v13.s[2]
OP_ir v25.4s, v5.4s, v12.4s[2] OP_ir v25.4s, v5.4s, v12.s[2]
OP_rr v28.4s, v4.4s, v12.4s[3] OP_rr v28.4s, v4.4s, v12.s[3]
OP_ii v28.4s, v5.4s, v13.4s[3] OP_ii v28.4s, v5.4s, v13.s[3]
OP_ri v29.4s, v4.4s, v13.4s[3] OP_ri v29.4s, v4.4s, v13.s[3]
OP_ir v29.4s, v5.4s, v12.4s[3] OP_ir v29.4s, v5.4s, v12.s[3]
.endm .endm
.macro KERNEL4x4_SUB .macro KERNEL4x4_SUB
@ -698,25 +698,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v0.4s, v1.4s}, [pA] ld2 {v0.4s, v1.4s}, [pA]
add pA, pA, #32 add pA, pA, #32
OP_rr v16.4s, v0.4s, v8.4s[0] OP_rr v16.4s, v0.4s, v8.s[0]
OP_ii v16.4s, v1.4s, v9.4s[0] OP_ii v16.4s, v1.4s, v9.s[0]
OP_ri v17.4s, v0.4s, v9.4s[0] OP_ri v17.4s, v0.4s, v9.s[0]
OP_ir v17.4s, v1.4s, v8.4s[0] OP_ir v17.4s, v1.4s, v8.s[0]
OP_rr v20.4s, v0.4s, v8.4s[1] OP_rr v20.4s, v0.4s, v8.s[1]
OP_ii v20.4s, v1.4s, v9.4s[1] OP_ii v20.4s, v1.4s, v9.s[1]
OP_ri v21.4s, v0.4s, v9.4s[1] OP_ri v21.4s, v0.4s, v9.s[1]
OP_ir v21.4s, v1.4s, v8.4s[1] OP_ir v21.4s, v1.4s, v8.s[1]
OP_rr v24.4s, v0.4s, v8.4s[2] OP_rr v24.4s, v0.4s, v8.s[2]
OP_ii v24.4s, v1.4s, v9.4s[2] OP_ii v24.4s, v1.4s, v9.s[2]
OP_ri v25.4s, v0.4s, v9.4s[2] OP_ri v25.4s, v0.4s, v9.s[2]
OP_ir v25.4s, v1.4s, v8.4s[2] OP_ir v25.4s, v1.4s, v8.s[2]
OP_rr v28.4s, v0.4s, v8.4s[3] OP_rr v28.4s, v0.4s, v8.s[3]
OP_ii v28.4s, v1.4s, v9.4s[3] OP_ii v28.4s, v1.4s, v9.s[3]
OP_ri v29.4s, v0.4s, v9.4s[3] OP_ri v29.4s, v0.4s, v9.s[3]
OP_ir v29.4s, v1.4s, v8.4s[3] OP_ir v29.4s, v1.4s, v8.s[3]
.endm .endm
.macro SAVE4x4 .macro SAVE4x4
@ -778,25 +778,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v0.2s, v1.2s}, [pA] ld2 {v0.2s, v1.2s}, [pA]
add pA, pA, #16 add pA, pA, #16
OP_rr v16.2s, v0.2s, v8.4s[0] OP_rr v16.2s, v0.2s, v8.s[0]
OP_ii v16.2s, v1.2s, v9.4s[0] OP_ii v16.2s, v1.2s, v9.s[0]
OP_ri v17.2s, v0.2s, v9.4s[0] OP_ri v17.2s, v0.2s, v9.s[0]
OP_ir v17.2s, v1.2s, v8.4s[0] OP_ir v17.2s, v1.2s, v8.s[0]
OP_rr v20.2s, v0.2s, v8.4s[1] OP_rr v20.2s, v0.2s, v8.s[1]
OP_ii v20.2s, v1.2s, v9.4s[1] OP_ii v20.2s, v1.2s, v9.s[1]
OP_ri v21.2s, v0.2s, v9.4s[1] OP_ri v21.2s, v0.2s, v9.s[1]
OP_ir v21.2s, v1.2s, v8.4s[1] OP_ir v21.2s, v1.2s, v8.s[1]
OP_rr v24.2s, v0.2s, v8.4s[2] OP_rr v24.2s, v0.2s, v8.s[2]
OP_ii v24.2s, v1.2s, v9.4s[2] OP_ii v24.2s, v1.2s, v9.s[2]
OP_ri v25.2s, v0.2s, v9.4s[2] OP_ri v25.2s, v0.2s, v9.s[2]
OP_ir v25.2s, v1.2s, v8.4s[2] OP_ir v25.2s, v1.2s, v8.s[2]
OP_rr v28.2s, v0.2s, v8.4s[3] OP_rr v28.2s, v0.2s, v8.s[3]
OP_ii v28.2s, v1.2s, v9.4s[3] OP_ii v28.2s, v1.2s, v9.s[3]
OP_ri v29.2s, v0.2s, v9.4s[3] OP_ri v29.2s, v0.2s, v9.s[3]
OP_ir v29.2s, v1.2s, v8.4s[3] OP_ir v29.2s, v1.2s, v8.s[3]
.endm .endm
.macro SAVE2x4 .macro SAVE2x4
@ -858,25 +858,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v0.s, v1.s}[0], [pA] ld2 {v0.s, v1.s}[0], [pA]
add pA, pA, #8 add pA, pA, #8
OP_rr s16, s0, v8.4s[0] OP_rr s16, s0, v8.s[0]
OP_ii s16, s1, v9.4s[0] OP_ii s16, s1, v9.s[0]
OP_ri s17, s0, v9.4s[0] OP_ri s17, s0, v9.s[0]
OP_ir s17, s1, v8.4s[0] OP_ir s17, s1, v8.s[0]
OP_rr s20, s0, v8.4s[1] OP_rr s20, s0, v8.s[1]
OP_ii s20, s1, v9.4s[1] OP_ii s20, s1, v9.s[1]
OP_ri s21, s0, v9.4s[1] OP_ri s21, s0, v9.s[1]
OP_ir s21, s1, v8.4s[1] OP_ir s21, s1, v8.s[1]
OP_rr s24, s0, v8.4s[2] OP_rr s24, s0, v8.s[2]
OP_ii s24, s1, v9.4s[2] OP_ii s24, s1, v9.s[2]
OP_ri s25, s0, v9.4s[2] OP_ri s25, s0, v9.s[2]
OP_ir s25, s1, v8.4s[2] OP_ir s25, s1, v8.s[2]
OP_rr s28, s0, v8.4s[3] OP_rr s28, s0, v8.s[3]
OP_ii s28, s1, v9.4s[3] OP_ii s28, s1, v9.s[3]
OP_ri s29, s0, v9.4s[3] OP_ri s29, s0, v9.s[3]
OP_ir s29, s1, v8.4s[3] OP_ir s29, s1, v8.s[3]
.endm .endm
.macro SAVE1x4 .macro SAVE1x4
@ -940,25 +940,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v2.4s, v3.4s}, [pA] ld2 {v2.4s, v3.4s}, [pA]
add pA, pA, #32 add pA, pA, #32
OP_rr v16.4s, v0.4s, v8.2s[0] OP_rr v16.4s, v0.4s, v8.s[0]
OP_ii v16.4s, v1.4s, v9.2s[0] OP_ii v16.4s, v1.4s, v9.s[0]
OP_ri v17.4s, v0.4s, v9.2s[0] OP_ri v17.4s, v0.4s, v9.s[0]
OP_ir v17.4s, v1.4s, v8.2s[0] OP_ir v17.4s, v1.4s, v8.s[0]
OP_rr v18.4s, v2.4s, v8.2s[0] OP_rr v18.4s, v2.4s, v8.s[0]
OP_ii v18.4s, v3.4s, v9.2s[0] OP_ii v18.4s, v3.4s, v9.s[0]
OP_ri v19.4s, v2.4s, v9.2s[0] OP_ri v19.4s, v2.4s, v9.s[0]
OP_ir v19.4s, v3.4s, v8.2s[0] OP_ir v19.4s, v3.4s, v8.s[0]
OP_rr v20.4s, v0.4s, v8.2s[1] OP_rr v20.4s, v0.4s, v8.s[1]
OP_ii v20.4s, v1.4s, v9.2s[1] OP_ii v20.4s, v1.4s, v9.s[1]
OP_ri v21.4s, v0.4s, v9.2s[1] OP_ri v21.4s, v0.4s, v9.s[1]
OP_ir v21.4s, v1.4s, v8.2s[1] OP_ir v21.4s, v1.4s, v8.s[1]
OP_rr v22.4s, v2.4s, v8.2s[1] OP_rr v22.4s, v2.4s, v8.s[1]
OP_ii v22.4s, v3.4s, v9.2s[1] OP_ii v22.4s, v3.4s, v9.s[1]
OP_ri v23.4s, v2.4s, v9.2s[1] OP_ri v23.4s, v2.4s, v9.s[1]
OP_ir v23.4s, v3.4s, v8.2s[1] OP_ir v23.4s, v3.4s, v8.s[1]
.endm .endm
.macro SAVE8x2 .macro SAVE8x2
@ -1016,15 +1016,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v0.4s, v1.4s}, [pA] ld2 {v0.4s, v1.4s}, [pA]
add pA, pA, #32 add pA, pA, #32
OP_rr v16.4s, v0.4s, v8.2s[0] OP_rr v16.4s, v0.4s, v8.s[0]
OP_ii v16.4s, v1.4s, v9.2s[0] OP_ii v16.4s, v1.4s, v9.s[0]
OP_ri v17.4s, v0.4s, v9.2s[0] OP_ri v17.4s, v0.4s, v9.s[0]
OP_ir v17.4s, v1.4s, v8.2s[0] OP_ir v17.4s, v1.4s, v8.s[0]
OP_rr v20.4s, v0.4s, v8.2s[1] OP_rr v20.4s, v0.4s, v8.s[1]
OP_ii v20.4s, v1.4s, v9.2s[1] OP_ii v20.4s, v1.4s, v9.s[1]
OP_ri v21.4s, v0.4s, v9.2s[1] OP_ri v21.4s, v0.4s, v9.s[1]
OP_ir v21.4s, v1.4s, v8.2s[1] OP_ir v21.4s, v1.4s, v8.s[1]
.endm .endm
.macro SAVE4x2 .macro SAVE4x2
@ -1064,15 +1064,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v0.2s, v1.2s}, [pA] ld2 {v0.2s, v1.2s}, [pA]
add pA, pA, #16 add pA, pA, #16
OP_rr v16.2s, v0.2s, v8.2s[0] OP_rr v16.2s, v0.2s, v8.s[0]
OP_ii v16.2s, v1.2s, v9.2s[0] OP_ii v16.2s, v1.2s, v9.s[0]
OP_ri v17.2s, v0.2s, v9.2s[0] OP_ri v17.2s, v0.2s, v9.s[0]
OP_ir v17.2s, v1.2s, v8.2s[0] OP_ir v17.2s, v1.2s, v8.s[0]
OP_rr v20.2s, v0.2s, v8.2s[1] OP_rr v20.2s, v0.2s, v8.s[1]
OP_ii v20.2s, v1.2s, v9.2s[1] OP_ii v20.2s, v1.2s, v9.s[1]
OP_ri v21.2s, v0.2s, v9.2s[1] OP_ri v21.2s, v0.2s, v9.s[1]
OP_ir v21.2s, v1.2s, v8.2s[1] OP_ir v21.2s, v1.2s, v8.s[1]
.endm .endm
.macro SAVE2x2 .macro SAVE2x2
@ -1112,15 +1112,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v0.s, v1.s}[0], [pA] ld2 {v0.s, v1.s}[0], [pA]
add pA, pA, #8 add pA, pA, #8
OP_rr s16, s0, v8.2s[0] OP_rr s16, s0, v8.s[0]
OP_ii s16, s1, v9.2s[0] OP_ii s16, s1, v9.s[0]
OP_ri s17, s0, v9.2s[0] OP_ri s17, s0, v9.s[0]
OP_ir s17, s1, v8.2s[0] OP_ir s17, s1, v8.s[0]
OP_rr s20, s0, v8.2s[1] OP_rr s20, s0, v8.s[1]
OP_ii s20, s1, v9.2s[1] OP_ii s20, s1, v9.s[1]
OP_ri s21, s0, v9.2s[1] OP_ri s21, s0, v9.s[1]
OP_ir s21, s1, v8.2s[1] OP_ir s21, s1, v8.s[1]
.endm .endm
.macro SAVE1x2 .macro SAVE1x2
@ -1162,15 +1162,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v2.4s, v3.4s}, [pA] ld2 {v2.4s, v3.4s}, [pA]
add pA, pA, #32 add pA, pA, #32
OP_rr v16.4s, v0.4s, v8.4s[0] OP_rr v16.4s, v0.4s, v8.s[0]
OP_ii v16.4s, v1.4s, v8.4s[1] OP_ii v16.4s, v1.4s, v8.s[1]
OP_ri v17.4s, v0.4s, v8.4s[1] OP_ri v17.4s, v0.4s, v8.s[1]
OP_ir v17.4s, v1.4s, v8.4s[0] OP_ir v17.4s, v1.4s, v8.s[0]
OP_rr v18.4s, v2.4s, v8.4s[0] OP_rr v18.4s, v2.4s, v8.s[0]
OP_ii v18.4s, v3.4s, v8.4s[1] OP_ii v18.4s, v3.4s, v8.s[1]
OP_ri v19.4s, v2.4s, v8.4s[1] OP_ri v19.4s, v2.4s, v8.s[1]
OP_ir v19.4s, v3.4s, v8.4s[0] OP_ir v19.4s, v3.4s, v8.s[0]
.endm .endm
.macro SAVE8x1 .macro SAVE8x1

View File

@ -170,49 +170,49 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v0.4s, v1.4s}, [pA] ld2 {v0.4s, v1.4s}, [pA]
add pA, pA, #32 add pA, pA, #32
fmul v16.4s, v0.4s, v8.4s[0] fmul v16.4s, v0.4s, v8.s[0]
OP_ii v16.4s, v1.4s, v9.4s[0] OP_ii v16.4s, v1.4s, v9.s[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC) defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v17.16b, v17.16b, v17.16b eor v17.16b, v17.16b, v17.16b
fmls v17.4s, v0.4s, v9.4s[0] fmls v17.4s, v0.4s, v9.s[0]
#else #else
fmul v17.4s, v0.4s, v9.4s[0] fmul v17.4s, v0.4s, v9.s[0]
#endif #endif
OP_ir v17.4s, v1.4s, v8.4s[0] OP_ir v17.4s, v1.4s, v8.s[0]
fmul v20.4s, v0.4s, v8.4s[1] fmul v20.4s, v0.4s, v8.s[1]
OP_ii v20.4s, v1.4s, v9.4s[1] OP_ii v20.4s, v1.4s, v9.s[1]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC) defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v21.16b, v21.16b, v21.16b eor v21.16b, v21.16b, v21.16b
fmls v21.4s, v0.4s, v9.4s[1] fmls v21.4s, v0.4s, v9.s[1]
#else #else
fmul v21.4s, v0.4s, v9.4s[1] fmul v21.4s, v0.4s, v9.s[1]
#endif #endif
OP_ir v21.4s, v1.4s, v8.4s[1] OP_ir v21.4s, v1.4s, v8.s[1]
fmul v24.4s, v0.4s, v8.4s[2] fmul v24.4s, v0.4s, v8.s[2]
OP_ii v24.4s, v1.4s, v9.4s[2] OP_ii v24.4s, v1.4s, v9.s[2]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC) defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v25.16b, v25.16b, v25.16b eor v25.16b, v25.16b, v25.16b
fmls v25.4s, v0.4s, v9.4s[2] fmls v25.4s, v0.4s, v9.s[2]
#else #else
fmul v25.4s, v0.4s, v9.4s[2] fmul v25.4s, v0.4s, v9.s[2]
#endif #endif
OP_ir v25.4s, v1.4s, v8.4s[2] OP_ir v25.4s, v1.4s, v8.s[2]
fmul v28.4s, v0.4s, v8.4s[3] fmul v28.4s, v0.4s, v8.s[3]
OP_ii v28.4s, v1.4s, v9.4s[3] OP_ii v28.4s, v1.4s, v9.s[3]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC) defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v29.16b, v29.16b, v29.16b eor v29.16b, v29.16b, v29.16b
fmls v29.4s, v0.4s, v9.4s[3] fmls v29.4s, v0.4s, v9.s[3]
#else #else
fmul v29.4s, v0.4s, v9.4s[3] fmul v29.4s, v0.4s, v9.s[3]
#endif #endif
OP_ir v29.4s, v1.4s, v8.4s[3] OP_ir v29.4s, v1.4s, v8.s[3]
ld2 {v12.4s, v13.4s}, [pB] ld2 {v12.4s, v13.4s}, [pB]
add pB, pB, #32 add pB, pB, #32
@ -221,85 +221,85 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL4x4_M1 .macro KERNEL4x4_M1
OP_rr v16.4s, v0.4s, v8.4s[0] OP_rr v16.4s, v0.4s, v8.s[0]
OP_ii v16.4s, v1.4s, v9.4s[0] OP_ii v16.4s, v1.4s, v9.s[0]
OP_ri v17.4s, v0.4s, v9.4s[0] OP_ri v17.4s, v0.4s, v9.s[0]
OP_ir v17.4s, v1.4s, v8.4s[0] OP_ir v17.4s, v1.4s, v8.s[0]
ld2 {v12.4s, v13.4s}, [pB] // For next round ld2 {v12.4s, v13.4s}, [pB] // For next round
add pB, pB, #32 add pB, pB, #32
OP_rr v20.4s, v0.4s, v8.4s[1] OP_rr v20.4s, v0.4s, v8.s[1]
OP_ii v20.4s, v1.4s, v9.4s[1] OP_ii v20.4s, v1.4s, v9.s[1]
OP_ri v21.4s, v0.4s, v9.4s[1] OP_ri v21.4s, v0.4s, v9.s[1]
OP_ir v21.4s, v1.4s, v8.4s[1] OP_ir v21.4s, v1.4s, v8.s[1]
ld2 {v4.4s, v5.4s}, [pA] // For next round ld2 {v4.4s, v5.4s}, [pA] // For next round
add pA, pA, #32 add pA, pA, #32
OP_rr v24.4s, v0.4s, v8.4s[2] OP_rr v24.4s, v0.4s, v8.s[2]
OP_ii v24.4s, v1.4s, v9.4s[2] OP_ii v24.4s, v1.4s, v9.s[2]
OP_ri v25.4s, v0.4s, v9.4s[2] OP_ri v25.4s, v0.4s, v9.s[2]
OP_ir v25.4s, v1.4s, v8.4s[2] OP_ir v25.4s, v1.4s, v8.s[2]
prfm PLDL1KEEP, [pA, #512] prfm PLDL1KEEP, [pA, #512]
OP_rr v28.4s, v0.4s, v8.4s[3] OP_rr v28.4s, v0.4s, v8.s[3]
OP_ii v28.4s, v1.4s, v9.4s[3] OP_ii v28.4s, v1.4s, v9.s[3]
OP_ri v29.4s, v0.4s, v9.4s[3] OP_ri v29.4s, v0.4s, v9.s[3]
OP_ir v29.4s, v1.4s, v8.4s[3] OP_ir v29.4s, v1.4s, v8.s[3]
.endm .endm
.macro KERNEL4x4_M2 .macro KERNEL4x4_M2
OP_rr v16.4s, v4.4s, v12.4s[0] OP_rr v16.4s, v4.4s, v12.s[0]
OP_ii v16.4s, v5.4s, v13.4s[0] OP_ii v16.4s, v5.4s, v13.s[0]
OP_ri v17.4s, v4.4s, v13.4s[0] OP_ri v17.4s, v4.4s, v13.s[0]
OP_ir v17.4s, v5.4s, v12.4s[0] OP_ir v17.4s, v5.4s, v12.s[0]
ld2 {v8.4s, v9.4s}, [pB] // For next round ld2 {v8.4s, v9.4s}, [pB] // For next round
add pB, pB, #32 add pB, pB, #32
OP_rr v20.4s, v4.4s, v12.4s[1] OP_rr v20.4s, v4.4s, v12.s[1]
OP_ii v20.4s, v5.4s, v13.4s[1] OP_ii v20.4s, v5.4s, v13.s[1]
OP_ri v21.4s, v4.4s, v13.4s[1] OP_ri v21.4s, v4.4s, v13.s[1]
OP_ir v21.4s, v5.4s, v12.4s[1] OP_ir v21.4s, v5.4s, v12.s[1]
ld2 {v0.4s, v1.4s}, [pA] // For next round ld2 {v0.4s, v1.4s}, [pA] // For next round
add pA, pA, #32 add pA, pA, #32
OP_rr v24.4s, v4.4s, v12.4s[2] OP_rr v24.4s, v4.4s, v12.s[2]
OP_ii v24.4s, v5.4s, v13.4s[2] OP_ii v24.4s, v5.4s, v13.s[2]
OP_ri v25.4s, v4.4s, v13.4s[2] OP_ri v25.4s, v4.4s, v13.s[2]
OP_ir v25.4s, v5.4s, v12.4s[2] OP_ir v25.4s, v5.4s, v12.s[2]
prfm PLDL1KEEP, [pB, #512] prfm PLDL1KEEP, [pB, #512]
OP_rr v28.4s, v4.4s, v12.4s[3] OP_rr v28.4s, v4.4s, v12.s[3]
OP_ii v28.4s, v5.4s, v13.4s[3] OP_ii v28.4s, v5.4s, v13.s[3]
OP_ri v29.4s, v4.4s, v13.4s[3] OP_ri v29.4s, v4.4s, v13.s[3]
OP_ir v29.4s, v5.4s, v12.4s[3] OP_ir v29.4s, v5.4s, v12.s[3]
.endm .endm
.macro KERNEL4x4_E .macro KERNEL4x4_E
OP_rr v16.4s, v4.4s, v12.4s[0] OP_rr v16.4s, v4.4s, v12.s[0]
OP_ii v16.4s, v5.4s, v13.4s[0] OP_ii v16.4s, v5.4s, v13.s[0]
OP_ri v17.4s, v4.4s, v13.4s[0] OP_ri v17.4s, v4.4s, v13.s[0]
OP_ir v17.4s, v5.4s, v12.4s[0] OP_ir v17.4s, v5.4s, v12.s[0]
OP_rr v20.4s, v4.4s, v12.4s[1] OP_rr v20.4s, v4.4s, v12.s[1]
OP_ii v20.4s, v5.4s, v13.4s[1] OP_ii v20.4s, v5.4s, v13.s[1]
OP_ri v21.4s, v4.4s, v13.4s[1] OP_ri v21.4s, v4.4s, v13.s[1]
OP_ir v21.4s, v5.4s, v12.4s[1] OP_ir v21.4s, v5.4s, v12.s[1]
OP_rr v24.4s, v4.4s, v12.4s[2] OP_rr v24.4s, v4.4s, v12.s[2]
OP_ii v24.4s, v5.4s, v13.4s[2] OP_ii v24.4s, v5.4s, v13.s[2]
OP_ri v25.4s, v4.4s, v13.4s[2] OP_ri v25.4s, v4.4s, v13.s[2]
OP_ir v25.4s, v5.4s, v12.4s[2] OP_ir v25.4s, v5.4s, v12.s[2]
OP_rr v28.4s, v4.4s, v12.4s[3] OP_rr v28.4s, v4.4s, v12.s[3]
OP_ii v28.4s, v5.4s, v13.4s[3] OP_ii v28.4s, v5.4s, v13.s[3]
OP_ri v29.4s, v4.4s, v13.4s[3] OP_ri v29.4s, v4.4s, v13.s[3]
OP_ir v29.4s, v5.4s, v12.4s[3] OP_ir v29.4s, v5.4s, v12.s[3]
.endm .endm
.macro KERNEL4x4_SUB .macro KERNEL4x4_SUB
@ -308,25 +308,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v0.4s, v1.4s}, [pA] ld2 {v0.4s, v1.4s}, [pA]
add pA, pA, #32 add pA, pA, #32
OP_rr v16.4s, v0.4s, v8.4s[0] OP_rr v16.4s, v0.4s, v8.s[0]
OP_ii v16.4s, v1.4s, v9.4s[0] OP_ii v16.4s, v1.4s, v9.s[0]
OP_ri v17.4s, v0.4s, v9.4s[0] OP_ri v17.4s, v0.4s, v9.s[0]
OP_ir v17.4s, v1.4s, v8.4s[0] OP_ir v17.4s, v1.4s, v8.s[0]
OP_rr v20.4s, v0.4s, v8.4s[1] OP_rr v20.4s, v0.4s, v8.s[1]
OP_ii v20.4s, v1.4s, v9.4s[1] OP_ii v20.4s, v1.4s, v9.s[1]
OP_ri v21.4s, v0.4s, v9.4s[1] OP_ri v21.4s, v0.4s, v9.s[1]
OP_ir v21.4s, v1.4s, v8.4s[1] OP_ir v21.4s, v1.4s, v8.s[1]
OP_rr v24.4s, v0.4s, v8.4s[2] OP_rr v24.4s, v0.4s, v8.s[2]
OP_ii v24.4s, v1.4s, v9.4s[2] OP_ii v24.4s, v1.4s, v9.s[2]
OP_ri v25.4s, v0.4s, v9.4s[2] OP_ri v25.4s, v0.4s, v9.s[2]
OP_ir v25.4s, v1.4s, v8.4s[2] OP_ir v25.4s, v1.4s, v8.s[2]
OP_rr v28.4s, v0.4s, v8.4s[3] OP_rr v28.4s, v0.4s, v8.s[3]
OP_ii v28.4s, v1.4s, v9.4s[3] OP_ii v28.4s, v1.4s, v9.s[3]
OP_ri v29.4s, v0.4s, v9.4s[3] OP_ri v29.4s, v0.4s, v9.s[3]
OP_ir v29.4s, v1.4s, v8.4s[3] OP_ir v29.4s, v1.4s, v8.s[3]
.endm .endm
.macro SAVE4x4 .macro SAVE4x4
@ -384,25 +384,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v0.2s, v1.2s}, [pA] ld2 {v0.2s, v1.2s}, [pA]
add pA, pA, #16 add pA, pA, #16
OP_rr v16.2s, v0.2s, v8.4s[0] OP_rr v16.2s, v0.2s, v8.s[0]
OP_ii v16.2s, v1.2s, v9.4s[0] OP_ii v16.2s, v1.2s, v9.s[0]
OP_ri v17.2s, v0.2s, v9.4s[0] OP_ri v17.2s, v0.2s, v9.s[0]
OP_ir v17.2s, v1.2s, v8.4s[0] OP_ir v17.2s, v1.2s, v8.s[0]
OP_rr v20.2s, v0.2s, v8.4s[1] OP_rr v20.2s, v0.2s, v8.s[1]
OP_ii v20.2s, v1.2s, v9.4s[1] OP_ii v20.2s, v1.2s, v9.s[1]
OP_ri v21.2s, v0.2s, v9.4s[1] OP_ri v21.2s, v0.2s, v9.s[1]
OP_ir v21.2s, v1.2s, v8.4s[1] OP_ir v21.2s, v1.2s, v8.s[1]
OP_rr v24.2s, v0.2s, v8.4s[2] OP_rr v24.2s, v0.2s, v8.s[2]
OP_ii v24.2s, v1.2s, v9.4s[2] OP_ii v24.2s, v1.2s, v9.s[2]
OP_ri v25.2s, v0.2s, v9.4s[2] OP_ri v25.2s, v0.2s, v9.s[2]
OP_ir v25.2s, v1.2s, v8.4s[2] OP_ir v25.2s, v1.2s, v8.s[2]
OP_rr v28.2s, v0.2s, v8.4s[3] OP_rr v28.2s, v0.2s, v8.s[3]
OP_ii v28.2s, v1.2s, v9.4s[3] OP_ii v28.2s, v1.2s, v9.s[3]
OP_ri v29.2s, v0.2s, v9.4s[3] OP_ri v29.2s, v0.2s, v9.s[3]
OP_ir v29.2s, v1.2s, v8.4s[3] OP_ir v29.2s, v1.2s, v8.s[3]
.endm .endm
.macro SAVE2x4 .macro SAVE2x4
@ -460,25 +460,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v0.s, v1.s}[0], [pA] ld2 {v0.s, v1.s}[0], [pA]
add pA, pA, #8 add pA, pA, #8
OP_rr s16, s0, v8.4s[0] OP_rr s16, s0, v8.s[0]
OP_ii s16, s1, v9.4s[0] OP_ii s16, s1, v9.s[0]
OP_ri s17, s0, v9.4s[0] OP_ri s17, s0, v9.s[0]
OP_ir s17, s1, v8.4s[0] OP_ir s17, s1, v8.s[0]
OP_rr s20, s0, v8.4s[1] OP_rr s20, s0, v8.s[1]
OP_ii s20, s1, v9.4s[1] OP_ii s20, s1, v9.s[1]
OP_ri s21, s0, v9.4s[1] OP_ri s21, s0, v9.s[1]
OP_ir s21, s1, v8.4s[1] OP_ir s21, s1, v8.s[1]
OP_rr s24, s0, v8.4s[2] OP_rr s24, s0, v8.s[2]
OP_ii s24, s1, v9.4s[2] OP_ii s24, s1, v9.s[2]
OP_ri s25, s0, v9.4s[2] OP_ri s25, s0, v9.s[2]
OP_ir s25, s1, v8.4s[2] OP_ir s25, s1, v8.s[2]
OP_rr s28, s0, v8.4s[3] OP_rr s28, s0, v8.s[3]
OP_ii s28, s1, v9.4s[3] OP_ii s28, s1, v9.s[3]
OP_ri s29, s0, v9.4s[3] OP_ri s29, s0, v9.s[3]
OP_ir s29, s1, v8.4s[3] OP_ir s29, s1, v8.s[3]
.endm .endm
.macro SAVE1x4 .macro SAVE1x4
@ -532,15 +532,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v0.4s, v1.4s}, [pA] ld2 {v0.4s, v1.4s}, [pA]
add pA, pA, #32 add pA, pA, #32
OP_rr v16.4s, v0.4s, v8.2s[0] OP_rr v16.4s, v0.4s, v8.s[0]
OP_ii v16.4s, v1.4s, v9.2s[0] OP_ii v16.4s, v1.4s, v9.s[0]
OP_ri v17.4s, v0.4s, v9.2s[0] OP_ri v17.4s, v0.4s, v9.s[0]
OP_ir v17.4s, v1.4s, v8.2s[0] OP_ir v17.4s, v1.4s, v8.s[0]
OP_rr v20.4s, v0.4s, v8.2s[1] OP_rr v20.4s, v0.4s, v8.s[1]
OP_ii v20.4s, v1.4s, v9.2s[1] OP_ii v20.4s, v1.4s, v9.s[1]
OP_ri v21.4s, v0.4s, v9.2s[1] OP_ri v21.4s, v0.4s, v9.s[1]
OP_ir v21.4s, v1.4s, v8.2s[1] OP_ir v21.4s, v1.4s, v8.s[1]
.endm .endm
.macro SAVE4x2 .macro SAVE4x2
@ -578,15 +578,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v0.2s, v1.2s}, [pA] ld2 {v0.2s, v1.2s}, [pA]
add pA, pA, #16 add pA, pA, #16
OP_rr v16.2s, v0.2s, v8.2s[0] OP_rr v16.2s, v0.2s, v8.s[0]
OP_ii v16.2s, v1.2s, v9.2s[0] OP_ii v16.2s, v1.2s, v9.s[0]
OP_ri v17.2s, v0.2s, v9.2s[0] OP_ri v17.2s, v0.2s, v9.s[0]
OP_ir v17.2s, v1.2s, v8.2s[0] OP_ir v17.2s, v1.2s, v8.s[0]
OP_rr v20.2s, v0.2s, v8.2s[1] OP_rr v20.2s, v0.2s, v8.s[1]
OP_ii v20.2s, v1.2s, v9.2s[1] OP_ii v20.2s, v1.2s, v9.s[1]
OP_ri v21.2s, v0.2s, v9.2s[1] OP_ri v21.2s, v0.2s, v9.s[1]
OP_ir v21.2s, v1.2s, v8.2s[1] OP_ir v21.2s, v1.2s, v8.s[1]
.endm .endm
.macro SAVE2x2 .macro SAVE2x2
@ -624,15 +624,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v0.s, v1.s}[0], [pA] ld2 {v0.s, v1.s}[0], [pA]
add pA, pA, #8 add pA, pA, #8
OP_rr s16, s0, v8.2s[0] OP_rr s16, s0, v8.s[0]
OP_ii s16, s1, v9.2s[0] OP_ii s16, s1, v9.s[0]
OP_ri s17, s0, v9.2s[0] OP_ri s17, s0, v9.s[0]
OP_ir s17, s1, v8.2s[0] OP_ir s17, s1, v8.s[0]
OP_rr s20, s0, v8.2s[1] OP_rr s20, s0, v8.s[1]
OP_ii s20, s1, v9.2s[1] OP_ii s20, s1, v9.s[1]
OP_ri s21, s0, v9.2s[1] OP_ri s21, s0, v9.s[1]
OP_ir s21, s1, v8.2s[1] OP_ir s21, s1, v8.s[1]
.endm .endm
.macro SAVE1x2 .macro SAVE1x2

664
kernel/arm64/ctrmm_kernel_8x4.S Executable file → Normal file
View File

@ -180,93 +180,93 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v2.4s, v3.4s}, [pA] ld2 {v2.4s, v3.4s}, [pA]
add pA, pA, #32 add pA, pA, #32
fmul v16.4s, v0.4s, v8.4s[0] fmul v16.4s, v0.4s, v8.s[0]
OP_ii v16.4s, v1.4s, v9.4s[0] OP_ii v16.4s, v1.4s, v9.s[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC) defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v17.16b, v17.16b, v17.16b eor v17.16b, v17.16b, v17.16b
fmls v17.4s, v0.4s, v9.4s[0] fmls v17.4s, v0.4s, v9.s[0]
#else #else
fmul v17.4s, v0.4s, v9.4s[0] fmul v17.4s, v0.4s, v9.s[0]
#endif #endif
OP_ir v17.4s, v1.4s, v8.4s[0] OP_ir v17.4s, v1.4s, v8.s[0]
fmul v18.4s, v2.4s, v8.4s[0] fmul v18.4s, v2.4s, v8.s[0]
OP_ii v18.4s, v3.4s, v9.4s[0] OP_ii v18.4s, v3.4s, v9.s[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC) defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v19.16b, v19.16b, v19.16b eor v19.16b, v19.16b, v19.16b
fmls v19.4s, v2.4s, v9.4s[0] fmls v19.4s, v2.4s, v9.s[0]
#else #else
fmul v19.4s, v2.4s, v9.4s[0] fmul v19.4s, v2.4s, v9.s[0]
#endif #endif
OP_ir v19.4s, v3.4s, v8.4s[0] OP_ir v19.4s, v3.4s, v8.s[0]
fmul v20.4s, v0.4s, v8.4s[1] fmul v20.4s, v0.4s, v8.s[1]
OP_ii v20.4s, v1.4s, v9.4s[1] OP_ii v20.4s, v1.4s, v9.s[1]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC) defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v21.16b, v21.16b, v21.16b eor v21.16b, v21.16b, v21.16b
fmls v21.4s, v0.4s, v9.4s[1] fmls v21.4s, v0.4s, v9.s[1]
#else #else
fmul v21.4s, v0.4s, v9.4s[1] fmul v21.4s, v0.4s, v9.s[1]
#endif #endif
OP_ir v21.4s, v1.4s, v8.4s[1] OP_ir v21.4s, v1.4s, v8.s[1]
fmul v22.4s, v2.4s, v8.4s[1] fmul v22.4s, v2.4s, v8.s[1]
OP_ii v22.4s, v3.4s, v9.4s[1] OP_ii v22.4s, v3.4s, v9.s[1]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC) defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v23.16b, v23.16b, v23.16b eor v23.16b, v23.16b, v23.16b
fmls v23.4s, v2.4s, v9.4s[1] fmls v23.4s, v2.4s, v9.s[1]
#else #else
fmul v23.4s, v2.4s, v9.4s[1] fmul v23.4s, v2.4s, v9.s[1]
#endif #endif
OP_ir v23.4s, v3.4s, v8.4s[1] OP_ir v23.4s, v3.4s, v8.s[1]
fmul v24.4s, v0.4s, v8.4s[2] fmul v24.4s, v0.4s, v8.s[2]
OP_ii v24.4s, v1.4s, v9.4s[2] OP_ii v24.4s, v1.4s, v9.s[2]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC) defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v25.16b, v25.16b, v25.16b eor v25.16b, v25.16b, v25.16b
fmls v25.4s, v0.4s, v9.4s[2] fmls v25.4s, v0.4s, v9.s[2]
#else #else
fmul v25.4s, v0.4s, v9.4s[2] fmul v25.4s, v0.4s, v9.s[2]
#endif #endif
OP_ir v25.4s, v1.4s, v8.4s[2] OP_ir v25.4s, v1.4s, v8.s[2]
fmul v26.4s, v2.4s, v8.4s[2] fmul v26.4s, v2.4s, v8.s[2]
OP_ii v26.4s, v3.4s, v9.4s[2] OP_ii v26.4s, v3.4s, v9.s[2]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC) defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v27.16b, v27.16b, v27.16b eor v27.16b, v27.16b, v27.16b
fmls v27.4s, v2.4s, v9.4s[2] fmls v27.4s, v2.4s, v9.s[2]
#else #else
fmul v27.4s, v2.4s, v9.4s[2] fmul v27.4s, v2.4s, v9.s[2]
#endif #endif
OP_ir v27.4s, v3.4s, v8.4s[2] OP_ir v27.4s, v3.4s, v8.s[2]
fmul v28.4s, v0.4s, v8.4s[3] fmul v28.4s, v0.4s, v8.s[3]
OP_ii v28.4s, v1.4s, v9.4s[3] OP_ii v28.4s, v1.4s, v9.s[3]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC) defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v29.16b, v29.16b, v29.16b eor v29.16b, v29.16b, v29.16b
fmls v29.4s, v0.4s, v9.4s[3] fmls v29.4s, v0.4s, v9.s[3]
#else #else
fmul v29.4s, v0.4s, v9.4s[3] fmul v29.4s, v0.4s, v9.s[3]
#endif #endif
OP_ir v29.4s, v1.4s, v8.4s[3] OP_ir v29.4s, v1.4s, v8.s[3]
fmul v30.4s, v2.4s, v8.4s[3] fmul v30.4s, v2.4s, v8.s[3]
OP_ii v30.4s, v3.4s, v9.4s[3] OP_ii v30.4s, v3.4s, v9.s[3]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC) defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v31.16b, v31.16b, v31.16b eor v31.16b, v31.16b, v31.16b
fmls v31.4s, v2.4s, v9.4s[3] fmls v31.4s, v2.4s, v9.s[3]
#else #else
fmul v31.4s, v2.4s, v9.4s[3] fmul v31.4s, v2.4s, v9.s[3]
#endif #endif
OP_ir v31.4s, v3.4s, v8.4s[3] OP_ir v31.4s, v3.4s, v8.s[3]
ld2 {v12.4s, v13.4s}, [pB] ld2 {v12.4s, v13.4s}, [pB]
add pB, pB, #32 add pB, pB, #32
@ -277,45 +277,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL8x4_M1 .macro KERNEL8x4_M1
OP_rr v16.4s, v0.4s, v8.4s[0] OP_rr v16.4s, v0.4s, v8.s[0]
OP_ii v16.4s, v1.4s, v9.4s[0] OP_ii v16.4s, v1.4s, v9.s[0]
OP_ri v17.4s, v0.4s, v9.4s[0] OP_ri v17.4s, v0.4s, v9.s[0]
OP_ir v17.4s, v1.4s, v8.4s[0] OP_ir v17.4s, v1.4s, v8.s[0]
OP_rr v18.4s, v2.4s, v8.4s[0] OP_rr v18.4s, v2.4s, v8.s[0]
OP_ii v18.4s, v3.4s, v9.4s[0] OP_ii v18.4s, v3.4s, v9.s[0]
OP_ri v19.4s, v2.4s, v9.4s[0] OP_ri v19.4s, v2.4s, v9.s[0]
OP_ir v19.4s, v3.4s, v8.4s[0] OP_ir v19.4s, v3.4s, v8.s[0]
OP_rr v20.4s, v0.4s, v8.4s[1] OP_rr v20.4s, v0.4s, v8.s[1]
OP_ii v20.4s, v1.4s, v9.4s[1] OP_ii v20.4s, v1.4s, v9.s[1]
OP_ri v21.4s, v0.4s, v9.4s[1] OP_ri v21.4s, v0.4s, v9.s[1]
OP_ir v21.4s, v1.4s, v8.4s[1] OP_ir v21.4s, v1.4s, v8.s[1]
OP_rr v22.4s, v2.4s, v8.4s[1] OP_rr v22.4s, v2.4s, v8.s[1]
OP_ii v22.4s, v3.4s, v9.4s[1] OP_ii v22.4s, v3.4s, v9.s[1]
OP_ri v23.4s, v2.4s, v9.4s[1] OP_ri v23.4s, v2.4s, v9.s[1]
OP_ir v23.4s, v3.4s, v8.4s[1] OP_ir v23.4s, v3.4s, v8.s[1]
OP_rr v24.4s, v0.4s, v8.4s[2] OP_rr v24.4s, v0.4s, v8.s[2]
OP_ii v24.4s, v1.4s, v9.4s[2] OP_ii v24.4s, v1.4s, v9.s[2]
OP_ri v25.4s, v0.4s, v9.4s[2] OP_ri v25.4s, v0.4s, v9.s[2]
OP_ir v25.4s, v1.4s, v8.4s[2] OP_ir v25.4s, v1.4s, v8.s[2]
OP_rr v26.4s, v2.4s, v8.4s[2] OP_rr v26.4s, v2.4s, v8.s[2]
OP_ii v26.4s, v3.4s, v9.4s[2] OP_ii v26.4s, v3.4s, v9.s[2]
OP_ri v27.4s, v2.4s, v9.4s[2] OP_ri v27.4s, v2.4s, v9.s[2]
OP_ir v27.4s, v3.4s, v8.4s[2] OP_ir v27.4s, v3.4s, v8.s[2]
OP_rr v28.4s, v0.4s, v8.4s[3] OP_rr v28.4s, v0.4s, v8.s[3]
OP_ii v28.4s, v1.4s, v9.4s[3] OP_ii v28.4s, v1.4s, v9.s[3]
OP_ri v29.4s, v0.4s, v9.4s[3] OP_ri v29.4s, v0.4s, v9.s[3]
OP_ir v29.4s, v1.4s, v8.4s[3] OP_ir v29.4s, v1.4s, v8.s[3]
OP_rr v30.4s, v2.4s, v8.4s[3] OP_rr v30.4s, v2.4s, v8.s[3]
OP_ii v30.4s, v3.4s, v9.4s[3] OP_ii v30.4s, v3.4s, v9.s[3]
OP_ri v31.4s, v2.4s, v9.4s[3] OP_ri v31.4s, v2.4s, v9.s[3]
OP_ir v31.4s, v3.4s, v8.4s[3] OP_ir v31.4s, v3.4s, v8.s[3]
ld2 {v12.4s, v13.4s}, [pB] // For next round ld2 {v12.4s, v13.4s}, [pB] // For next round
add pB, pB, #32 add pB, pB, #32
@ -326,45 +326,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL8x4_M2 .macro KERNEL8x4_M2
OP_rr v16.4s, v4.4s, v12.4s[0] OP_rr v16.4s, v4.4s, v12.s[0]
OP_ii v16.4s, v5.4s, v13.4s[0] OP_ii v16.4s, v5.4s, v13.s[0]
OP_ri v17.4s, v4.4s, v13.4s[0] OP_ri v17.4s, v4.4s, v13.s[0]
OP_ir v17.4s, v5.4s, v12.4s[0] OP_ir v17.4s, v5.4s, v12.s[0]
OP_rr v18.4s, v6.4s, v12.4s[0] OP_rr v18.4s, v6.4s, v12.s[0]
OP_ii v18.4s, v7.4s, v13.4s[0] OP_ii v18.4s, v7.4s, v13.s[0]
OP_ri v19.4s, v6.4s, v13.4s[0] OP_ri v19.4s, v6.4s, v13.s[0]
OP_ir v19.4s, v7.4s, v12.4s[0] OP_ir v19.4s, v7.4s, v12.s[0]
OP_rr v20.4s, v4.4s, v12.4s[1] OP_rr v20.4s, v4.4s, v12.s[1]
OP_ii v20.4s, v5.4s, v13.4s[1] OP_ii v20.4s, v5.4s, v13.s[1]
OP_ri v21.4s, v4.4s, v13.4s[1] OP_ri v21.4s, v4.4s, v13.s[1]
OP_ir v21.4s, v5.4s, v12.4s[1] OP_ir v21.4s, v5.4s, v12.s[1]
OP_rr v22.4s, v6.4s, v12.4s[1] OP_rr v22.4s, v6.4s, v12.s[1]
OP_ii v22.4s, v7.4s, v13.4s[1] OP_ii v22.4s, v7.4s, v13.s[1]
OP_ri v23.4s, v6.4s, v13.4s[1] OP_ri v23.4s, v6.4s, v13.s[1]
OP_ir v23.4s, v7.4s, v12.4s[1] OP_ir v23.4s, v7.4s, v12.s[1]
OP_rr v24.4s, v4.4s, v12.4s[2] OP_rr v24.4s, v4.4s, v12.s[2]
OP_ii v24.4s, v5.4s, v13.4s[2] OP_ii v24.4s, v5.4s, v13.s[2]
OP_ri v25.4s, v4.4s, v13.4s[2] OP_ri v25.4s, v4.4s, v13.s[2]
OP_ir v25.4s, v5.4s, v12.4s[2] OP_ir v25.4s, v5.4s, v12.s[2]
OP_rr v26.4s, v6.4s, v12.4s[2] OP_rr v26.4s, v6.4s, v12.s[2]
OP_ii v26.4s, v7.4s, v13.4s[2] OP_ii v26.4s, v7.4s, v13.s[2]
OP_ri v27.4s, v6.4s, v13.4s[2] OP_ri v27.4s, v6.4s, v13.s[2]
OP_ir v27.4s, v7.4s, v12.4s[2] OP_ir v27.4s, v7.4s, v12.s[2]
OP_rr v28.4s, v4.4s, v12.4s[3] OP_rr v28.4s, v4.4s, v12.s[3]
OP_ii v28.4s, v5.4s, v13.4s[3] OP_ii v28.4s, v5.4s, v13.s[3]
OP_ri v29.4s, v4.4s, v13.4s[3] OP_ri v29.4s, v4.4s, v13.s[3]
OP_ir v29.4s, v5.4s, v12.4s[3] OP_ir v29.4s, v5.4s, v12.s[3]
OP_rr v30.4s, v6.4s, v12.4s[3] OP_rr v30.4s, v6.4s, v12.s[3]
OP_ii v30.4s, v7.4s, v13.4s[3] OP_ii v30.4s, v7.4s, v13.s[3]
OP_ri v31.4s, v6.4s, v13.4s[3] OP_ri v31.4s, v6.4s, v13.s[3]
OP_ir v31.4s, v7.4s, v12.4s[3] OP_ir v31.4s, v7.4s, v12.s[3]
ld2 {v8.4s, v9.4s}, [pB] ld2 {v8.4s, v9.4s}, [pB]
add pB, pB, #32 add pB, pB, #32
@ -375,45 +375,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL8x4_E .macro KERNEL8x4_E
OP_rr v16.4s, v4.4s, v12.4s[0] OP_rr v16.4s, v4.4s, v12.s[0]
OP_ii v16.4s, v5.4s, v13.4s[0] OP_ii v16.4s, v5.4s, v13.s[0]
OP_ri v17.4s, v4.4s, v13.4s[0] OP_ri v17.4s, v4.4s, v13.s[0]
OP_ir v17.4s, v5.4s, v12.4s[0] OP_ir v17.4s, v5.4s, v12.s[0]
OP_rr v18.4s, v6.4s, v12.4s[0] OP_rr v18.4s, v6.4s, v12.s[0]
OP_ii v18.4s, v7.4s, v13.4s[0] OP_ii v18.4s, v7.4s, v13.s[0]
OP_ri v19.4s, v6.4s, v13.4s[0] OP_ri v19.4s, v6.4s, v13.s[0]
OP_ir v19.4s, v7.4s, v12.4s[0] OP_ir v19.4s, v7.4s, v12.s[0]
OP_rr v20.4s, v4.4s, v12.4s[1] OP_rr v20.4s, v4.4s, v12.s[1]
OP_ii v20.4s, v5.4s, v13.4s[1] OP_ii v20.4s, v5.4s, v13.s[1]
OP_ri v21.4s, v4.4s, v13.4s[1] OP_ri v21.4s, v4.4s, v13.s[1]
OP_ir v21.4s, v5.4s, v12.4s[1] OP_ir v21.4s, v5.4s, v12.s[1]
OP_rr v22.4s, v6.4s, v12.4s[1] OP_rr v22.4s, v6.4s, v12.s[1]
OP_ii v22.4s, v7.4s, v13.4s[1] OP_ii v22.4s, v7.4s, v13.s[1]
OP_ri v23.4s, v6.4s, v13.4s[1] OP_ri v23.4s, v6.4s, v13.s[1]
OP_ir v23.4s, v7.4s, v12.4s[1] OP_ir v23.4s, v7.4s, v12.s[1]
OP_rr v24.4s, v4.4s, v12.4s[2] OP_rr v24.4s, v4.4s, v12.s[2]
OP_ii v24.4s, v5.4s, v13.4s[2] OP_ii v24.4s, v5.4s, v13.s[2]
OP_ri v25.4s, v4.4s, v13.4s[2] OP_ri v25.4s, v4.4s, v13.s[2]
OP_ir v25.4s, v5.4s, v12.4s[2] OP_ir v25.4s, v5.4s, v12.s[2]
OP_rr v26.4s, v6.4s, v12.4s[2] OP_rr v26.4s, v6.4s, v12.s[2]
OP_ii v26.4s, v7.4s, v13.4s[2] OP_ii v26.4s, v7.4s, v13.s[2]
OP_ri v27.4s, v6.4s, v13.4s[2] OP_ri v27.4s, v6.4s, v13.s[2]
OP_ir v27.4s, v7.4s, v12.4s[2] OP_ir v27.4s, v7.4s, v12.s[2]
OP_rr v28.4s, v4.4s, v12.4s[3] OP_rr v28.4s, v4.4s, v12.s[3]
OP_ii v28.4s, v5.4s, v13.4s[3] OP_ii v28.4s, v5.4s, v13.s[3]
OP_ri v29.4s, v4.4s, v13.4s[3] OP_ri v29.4s, v4.4s, v13.s[3]
OP_ir v29.4s, v5.4s, v12.4s[3] OP_ir v29.4s, v5.4s, v12.s[3]
OP_rr v30.4s, v6.4s, v12.4s[3] OP_rr v30.4s, v6.4s, v12.s[3]
OP_ii v30.4s, v7.4s, v13.4s[3] OP_ii v30.4s, v7.4s, v13.s[3]
OP_ri v31.4s, v6.4s, v13.4s[3] OP_ri v31.4s, v6.4s, v13.s[3]
OP_ir v31.4s, v7.4s, v12.4s[3] OP_ir v31.4s, v7.4s, v12.s[3]
.endm .endm
@ -425,45 +425,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v2.4s, v3.4s}, [pA] ld2 {v2.4s, v3.4s}, [pA]
add pA, pA, #32 add pA, pA, #32
OP_rr v16.4s, v0.4s, v8.4s[0] OP_rr v16.4s, v0.4s, v8.s[0]
OP_ii v16.4s, v1.4s, v9.4s[0] OP_ii v16.4s, v1.4s, v9.s[0]
OP_ri v17.4s, v0.4s, v9.4s[0] OP_ri v17.4s, v0.4s, v9.s[0]
OP_ir v17.4s, v1.4s, v8.4s[0] OP_ir v17.4s, v1.4s, v8.s[0]
OP_rr v18.4s, v2.4s, v8.4s[0] OP_rr v18.4s, v2.4s, v8.s[0]
OP_ii v18.4s, v3.4s, v9.4s[0] OP_ii v18.4s, v3.4s, v9.s[0]
OP_ri v19.4s, v2.4s, v9.4s[0] OP_ri v19.4s, v2.4s, v9.s[0]
OP_ir v19.4s, v3.4s, v8.4s[0] OP_ir v19.4s, v3.4s, v8.s[0]
OP_rr v20.4s, v0.4s, v8.4s[1] OP_rr v20.4s, v0.4s, v8.s[1]
OP_ii v20.4s, v1.4s, v9.4s[1] OP_ii v20.4s, v1.4s, v9.s[1]
OP_ri v21.4s, v0.4s, v9.4s[1] OP_ri v21.4s, v0.4s, v9.s[1]
OP_ir v21.4s, v1.4s, v8.4s[1] OP_ir v21.4s, v1.4s, v8.s[1]
OP_rr v22.4s, v2.4s, v8.4s[1] OP_rr v22.4s, v2.4s, v8.s[1]
OP_ii v22.4s, v3.4s, v9.4s[1] OP_ii v22.4s, v3.4s, v9.s[1]
OP_ri v23.4s, v2.4s, v9.4s[1] OP_ri v23.4s, v2.4s, v9.s[1]
OP_ir v23.4s, v3.4s, v8.4s[1] OP_ir v23.4s, v3.4s, v8.s[1]
OP_rr v24.4s, v0.4s, v8.4s[2] OP_rr v24.4s, v0.4s, v8.s[2]
OP_ii v24.4s, v1.4s, v9.4s[2] OP_ii v24.4s, v1.4s, v9.s[2]
OP_ri v25.4s, v0.4s, v9.4s[2] OP_ri v25.4s, v0.4s, v9.s[2]
OP_ir v25.4s, v1.4s, v8.4s[2] OP_ir v25.4s, v1.4s, v8.s[2]
OP_rr v26.4s, v2.4s, v8.4s[2] OP_rr v26.4s, v2.4s, v8.s[2]
OP_ii v26.4s, v3.4s, v9.4s[2] OP_ii v26.4s, v3.4s, v9.s[2]
OP_ri v27.4s, v2.4s, v9.4s[2] OP_ri v27.4s, v2.4s, v9.s[2]
OP_ir v27.4s, v3.4s, v8.4s[2] OP_ir v27.4s, v3.4s, v8.s[2]
OP_rr v28.4s, v0.4s, v8.4s[3] OP_rr v28.4s, v0.4s, v8.s[3]
OP_ii v28.4s, v1.4s, v9.4s[3] OP_ii v28.4s, v1.4s, v9.s[3]
OP_ri v29.4s, v0.4s, v9.4s[3] OP_ri v29.4s, v0.4s, v9.s[3]
OP_ir v29.4s, v1.4s, v8.4s[3] OP_ir v29.4s, v1.4s, v8.s[3]
OP_rr v30.4s, v2.4s, v8.4s[3] OP_rr v30.4s, v2.4s, v8.s[3]
OP_ii v30.4s, v3.4s, v9.4s[3] OP_ii v30.4s, v3.4s, v9.s[3]
OP_ri v31.4s, v2.4s, v9.4s[3] OP_ri v31.4s, v2.4s, v9.s[3]
OP_ir v31.4s, v3.4s, v8.4s[3] OP_ir v31.4s, v3.4s, v8.s[3]
.endm .endm
@ -562,49 +562,49 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v0.4s, v1.4s}, [pA] ld2 {v0.4s, v1.4s}, [pA]
add pA, pA, #32 add pA, pA, #32
fmul v16.4s, v0.4s, v8.4s[0] fmul v16.4s, v0.4s, v8.s[0]
OP_ii v16.4s, v1.4s, v9.4s[0] OP_ii v16.4s, v1.4s, v9.s[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC) defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v17.16b, v17.16b, v17.16b eor v17.16b, v17.16b, v17.16b
fmls v17.4s, v0.4s, v9.4s[0] fmls v17.4s, v0.4s, v9.s[0]
#else #else
fmul v17.4s, v0.4s, v9.4s[0] fmul v17.4s, v0.4s, v9.s[0]
#endif #endif
OP_ir v17.4s, v1.4s, v8.4s[0] OP_ir v17.4s, v1.4s, v8.s[0]
fmul v20.4s, v0.4s, v8.4s[1] fmul v20.4s, v0.4s, v8.s[1]
OP_ii v20.4s, v1.4s, v9.4s[1] OP_ii v20.4s, v1.4s, v9.s[1]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC) defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v21.16b, v21.16b, v21.16b eor v21.16b, v21.16b, v21.16b
fmls v21.4s, v0.4s, v9.4s[1] fmls v21.4s, v0.4s, v9.s[1]
#else #else
fmul v21.4s, v0.4s, v9.4s[1] fmul v21.4s, v0.4s, v9.s[1]
#endif #endif
OP_ir v21.4s, v1.4s, v8.4s[1] OP_ir v21.4s, v1.4s, v8.s[1]
fmul v24.4s, v0.4s, v8.4s[2] fmul v24.4s, v0.4s, v8.s[2]
OP_ii v24.4s, v1.4s, v9.4s[2] OP_ii v24.4s, v1.4s, v9.s[2]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC) defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v25.16b, v25.16b, v25.16b eor v25.16b, v25.16b, v25.16b
fmls v25.4s, v0.4s, v9.4s[2] fmls v25.4s, v0.4s, v9.s[2]
#else #else
fmul v25.4s, v0.4s, v9.4s[2] fmul v25.4s, v0.4s, v9.s[2]
#endif #endif
OP_ir v25.4s, v1.4s, v8.4s[2] OP_ir v25.4s, v1.4s, v8.s[2]
fmul v28.4s, v0.4s, v8.4s[3] fmul v28.4s, v0.4s, v8.s[3]
OP_ii v28.4s, v1.4s, v9.4s[3] OP_ii v28.4s, v1.4s, v9.s[3]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC) defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v29.16b, v29.16b, v29.16b eor v29.16b, v29.16b, v29.16b
fmls v29.4s, v0.4s, v9.4s[3] fmls v29.4s, v0.4s, v9.s[3]
#else #else
fmul v29.4s, v0.4s, v9.4s[3] fmul v29.4s, v0.4s, v9.s[3]
#endif #endif
OP_ir v29.4s, v1.4s, v8.4s[3] OP_ir v29.4s, v1.4s, v8.s[3]
ld2 {v12.4s, v13.4s}, [pB] ld2 {v12.4s, v13.4s}, [pB]
add pB, pB, #32 add pB, pB, #32
@ -613,85 +613,85 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL4x4_M1 .macro KERNEL4x4_M1
OP_rr v16.4s, v0.4s, v8.4s[0] OP_rr v16.4s, v0.4s, v8.s[0]
OP_ii v16.4s, v1.4s, v9.4s[0] OP_ii v16.4s, v1.4s, v9.s[0]
OP_ri v17.4s, v0.4s, v9.4s[0] OP_ri v17.4s, v0.4s, v9.s[0]
OP_ir v17.4s, v1.4s, v8.4s[0] OP_ir v17.4s, v1.4s, v8.s[0]
ld2 {v12.4s, v13.4s}, [pB] // For next round ld2 {v12.4s, v13.4s}, [pB] // For next round
add pB, pB, #32 add pB, pB, #32
OP_rr v20.4s, v0.4s, v8.4s[1] OP_rr v20.4s, v0.4s, v8.s[1]
OP_ii v20.4s, v1.4s, v9.4s[1] OP_ii v20.4s, v1.4s, v9.s[1]
OP_ri v21.4s, v0.4s, v9.4s[1] OP_ri v21.4s, v0.4s, v9.s[1]
OP_ir v21.4s, v1.4s, v8.4s[1] OP_ir v21.4s, v1.4s, v8.s[1]
ld2 {v4.4s, v5.4s}, [pA] // For next round ld2 {v4.4s, v5.4s}, [pA] // For next round
add pA, pA, #32 add pA, pA, #32
OP_rr v24.4s, v0.4s, v8.4s[2] OP_rr v24.4s, v0.4s, v8.s[2]
OP_ii v24.4s, v1.4s, v9.4s[2] OP_ii v24.4s, v1.4s, v9.s[2]
OP_ri v25.4s, v0.4s, v9.4s[2] OP_ri v25.4s, v0.4s, v9.s[2]
OP_ir v25.4s, v1.4s, v8.4s[2] OP_ir v25.4s, v1.4s, v8.s[2]
prfm PLDL1KEEP, [pA, #512] prfm PLDL1KEEP, [pA, #512]
OP_rr v28.4s, v0.4s, v8.4s[3] OP_rr v28.4s, v0.4s, v8.s[3]
OP_ii v28.4s, v1.4s, v9.4s[3] OP_ii v28.4s, v1.4s, v9.s[3]
OP_ri v29.4s, v0.4s, v9.4s[3] OP_ri v29.4s, v0.4s, v9.s[3]
OP_ir v29.4s, v1.4s, v8.4s[3] OP_ir v29.4s, v1.4s, v8.s[3]
.endm .endm
.macro KERNEL4x4_M2 .macro KERNEL4x4_M2
OP_rr v16.4s, v4.4s, v12.4s[0] OP_rr v16.4s, v4.4s, v12.s[0]
OP_ii v16.4s, v5.4s, v13.4s[0] OP_ii v16.4s, v5.4s, v13.s[0]
OP_ri v17.4s, v4.4s, v13.4s[0] OP_ri v17.4s, v4.4s, v13.s[0]
OP_ir v17.4s, v5.4s, v12.4s[0] OP_ir v17.4s, v5.4s, v12.s[0]
ld2 {v8.4s, v9.4s}, [pB] // For next round ld2 {v8.4s, v9.4s}, [pB] // For next round
add pB, pB, #32 add pB, pB, #32
OP_rr v20.4s, v4.4s, v12.4s[1] OP_rr v20.4s, v4.4s, v12.s[1]
OP_ii v20.4s, v5.4s, v13.4s[1] OP_ii v20.4s, v5.4s, v13.s[1]
OP_ri v21.4s, v4.4s, v13.4s[1] OP_ri v21.4s, v4.4s, v13.s[1]
OP_ir v21.4s, v5.4s, v12.4s[1] OP_ir v21.4s, v5.4s, v12.s[1]
ld2 {v0.4s, v1.4s}, [pA] // For next round ld2 {v0.4s, v1.4s}, [pA] // For next round
add pA, pA, #32 add pA, pA, #32
OP_rr v24.4s, v4.4s, v12.4s[2] OP_rr v24.4s, v4.4s, v12.s[2]
OP_ii v24.4s, v5.4s, v13.4s[2] OP_ii v24.4s, v5.4s, v13.s[2]
OP_ri v25.4s, v4.4s, v13.4s[2] OP_ri v25.4s, v4.4s, v13.s[2]
OP_ir v25.4s, v5.4s, v12.4s[2] OP_ir v25.4s, v5.4s, v12.s[2]
prfm PLDL1KEEP, [pB, #512] prfm PLDL1KEEP, [pB, #512]
OP_rr v28.4s, v4.4s, v12.4s[3] OP_rr v28.4s, v4.4s, v12.s[3]
OP_ii v28.4s, v5.4s, v13.4s[3] OP_ii v28.4s, v5.4s, v13.s[3]
OP_ri v29.4s, v4.4s, v13.4s[3] OP_ri v29.4s, v4.4s, v13.s[3]
OP_ir v29.4s, v5.4s, v12.4s[3] OP_ir v29.4s, v5.4s, v12.s[3]
.endm .endm
.macro KERNEL4x4_E .macro KERNEL4x4_E
OP_rr v16.4s, v4.4s, v12.4s[0] OP_rr v16.4s, v4.4s, v12.s[0]
OP_ii v16.4s, v5.4s, v13.4s[0] OP_ii v16.4s, v5.4s, v13.s[0]
OP_ri v17.4s, v4.4s, v13.4s[0] OP_ri v17.4s, v4.4s, v13.s[0]
OP_ir v17.4s, v5.4s, v12.4s[0] OP_ir v17.4s, v5.4s, v12.s[0]
OP_rr v20.4s, v4.4s, v12.4s[1] OP_rr v20.4s, v4.4s, v12.s[1]
OP_ii v20.4s, v5.4s, v13.4s[1] OP_ii v20.4s, v5.4s, v13.s[1]
OP_ri v21.4s, v4.4s, v13.4s[1] OP_ri v21.4s, v4.4s, v13.s[1]
OP_ir v21.4s, v5.4s, v12.4s[1] OP_ir v21.4s, v5.4s, v12.s[1]
OP_rr v24.4s, v4.4s, v12.4s[2] OP_rr v24.4s, v4.4s, v12.s[2]
OP_ii v24.4s, v5.4s, v13.4s[2] OP_ii v24.4s, v5.4s, v13.s[2]
OP_ri v25.4s, v4.4s, v13.4s[2] OP_ri v25.4s, v4.4s, v13.s[2]
OP_ir v25.4s, v5.4s, v12.4s[2] OP_ir v25.4s, v5.4s, v12.s[2]
OP_rr v28.4s, v4.4s, v12.4s[3] OP_rr v28.4s, v4.4s, v12.s[3]
OP_ii v28.4s, v5.4s, v13.4s[3] OP_ii v28.4s, v5.4s, v13.s[3]
OP_ri v29.4s, v4.4s, v13.4s[3] OP_ri v29.4s, v4.4s, v13.s[3]
OP_ir v29.4s, v5.4s, v12.4s[3] OP_ir v29.4s, v5.4s, v12.s[3]
.endm .endm
.macro KERNEL4x4_SUB .macro KERNEL4x4_SUB
@ -700,25 +700,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v0.4s, v1.4s}, [pA] ld2 {v0.4s, v1.4s}, [pA]
add pA, pA, #32 add pA, pA, #32
OP_rr v16.4s, v0.4s, v8.4s[0] OP_rr v16.4s, v0.4s, v8.s[0]
OP_ii v16.4s, v1.4s, v9.4s[0] OP_ii v16.4s, v1.4s, v9.s[0]
OP_ri v17.4s, v0.4s, v9.4s[0] OP_ri v17.4s, v0.4s, v9.s[0]
OP_ir v17.4s, v1.4s, v8.4s[0] OP_ir v17.4s, v1.4s, v8.s[0]
OP_rr v20.4s, v0.4s, v8.4s[1] OP_rr v20.4s, v0.4s, v8.s[1]
OP_ii v20.4s, v1.4s, v9.4s[1] OP_ii v20.4s, v1.4s, v9.s[1]
OP_ri v21.4s, v0.4s, v9.4s[1] OP_ri v21.4s, v0.4s, v9.s[1]
OP_ir v21.4s, v1.4s, v8.4s[1] OP_ir v21.4s, v1.4s, v8.s[1]
OP_rr v24.4s, v0.4s, v8.4s[2] OP_rr v24.4s, v0.4s, v8.s[2]
OP_ii v24.4s, v1.4s, v9.4s[2] OP_ii v24.4s, v1.4s, v9.s[2]
OP_ri v25.4s, v0.4s, v9.4s[2] OP_ri v25.4s, v0.4s, v9.s[2]
OP_ir v25.4s, v1.4s, v8.4s[2] OP_ir v25.4s, v1.4s, v8.s[2]
OP_rr v28.4s, v0.4s, v8.4s[3] OP_rr v28.4s, v0.4s, v8.s[3]
OP_ii v28.4s, v1.4s, v9.4s[3] OP_ii v28.4s, v1.4s, v9.s[3]
OP_ri v29.4s, v0.4s, v9.4s[3] OP_ri v29.4s, v0.4s, v9.s[3]
OP_ir v29.4s, v1.4s, v8.4s[3] OP_ir v29.4s, v1.4s, v8.s[3]
.endm .endm
.macro SAVE4x4 .macro SAVE4x4
@ -780,25 +780,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v0.2s, v1.2s}, [pA] ld2 {v0.2s, v1.2s}, [pA]
add pA, pA, #16 add pA, pA, #16
OP_rr v16.2s, v0.2s, v8.4s[0] OP_rr v16.2s, v0.2s, v8.s[0]
OP_ii v16.2s, v1.2s, v9.4s[0] OP_ii v16.2s, v1.2s, v9.s[0]
OP_ri v17.2s, v0.2s, v9.4s[0] OP_ri v17.2s, v0.2s, v9.s[0]
OP_ir v17.2s, v1.2s, v8.4s[0] OP_ir v17.2s, v1.2s, v8.s[0]
OP_rr v20.2s, v0.2s, v8.4s[1] OP_rr v20.2s, v0.2s, v8.s[1]
OP_ii v20.2s, v1.2s, v9.4s[1] OP_ii v20.2s, v1.2s, v9.s[1]
OP_ri v21.2s, v0.2s, v9.4s[1] OP_ri v21.2s, v0.2s, v9.s[1]
OP_ir v21.2s, v1.2s, v8.4s[1] OP_ir v21.2s, v1.2s, v8.s[1]
OP_rr v24.2s, v0.2s, v8.4s[2] OP_rr v24.2s, v0.2s, v8.s[2]
OP_ii v24.2s, v1.2s, v9.4s[2] OP_ii v24.2s, v1.2s, v9.s[2]
OP_ri v25.2s, v0.2s, v9.4s[2] OP_ri v25.2s, v0.2s, v9.s[2]
OP_ir v25.2s, v1.2s, v8.4s[2] OP_ir v25.2s, v1.2s, v8.s[2]
OP_rr v28.2s, v0.2s, v8.4s[3] OP_rr v28.2s, v0.2s, v8.s[3]
OP_ii v28.2s, v1.2s, v9.4s[3] OP_ii v28.2s, v1.2s, v9.s[3]
OP_ri v29.2s, v0.2s, v9.4s[3] OP_ri v29.2s, v0.2s, v9.s[3]
OP_ir v29.2s, v1.2s, v8.4s[3] OP_ir v29.2s, v1.2s, v8.s[3]
.endm .endm
.macro SAVE2x4 .macro SAVE2x4
@ -860,25 +860,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v0.s, v1.s}[0], [pA] ld2 {v0.s, v1.s}[0], [pA]
add pA, pA, #8 add pA, pA, #8
OP_rr s16, s0, v8.4s[0] OP_rr s16, s0, v8.s[0]
OP_ii s16, s1, v9.4s[0] OP_ii s16, s1, v9.s[0]
OP_ri s17, s0, v9.4s[0] OP_ri s17, s0, v9.s[0]
OP_ir s17, s1, v8.4s[0] OP_ir s17, s1, v8.s[0]
OP_rr s20, s0, v8.4s[1] OP_rr s20, s0, v8.s[1]
OP_ii s20, s1, v9.4s[1] OP_ii s20, s1, v9.s[1]
OP_ri s21, s0, v9.4s[1] OP_ri s21, s0, v9.s[1]
OP_ir s21, s1, v8.4s[1] OP_ir s21, s1, v8.s[1]
OP_rr s24, s0, v8.4s[2] OP_rr s24, s0, v8.s[2]
OP_ii s24, s1, v9.4s[2] OP_ii s24, s1, v9.s[2]
OP_ri s25, s0, v9.4s[2] OP_ri s25, s0, v9.s[2]
OP_ir s25, s1, v8.4s[2] OP_ir s25, s1, v8.s[2]
OP_rr s28, s0, v8.4s[3] OP_rr s28, s0, v8.s[3]
OP_ii s28, s1, v9.4s[3] OP_ii s28, s1, v9.s[3]
OP_ri s29, s0, v9.4s[3] OP_ri s29, s0, v9.s[3]
OP_ir s29, s1, v8.4s[3] OP_ir s29, s1, v8.s[3]
.endm .endm
.macro SAVE1x4 .macro SAVE1x4
@ -942,25 +942,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v2.4s, v3.4s}, [pA] ld2 {v2.4s, v3.4s}, [pA]
add pA, pA, #32 add pA, pA, #32
OP_rr v16.4s, v0.4s, v8.2s[0] OP_rr v16.4s, v0.4s, v8.s[0]
OP_ii v16.4s, v1.4s, v9.2s[0] OP_ii v16.4s, v1.4s, v9.s[0]
OP_ri v17.4s, v0.4s, v9.2s[0] OP_ri v17.4s, v0.4s, v9.s[0]
OP_ir v17.4s, v1.4s, v8.2s[0] OP_ir v17.4s, v1.4s, v8.s[0]
OP_rr v18.4s, v2.4s, v8.2s[0] OP_rr v18.4s, v2.4s, v8.s[0]
OP_ii v18.4s, v3.4s, v9.2s[0] OP_ii v18.4s, v3.4s, v9.s[0]
OP_ri v19.4s, v2.4s, v9.2s[0] OP_ri v19.4s, v2.4s, v9.s[0]
OP_ir v19.4s, v3.4s, v8.2s[0] OP_ir v19.4s, v3.4s, v8.s[0]
OP_rr v20.4s, v0.4s, v8.2s[1] OP_rr v20.4s, v0.4s, v8.s[1]
OP_ii v20.4s, v1.4s, v9.2s[1] OP_ii v20.4s, v1.4s, v9.s[1]
OP_ri v21.4s, v0.4s, v9.2s[1] OP_ri v21.4s, v0.4s, v9.s[1]
OP_ir v21.4s, v1.4s, v8.2s[1] OP_ir v21.4s, v1.4s, v8.s[1]
OP_rr v22.4s, v2.4s, v8.2s[1] OP_rr v22.4s, v2.4s, v8.s[1]
OP_ii v22.4s, v3.4s, v9.2s[1] OP_ii v22.4s, v3.4s, v9.s[1]
OP_ri v23.4s, v2.4s, v9.2s[1] OP_ri v23.4s, v2.4s, v9.s[1]
OP_ir v23.4s, v3.4s, v8.2s[1] OP_ir v23.4s, v3.4s, v8.s[1]
.endm .endm
.macro SAVE8x2 .macro SAVE8x2
@ -1018,15 +1018,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v0.4s, v1.4s}, [pA] ld2 {v0.4s, v1.4s}, [pA]
add pA, pA, #32 add pA, pA, #32
OP_rr v16.4s, v0.4s, v8.2s[0] OP_rr v16.4s, v0.4s, v8.s[0]
OP_ii v16.4s, v1.4s, v9.2s[0] OP_ii v16.4s, v1.4s, v9.s[0]
OP_ri v17.4s, v0.4s, v9.2s[0] OP_ri v17.4s, v0.4s, v9.s[0]
OP_ir v17.4s, v1.4s, v8.2s[0] OP_ir v17.4s, v1.4s, v8.s[0]
OP_rr v20.4s, v0.4s, v8.2s[1] OP_rr v20.4s, v0.4s, v8.s[1]
OP_ii v20.4s, v1.4s, v9.2s[1] OP_ii v20.4s, v1.4s, v9.s[1]
OP_ri v21.4s, v0.4s, v9.2s[1] OP_ri v21.4s, v0.4s, v9.s[1]
OP_ir v21.4s, v1.4s, v8.2s[1] OP_ir v21.4s, v1.4s, v8.s[1]
.endm .endm
.macro SAVE4x2 .macro SAVE4x2
@ -1066,15 +1066,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v0.2s, v1.2s}, [pA] ld2 {v0.2s, v1.2s}, [pA]
add pA, pA, #16 add pA, pA, #16
OP_rr v16.2s, v0.2s, v8.2s[0] OP_rr v16.2s, v0.2s, v8.s[0]
OP_ii v16.2s, v1.2s, v9.2s[0] OP_ii v16.2s, v1.2s, v9.s[0]
OP_ri v17.2s, v0.2s, v9.2s[0] OP_ri v17.2s, v0.2s, v9.s[0]
OP_ir v17.2s, v1.2s, v8.2s[0] OP_ir v17.2s, v1.2s, v8.s[0]
OP_rr v20.2s, v0.2s, v8.2s[1] OP_rr v20.2s, v0.2s, v8.s[1]
OP_ii v20.2s, v1.2s, v9.2s[1] OP_ii v20.2s, v1.2s, v9.s[1]
OP_ri v21.2s, v0.2s, v9.2s[1] OP_ri v21.2s, v0.2s, v9.s[1]
OP_ir v21.2s, v1.2s, v8.2s[1] OP_ir v21.2s, v1.2s, v8.s[1]
.endm .endm
.macro SAVE2x2 .macro SAVE2x2
@ -1114,15 +1114,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v0.s, v1.s}[0], [pA] ld2 {v0.s, v1.s}[0], [pA]
add pA, pA, #8 add pA, pA, #8
OP_rr s16, s0, v8.2s[0] OP_rr s16, s0, v8.s[0]
OP_ii s16, s1, v9.2s[0] OP_ii s16, s1, v9.s[0]
OP_ri s17, s0, v9.2s[0] OP_ri s17, s0, v9.s[0]
OP_ir s17, s1, v8.2s[0] OP_ir s17, s1, v8.s[0]
OP_rr s20, s0, v8.2s[1] OP_rr s20, s0, v8.s[1]
OP_ii s20, s1, v9.2s[1] OP_ii s20, s1, v9.s[1]
OP_ri s21, s0, v9.2s[1] OP_ri s21, s0, v9.s[1]
OP_ir s21, s1, v8.2s[1] OP_ir s21, s1, v8.s[1]
.endm .endm
.macro SAVE1x2 .macro SAVE1x2
@ -1164,15 +1164,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v2.4s, v3.4s}, [pA] ld2 {v2.4s, v3.4s}, [pA]
add pA, pA, #32 add pA, pA, #32
OP_rr v16.4s, v0.4s, v8.4s[0] OP_rr v16.4s, v0.4s, v8.s[0]
OP_ii v16.4s, v1.4s, v8.4s[1] OP_ii v16.4s, v1.4s, v8.s[1]
OP_ri v17.4s, v0.4s, v8.4s[1] OP_ri v17.4s, v0.4s, v8.s[1]
OP_ir v17.4s, v1.4s, v8.4s[0] OP_ir v17.4s, v1.4s, v8.s[0]
OP_rr v18.4s, v2.4s, v8.4s[0] OP_rr v18.4s, v2.4s, v8.s[0]
OP_ii v18.4s, v3.4s, v8.4s[1] OP_ii v18.4s, v3.4s, v8.s[1]
OP_ri v19.4s, v2.4s, v8.4s[1] OP_ri v19.4s, v2.4s, v8.s[1]
OP_ir v19.4s, v3.4s, v8.4s[0] OP_ir v19.4s, v3.4s, v8.s[0]
.endm .endm
.macro SAVE8x1 .macro SAVE8x1

View File

@ -161,150 +161,150 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ldp q0, q1, [pA] ldp q0, q1, [pA]
add pA, pA, #32 add pA, pA, #32
fmul v16.2d, v0.2d, v8.2d[0] fmul v16.2d, v0.2d, v8.d[0]
fmul v29.2d, v1.2d, v11.2d[0] fmul v29.2d, v1.2d, v11.d[0]
ldp q2, q3, [ppA] ldp q2, q3, [ppA]
add ppA, ppA, #32 add ppA, ppA, #32
fmul v20.2d, v0.2d, v9.2d[0] fmul v20.2d, v0.2d, v9.d[0]
fmul v25.2d, v1.2d, v10.2d[0] fmul v25.2d, v1.2d, v10.d[0]
prfm PLDL1KEEP, [pA, #A_PRE_SIZE] prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
fmul v18.2d, v2.2d, v8.2d[0] fmul v18.2d, v2.2d, v8.d[0]
fmul v31.2d, v3.2d, v11.2d[0] fmul v31.2d, v3.2d, v11.d[0]
prfm PLDL1KEEP, [ppA, #A_PRE_SIZE] prfm PLDL1KEEP, [ppA, #A_PRE_SIZE]
fmul v22.2d, v2.2d, v9.2d[0] fmul v22.2d, v2.2d, v9.d[0]
fmul v27.2d, v3.2d, v10.2d[0] fmul v27.2d, v3.2d, v10.d[0]
ldp d12, d13, [pB] ldp d12, d13, [pB]
add pB, pB, #16 add pB, pB, #16
fmul v24.2d, v0.2d, v10.2d[0] fmul v24.2d, v0.2d, v10.d[0]
fmul v21.2d, v1.2d, v9.2d[0] fmul v21.2d, v1.2d, v9.d[0]
ldp q4, q5, [pA] // for next round ldp q4, q5, [pA] // for next round
add pA, pA, #32 add pA, pA, #32
fmul v26.2d, v2.2d, v10.2d[0] fmul v26.2d, v2.2d, v10.d[0]
fmul v23.2d, v3.2d, v9.2d[0] fmul v23.2d, v3.2d, v9.d[0]
ldp q6, q7, [ppA] // for next round ldp q6, q7, [ppA] // for next round
add ppA, ppA, #32 add ppA, ppA, #32
fmul v28.2d, v0.2d, v11.2d[0] fmul v28.2d, v0.2d, v11.d[0]
fmul v17.2d, v1.2d, v8.2d[0] fmul v17.2d, v1.2d, v8.d[0]
ldp d14, d15, [pB] ldp d14, d15, [pB]
add pB, pB, #16 add pB, pB, #16
fmul v30.2d, v2.2d, v11.2d[0] fmul v30.2d, v2.2d, v11.d[0]
fmul v19.2d, v3.2d, v8.2d[0] fmul v19.2d, v3.2d, v8.d[0]
.endm .endm
.macro KERNEL8x4_M2 .macro KERNEL8x4_M2
fmla v16.2d, v4.2d, v12.2d[0] fmla v16.2d, v4.2d, v12.d[0]
fmla v29.2d, v5.2d, v15.2d[0] fmla v29.2d, v5.2d, v15.d[0]
ldp d8, d9, [pB] ldp d8, d9, [pB]
add pB, pB, #16 add pB, pB, #16
fmla v18.2d, v6.2d, v12.2d[0] fmla v18.2d, v6.2d, v12.d[0]
fmla v31.2d, v7.2d, v15.2d[0] fmla v31.2d, v7.2d, v15.d[0]
ldp d10, d11, [pB] ldp d10, d11, [pB]
add pB, pB, #16 add pB, pB, #16
fmla v20.2d, v4.2d, v13.2d[0] fmla v20.2d, v4.2d, v13.d[0]
fmla v25.2d, v5.2d, v14.2d[0] fmla v25.2d, v5.2d, v14.d[0]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE] prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
fmla v22.2d, v6.2d, v13.2d[0] fmla v22.2d, v6.2d, v13.d[0]
fmla v27.2d, v7.2d, v14.2d[0] fmla v27.2d, v7.2d, v14.d[0]
fmla v24.2d, v4.2d, v14.2d[0] fmla v24.2d, v4.2d, v14.d[0]
fmla v21.2d, v5.2d, v13.2d[0] fmla v21.2d, v5.2d, v13.d[0]
ldp q0, q1, [pA] ldp q0, q1, [pA]
add pA, pA, #32 add pA, pA, #32
fmla v26.2d, v6.2d, v14.2d[0] fmla v26.2d, v6.2d, v14.d[0]
fmla v23.2d, v7.2d, v13.2d[0] fmla v23.2d, v7.2d, v13.d[0]
fmla v28.2d, v4.2d, v15.2d[0] fmla v28.2d, v4.2d, v15.d[0]
fmla v17.2d, v5.2d, v12.2d[0] fmla v17.2d, v5.2d, v12.d[0]
ldp q2, q3, [ppA] ldp q2, q3, [ppA]
add ppA, ppA, #32 add ppA, ppA, #32
fmla v30.2d, v6.2d, v15.2d[0] fmla v30.2d, v6.2d, v15.d[0]
fmla v19.2d, v7.2d, v12.2d[0] fmla v19.2d, v7.2d, v12.d[0]
.endm .endm
.macro KERNEL8x4_M1 .macro KERNEL8x4_M1
fmla v16.2d, v0.2d, v8.2d[0] fmla v16.2d, v0.2d, v8.d[0]
fmla v29.2d, v1.2d, v11.2d[0] fmla v29.2d, v1.2d, v11.d[0]
ldp d12, d13, [pB] ldp d12, d13, [pB]
add pB, pB, #16 add pB, pB, #16
fmla v18.2d, v2.2d, v8.2d[0] fmla v18.2d, v2.2d, v8.d[0]
fmla v31.2d, v3.2d, v11.2d[0] fmla v31.2d, v3.2d, v11.d[0]
ldp d14, d15, [pB] ldp d14, d15, [pB]
add pB, pB, #16 add pB, pB, #16
fmla v20.2d, v0.2d, v9.2d[0] fmla v20.2d, v0.2d, v9.d[0]
fmla v25.2d, v1.2d, v10.2d[0] fmla v25.2d, v1.2d, v10.d[0]
prfm PLDL1KEEP, [pA, #A_PRE_SIZE] prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
fmla v22.2d, v2.2d, v9.2d[0] fmla v22.2d, v2.2d, v9.d[0]
fmla v27.2d, v3.2d, v10.2d[0] fmla v27.2d, v3.2d, v10.d[0]
prfm PLDL1KEEP, [ppA, #A_PRE_SIZE] prfm PLDL1KEEP, [ppA, #A_PRE_SIZE]
fmla v24.2d, v0.2d, v10.2d[0] fmla v24.2d, v0.2d, v10.d[0]
fmla v21.2d, v1.2d, v9.2d[0] fmla v21.2d, v1.2d, v9.d[0]
ldp q4, q5, [pA] ldp q4, q5, [pA]
add pA, pA, #32 add pA, pA, #32
fmla v26.2d, v2.2d, v10.2d[0] fmla v26.2d, v2.2d, v10.d[0]
fmla v23.2d, v3.2d, v9.2d[0] fmla v23.2d, v3.2d, v9.d[0]
fmla v28.2d, v0.2d, v11.2d[0] fmla v28.2d, v0.2d, v11.d[0]
fmla v17.2d, v1.2d, v8.2d[0] fmla v17.2d, v1.2d, v8.d[0]
ldp q6, q7, [ppA] ldp q6, q7, [ppA]
add ppA, ppA, #32 add ppA, ppA, #32
fmla v30.2d, v2.2d, v11.2d[0] fmla v30.2d, v2.2d, v11.d[0]
fmla v19.2d, v3.2d, v8.2d[0] fmla v19.2d, v3.2d, v8.d[0]
.endm .endm
.macro KERNEL8x4_E .macro KERNEL8x4_E
fmla v16.2d, v4.2d, v12.2d[0] fmla v16.2d, v4.2d, v12.d[0]
fmla v25.2d, v5.2d, v14.2d[0] fmla v25.2d, v5.2d, v14.d[0]
fmla v18.2d, v6.2d, v12.2d[0] fmla v18.2d, v6.2d, v12.d[0]
fmla v27.2d, v7.2d, v14.2d[0] fmla v27.2d, v7.2d, v14.d[0]
fmla v20.2d, v4.2d, v13.2d[0] fmla v20.2d, v4.2d, v13.d[0]
fmla v29.2d, v5.2d, v15.2d[0] fmla v29.2d, v5.2d, v15.d[0]
fmla v22.2d, v6.2d, v13.2d[0] fmla v22.2d, v6.2d, v13.d[0]
fmla v31.2d, v7.2d, v15.2d[0] fmla v31.2d, v7.2d, v15.d[0]
fmla v24.2d, v4.2d, v14.2d[0] fmla v24.2d, v4.2d, v14.d[0]
fmla v17.2d, v5.2d, v12.2d[0] fmla v17.2d, v5.2d, v12.d[0]
fmla v26.2d, v6.2d, v14.2d[0] fmla v26.2d, v6.2d, v14.d[0]
fmla v19.2d, v7.2d, v12.2d[0] fmla v19.2d, v7.2d, v12.d[0]
fmla v28.2d, v4.2d, v15.2d[0] fmla v28.2d, v4.2d, v15.d[0]
fmla v21.2d, v5.2d, v13.2d[0] fmla v21.2d, v5.2d, v13.d[0]
fmla v30.2d, v6.2d, v15.2d[0] fmla v30.2d, v6.2d, v15.d[0]
fmla v23.2d, v7.2d, v13.2d[0] fmla v23.2d, v7.2d, v13.d[0]
.endm .endm
.macro KERNEL8x4_SUB .macro KERNEL8x4_SUB
@ -315,28 +315,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ldp q0, q1, [pA] ldp q0, q1, [pA]
add pA, pA, #32 add pA, pA, #32
fmla v16.2d, v0.2d, v8.2d[0] fmla v16.2d, v0.2d, v8.d[0]
fmla v29.2d, v1.2d, v11.2d[0] fmla v29.2d, v1.2d, v11.d[0]
fmla v20.2d, v0.2d, v9.2d[0] fmla v20.2d, v0.2d, v9.d[0]
fmla v25.2d, v1.2d, v10.2d[0] fmla v25.2d, v1.2d, v10.d[0]
ldp q2, q3, [ppA] ldp q2, q3, [ppA]
add ppA, ppA, #32 add ppA, ppA, #32
fmla v24.2d, v0.2d, v10.2d[0] fmla v24.2d, v0.2d, v10.d[0]
fmla v21.2d, v1.2d, v9.2d[0] fmla v21.2d, v1.2d, v9.d[0]
fmla v28.2d, v0.2d, v11.2d[0] fmla v28.2d, v0.2d, v11.d[0]
fmla v17.2d, v1.2d, v8.2d[0] fmla v17.2d, v1.2d, v8.d[0]
fmla v18.2d, v2.2d, v8.2d[0] fmla v18.2d, v2.2d, v8.d[0]
fmla v31.2d, v3.2d, v11.2d[0] fmla v31.2d, v3.2d, v11.d[0]
fmla v22.2d, v2.2d, v9.2d[0] fmla v22.2d, v2.2d, v9.d[0]
fmla v27.2d, v3.2d, v10.2d[0] fmla v27.2d, v3.2d, v10.d[0]
fmla v26.2d, v2.2d, v10.2d[0] fmla v26.2d, v2.2d, v10.d[0]
fmla v23.2d, v3.2d, v9.2d[0] fmla v23.2d, v3.2d, v9.d[0]
fmla v30.2d, v2.2d, v11.2d[0] fmla v30.2d, v2.2d, v11.d[0]
fmla v19.2d, v3.2d, v8.2d[0] fmla v19.2d, v3.2d, v8.d[0]
.endm .endm
.macro SAVE8x4 .macro SAVE8x4
@ -422,17 +422,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2d, v1.2d}, [pA] ld1 {v0.2d, v1.2d}, [pA]
add pA, pA, #32 add pA, pA, #32
fmla v16.2d, v0.2d, v8.2d[0] fmla v16.2d, v0.2d, v8.d[0]
fmla v29.2d, v1.2d, v9.2d[1] fmla v29.2d, v1.2d, v9.d[1]
fmla v20.2d, v0.2d, v8.2d[1] fmla v20.2d, v0.2d, v8.d[1]
fmla v25.2d, v1.2d, v9.2d[0] fmla v25.2d, v1.2d, v9.d[0]
fmla v24.2d, v0.2d, v9.2d[0] fmla v24.2d, v0.2d, v9.d[0]
fmla v21.2d, v1.2d, v8.2d[1] fmla v21.2d, v1.2d, v8.d[1]
fmla v28.2d, v0.2d, v9.2d[1] fmla v28.2d, v0.2d, v9.d[1]
fmla v17.2d, v1.2d, v8.2d[0] fmla v17.2d, v1.2d, v8.d[0]
.endm .endm
.macro SAVE4x4 .macro SAVE4x4
@ -482,10 +482,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2d}, [pA] ld1 {v0.2d}, [pA]
add pA, pA, #16 add pA, pA, #16
fmla v16.2d, v0.2d, v8.2d[0] fmla v16.2d, v0.2d, v8.d[0]
fmla v20.2d, v0.2d, v8.2d[1] fmla v20.2d, v0.2d, v8.d[1]
fmla v24.2d, v0.2d, v9.2d[0] fmla v24.2d, v0.2d, v9.d[0]
fmla v28.2d, v0.2d, v9.2d[1] fmla v28.2d, v0.2d, v9.d[1]
.endm .endm
.macro SAVE2x4 .macro SAVE2x4
@ -572,10 +572,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2d, v1.2d}, [pA] ld1 {v0.2d, v1.2d}, [pA]
add pA, pA, #32 add pA, pA, #32
fmla v16.2d, v0.2d, v8.2d[0] fmla v16.2d, v0.2d, v8.d[0]
fmla v17.2d, v1.2d, v8.2d[0] fmla v17.2d, v1.2d, v8.d[0]
fmla v20.2d, v0.2d, v8.2d[1] fmla v20.2d, v0.2d, v8.d[1]
fmla v21.2d, v1.2d, v8.2d[1] fmla v21.2d, v1.2d, v8.d[1]
.endm .endm
.macro SAVE4x2 .macro SAVE4x2
@ -610,8 +610,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2d}, [pA] ld1 {v0.2d}, [pA]
add pA, pA, #16 add pA, pA, #16
fmla v16.2d, v0.2d, v8.2d[0] fmla v16.2d, v0.2d, v8.d[0]
fmla v20.2d, v0.2d, v8.2d[1] fmla v20.2d, v0.2d, v8.d[1]
.endm .endm
.macro SAVE2x2 .macro SAVE2x2
@ -643,7 +643,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ldr d0 , [pA] ldr d0 , [pA]
add pA, pA, #8 add pA, pA, #8
fmla v16.2d, v8.2d, v0.2d[0] fmla v16.2d, v8.2d, v0.d[0]
.endm .endm
.macro SAVE1x2 .macro SAVE1x2
@ -674,8 +674,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2d, v1.2d}, [pA] ld1 {v0.2d, v1.2d}, [pA]
add pA , pA, #32 add pA , pA, #32
fmla v16.2d, v0.2d, v8.2d[0] fmla v16.2d, v0.2d, v8.d[0]
fmla v17.2d, v1.2d, v8.2d[0] fmla v17.2d, v1.2d, v8.d[0]
.endm .endm
.macro SAVE4x1 .macro SAVE4x1
@ -705,7 +705,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2d}, [pA] ld1 {v0.2d}, [pA]
add pA , pA, #16 add pA , pA, #16
fmla v16.2d, v0.2d, v8.2d[0] fmla v16.2d, v0.2d, v8.d[0]
.endm .endm
.macro SAVE2x1 .macro SAVE2x1

284
kernel/arm64/dgemm_kernel_4x8.S Executable file → Normal file
View File

@ -154,25 +154,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v10.2d, v11.2d}, [pB] ld1 {v10.2d, v11.2d}, [pB]
add pB, pB, #32 add pB, pB, #32
fmul v16.2d, v0.2d, v8.2d[0] fmul v16.2d, v0.2d, v8.d[0]
fmul v17.2d, v1.2d, v8.2d[0] fmul v17.2d, v1.2d, v8.d[0]
fmul v18.2d, v0.2d, v8.2d[1] fmul v18.2d, v0.2d, v8.d[1]
fmul v19.2d, v1.2d, v8.2d[1] fmul v19.2d, v1.2d, v8.d[1]
fmul v20.2d, v0.2d, v9.2d[0] fmul v20.2d, v0.2d, v9.d[0]
fmul v21.2d, v1.2d, v9.2d[0] fmul v21.2d, v1.2d, v9.d[0]
fmul v22.2d, v0.2d, v9.2d[1] fmul v22.2d, v0.2d, v9.d[1]
fmul v23.2d, v1.2d, v9.2d[1] fmul v23.2d, v1.2d, v9.d[1]
fmul v24.2d, v0.2d, v10.2d[0] fmul v24.2d, v0.2d, v10.d[0]
fmul v25.2d, v1.2d, v10.2d[0] fmul v25.2d, v1.2d, v10.d[0]
fmul v26.2d, v0.2d, v10.2d[1] fmul v26.2d, v0.2d, v10.d[1]
fmul v27.2d, v1.2d, v10.2d[1] fmul v27.2d, v1.2d, v10.d[1]
fmul v28.2d, v0.2d, v11.2d[0] fmul v28.2d, v0.2d, v11.d[0]
fmul v29.2d, v1.2d, v11.2d[0] fmul v29.2d, v1.2d, v11.d[0]
fmul v30.2d, v0.2d, v11.2d[1] fmul v30.2d, v0.2d, v11.d[1]
fmul v31.2d, v1.2d, v11.2d[1] fmul v31.2d, v1.2d, v11.d[1]
ld1 {v12.2d, v13.2d}, [pB] ld1 {v12.2d, v13.2d}, [pB]
add pB, pB, #32 add pB, pB, #32
@ -183,25 +183,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL4x8_M1 .macro KERNEL4x8_M1
fmla v16.2d, v0.2d, v8.2d[0] fmla v16.2d, v0.2d, v8.d[0]
fmla v17.2d, v1.2d, v8.2d[0] fmla v17.2d, v1.2d, v8.d[0]
fmla v18.2d, v0.2d, v8.2d[1] fmla v18.2d, v0.2d, v8.d[1]
fmla v19.2d, v1.2d, v8.2d[1] fmla v19.2d, v1.2d, v8.d[1]
fmla v20.2d, v0.2d, v9.2d[0] fmla v20.2d, v0.2d, v9.d[0]
fmla v21.2d, v1.2d, v9.2d[0] fmla v21.2d, v1.2d, v9.d[0]
fmla v22.2d, v0.2d, v9.2d[1] fmla v22.2d, v0.2d, v9.d[1]
fmla v23.2d, v1.2d, v9.2d[1] fmla v23.2d, v1.2d, v9.d[1]
fmla v24.2d, v0.2d, v10.2d[0] fmla v24.2d, v0.2d, v10.d[0]
fmla v25.2d, v1.2d, v10.2d[0] fmla v25.2d, v1.2d, v10.d[0]
fmla v26.2d, v0.2d, v10.2d[1] fmla v26.2d, v0.2d, v10.d[1]
fmla v27.2d, v1.2d, v10.2d[1] fmla v27.2d, v1.2d, v10.d[1]
fmla v28.2d, v0.2d, v11.2d[0] fmla v28.2d, v0.2d, v11.d[0]
fmla v29.2d, v1.2d, v11.2d[0] fmla v29.2d, v1.2d, v11.d[0]
fmla v30.2d, v0.2d, v11.2d[1] fmla v30.2d, v0.2d, v11.d[1]
fmla v31.2d, v1.2d, v11.2d[1] fmla v31.2d, v1.2d, v11.d[1]
ld1 {v12.2d, v13.2d}, [pB] // For next round ld1 {v12.2d, v13.2d}, [pB] // For next round
add pB, pB, #32 add pB, pB, #32
@ -214,25 +214,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL4x8_M2 .macro KERNEL4x8_M2
fmla v16.2d, v4.2d, v12.2d[0] fmla v16.2d, v4.2d, v12.d[0]
fmla v17.2d, v5.2d, v12.2d[0] fmla v17.2d, v5.2d, v12.d[0]
fmla v18.2d, v4.2d, v12.2d[1] fmla v18.2d, v4.2d, v12.d[1]
fmla v19.2d, v5.2d, v12.2d[1] fmla v19.2d, v5.2d, v12.d[1]
fmla v20.2d, v4.2d, v13.2d[0] fmla v20.2d, v4.2d, v13.d[0]
fmla v21.2d, v5.2d, v13.2d[0] fmla v21.2d, v5.2d, v13.d[0]
fmla v22.2d, v4.2d, v13.2d[1] fmla v22.2d, v4.2d, v13.d[1]
fmla v23.2d, v5.2d, v13.2d[1] fmla v23.2d, v5.2d, v13.d[1]
fmla v24.2d, v4.2d, v14.2d[0] fmla v24.2d, v4.2d, v14.d[0]
fmla v25.2d, v5.2d, v14.2d[0] fmla v25.2d, v5.2d, v14.d[0]
fmla v26.2d, v4.2d, v14.2d[1] fmla v26.2d, v4.2d, v14.d[1]
fmla v27.2d, v5.2d, v14.2d[1] fmla v27.2d, v5.2d, v14.d[1]
fmla v28.2d, v4.2d, v15.2d[0] fmla v28.2d, v4.2d, v15.d[0]
fmla v29.2d, v5.2d, v15.2d[0] fmla v29.2d, v5.2d, v15.d[0]
fmla v30.2d, v4.2d, v15.2d[1] fmla v30.2d, v4.2d, v15.d[1]
fmla v31.2d, v5.2d, v15.2d[1] fmla v31.2d, v5.2d, v15.d[1]
ld1 {v8.2d, v9.2d}, [pB] // For next round ld1 {v8.2d, v9.2d}, [pB] // For next round
add pB, pB, #32 add pB, pB, #32
@ -245,25 +245,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL4x8_E .macro KERNEL4x8_E
fmla v16.2d, v4.2d, v12.2d[0] fmla v16.2d, v4.2d, v12.d[0]
fmla v17.2d, v5.2d, v12.2d[0] fmla v17.2d, v5.2d, v12.d[0]
fmla v18.2d, v4.2d, v12.2d[1] fmla v18.2d, v4.2d, v12.d[1]
fmla v19.2d, v5.2d, v12.2d[1] fmla v19.2d, v5.2d, v12.d[1]
fmla v20.2d, v4.2d, v13.2d[0] fmla v20.2d, v4.2d, v13.d[0]
fmla v21.2d, v5.2d, v13.2d[0] fmla v21.2d, v5.2d, v13.d[0]
fmla v22.2d, v4.2d, v13.2d[1] fmla v22.2d, v4.2d, v13.d[1]
fmla v23.2d, v5.2d, v13.2d[1] fmla v23.2d, v5.2d, v13.d[1]
fmla v24.2d, v4.2d, v14.2d[0] fmla v24.2d, v4.2d, v14.d[0]
fmla v25.2d, v5.2d, v14.2d[0] fmla v25.2d, v5.2d, v14.d[0]
fmla v26.2d, v4.2d, v14.2d[1] fmla v26.2d, v4.2d, v14.d[1]
fmla v27.2d, v5.2d, v14.2d[1] fmla v27.2d, v5.2d, v14.d[1]
fmla v28.2d, v4.2d, v15.2d[0] fmla v28.2d, v4.2d, v15.d[0]
fmla v29.2d, v5.2d, v15.2d[0] fmla v29.2d, v5.2d, v15.d[0]
fmla v30.2d, v4.2d, v15.2d[1] fmla v30.2d, v4.2d, v15.d[1]
fmla v31.2d, v5.2d, v15.2d[1] fmla v31.2d, v5.2d, v15.d[1]
.endm .endm
.macro KERNEL4x8_SUB .macro KERNEL4x8_SUB
@ -274,25 +274,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v10.2d, v11.2d}, [pB] ld1 {v10.2d, v11.2d}, [pB]
add pB, pB, #32 add pB, pB, #32
fmla v16.2d, v0.2d, v8.2d[0] fmla v16.2d, v0.2d, v8.d[0]
fmla v17.2d, v1.2d, v8.2d[0] fmla v17.2d, v1.2d, v8.d[0]
fmla v18.2d, v0.2d, v8.2d[1] fmla v18.2d, v0.2d, v8.d[1]
fmla v19.2d, v1.2d, v8.2d[1] fmla v19.2d, v1.2d, v8.d[1]
fmla v20.2d, v0.2d, v9.2d[0] fmla v20.2d, v0.2d, v9.d[0]
fmla v21.2d, v1.2d, v9.2d[0] fmla v21.2d, v1.2d, v9.d[0]
fmla v22.2d, v0.2d, v9.2d[1] fmla v22.2d, v0.2d, v9.d[1]
fmla v23.2d, v1.2d, v9.2d[1] fmla v23.2d, v1.2d, v9.d[1]
fmla v24.2d, v0.2d, v10.2d[0] fmla v24.2d, v0.2d, v10.d[0]
fmla v25.2d, v1.2d, v10.2d[0] fmla v25.2d, v1.2d, v10.d[0]
fmla v26.2d, v0.2d, v10.2d[1] fmla v26.2d, v0.2d, v10.d[1]
fmla v27.2d, v1.2d, v10.2d[1] fmla v27.2d, v1.2d, v10.d[1]
fmla v28.2d, v0.2d, v11.2d[0] fmla v28.2d, v0.2d, v11.d[0]
fmla v29.2d, v1.2d, v11.2d[0] fmla v29.2d, v1.2d, v11.d[0]
fmla v30.2d, v0.2d, v11.2d[1] fmla v30.2d, v0.2d, v11.d[1]
fmla v31.2d, v1.2d, v11.2d[1] fmla v31.2d, v1.2d, v11.d[1]
.endm .endm
.macro SAVE4x8 .macro SAVE4x8
@ -374,17 +374,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v10.2d, v11.2d}, [pB] ld1 {v10.2d, v11.2d}, [pB]
add pB, pB, #32 add pB, pB, #32
fmla v16.2d, v0.2d, v8.2d[0] fmla v16.2d, v0.2d, v8.d[0]
fmla v18.2d, v0.2d, v8.2d[1] fmla v18.2d, v0.2d, v8.d[1]
fmla v20.2d, v0.2d, v9.2d[0] fmla v20.2d, v0.2d, v9.d[0]
fmla v22.2d, v0.2d, v9.2d[1] fmla v22.2d, v0.2d, v9.d[1]
fmla v24.2d, v0.2d, v10.2d[0] fmla v24.2d, v0.2d, v10.d[0]
fmla v26.2d, v0.2d, v10.2d[1] fmla v26.2d, v0.2d, v10.d[1]
fmla v28.2d, v0.2d, v11.2d[0] fmla v28.2d, v0.2d, v11.d[0]
fmla v30.2d, v0.2d, v11.2d[1] fmla v30.2d, v0.2d, v11.d[1]
.endm .endm
.macro SAVE2x8 .macro SAVE2x8
@ -520,17 +520,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2d, v1.2d}, [pA] ld1 {v0.2d, v1.2d}, [pA]
add pA, pA, #32 add pA, pA, #32
fmul v16.2d, v0.2d, v8.2d[0] fmul v16.2d, v0.2d, v8.d[0]
fmul v29.2d, v1.2d, v9.2d[1] fmul v29.2d, v1.2d, v9.d[1]
fmul v20.2d, v0.2d, v8.2d[1] fmul v20.2d, v0.2d, v8.d[1]
fmul v25.2d, v1.2d, v9.2d[0] fmul v25.2d, v1.2d, v9.d[0]
fmul v24.2d, v0.2d, v9.2d[0] fmul v24.2d, v0.2d, v9.d[0]
fmul v21.2d, v1.2d, v8.2d[1] fmul v21.2d, v1.2d, v8.d[1]
fmul v28.2d, v0.2d, v9.2d[1] fmul v28.2d, v0.2d, v9.d[1]
fmul v17.2d, v1.2d, v8.2d[0] fmul v17.2d, v1.2d, v8.d[0]
ld1 {v12.2d, v13.2d}, [pB] ld1 {v12.2d, v13.2d}, [pB]
add pB, pB, #32 add pB, pB, #32
@ -539,61 +539,61 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL4x4_M1 .macro KERNEL4x4_M1
fmla v16.2d, v0.2d, v8.2d[0] fmla v16.2d, v0.2d, v8.d[0]
fmla v29.2d, v1.2d, v9.2d[1] fmla v29.2d, v1.2d, v9.d[1]
ld1 {v12.2d, v13.2d}, [pB] // For next round ld1 {v12.2d, v13.2d}, [pB] // For next round
add pB, pB, #32 add pB, pB, #32
fmla v20.2d, v0.2d, v8.2d[1] fmla v20.2d, v0.2d, v8.d[1]
fmla v25.2d, v1.2d, v9.2d[0] fmla v25.2d, v1.2d, v9.d[0]
ld1 {v4.2d, v5.2d}, [pA] // For next round ld1 {v4.2d, v5.2d}, [pA] // For next round
add pA, pA, #32 add pA, pA, #32
fmla v24.2d, v0.2d, v9.2d[0] fmla v24.2d, v0.2d, v9.d[0]
fmla v21.2d, v1.2d, v8.2d[1] fmla v21.2d, v1.2d, v8.d[1]
prfm PLDL1KEEP, [pA, #512] prfm PLDL1KEEP, [pA, #512]
fmla v28.2d, v0.2d, v9.2d[1] fmla v28.2d, v0.2d, v9.d[1]
fmla v17.2d, v1.2d, v8.2d[0] fmla v17.2d, v1.2d, v8.d[0]
.endm .endm
.macro KERNEL4x4_M2 .macro KERNEL4x4_M2
fmla v16.2d, v4.2d, v12.2d[0] fmla v16.2d, v4.2d, v12.d[0]
fmla v29.2d, v5.2d, v13.2d[1] fmla v29.2d, v5.2d, v13.d[1]
ld1 {v8.2d, v9.2d}, [pB] // For next round ld1 {v8.2d, v9.2d}, [pB] // For next round
add pB, pB, #32 add pB, pB, #32
fmla v20.2d, v4.2d, v12.2d[1] fmla v20.2d, v4.2d, v12.d[1]
fmla v25.2d, v5.2d, v13.2d[0] fmla v25.2d, v5.2d, v13.d[0]
ld1 {v0.2d, v1.2d}, [pA] // For next round ld1 {v0.2d, v1.2d}, [pA] // For next round
add pA, pA, #32 add pA, pA, #32
fmla v24.2d, v4.2d, v13.2d[0] fmla v24.2d, v4.2d, v13.d[0]
fmla v21.2d, v5.2d, v12.2d[1] fmla v21.2d, v5.2d, v12.d[1]
prfm PLDL1KEEP, [pB, #512] prfm PLDL1KEEP, [pB, #512]
fmla v28.2d, v4.2d, v13.2d[1] fmla v28.2d, v4.2d, v13.d[1]
fmla v17.2d, v5.2d, v12.2d[0] fmla v17.2d, v5.2d, v12.d[0]
.endm .endm
.macro KERNEL4x4_E .macro KERNEL4x4_E
fmla v16.2d, v4.2d, v12.2d[0] fmla v16.2d, v4.2d, v12.d[0]
fmla v29.2d, v5.2d, v13.2d[1] fmla v29.2d, v5.2d, v13.d[1]
fmla v20.2d, v4.2d, v12.2d[1] fmla v20.2d, v4.2d, v12.d[1]
fmla v25.2d, v5.2d, v13.2d[0] fmla v25.2d, v5.2d, v13.d[0]
fmla v24.2d, v4.2d, v13.2d[0] fmla v24.2d, v4.2d, v13.d[0]
fmla v21.2d, v5.2d, v12.2d[1] fmla v21.2d, v5.2d, v12.d[1]
fmla v28.2d, v4.2d, v13.2d[1] fmla v28.2d, v4.2d, v13.d[1]
fmla v17.2d, v5.2d, v12.2d[0] fmla v17.2d, v5.2d, v12.d[0]
.endm .endm
.macro KERNEL4x4_SUB .macro KERNEL4x4_SUB
@ -602,17 +602,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2d, v1.2d}, [pA] ld1 {v0.2d, v1.2d}, [pA]
add pA, pA, #32 add pA, pA, #32
fmla v16.2d, v0.2d, v8.2d[0] fmla v16.2d, v0.2d, v8.d[0]
fmla v29.2d, v1.2d, v9.2d[1] fmla v29.2d, v1.2d, v9.d[1]
fmla v20.2d, v0.2d, v8.2d[1] fmla v20.2d, v0.2d, v8.d[1]
fmla v25.2d, v1.2d, v9.2d[0] fmla v25.2d, v1.2d, v9.d[0]
fmla v24.2d, v0.2d, v9.2d[0] fmla v24.2d, v0.2d, v9.d[0]
fmla v21.2d, v1.2d, v8.2d[1] fmla v21.2d, v1.2d, v8.d[1]
fmla v28.2d, v0.2d, v9.2d[1] fmla v28.2d, v0.2d, v9.d[1]
fmla v17.2d, v1.2d, v8.2d[0] fmla v17.2d, v1.2d, v8.d[0]
.endm .endm
.macro SAVE4x4 .macro SAVE4x4
@ -660,10 +660,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2d}, [pA] ld1 {v0.2d}, [pA]
add pA, pA, #16 add pA, pA, #16
fmla v16.2d, v0.2d, v8.2d[0] fmla v16.2d, v0.2d, v8.d[0]
fmla v20.2d, v0.2d, v8.2d[1] fmla v20.2d, v0.2d, v8.d[1]
fmla v24.2d, v0.2d, v9.2d[0] fmla v24.2d, v0.2d, v9.d[0]
fmla v28.2d, v0.2d, v9.2d[1] fmla v28.2d, v0.2d, v9.d[1]
.endm .endm
.macro SAVE2x4 .macro SAVE2x4
@ -746,10 +746,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2d, v1.2d}, [pA] ld1 {v0.2d, v1.2d}, [pA]
add pA, pA, #32 add pA, pA, #32
fmla v16.2d, v0.2d, v8.2d[0] fmla v16.2d, v0.2d, v8.d[0]
fmla v17.2d, v1.2d, v8.2d[0] fmla v17.2d, v1.2d, v8.d[0]
fmla v20.2d, v0.2d, v8.2d[1] fmla v20.2d, v0.2d, v8.d[1]
fmla v21.2d, v1.2d, v8.2d[1] fmla v21.2d, v1.2d, v8.d[1]
.endm .endm
.macro SAVE4x2 .macro SAVE4x2
@ -782,8 +782,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2d}, [pA] ld1 {v0.2d}, [pA]
add pA, pA, #16 add pA, pA, #16
fmla v16.2d, v0.2d, v8.2d[0] fmla v16.2d, v0.2d, v8.d[0]
fmla v20.2d, v0.2d, v8.2d[1] fmla v20.2d, v0.2d, v8.d[1]
.endm .endm
.macro SAVE2x2 .macro SAVE2x2
@ -813,7 +813,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ldr d0 , [pA] ldr d0 , [pA]
add pA, pA, #8 add pA, pA, #8
fmla v16.2d, v8.2d, v0.2d[0] fmla v16.2d, v8.2d, v0.d[0]
.endm .endm
.macro SAVE1x2 .macro SAVE1x2
@ -842,8 +842,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2d, v1.2d}, [pA] ld1 {v0.2d, v1.2d}, [pA]
add pA , pA, #32 add pA , pA, #32
fmla v16.2d, v0.2d, v8.2d[0] fmla v16.2d, v0.2d, v8.d[0]
fmla v17.2d, v1.2d, v8.2d[0] fmla v17.2d, v1.2d, v8.d[0]
.endm .endm
.macro SAVE4x1 .macro SAVE4x1
@ -871,7 +871,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2d}, [pA] ld1 {v0.2d}, [pA]
add pA , pA, #16 add pA , pA, #16
fmla v16.2d, v0.2d, v8.2d[0] fmla v16.2d, v0.2d, v8.d[0]
.endm .endm
.macro SAVE2x1 .macro SAVE2x1

478
kernel/arm64/dgemm_kernel_8x4.S Executable file → Normal file
View File

@ -52,12 +52,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define alpha0 d10 #define alpha0 d10
#define alphaV0 v10.d[0] #define alphaV0 v10.d[0]
#define alpha1 d11
#define alphaV1 v11.d[0] #define A_PRE_SIZE 2560
#define alpha2 d14 #define B_PRE_SIZE 448
#define alphaV2 v14.d[0] #define C_PRE_SIZE 128
#define alpha3 d15
#define alphaV3 v15.d[0]
// 00 origM // 00 origM
// 01 origN // 01 origN
@ -74,8 +72,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
// 12 pCRow0 // 12 pCRow0
// 13 pCRow1 // 13 pCRow1
// 14 pCRow2 // 14 pCRow2
// 15 pA // 15 pCRow3
// 16 // 16 pA
// 17 // 17
// 18 must save // 18 must save
// 19 must save // 19 must save
@ -100,14 +98,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//v05 pA1_2, pA1_3 //v05 pA1_2, pA1_3
//v06 pA1_4, pA1_5 //v06 pA1_4, pA1_5
//v07 pA1_6, pA1_7 //v07 pA1_6, pA1_7
//v08 must save pB0_0, pB0_1 //v08 must save pB0_0
//v09 must save pB0_2, pB0_3 //v09 must save pB0_1
//v10 must save ALPHA0 //v10 must save pB0_2 --> ALPHA0
//v11 must save ALPHA1 //v11 must save pB0_3
//v12 must save pB1_0, pB1_1 //v12 must save pB1_0
//v13 must save pB1_2, pB1_3 //v13 must save pB1_1
//v14 must save ALPHA2 //v14 must save pB1_2
//v15 must save ALPHA3 //v15 must save pB1_3
//v16 must save C00, C01 //v16 must save C00, C01
//v17 must save C02, C03 //v17 must save C02, C03
//v18 C04, C05 //v18 C04, C05
@ -149,244 +147,257 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL8x4_I .macro KERNEL8x4_I
ld1 {v0.2d, v1.2d}, [pA] ldp q0, q1, [pA], #32
add pA, pA, #32
ld1 {v2.2d, v3.2d}, [pA]
add pA, pA, #32
ldp d8, d9, [pB]
add pB, pB, #16
ldp d10, d11, [pB]
add pB, pB, #16
fmul v16.2d, v0.2d, v8.2d[0] ldp d8, d9, [pB], #16
fmul v17.2d, v1.2d, v8.2d[0]
fmul v18.2d, v2.2d, v8.2d[0] fmul v16.2d, v0.2d, v8.d[0]
fmul v19.2d, v3.2d, v8.2d[0] fmul v20.2d, v0.2d, v9.d[0]
fmul v20.2d, v0.2d, v9.2d[0] ldp d10, d11, [pB], #16
fmul v21.2d, v1.2d, v9.2d[0]
fmul v22.2d, v2.2d, v9.2d[0] fmul v17.2d, v1.2d, v8.d[0]
fmul v23.2d, v3.2d, v9.2d[0] fmul v21.2d, v1.2d, v9.d[0]
fmul v24.2d, v0.2d, v10.2d[0] ldp q2, q3, [pA], #32
fmul v25.2d, v1.2d, v10.2d[0]
fmul v26.2d, v2.2d, v10.2d[0] fmul v24.2d, v0.2d, v10.d[0]
fmul v27.2d, v3.2d, v10.2d[0] fmul v28.2d, v0.2d, v11.d[0]
fmul v28.2d, v0.2d, v11.2d[0] ldp q4, q5, [pA], #32
fmul v29.2d, v1.2d, v11.2d[0]
fmul v30.2d, v2.2d, v11.2d[0] fmul v25.2d, v1.2d, v10.d[0]
fmul v31.2d, v3.2d, v11.2d[0] fmul v29.2d, v1.2d, v11.d[0]
ld1 {v4.2d, v5.2d}, [pA] ldp d12, d13, [pB], #16
add pA, pA, #32
ld1 {v6.2d, v7.2d}, [pA] fmul v18.2d, v2.2d, v8.d[0]
add pA, pA, #32 fmul v22.2d, v2.2d, v9.d[0]
ldp d12, d13, [pB]
add pB, pB, #16 ldp d14, d15, [pB], #16
ldp d14, d15, [pB]
add pB, pB, #16 fmul v26.2d, v2.2d, v10.d[0]
fmul v30.2d, v2.2d, v11.d[0]
ldp q6, q7, [pA], #32
fmul v19.2d, v3.2d, v8.d[0]
fmul v27.2d, v3.2d, v10.d[0]
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
fmul v31.2d, v3.2d, v11.d[0]
fmul v23.2d, v3.2d, v9.d[0]
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
.endm .endm
.macro KERNEL8x4_M1 .macro KERNEL8x4_M1
fmla v16.2d, v0.2d, v8.2d[0] fmla v16.2d, v0.2d, v8.d[0]
fmla v21.2d, v1.2d, v9.2d[0] fmla v20.2d, v0.2d, v9.d[0]
fmla v26.2d, v2.2d, v10.2d[0]
fmla v31.2d, v3.2d, v11.2d[0]
ld1 {v4.2d}, [pA], #16 ldp q4, q5, [pA], #32
fmla v20.2d, v0.2d, v9.2d[0] fmla v24.2d, v0.2d, v10.d[0]
fmla v17.2d, v1.2d, v8.2d[0] fmla v28.2d, v0.2d, v11.d[0]
ld1 {v5.2d}, [pA], #16 ldp d12, d13, [pB], #16
fmla v30.2d, v2.2d, v11.2d[0] fmla v17.2d, v1.2d, v8.d[0]
fmla v27.2d, v3.2d, v10.2d[0] fmla v25.2d, v1.2d, v10.d[0]
ldp d12, d13, [pB] prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
add pB, pB, #16
fmla v28.2d, v0.2d, v11.2d[0] fmla v21.2d, v1.2d, v9.d[0]
fmla v25.2d, v1.2d, v10.2d[0] fmla v29.2d, v1.2d, v11.d[0]
ldp d14, d15, [pB] ldp d14, d15, [pB], #16
add pB, pB, #16
fmla v18.2d, v2.2d, v8.2d[0] fmla v18.2d, v2.2d, v8.d[0]
fmla v23.2d, v3.2d, v9.2d[0] fmla v22.2d, v2.2d, v9.d[0]
ld1 {v6.2d}, [pA], #16 prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
fmla v24.2d, v0.2d, v10.2d[0] fmla v26.2d, v2.2d, v10.d[0]
fmla v29.2d, v1.2d, v11.2d[0] fmla v30.2d, v2.2d, v11.d[0]
fmla v19.2d, v3.2d, v8.d[0]
fmla v23.2d, v3.2d, v9.d[0]
ld1 {v7.2d}, [pA], #16 ldp q6, q7, [pA], #32
fmla v22.2d, v2.2d, v9.2d[0] fmla v27.2d, v3.2d, v10.d[0]
fmla v19.2d, v3.2d, v8.2d[0] fmla v31.2d, v3.2d, v11.d[0]
prfm PLDL1KEEP, [pA, #224]
prfm PLDL1KEEP, [pA, #224+64]
.endm .endm
.macro KERNEL8x4_M2 .macro KERNEL8x4_M2
fmla v16.2d, v4.2d, v12.2d[0] fmla v16.2d, v4.2d, v12.d[0]
fmla v21.2d, v5.2d, v13.2d[0] fmla v20.2d, v4.2d, v13.d[0]
fmla v26.2d, v6.2d, v14.2d[0] fmla v24.2d, v4.2d, v14.d[0]
fmla v31.2d, v7.2d, v15.2d[0] fmla v28.2d, v4.2d, v15.d[0]
ld1 {v0.2d}, [pA], #16 ldp q0, q1, [pA], #32
fmla v20.2d, v4.2d, v13.2d[0] fmla v17.2d, v5.2d, v12.d[0]
fmla v17.2d, v5.2d, v12.2d[0] fmla v25.2d, v5.2d, v14.d[0]
ld1 {v1.2d}, [pA], #16 ldp d8, d9, [pB], #16
fmla v30.2d, v6.2d, v15.2d[0] fmla v21.2d, v5.2d, v13.d[0]
fmla v27.2d, v7.2d, v14.2d[0] fmla v29.2d, v5.2d, v15.d[0]
ldp d8, d9, [pB] ldp d10, d11, [pB], #16
add pB, pB, #16
fmla v28.2d, v4.2d, v15.2d[0] fmla v18.2d, v6.2d, v12.d[0]
fmla v25.2d, v5.2d, v14.2d[0] fmla v22.2d, v6.2d, v13.d[0]
ldp d10, d11, [pB] prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
add pB, pB, #16
fmla v22.2d, v6.2d, v13.2d[0] fmla v26.2d, v6.2d, v14.d[0]
fmla v19.2d, v7.2d, v12.2d[0] fmla v30.2d, v6.2d, v15.d[0]
ld1 {v2.2d}, [pA], #16 fmla v19.2d, v7.2d, v12.d[0]
fmla v23.2d, v7.2d, v13.d[0]
fmla v24.2d, v4.2d, v14.2d[0] ldp q2, q3, [pA], #32
fmla v29.2d, v5.2d, v15.2d[0]
ld1 {v3.2d}, [pA], #16 fmla v27.2d, v7.2d, v14.d[0]
fmla v31.2d, v7.2d, v15.d[0]
fmla v18.2d, v6.2d, v12.2d[0]
fmla v23.2d, v7.2d, v13.2d[0]
prfm PLDL1KEEP, [pB, #640]
.endm .endm
.macro KERNEL8x4_E .macro KERNEL8x4_E
fmla v16.2d, v4.2d, v12.2d[0] fmla v16.2d, v4.2d, v12.d[0]
fmla v17.2d, v5.2d, v12.2d[0] fmla v20.2d, v4.2d, v13.d[0]
fmla v18.2d, v6.2d, v12.2d[0] fmla v24.2d, v4.2d, v14.d[0]
fmla v19.2d, v7.2d, v12.2d[0] fmla v28.2d, v4.2d, v15.d[0]
fmla v20.2d, v4.2d, v13.2d[0]
fmla v21.2d, v5.2d, v13.2d[0] fmla v17.2d, v5.2d, v12.d[0]
fmla v22.2d, v6.2d, v13.2d[0] fmla v25.2d, v5.2d, v14.d[0]
fmla v23.2d, v7.2d, v13.2d[0] fmla v21.2d, v5.2d, v13.d[0]
fmla v24.2d, v4.2d, v14.2d[0] fmla v29.2d, v5.2d, v15.d[0]
fmla v25.2d, v5.2d, v14.2d[0]
fmla v26.2d, v6.2d, v14.2d[0] prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
fmla v27.2d, v7.2d, v14.2d[0]
fmla v28.2d, v4.2d, v15.2d[0] fmla v18.2d, v6.2d, v12.d[0]
fmla v29.2d, v5.2d, v15.2d[0] fmla v22.2d, v6.2d, v13.d[0]
fmla v30.2d, v6.2d, v15.2d[0] fmla v26.2d, v6.2d, v14.d[0]
fmla v31.2d, v7.2d, v15.2d[0] fmla v30.2d, v6.2d, v15.d[0]
fmla v19.2d, v7.2d, v12.d[0]
fmla v23.2d, v7.2d, v13.d[0]
fmla v27.2d, v7.2d, v14.d[0]
fmla v31.2d, v7.2d, v15.d[0]
.endm .endm
.macro KERNEL8x4_SUB .macro KERNEL8x4_SUB
ld1 {v0.2d, v1.2d}, [pA] ldp q0, q1, [pA], #32
add pA, pA, #32
ld1 {v2.2d, v3.2d}, [pA]
add pA, pA, #32
ldp d8, d9, [pB]
add pB, pB, #16
ldp d10, d11, [pB]
add pB, pB, #16
fmla v16.2d, v0.2d, v8.2d[0] ldp d8, d9, [pB], #16
fmla v17.2d, v1.2d, v8.2d[0]
fmla v18.2d, v2.2d, v8.2d[0]
fmla v19.2d, v3.2d, v8.2d[0]
fmla v20.2d, v0.2d, v9.2d[0] fmla v16.2d, v0.2d, v8.d[0]
fmla v21.2d, v1.2d, v9.2d[0] fmla v20.2d, v0.2d, v9.d[0]
fmla v22.2d, v2.2d, v9.2d[0]
fmla v23.2d, v3.2d, v9.2d[0]
fmla v24.2d, v0.2d, v10.2d[0] ldp d10, d11, [pB], #16
fmla v25.2d, v1.2d, v10.2d[0]
fmla v26.2d, v2.2d, v10.2d[0]
fmla v27.2d, v3.2d, v10.2d[0]
fmla v28.2d, v0.2d, v11.2d[0] fmla v17.2d, v1.2d, v8.d[0]
fmla v29.2d, v1.2d, v11.2d[0] fmla v21.2d, v1.2d, v9.d[0]
fmla v30.2d, v2.2d, v11.2d[0]
fmla v31.2d, v3.2d, v11.2d[0] ldp q2, q3, [pA], #32
fmla v24.2d, v0.2d, v10.d[0]
fmla v28.2d, v0.2d, v11.d[0]
fmla v25.2d, v1.2d, v10.d[0]
fmla v29.2d, v1.2d, v11.d[0]
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
fmla v18.2d, v2.2d, v8.d[0]
fmla v22.2d, v2.2d, v9.d[0]
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
fmla v26.2d, v2.2d, v10.d[0]
fmla v30.2d, v2.2d, v11.d[0]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
fmla v19.2d, v3.2d, v8.d[0]
fmla v27.2d, v3.2d, v10.d[0]
fmla v31.2d, v3.2d, v11.d[0]
fmla v23.2d, v3.2d, v9.d[0]
.endm .endm
.macro SAVE8x4 .macro SAVE8x4
fmov alpha0, alpha fmov alpha0, alpha
ld1 {v0.2d, v1.2d}, [pCRow0] prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
ldp q0, q1, [pCRow0]
fmla v0.2d, v16.2d, alphaV0 fmla v0.2d, v16.2d, alphaV0
fmla v1.2d, v17.2d, alphaV0 fmla v1.2d, v17.2d, alphaV0
st1 {v0.2d, v1.2d}, [pCRow0] stp q0, q1, [pCRow0]
add pCRow0, pCRow0, #32 add pCRow0, pCRow0, #32
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
ld1 {v2.2d, v3.2d}, [pCRow0] ldp q2, q3, [pCRow0]
fmla v2.2d, v18.2d, alphaV0 fmla v2.2d, v18.2d, alphaV0
fmla v3.2d, v19.2d, alphaV0 fmla v3.2d, v19.2d, alphaV0
st1 {v2.2d, v3.2d}, [pCRow0] stp q2, q3, [pCRow0]
add pCRow0, pCRow0, #32 add pCRow0, pCRow0, #32
ld1 {v4.2d, v5.2d}, [pCRow1] prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
ldp q4, q5, [pCRow1]
fmla v4.2d, v20.2d, alphaV0 fmla v4.2d, v20.2d, alphaV0
fmla v5.2d, v21.2d, alphaV0 fmla v5.2d, v21.2d, alphaV0
st1 {v4.2d, v5.2d}, [pCRow1] stp q4, q5, [pCRow1]
add pCRow1, pCRow1, #32 add pCRow1, pCRow1, #32
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
ld1 {v6.2d, v7.2d}, [pCRow1] ldp q6, q7, [pCRow1]
fmla v6.2d, v22.2d, alphaV0 fmla v6.2d, v22.2d, alphaV0
fmla v7.2d, v23.2d, alphaV0 fmla v7.2d, v23.2d, alphaV0
st1 {v6.2d, v7.2d}, [pCRow1] stp q6, q7, [pCRow1]
add pCRow1, pCRow1, #32 add pCRow1, pCRow1, #32
ld1 {v0.2d, v1.2d}, [pCRow2] prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
ldp q0, q1, [pCRow2]
fmla v0.2d, v24.2d, alphaV0 fmla v0.2d, v24.2d, alphaV0
fmla v1.2d, v25.2d, alphaV0 fmla v1.2d, v25.2d, alphaV0
st1 {v0.2d, v1.2d}, [pCRow2] stp q0, q1, [pCRow2]
add pCRow2, pCRow2, #32 add pCRow2, pCRow2, #32
ld1 {v2.2d, v3.2d}, [pCRow2] prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
ldp q2, q3, [pCRow2]
fmla v2.2d, v26.2d, alphaV0 fmla v2.2d, v26.2d, alphaV0
fmla v3.2d, v27.2d, alphaV0 fmla v3.2d, v27.2d, alphaV0
st1 {v2.2d, v3.2d}, [pCRow2] stp q2, q3, [pCRow2]
add pCRow2, pCRow2, #32 add pCRow2, pCRow2, #32
ld1 {v4.2d, v5.2d}, [pCRow3] prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
ldp q4, q5, [pCRow3]
fmla v4.2d, v28.2d, alphaV0 fmla v4.2d, v28.2d, alphaV0
fmla v5.2d, v29.2d, alphaV0 fmla v5.2d, v29.2d, alphaV0
st1 {v4.2d, v5.2d}, [pCRow3] stp q4, q5, [pCRow3]
add pCRow3, pCRow3, #32 add pCRow3, pCRow3, #32
prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
ld1 {v6.2d, v7.2d}, [pCRow3] ldp q6, q7, [pCRow3]
fmla v6.2d, v30.2d, alphaV0 fmla v6.2d, v30.2d, alphaV0
fmla v7.2d, v31.2d, alphaV0 fmla v7.2d, v31.2d, alphaV0
st1 {v6.2d, v7.2d}, [pCRow3] stp q6, q7, [pCRow3]
add pCRow3, pCRow3, #32 add pCRow3, pCRow3, #32
prfm PLDL2KEEP, [pCRow0, #128]
prfm PLDL2KEEP, [pCRow1, #128]
prfm PLDL2KEEP, [pCRow2, #128]
prfm PLDL2KEEP, [pCRow3, #128]
.endm .endm
/******************************************************************************/ /******************************************************************************/
@ -408,44 +419,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2d, v1.2d}, [pA] ld1 {v0.2d, v1.2d}, [pA]
add pA, pA, #32 add pA, pA, #32
fmla v16.2d, v0.2d, v8.2d[0] fmla v16.2d, v0.2d, v8.d[0]
fmla v29.2d, v1.2d, v9.2d[1] fmla v29.2d, v1.2d, v9.d[1]
fmla v20.2d, v0.2d, v8.2d[1] fmla v20.2d, v0.2d, v8.d[1]
fmla v25.2d, v1.2d, v9.2d[0] fmla v25.2d, v1.2d, v9.d[0]
fmla v24.2d, v0.2d, v9.2d[0] fmla v24.2d, v0.2d, v9.d[0]
fmla v21.2d, v1.2d, v8.2d[1] fmla v21.2d, v1.2d, v8.d[1]
fmla v28.2d, v0.2d, v9.2d[1] fmla v28.2d, v0.2d, v9.d[1]
fmla v17.2d, v1.2d, v8.2d[0] fmla v17.2d, v1.2d, v8.d[0]
.endm .endm
.macro SAVE4x4 .macro SAVE4x4
fmov alpha0, alpha
ld1 {v8.2d, v9.2d}, [pCRow0] ld1 {v8.2d, v9.2d}, [pCRow0]
fmla v8.2d, v16.2d, alphaV0 fmla v8.2d, v16.2d, alphaV0
fmla v9.2d, v17.2d, alphaV1 fmla v9.2d, v17.2d, alphaV0
st1 {v8.2d, v9.2d}, [pCRow0] st1 {v8.2d, v9.2d}, [pCRow0]
add pCRow1, pCRow0, LDC add pCRow1, pCRow0, LDC
ld1 {v12.2d, v13.2d}, [pCRow1] ld1 {v12.2d, v13.2d}, [pCRow1]
fmla v12.2d, v20.2d, alphaV2 fmla v12.2d, v20.2d, alphaV0
fmla v13.2d, v21.2d, alphaV3 fmla v13.2d, v21.2d, alphaV0
st1 {v12.2d, v13.2d}, [pCRow1] st1 {v12.2d, v13.2d}, [pCRow1]
add pCRow2, pCRow1, LDC add pCRow2, pCRow1, LDC
ld1 {v8.2d, v9.2d}, [pCRow2] ld1 {v8.2d, v9.2d}, [pCRow2]
fmla v8.2d, v24.2d, alphaV0 fmla v8.2d, v24.2d, alphaV0
fmla v9.2d, v25.2d, alphaV1 fmla v9.2d, v25.2d, alphaV0
st1 {v8.2d, v9.2d}, [pCRow2] st1 {v8.2d, v9.2d}, [pCRow2]
add pCRow1, pCRow2, LDC add pCRow1, pCRow2, LDC
ld1 {v12.2d, v13.2d}, [pCRow1] ld1 {v12.2d, v13.2d}, [pCRow1]
fmla v12.2d, v28.2d, alphaV2 fmla v12.2d, v28.2d, alphaV0
fmla v13.2d, v29.2d, alphaV3 fmla v13.2d, v29.2d, alphaV0
st1 {v12.2d, v13.2d}, [pCRow1] st1 {v12.2d, v13.2d}, [pCRow1]
add pCRow0, pCRow0, #32 add pCRow0, pCRow0, #32
@ -467,13 +479,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2d}, [pA] ld1 {v0.2d}, [pA]
add pA, pA, #16 add pA, pA, #16
fmla v16.2d, v0.2d, v8.2d[0] fmla v16.2d, v0.2d, v8.d[0]
fmla v20.2d, v0.2d, v8.2d[1] fmla v20.2d, v0.2d, v8.d[1]
fmla v24.2d, v0.2d, v9.2d[0] fmla v24.2d, v0.2d, v9.d[0]
fmla v28.2d, v0.2d, v9.2d[1] fmla v28.2d, v0.2d, v9.d[1]
.endm .endm
.macro SAVE2x4 .macro SAVE2x4
fmov alpha0, alpha
ld1 {v8.2d}, [pCRow0] ld1 {v8.2d}, [pCRow0]
fmla v8.2d, v16.2d, alphaV0 fmla v8.2d, v16.2d, alphaV0
st1 {v8.2d}, [pCRow0] st1 {v8.2d}, [pCRow0]
@ -481,19 +494,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add pCRow1, pCRow0, LDC add pCRow1, pCRow0, LDC
ld1 {v12.2d}, [pCRow1] ld1 {v12.2d}, [pCRow1]
fmla v12.2d, v20.2d, alphaV1 fmla v12.2d, v20.2d, alphaV0
st1 {v12.2d}, [pCRow1] st1 {v12.2d}, [pCRow1]
add pCRow2, pCRow1, LDC add pCRow2, pCRow1, LDC
ld1 {v8.2d}, [pCRow2] ld1 {v8.2d}, [pCRow2]
fmla v8.2d, v24.2d, alphaV2 fmla v8.2d, v24.2d, alphaV0
st1 {v8.2d}, [pCRow2] st1 {v8.2d}, [pCRow2]
add pCRow1, pCRow2, LDC add pCRow1, pCRow2, LDC
ld1 {v12.2d}, [pCRow1] ld1 {v12.2d}, [pCRow1]
fmla v12.2d, v28.2d, alphaV3 fmla v12.2d, v28.2d, alphaV0
st1 {v12.2d}, [pCRow1] st1 {v12.2d}, [pCRow1]
add pCRow0, pCRow0, #16 add pCRow0, pCRow0, #16
@ -518,6 +531,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro SAVE1x4 .macro SAVE1x4
fmov alpha0, alpha
add pCRow1, pCRow0, LDC add pCRow1, pCRow0, LDC
ld1 {v8.d}[0], [pCRow0] ld1 {v8.d}[0], [pCRow0]
@ -531,7 +545,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v12.d}[0], [pCRow2] ld1 {v12.d}[0], [pCRow2]
ld1 {v12.d}[1], [pCRow1] ld1 {v12.d}[1], [pCRow1]
fmla v12.2d, v20.2d, alphaV1 fmla v12.2d, v20.2d, alphaV0
st1 {v12.d}[0], [pCRow2] st1 {v12.d}[0], [pCRow2]
st1 {v12.d}[1], [pCRow1] st1 {v12.d}[1], [pCRow1]
@ -559,32 +573,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v2.2d, v3.2d}, [pA] ld1 {v2.2d, v3.2d}, [pA]
add pA, pA, #32 add pA, pA, #32
fmla v16.2d, v0.2d, v8.2d[0] fmla v16.2d, v0.2d, v8.d[0]
fmla v17.2d, v1.2d, v8.2d[0] fmla v17.2d, v1.2d, v8.d[0]
fmla v18.2d, v2.2d, v8.2d[0] fmla v18.2d, v2.2d, v8.d[0]
fmla v19.2d, v3.2d, v8.2d[0] fmla v19.2d, v3.2d, v8.d[0]
fmla v20.2d, v0.2d, v8.2d[1] fmla v20.2d, v0.2d, v8.d[1]
fmla v21.2d, v1.2d, v8.2d[1] fmla v21.2d, v1.2d, v8.d[1]
fmla v22.2d, v2.2d, v8.2d[1] fmla v22.2d, v2.2d, v8.d[1]
fmla v23.2d, v3.2d, v8.2d[1] fmla v23.2d, v3.2d, v8.d[1]
.endm .endm
.macro SAVE8x2 .macro SAVE8x2
fmov alpha0, alpha
add pCRow1, pCRow0, LDC add pCRow1, pCRow0, LDC
ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0] ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0]
fmla v0.2d, v16.2d, alphaV0 fmla v0.2d, v16.2d, alphaV0
fmla v1.2d, v17.2d, alphaV1 fmla v1.2d, v17.2d, alphaV0
fmla v2.2d, v18.2d, alphaV2 fmla v2.2d, v18.2d, alphaV0
fmla v3.2d, v19.2d, alphaV3 fmla v3.2d, v19.2d, alphaV0
st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0] st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0]
ld1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1] ld1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1]
fmla v4.2d, v20.2d, alphaV0 fmla v4.2d, v20.2d, alphaV0
fmla v5.2d, v21.2d, alphaV1 fmla v5.2d, v21.2d, alphaV0
fmla v6.2d, v22.2d, alphaV2 fmla v6.2d, v22.2d, alphaV0
fmla v7.2d, v23.2d, alphaV3 fmla v7.2d, v23.2d, alphaV0
st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1] st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1]
add pCRow0, pCRow0, #64 add pCRow0, pCRow0, #64
@ -605,23 +620,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2d, v1.2d}, [pA] ld1 {v0.2d, v1.2d}, [pA]
add pA, pA, #32 add pA, pA, #32
fmla v16.2d, v0.2d, v8.2d[0] fmla v16.2d, v0.2d, v8.d[0]
fmla v17.2d, v1.2d, v8.2d[0] fmla v17.2d, v1.2d, v8.d[0]
fmla v20.2d, v0.2d, v8.2d[1] fmla v20.2d, v0.2d, v8.d[1]
fmla v21.2d, v1.2d, v8.2d[1] fmla v21.2d, v1.2d, v8.d[1]
.endm .endm
.macro SAVE4x2 .macro SAVE4x2
fmov alpha0, alpha
ld1 {v8.2d, v9.2d}, [pCRow0] ld1 {v8.2d, v9.2d}, [pCRow0]
fmla v8.2d, v16.2d, alphaV0 fmla v8.2d, v16.2d, alphaV0
fmla v9.2d, v17.2d, alphaV1 fmla v9.2d, v17.2d, alphaV0
st1 {v8.2d, v9.2d}, [pCRow0] st1 {v8.2d, v9.2d}, [pCRow0]
add pCRow1, pCRow0, LDC add pCRow1, pCRow0, LDC
ld1 {v12.2d, v13.2d}, [pCRow1] ld1 {v12.2d, v13.2d}, [pCRow1]
fmla v12.2d, v20.2d, alphaV2 fmla v12.2d, v20.2d, alphaV0
fmla v13.2d, v21.2d, alphaV3 fmla v13.2d, v21.2d, alphaV0
st1 {v12.2d, v13.2d}, [pCRow1] st1 {v12.2d, v13.2d}, [pCRow1]
add pCRow0, pCRow0, #32 add pCRow0, pCRow0, #32
@ -641,11 +657,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2d}, [pA] ld1 {v0.2d}, [pA]
add pA, pA, #16 add pA, pA, #16
fmla v16.2d, v0.2d, v8.2d[0] fmla v16.2d, v0.2d, v8.d[0]
fmla v20.2d, v0.2d, v8.2d[1] fmla v20.2d, v0.2d, v8.d[1]
.endm .endm
.macro SAVE2x2 .macro SAVE2x2
fmov alpha0, alpha
ld1 {v8.2d}, [pCRow0] ld1 {v8.2d}, [pCRow0]
fmla v8.2d, v16.2d, alphaV0 fmla v8.2d, v16.2d, alphaV0
st1 {v8.2d}, [pCRow0] st1 {v8.2d}, [pCRow0]
@ -653,7 +670,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add pCRow1 , pCRow0, LDC add pCRow1 , pCRow0, LDC
ld1 {v12.2d}, [pCRow1] ld1 {v12.2d}, [pCRow1]
fmla v12.2d, v20.2d, alphaV1 fmla v12.2d, v20.2d, alphaV0
st1 {v12.2d}, [pCRow1] st1 {v12.2d}, [pCRow1]
add pCRow0, pCRow0, #16 add pCRow0, pCRow0, #16
@ -672,10 +689,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ldr d0 , [pA] ldr d0 , [pA]
add pA, pA, #8 add pA, pA, #8
fmla v16.2d, v8.2d, v0.2d[0] fmla v16.2d, v8.2d, v0.d[0]
.endm .endm
.macro SAVE1x2 .macro SAVE1x2
fmov alpha0, alpha
add pCRow1 , pCRow0, LDC add pCRow1 , pCRow0, LDC
ld1 {v8.d}[0], [pCRow0] ld1 {v8.d}[0], [pCRow0]
@ -706,18 +724,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v2.2d, v3.2d}, [pA] ld1 {v2.2d, v3.2d}, [pA]
add pA, pA, #32 add pA, pA, #32
fmla v16.2d, v0.2d, v8.2d[0] fmla v16.2d, v0.2d, v8.d[0]
fmla v17.2d, v1.2d, v8.2d[0] fmla v17.2d, v1.2d, v8.d[0]
fmla v18.2d, v2.2d, v8.2d[0] fmla v18.2d, v2.2d, v8.d[0]
fmla v19.2d, v3.2d, v8.2d[0] fmla v19.2d, v3.2d, v8.d[0]
.endm .endm
.macro SAVE8x1 .macro SAVE8x1
fmov alpha0, alpha
ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0] ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0]
fmla v0.2d, v16.2d, alphaV0 fmla v0.2d, v16.2d, alphaV0
fmla v1.2d, v17.2d, alphaV1 fmla v1.2d, v17.2d, alphaV0
fmla v2.2d, v18.2d, alphaV2 fmla v2.2d, v18.2d, alphaV0
fmla v3.2d, v19.2d, alphaV3 fmla v3.2d, v19.2d, alphaV0
st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0] st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0]
add pCRow0, pCRow0, #64 add pCRow0, pCRow0, #64
@ -738,14 +757,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2d, v1.2d}, [pA] ld1 {v0.2d, v1.2d}, [pA]
add pA , pA, #32 add pA , pA, #32
fmla v16.2d, v0.2d, v8.2d[0] fmla v16.2d, v0.2d, v8.d[0]
fmla v17.2d, v1.2d, v8.2d[0] fmla v17.2d, v1.2d, v8.d[0]
.endm .endm
.macro SAVE4x1 .macro SAVE4x1
fmov alpha0, alpha
ld1 {v8.2d, v9.2d}, [pCRow0] ld1 {v8.2d, v9.2d}, [pCRow0]
fmla v8.2d, v16.2d, alphaV0 fmla v8.2d, v16.2d, alphaV0
fmla v9.2d, v17.2d, alphaV1 fmla v9.2d, v17.2d, alphaV0
st1 {v8.2d, v9.2d}, [pCRow0] st1 {v8.2d, v9.2d}, [pCRow0]
add pCRow0, pCRow0, #32 add pCRow0, pCRow0, #32
@ -765,10 +785,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2d}, [pA] ld1 {v0.2d}, [pA]
add pA , pA, #16 add pA , pA, #16
fmla v16.2d, v0.2d, v8.2d[0] fmla v16.2d, v0.2d, v8.d[0]
.endm .endm
.macro SAVE2x1 .macro SAVE2x1
fmov alpha0, alpha
ld1 {v8.2d}, [pCRow0] ld1 {v8.2d}, [pCRow0]
fmla v8.2d, v16.2d, alphaV0 fmla v8.2d, v16.2d, alphaV0
st1 {v8.2d}, [pCRow0] st1 {v8.2d}, [pCRow0]
@ -793,6 +814,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro SAVE1x1 .macro SAVE1x1
fmov alpha0, alpha
ldr d8, [pCRow0] ldr d8, [pCRow0]
fmadd d8, d16, alpha0, d8 fmadd d8, d16, alpha0, d8
str d8, [pCRow0] str d8, [pCRow0]
@ -820,6 +842,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stp x26, x27, [sp, #(9 * 16)] stp x26, x27, [sp, #(9 * 16)]
str x28, [sp, #(10 * 16)] str x28, [sp, #(10 * 16)]
prfm PLDL1KEEP, [origPB]
prfm PLDL1KEEP, [origPA]
fmov alpha, d0 fmov alpha, d0
lsl LDC, LDC, #3 // ldc = ldc * 8 lsl LDC, LDC, #3 // ldc = ldc * 8
@ -838,6 +863,7 @@ dgemm_kernel_L4_BEGIN:
add pCRow1, pCRow0, LDC add pCRow1, pCRow0, LDC
add pCRow2, pCRow1, LDC add pCRow2, pCRow1, LDC
add pCRow3, pCRow2, LDC add pCRow3, pCRow2, LDC
add pC, pCRow3, LDC add pC, pCRow3, LDC
mov pA, origPA // pA = start of A array mov pA, origPA // pA = start of A array
@ -849,6 +875,7 @@ dgemm_kernel_L4_M8_BEGIN:
cmp counterI, #0 cmp counterI, #0
ble dgemm_kernel_L4_M4_BEGIN ble dgemm_kernel_L4_M4_BEGIN
.align 5
dgemm_kernel_L4_M8_20: dgemm_kernel_L4_M8_20:
mov pB, origPB mov pB, origPB
@ -868,8 +895,8 @@ dgemm_kernel_L4_M8_20:
subs counterL, counterL, #2 // subtract 2 subs counterL, counterL, #2 // subtract 2
ble dgemm_kernel_L4_M8_22a ble dgemm_kernel_L4_M8_22a
.align 5
.align 5
dgemm_kernel_L4_M8_22: dgemm_kernel_L4_M8_22:
KERNEL8x4_M1 KERNEL8x4_M1
@ -884,7 +911,7 @@ dgemm_kernel_L4_M8_22:
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L4_M8_22 bgt dgemm_kernel_L4_M8_22
.align 5
dgemm_kernel_L4_M8_22a: dgemm_kernel_L4_M8_22a:
KERNEL8x4_M1 KERNEL8x4_M1
@ -898,6 +925,7 @@ dgemm_kernel_L4_M8_22a:
b dgemm_kernel_L4_M8_44 b dgemm_kernel_L4_M8_44
.align 5
dgemm_kernel_L4_M8_32: dgemm_kernel_L4_M8_32:
tst counterL, #1 tst counterL, #1
@ -923,6 +951,7 @@ dgemm_kernel_L4_M8_44:
ands counterL , origK, #7 ands counterL , origK, #7
ble dgemm_kernel_L4_M8_100 ble dgemm_kernel_L4_M8_100
.align 5
dgemm_kernel_L4_M8_46: dgemm_kernel_L4_M8_46:
KERNEL8x4_SUB KERNEL8x4_SUB
@ -931,6 +960,9 @@ dgemm_kernel_L4_M8_46:
bne dgemm_kernel_L4_M8_46 bne dgemm_kernel_L4_M8_46
dgemm_kernel_L4_M8_100: dgemm_kernel_L4_M8_100:
prfm PLDL1KEEP, [pA]
prfm PLDL1KEEP, [pA, #64]
prfm PLDL1KEEP, [origPB]
SAVE8x4 SAVE8x4

View File

@ -147,17 +147,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2d, v1.2d}, [pA] ld1 {v0.2d, v1.2d}, [pA]
add pA, pA, #32 add pA, pA, #32
fmul v16.2d, v0.2d, v8.2d[0] fmul v16.2d, v0.2d, v8.d[0]
fmul v29.2d, v1.2d, v9.2d[1] fmul v29.2d, v1.2d, v9.d[1]
fmul v20.2d, v0.2d, v8.2d[1] fmul v20.2d, v0.2d, v8.d[1]
fmul v25.2d, v1.2d, v9.2d[0] fmul v25.2d, v1.2d, v9.d[0]
fmul v24.2d, v0.2d, v9.2d[0] fmul v24.2d, v0.2d, v9.d[0]
fmul v21.2d, v1.2d, v8.2d[1] fmul v21.2d, v1.2d, v8.d[1]
fmul v28.2d, v0.2d, v9.2d[1] fmul v28.2d, v0.2d, v9.d[1]
fmul v17.2d, v1.2d, v8.2d[0] fmul v17.2d, v1.2d, v8.d[0]
ld1 {v12.2d, v13.2d}, [pB] ld1 {v12.2d, v13.2d}, [pB]
add pB, pB, #32 add pB, pB, #32
@ -166,61 +166,61 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL4x4_M1 .macro KERNEL4x4_M1
fmla v16.2d, v0.2d, v8.2d[0] fmla v16.2d, v0.2d, v8.d[0]
fmla v29.2d, v1.2d, v9.2d[1] fmla v29.2d, v1.2d, v9.d[1]
ld1 {v12.2d, v13.2d}, [pB] // For next round ld1 {v12.2d, v13.2d}, [pB] // For next round
add pB, pB, #32 add pB, pB, #32
fmla v20.2d, v0.2d, v8.2d[1] fmla v20.2d, v0.2d, v8.d[1]
fmla v25.2d, v1.2d, v9.2d[0] fmla v25.2d, v1.2d, v9.d[0]
ld1 {v4.2d, v5.2d}, [pA] // For next round ld1 {v4.2d, v5.2d}, [pA] // For next round
add pA, pA, #32 add pA, pA, #32
fmla v24.2d, v0.2d, v9.2d[0] fmla v24.2d, v0.2d, v9.d[0]
fmla v21.2d, v1.2d, v8.2d[1] fmla v21.2d, v1.2d, v8.d[1]
prfm PLDL1KEEP, [pA, #512] prfm PLDL1KEEP, [pA, #512]
fmla v28.2d, v0.2d, v9.2d[1] fmla v28.2d, v0.2d, v9.d[1]
fmla v17.2d, v1.2d, v8.2d[0] fmla v17.2d, v1.2d, v8.d[0]
.endm .endm
.macro KERNEL4x4_M2 .macro KERNEL4x4_M2
fmla v16.2d, v4.2d, v12.2d[0] fmla v16.2d, v4.2d, v12.d[0]
fmla v29.2d, v5.2d, v13.2d[1] fmla v29.2d, v5.2d, v13.d[1]
ld1 {v8.2d, v9.2d}, [pB] // For next round ld1 {v8.2d, v9.2d}, [pB] // For next round
add pB, pB, #32 add pB, pB, #32
fmla v20.2d, v4.2d, v12.2d[1] fmla v20.2d, v4.2d, v12.d[1]
fmla v25.2d, v5.2d, v13.2d[0] fmla v25.2d, v5.2d, v13.d[0]
ld1 {v0.2d, v1.2d}, [pA] // For next round ld1 {v0.2d, v1.2d}, [pA] // For next round
add pA, pA, #32 add pA, pA, #32
fmla v24.2d, v4.2d, v13.2d[0] fmla v24.2d, v4.2d, v13.d[0]
fmla v21.2d, v5.2d, v12.2d[1] fmla v21.2d, v5.2d, v12.d[1]
prfm PLDL1KEEP, [pB, #512] prfm PLDL1KEEP, [pB, #512]
fmla v28.2d, v4.2d, v13.2d[1] fmla v28.2d, v4.2d, v13.d[1]
fmla v17.2d, v5.2d, v12.2d[0] fmla v17.2d, v5.2d, v12.d[0]
.endm .endm
.macro KERNEL4x4_E .macro KERNEL4x4_E
fmla v16.2d, v4.2d, v12.2d[0] fmla v16.2d, v4.2d, v12.d[0]
fmla v29.2d, v5.2d, v13.2d[1] fmla v29.2d, v5.2d, v13.d[1]
fmla v20.2d, v4.2d, v12.2d[1] fmla v20.2d, v4.2d, v12.d[1]
fmla v25.2d, v5.2d, v13.2d[0] fmla v25.2d, v5.2d, v13.d[0]
fmla v24.2d, v4.2d, v13.2d[0] fmla v24.2d, v4.2d, v13.d[0]
fmla v21.2d, v5.2d, v12.2d[1] fmla v21.2d, v5.2d, v12.d[1]
fmla v28.2d, v4.2d, v13.2d[1] fmla v28.2d, v4.2d, v13.d[1]
fmla v17.2d, v5.2d, v12.2d[0] fmla v17.2d, v5.2d, v12.d[0]
.endm .endm
.macro KERNEL4x4_SUB .macro KERNEL4x4_SUB
@ -229,17 +229,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2d, v1.2d}, [pA] ld1 {v0.2d, v1.2d}, [pA]
add pA, pA, #32 add pA, pA, #32
fmla v16.2d, v0.2d, v8.2d[0] fmla v16.2d, v0.2d, v8.d[0]
fmla v29.2d, v1.2d, v9.2d[1] fmla v29.2d, v1.2d, v9.d[1]
fmla v20.2d, v0.2d, v8.2d[1] fmla v20.2d, v0.2d, v8.d[1]
fmla v25.2d, v1.2d, v9.2d[0] fmla v25.2d, v1.2d, v9.d[0]
fmla v24.2d, v0.2d, v9.2d[0] fmla v24.2d, v0.2d, v9.d[0]
fmla v21.2d, v1.2d, v8.2d[1] fmla v21.2d, v1.2d, v8.d[1]
fmla v28.2d, v0.2d, v9.2d[1] fmla v28.2d, v0.2d, v9.d[1]
fmla v17.2d, v1.2d, v8.2d[0] fmla v17.2d, v1.2d, v8.d[0]
.endm .endm
.macro SAVE4x4 .macro SAVE4x4
@ -283,10 +283,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2d}, [pA] ld1 {v0.2d}, [pA]
add pA, pA, #16 add pA, pA, #16
fmla v16.2d, v0.2d, v8.2d[0] fmla v16.2d, v0.2d, v8.d[0]
fmla v20.2d, v0.2d, v8.2d[1] fmla v20.2d, v0.2d, v8.d[1]
fmla v24.2d, v0.2d, v9.2d[0] fmla v24.2d, v0.2d, v9.d[0]
fmla v28.2d, v0.2d, v9.2d[1] fmla v28.2d, v0.2d, v9.d[1]
.endm .endm
.macro SAVE2x4 .macro SAVE2x4
@ -361,10 +361,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2d, v1.2d}, [pA] ld1 {v0.2d, v1.2d}, [pA]
add pA, pA, #32 add pA, pA, #32
fmla v16.2d, v0.2d, v8.2d[0] fmla v16.2d, v0.2d, v8.d[0]
fmla v17.2d, v1.2d, v8.2d[0] fmla v17.2d, v1.2d, v8.d[0]
fmla v20.2d, v0.2d, v8.2d[1] fmla v20.2d, v0.2d, v8.d[1]
fmla v21.2d, v1.2d, v8.2d[1] fmla v21.2d, v1.2d, v8.d[1]
.endm .endm
.macro SAVE4x2 .macro SAVE4x2
@ -395,8 +395,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2d}, [pA] ld1 {v0.2d}, [pA]
add pA, pA, #16 add pA, pA, #16
fmla v16.2d, v0.2d, v8.2d[0] fmla v16.2d, v0.2d, v8.d[0]
fmla v20.2d, v0.2d, v8.2d[1] fmla v20.2d, v0.2d, v8.d[1]
.endm .endm
.macro SAVE2x2 .macro SAVE2x2
@ -424,7 +424,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ldr d0 , [pA] ldr d0 , [pA]
add pA, pA, #8 add pA, pA, #8
fmla v16.2d, v8.2d, v0.2d[0] fmla v16.2d, v8.2d, v0.d[0]
.endm .endm
.macro SAVE1x2 .macro SAVE1x2
@ -451,8 +451,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2d, v1.2d}, [pA] ld1 {v0.2d, v1.2d}, [pA]
add pA , pA, #32 add pA , pA, #32
fmla v16.2d, v0.2d, v8.2d[0] fmla v16.2d, v0.2d, v8.d[0]
fmla v17.2d, v1.2d, v8.2d[0] fmla v17.2d, v1.2d, v8.d[0]
.endm .endm
.macro SAVE4x1 .macro SAVE4x1
@ -479,7 +479,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2d}, [pA] ld1 {v0.2d}, [pA]
add pA , pA, #16 add pA , pA, #16
fmla v16.2d, v0.2d, v8.2d[0] fmla v16.2d, v0.2d, v8.d[0]
.endm .endm
.macro SAVE2x1 .macro SAVE2x1

284
kernel/arm64/dtrmm_kernel_4x8.S Executable file → Normal file
View File

@ -157,25 +157,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v10.2d, v11.2d}, [pB] ld1 {v10.2d, v11.2d}, [pB]
add pB, pB, #32 add pB, pB, #32
fmul v16.2d, v0.2d, v8.2d[0] fmul v16.2d, v0.2d, v8.d[0]
fmul v17.2d, v1.2d, v8.2d[0] fmul v17.2d, v1.2d, v8.d[0]
fmul v18.2d, v0.2d, v8.2d[1] fmul v18.2d, v0.2d, v8.d[1]
fmul v19.2d, v1.2d, v8.2d[1] fmul v19.2d, v1.2d, v8.d[1]
fmul v20.2d, v0.2d, v9.2d[0] fmul v20.2d, v0.2d, v9.d[0]
fmul v21.2d, v1.2d, v9.2d[0] fmul v21.2d, v1.2d, v9.d[0]
fmul v22.2d, v0.2d, v9.2d[1] fmul v22.2d, v0.2d, v9.d[1]
fmul v23.2d, v1.2d, v9.2d[1] fmul v23.2d, v1.2d, v9.d[1]
fmul v24.2d, v0.2d, v10.2d[0] fmul v24.2d, v0.2d, v10.d[0]
fmul v25.2d, v1.2d, v10.2d[0] fmul v25.2d, v1.2d, v10.d[0]
fmul v26.2d, v0.2d, v10.2d[1] fmul v26.2d, v0.2d, v10.d[1]
fmul v27.2d, v1.2d, v10.2d[1] fmul v27.2d, v1.2d, v10.d[1]
fmul v28.2d, v0.2d, v11.2d[0] fmul v28.2d, v0.2d, v11.d[0]
fmul v29.2d, v1.2d, v11.2d[0] fmul v29.2d, v1.2d, v11.d[0]
fmul v30.2d, v0.2d, v11.2d[1] fmul v30.2d, v0.2d, v11.d[1]
fmul v31.2d, v1.2d, v11.2d[1] fmul v31.2d, v1.2d, v11.d[1]
ld1 {v12.2d, v13.2d}, [pB] ld1 {v12.2d, v13.2d}, [pB]
add pB, pB, #32 add pB, pB, #32
@ -186,25 +186,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL4x8_M1 .macro KERNEL4x8_M1
fmla v16.2d, v0.2d, v8.2d[0] fmla v16.2d, v0.2d, v8.d[0]
fmla v17.2d, v1.2d, v8.2d[0] fmla v17.2d, v1.2d, v8.d[0]
fmla v18.2d, v0.2d, v8.2d[1] fmla v18.2d, v0.2d, v8.d[1]
fmla v19.2d, v1.2d, v8.2d[1] fmla v19.2d, v1.2d, v8.d[1]
fmla v20.2d, v0.2d, v9.2d[0] fmla v20.2d, v0.2d, v9.d[0]
fmla v21.2d, v1.2d, v9.2d[0] fmla v21.2d, v1.2d, v9.d[0]
fmla v22.2d, v0.2d, v9.2d[1] fmla v22.2d, v0.2d, v9.d[1]
fmla v23.2d, v1.2d, v9.2d[1] fmla v23.2d, v1.2d, v9.d[1]
fmla v24.2d, v0.2d, v10.2d[0] fmla v24.2d, v0.2d, v10.d[0]
fmla v25.2d, v1.2d, v10.2d[0] fmla v25.2d, v1.2d, v10.d[0]
fmla v26.2d, v0.2d, v10.2d[1] fmla v26.2d, v0.2d, v10.d[1]
fmla v27.2d, v1.2d, v10.2d[1] fmla v27.2d, v1.2d, v10.d[1]
fmla v28.2d, v0.2d, v11.2d[0] fmla v28.2d, v0.2d, v11.d[0]
fmla v29.2d, v1.2d, v11.2d[0] fmla v29.2d, v1.2d, v11.d[0]
fmla v30.2d, v0.2d, v11.2d[1] fmla v30.2d, v0.2d, v11.d[1]
fmla v31.2d, v1.2d, v11.2d[1] fmla v31.2d, v1.2d, v11.d[1]
ld1 {v12.2d, v13.2d}, [pB] // For next round ld1 {v12.2d, v13.2d}, [pB] // For next round
add pB, pB, #32 add pB, pB, #32
@ -217,25 +217,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL4x8_M2 .macro KERNEL4x8_M2
fmla v16.2d, v4.2d, v12.2d[0] fmla v16.2d, v4.2d, v12.d[0]
fmla v17.2d, v5.2d, v12.2d[0] fmla v17.2d, v5.2d, v12.d[0]
fmla v18.2d, v4.2d, v12.2d[1] fmla v18.2d, v4.2d, v12.d[1]
fmla v19.2d, v5.2d, v12.2d[1] fmla v19.2d, v5.2d, v12.d[1]
fmla v20.2d, v4.2d, v13.2d[0] fmla v20.2d, v4.2d, v13.d[0]
fmla v21.2d, v5.2d, v13.2d[0] fmla v21.2d, v5.2d, v13.d[0]
fmla v22.2d, v4.2d, v13.2d[1] fmla v22.2d, v4.2d, v13.d[1]
fmla v23.2d, v5.2d, v13.2d[1] fmla v23.2d, v5.2d, v13.d[1]
fmla v24.2d, v4.2d, v14.2d[0] fmla v24.2d, v4.2d, v14.d[0]
fmla v25.2d, v5.2d, v14.2d[0] fmla v25.2d, v5.2d, v14.d[0]
fmla v26.2d, v4.2d, v14.2d[1] fmla v26.2d, v4.2d, v14.d[1]
fmla v27.2d, v5.2d, v14.2d[1] fmla v27.2d, v5.2d, v14.d[1]
fmla v28.2d, v4.2d, v15.2d[0] fmla v28.2d, v4.2d, v15.d[0]
fmla v29.2d, v5.2d, v15.2d[0] fmla v29.2d, v5.2d, v15.d[0]
fmla v30.2d, v4.2d, v15.2d[1] fmla v30.2d, v4.2d, v15.d[1]
fmla v31.2d, v5.2d, v15.2d[1] fmla v31.2d, v5.2d, v15.d[1]
ld1 {v8.2d, v9.2d}, [pB] // For next round ld1 {v8.2d, v9.2d}, [pB] // For next round
add pB, pB, #32 add pB, pB, #32
@ -248,25 +248,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL4x8_E .macro KERNEL4x8_E
fmla v16.2d, v4.2d, v12.2d[0] fmla v16.2d, v4.2d, v12.d[0]
fmla v17.2d, v5.2d, v12.2d[0] fmla v17.2d, v5.2d, v12.d[0]
fmla v18.2d, v4.2d, v12.2d[1] fmla v18.2d, v4.2d, v12.d[1]
fmla v19.2d, v5.2d, v12.2d[1] fmla v19.2d, v5.2d, v12.d[1]
fmla v20.2d, v4.2d, v13.2d[0] fmla v20.2d, v4.2d, v13.d[0]
fmla v21.2d, v5.2d, v13.2d[0] fmla v21.2d, v5.2d, v13.d[0]
fmla v22.2d, v4.2d, v13.2d[1] fmla v22.2d, v4.2d, v13.d[1]
fmla v23.2d, v5.2d, v13.2d[1] fmla v23.2d, v5.2d, v13.d[1]
fmla v24.2d, v4.2d, v14.2d[0] fmla v24.2d, v4.2d, v14.d[0]
fmla v25.2d, v5.2d, v14.2d[0] fmla v25.2d, v5.2d, v14.d[0]
fmla v26.2d, v4.2d, v14.2d[1] fmla v26.2d, v4.2d, v14.d[1]
fmla v27.2d, v5.2d, v14.2d[1] fmla v27.2d, v5.2d, v14.d[1]
fmla v28.2d, v4.2d, v15.2d[0] fmla v28.2d, v4.2d, v15.d[0]
fmla v29.2d, v5.2d, v15.2d[0] fmla v29.2d, v5.2d, v15.d[0]
fmla v30.2d, v4.2d, v15.2d[1] fmla v30.2d, v4.2d, v15.d[1]
fmla v31.2d, v5.2d, v15.2d[1] fmla v31.2d, v5.2d, v15.d[1]
.endm .endm
.macro KERNEL4x8_SUB .macro KERNEL4x8_SUB
@ -277,25 +277,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v10.2d, v11.2d}, [pB] ld1 {v10.2d, v11.2d}, [pB]
add pB, pB, #32 add pB, pB, #32
fmla v16.2d, v0.2d, v8.2d[0] fmla v16.2d, v0.2d, v8.d[0]
fmla v17.2d, v1.2d, v8.2d[0] fmla v17.2d, v1.2d, v8.d[0]
fmla v18.2d, v0.2d, v8.2d[1] fmla v18.2d, v0.2d, v8.d[1]
fmla v19.2d, v1.2d, v8.2d[1] fmla v19.2d, v1.2d, v8.d[1]
fmla v20.2d, v0.2d, v9.2d[0] fmla v20.2d, v0.2d, v9.d[0]
fmla v21.2d, v1.2d, v9.2d[0] fmla v21.2d, v1.2d, v9.d[0]
fmla v22.2d, v0.2d, v9.2d[1] fmla v22.2d, v0.2d, v9.d[1]
fmla v23.2d, v1.2d, v9.2d[1] fmla v23.2d, v1.2d, v9.d[1]
fmla v24.2d, v0.2d, v10.2d[0] fmla v24.2d, v0.2d, v10.d[0]
fmla v25.2d, v1.2d, v10.2d[0] fmla v25.2d, v1.2d, v10.d[0]
fmla v26.2d, v0.2d, v10.2d[1] fmla v26.2d, v0.2d, v10.d[1]
fmla v27.2d, v1.2d, v10.2d[1] fmla v27.2d, v1.2d, v10.d[1]
fmla v28.2d, v0.2d, v11.2d[0] fmla v28.2d, v0.2d, v11.d[0]
fmla v29.2d, v1.2d, v11.2d[0] fmla v29.2d, v1.2d, v11.d[0]
fmla v30.2d, v0.2d, v11.2d[1] fmla v30.2d, v0.2d, v11.d[1]
fmla v31.2d, v1.2d, v11.2d[1] fmla v31.2d, v1.2d, v11.d[1]
.endm .endm
.macro SAVE4x8 .macro SAVE4x8
@ -369,17 +369,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v10.2d, v11.2d}, [pB] ld1 {v10.2d, v11.2d}, [pB]
add pB, pB, #32 add pB, pB, #32
fmla v16.2d, v0.2d, v8.2d[0] fmla v16.2d, v0.2d, v8.d[0]
fmla v18.2d, v0.2d, v8.2d[1] fmla v18.2d, v0.2d, v8.d[1]
fmla v20.2d, v0.2d, v9.2d[0] fmla v20.2d, v0.2d, v9.d[0]
fmla v22.2d, v0.2d, v9.2d[1] fmla v22.2d, v0.2d, v9.d[1]
fmla v24.2d, v0.2d, v10.2d[0] fmla v24.2d, v0.2d, v10.d[0]
fmla v26.2d, v0.2d, v10.2d[1] fmla v26.2d, v0.2d, v10.d[1]
fmla v28.2d, v0.2d, v11.2d[0] fmla v28.2d, v0.2d, v11.d[0]
fmla v30.2d, v0.2d, v11.2d[1] fmla v30.2d, v0.2d, v11.d[1]
.endm .endm
.macro SAVE2x8 .macro SAVE2x8
@ -499,17 +499,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2d, v1.2d}, [pA] ld1 {v0.2d, v1.2d}, [pA]
add pA, pA, #32 add pA, pA, #32
fmul v16.2d, v0.2d, v8.2d[0] fmul v16.2d, v0.2d, v8.d[0]
fmul v29.2d, v1.2d, v9.2d[1] fmul v29.2d, v1.2d, v9.d[1]
fmul v20.2d, v0.2d, v8.2d[1] fmul v20.2d, v0.2d, v8.d[1]
fmul v25.2d, v1.2d, v9.2d[0] fmul v25.2d, v1.2d, v9.d[0]
fmul v24.2d, v0.2d, v9.2d[0] fmul v24.2d, v0.2d, v9.d[0]
fmul v21.2d, v1.2d, v8.2d[1] fmul v21.2d, v1.2d, v8.d[1]
fmul v28.2d, v0.2d, v9.2d[1] fmul v28.2d, v0.2d, v9.d[1]
fmul v17.2d, v1.2d, v8.2d[0] fmul v17.2d, v1.2d, v8.d[0]
ld1 {v12.2d, v13.2d}, [pB] ld1 {v12.2d, v13.2d}, [pB]
add pB, pB, #32 add pB, pB, #32
@ -518,61 +518,61 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL4x4_M1 .macro KERNEL4x4_M1
fmla v16.2d, v0.2d, v8.2d[0] fmla v16.2d, v0.2d, v8.d[0]
fmla v29.2d, v1.2d, v9.2d[1] fmla v29.2d, v1.2d, v9.d[1]
ld1 {v12.2d, v13.2d}, [pB] // For next round ld1 {v12.2d, v13.2d}, [pB] // For next round
add pB, pB, #32 add pB, pB, #32
fmla v20.2d, v0.2d, v8.2d[1] fmla v20.2d, v0.2d, v8.d[1]
fmla v25.2d, v1.2d, v9.2d[0] fmla v25.2d, v1.2d, v9.d[0]
ld1 {v4.2d, v5.2d}, [pA] // For next round ld1 {v4.2d, v5.2d}, [pA] // For next round
add pA, pA, #32 add pA, pA, #32
fmla v24.2d, v0.2d, v9.2d[0] fmla v24.2d, v0.2d, v9.d[0]
fmla v21.2d, v1.2d, v8.2d[1] fmla v21.2d, v1.2d, v8.d[1]
prfm PLDL1KEEP, [pA, #512] prfm PLDL1KEEP, [pA, #512]
fmla v28.2d, v0.2d, v9.2d[1] fmla v28.2d, v0.2d, v9.d[1]
fmla v17.2d, v1.2d, v8.2d[0] fmla v17.2d, v1.2d, v8.d[0]
.endm .endm
.macro KERNEL4x4_M2 .macro KERNEL4x4_M2
fmla v16.2d, v4.2d, v12.2d[0] fmla v16.2d, v4.2d, v12.d[0]
fmla v29.2d, v5.2d, v13.2d[1] fmla v29.2d, v5.2d, v13.d[1]
ld1 {v8.2d, v9.2d}, [pB] // For next round ld1 {v8.2d, v9.2d}, [pB] // For next round
add pB, pB, #32 add pB, pB, #32
fmla v20.2d, v4.2d, v12.2d[1] fmla v20.2d, v4.2d, v12.d[1]
fmla v25.2d, v5.2d, v13.2d[0] fmla v25.2d, v5.2d, v13.d[0]
ld1 {v0.2d, v1.2d}, [pA] // For next round ld1 {v0.2d, v1.2d}, [pA] // For next round
add pA, pA, #32 add pA, pA, #32
fmla v24.2d, v4.2d, v13.2d[0] fmla v24.2d, v4.2d, v13.d[0]
fmla v21.2d, v5.2d, v12.2d[1] fmla v21.2d, v5.2d, v12.d[1]
prfm PLDL1KEEP, [pB, #512] prfm PLDL1KEEP, [pB, #512]
fmla v28.2d, v4.2d, v13.2d[1] fmla v28.2d, v4.2d, v13.d[1]
fmla v17.2d, v5.2d, v12.2d[0] fmla v17.2d, v5.2d, v12.d[0]
.endm .endm
.macro KERNEL4x4_E .macro KERNEL4x4_E
fmla v16.2d, v4.2d, v12.2d[0] fmla v16.2d, v4.2d, v12.d[0]
fmla v29.2d, v5.2d, v13.2d[1] fmla v29.2d, v5.2d, v13.d[1]
fmla v20.2d, v4.2d, v12.2d[1] fmla v20.2d, v4.2d, v12.d[1]
fmla v25.2d, v5.2d, v13.2d[0] fmla v25.2d, v5.2d, v13.d[0]
fmla v24.2d, v4.2d, v13.2d[0] fmla v24.2d, v4.2d, v13.d[0]
fmla v21.2d, v5.2d, v12.2d[1] fmla v21.2d, v5.2d, v12.d[1]
fmla v28.2d, v4.2d, v13.2d[1] fmla v28.2d, v4.2d, v13.d[1]
fmla v17.2d, v5.2d, v12.2d[0] fmla v17.2d, v5.2d, v12.d[0]
.endm .endm
.macro KERNEL4x4_SUB .macro KERNEL4x4_SUB
@ -581,17 +581,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2d, v1.2d}, [pA] ld1 {v0.2d, v1.2d}, [pA]
add pA, pA, #32 add pA, pA, #32
fmla v16.2d, v0.2d, v8.2d[0] fmla v16.2d, v0.2d, v8.d[0]
fmla v29.2d, v1.2d, v9.2d[1] fmla v29.2d, v1.2d, v9.d[1]
fmla v20.2d, v0.2d, v8.2d[1] fmla v20.2d, v0.2d, v8.d[1]
fmla v25.2d, v1.2d, v9.2d[0] fmla v25.2d, v1.2d, v9.d[0]
fmla v24.2d, v0.2d, v9.2d[0] fmla v24.2d, v0.2d, v9.d[0]
fmla v21.2d, v1.2d, v8.2d[1] fmla v21.2d, v1.2d, v8.d[1]
fmla v28.2d, v0.2d, v9.2d[1] fmla v28.2d, v0.2d, v9.d[1]
fmla v17.2d, v1.2d, v8.2d[0] fmla v17.2d, v1.2d, v8.d[0]
.endm .endm
.macro SAVE4x4 .macro SAVE4x4
@ -635,10 +635,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2d}, [pA] ld1 {v0.2d}, [pA]
add pA, pA, #16 add pA, pA, #16
fmla v16.2d, v0.2d, v8.2d[0] fmla v16.2d, v0.2d, v8.d[0]
fmla v20.2d, v0.2d, v8.2d[1] fmla v20.2d, v0.2d, v8.d[1]
fmla v24.2d, v0.2d, v9.2d[0] fmla v24.2d, v0.2d, v9.d[0]
fmla v28.2d, v0.2d, v9.2d[1] fmla v28.2d, v0.2d, v9.d[1]
.endm .endm
.macro SAVE2x4 .macro SAVE2x4
@ -713,10 +713,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2d, v1.2d}, [pA] ld1 {v0.2d, v1.2d}, [pA]
add pA, pA, #32 add pA, pA, #32
fmla v16.2d, v0.2d, v8.2d[0] fmla v16.2d, v0.2d, v8.d[0]
fmla v17.2d, v1.2d, v8.2d[0] fmla v17.2d, v1.2d, v8.d[0]
fmla v20.2d, v0.2d, v8.2d[1] fmla v20.2d, v0.2d, v8.d[1]
fmla v21.2d, v1.2d, v8.2d[1] fmla v21.2d, v1.2d, v8.d[1]
.endm .endm
.macro SAVE4x2 .macro SAVE4x2
@ -747,8 +747,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2d}, [pA] ld1 {v0.2d}, [pA]
add pA, pA, #16 add pA, pA, #16
fmla v16.2d, v0.2d, v8.2d[0] fmla v16.2d, v0.2d, v8.d[0]
fmla v20.2d, v0.2d, v8.2d[1] fmla v20.2d, v0.2d, v8.d[1]
.endm .endm
.macro SAVE2x2 .macro SAVE2x2
@ -776,7 +776,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ldr d0 , [pA] ldr d0 , [pA]
add pA, pA, #8 add pA, pA, #8
fmla v16.2d, v8.2d, v0.2d[0] fmla v16.2d, v8.2d, v0.d[0]
.endm .endm
.macro SAVE1x2 .macro SAVE1x2
@ -803,8 +803,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2d, v1.2d}, [pA] ld1 {v0.2d, v1.2d}, [pA]
add pA , pA, #32 add pA , pA, #32
fmla v16.2d, v0.2d, v8.2d[0] fmla v16.2d, v0.2d, v8.d[0]
fmla v17.2d, v1.2d, v8.2d[0] fmla v17.2d, v1.2d, v8.d[0]
.endm .endm
.macro SAVE4x1 .macro SAVE4x1
@ -831,7 +831,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2d}, [pA] ld1 {v0.2d}, [pA]
add pA , pA, #16 add pA , pA, #16
fmla v16.2d, v0.2d, v8.2d[0] fmla v16.2d, v0.2d, v8.d[0]
.endm .endm
.macro SAVE2x1 .macro SAVE2x1

228
kernel/arm64/dtrmm_kernel_8x4.S Executable file → Normal file
View File

@ -157,25 +157,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v2.2d, v3.2d}, [pA] ld1 {v2.2d, v3.2d}, [pA]
add pA, pA, #32 add pA, pA, #32
fmul v16.2d, v0.2d, v8.2d[0] fmul v16.2d, v0.2d, v8.d[0]
fmul v17.2d, v1.2d, v8.2d[0] fmul v17.2d, v1.2d, v8.d[0]
fmul v18.2d, v2.2d, v8.2d[0] fmul v18.2d, v2.2d, v8.d[0]
fmul v19.2d, v3.2d, v8.2d[0] fmul v19.2d, v3.2d, v8.d[0]
fmul v20.2d, v0.2d, v8.2d[1] fmul v20.2d, v0.2d, v8.d[1]
fmul v21.2d, v1.2d, v8.2d[1] fmul v21.2d, v1.2d, v8.d[1]
fmul v22.2d, v2.2d, v8.2d[1] fmul v22.2d, v2.2d, v8.d[1]
fmul v23.2d, v3.2d, v8.2d[1] fmul v23.2d, v3.2d, v8.d[1]
fmul v24.2d, v0.2d, v9.2d[0] fmul v24.2d, v0.2d, v9.d[0]
fmul v25.2d, v1.2d, v9.2d[0] fmul v25.2d, v1.2d, v9.d[0]
fmul v26.2d, v2.2d, v9.2d[0] fmul v26.2d, v2.2d, v9.d[0]
fmul v27.2d, v3.2d, v9.2d[0] fmul v27.2d, v3.2d, v9.d[0]
fmul v28.2d, v0.2d, v9.2d[1] fmul v28.2d, v0.2d, v9.d[1]
fmul v29.2d, v1.2d, v9.2d[1] fmul v29.2d, v1.2d, v9.d[1]
fmul v30.2d, v2.2d, v9.2d[1] fmul v30.2d, v2.2d, v9.d[1]
fmul v31.2d, v3.2d, v9.2d[1] fmul v31.2d, v3.2d, v9.d[1]
ld1 {v4.2d, v5.2d}, [pA] ld1 {v4.2d, v5.2d}, [pA]
add pA, pA, #32 add pA, pA, #32
@ -186,25 +186,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL8x4_M1 .macro KERNEL8x4_M1
fmla v16.2d, v0.2d, v8.2d[0] fmla v16.2d, v0.2d, v8.d[0]
fmla v17.2d, v1.2d, v8.2d[0] fmla v17.2d, v1.2d, v8.d[0]
fmla v18.2d, v2.2d, v8.2d[0] fmla v18.2d, v2.2d, v8.d[0]
fmla v19.2d, v3.2d, v8.2d[0] fmla v19.2d, v3.2d, v8.d[0]
fmla v20.2d, v0.2d, v8.2d[1] fmla v20.2d, v0.2d, v8.d[1]
fmla v21.2d, v1.2d, v8.2d[1] fmla v21.2d, v1.2d, v8.d[1]
fmla v22.2d, v2.2d, v8.2d[1] fmla v22.2d, v2.2d, v8.d[1]
fmla v23.2d, v3.2d, v8.2d[1] fmla v23.2d, v3.2d, v8.d[1]
fmla v24.2d, v0.2d, v9.2d[0] fmla v24.2d, v0.2d, v9.d[0]
fmla v25.2d, v1.2d, v9.2d[0] fmla v25.2d, v1.2d, v9.d[0]
fmla v26.2d, v2.2d, v9.2d[0] fmla v26.2d, v2.2d, v9.d[0]
fmla v27.2d, v3.2d, v9.2d[0] fmla v27.2d, v3.2d, v9.d[0]
fmla v28.2d, v0.2d, v9.2d[1] fmla v28.2d, v0.2d, v9.d[1]
fmla v29.2d, v1.2d, v9.2d[1] fmla v29.2d, v1.2d, v9.d[1]
fmla v30.2d, v2.2d, v9.2d[1] fmla v30.2d, v2.2d, v9.d[1]
fmla v31.2d, v3.2d, v9.2d[1] fmla v31.2d, v3.2d, v9.d[1]
ld1 {v4.2d, v5.2d}, [pA] ld1 {v4.2d, v5.2d}, [pA]
add pA, pA, #32 add pA, pA, #32
@ -217,25 +217,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL8x4_M2 .macro KERNEL8x4_M2
fmla v16.2d, v4.2d, v12.2d[0] fmla v16.2d, v4.2d, v12.d[0]
fmla v17.2d, v5.2d, v12.2d[0] fmla v17.2d, v5.2d, v12.d[0]
fmla v18.2d, v6.2d, v12.2d[0] fmla v18.2d, v6.2d, v12.d[0]
fmla v19.2d, v7.2d, v12.2d[0] fmla v19.2d, v7.2d, v12.d[0]
fmla v20.2d, v4.2d, v12.2d[1] fmla v20.2d, v4.2d, v12.d[1]
fmla v21.2d, v5.2d, v12.2d[1] fmla v21.2d, v5.2d, v12.d[1]
fmla v22.2d, v6.2d, v12.2d[1] fmla v22.2d, v6.2d, v12.d[1]
fmla v23.2d, v7.2d, v12.2d[1] fmla v23.2d, v7.2d, v12.d[1]
fmla v24.2d, v4.2d, v13.2d[0] fmla v24.2d, v4.2d, v13.d[0]
fmla v25.2d, v5.2d, v13.2d[0] fmla v25.2d, v5.2d, v13.d[0]
fmla v26.2d, v6.2d, v13.2d[0] fmla v26.2d, v6.2d, v13.d[0]
fmla v27.2d, v7.2d, v13.2d[0] fmla v27.2d, v7.2d, v13.d[0]
fmla v28.2d, v4.2d, v13.2d[1] fmla v28.2d, v4.2d, v13.d[1]
fmla v29.2d, v5.2d, v13.2d[1] fmla v29.2d, v5.2d, v13.d[1]
fmla v30.2d, v6.2d, v13.2d[1] fmla v30.2d, v6.2d, v13.d[1]
fmla v31.2d, v7.2d, v13.2d[1] fmla v31.2d, v7.2d, v13.d[1]
ld1 {v0.2d, v1.2d}, [pA] ld1 {v0.2d, v1.2d}, [pA]
add pA, pA, #32 add pA, pA, #32
@ -248,25 +248,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL8x4_E .macro KERNEL8x4_E
fmla v16.2d, v4.2d, v12.2d[0] fmla v16.2d, v4.2d, v12.d[0]
fmla v17.2d, v5.2d, v12.2d[0] fmla v17.2d, v5.2d, v12.d[0]
fmla v18.2d, v6.2d, v12.2d[0] fmla v18.2d, v6.2d, v12.d[0]
fmla v19.2d, v7.2d, v12.2d[0] fmla v19.2d, v7.2d, v12.d[0]
fmla v20.2d, v4.2d, v12.2d[1] fmla v20.2d, v4.2d, v12.d[1]
fmla v21.2d, v5.2d, v12.2d[1] fmla v21.2d, v5.2d, v12.d[1]
fmla v22.2d, v6.2d, v12.2d[1] fmla v22.2d, v6.2d, v12.d[1]
fmla v23.2d, v7.2d, v12.2d[1] fmla v23.2d, v7.2d, v12.d[1]
fmla v24.2d, v4.2d, v13.2d[0] fmla v24.2d, v4.2d, v13.d[0]
fmla v25.2d, v5.2d, v13.2d[0] fmla v25.2d, v5.2d, v13.d[0]
fmla v26.2d, v6.2d, v13.2d[0] fmla v26.2d, v6.2d, v13.d[0]
fmla v27.2d, v7.2d, v13.2d[0] fmla v27.2d, v7.2d, v13.d[0]
fmla v28.2d, v4.2d, v13.2d[1] fmla v28.2d, v4.2d, v13.d[1]
fmla v29.2d, v5.2d, v13.2d[1] fmla v29.2d, v5.2d, v13.d[1]
fmla v30.2d, v6.2d, v13.2d[1] fmla v30.2d, v6.2d, v13.d[1]
fmla v31.2d, v7.2d, v13.2d[1] fmla v31.2d, v7.2d, v13.d[1]
.endm .endm
.macro KERNEL8x4_SUB .macro KERNEL8x4_SUB
@ -277,25 +277,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v2.2d, v3.2d}, [pA] ld1 {v2.2d, v3.2d}, [pA]
add pA, pA, #32 add pA, pA, #32
fmla v16.2d, v0.2d, v8.2d[0] fmla v16.2d, v0.2d, v8.d[0]
fmla v17.2d, v1.2d, v8.2d[0] fmla v17.2d, v1.2d, v8.d[0]
fmla v18.2d, v2.2d, v8.2d[0] fmla v18.2d, v2.2d, v8.d[0]
fmla v19.2d, v3.2d, v8.2d[0] fmla v19.2d, v3.2d, v8.d[0]
fmla v20.2d, v0.2d, v8.2d[1] fmla v20.2d, v0.2d, v8.d[1]
fmla v21.2d, v1.2d, v8.2d[1] fmla v21.2d, v1.2d, v8.d[1]
fmla v22.2d, v2.2d, v8.2d[1] fmla v22.2d, v2.2d, v8.d[1]
fmla v23.2d, v3.2d, v8.2d[1] fmla v23.2d, v3.2d, v8.d[1]
fmla v24.2d, v0.2d, v9.2d[0] fmla v24.2d, v0.2d, v9.d[0]
fmla v25.2d, v1.2d, v9.2d[0] fmla v25.2d, v1.2d, v9.d[0]
fmla v26.2d, v2.2d, v9.2d[0] fmla v26.2d, v2.2d, v9.d[0]
fmla v27.2d, v3.2d, v9.2d[0] fmla v27.2d, v3.2d, v9.d[0]
fmla v28.2d, v0.2d, v9.2d[1] fmla v28.2d, v0.2d, v9.d[1]
fmla v29.2d, v1.2d, v9.2d[1] fmla v29.2d, v1.2d, v9.d[1]
fmla v30.2d, v2.2d, v9.2d[1] fmla v30.2d, v2.2d, v9.d[1]
fmla v31.2d, v3.2d, v9.2d[1] fmla v31.2d, v3.2d, v9.d[1]
.endm .endm
.macro SAVE8x4 .macro SAVE8x4
@ -351,17 +351,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2d, v1.2d}, [pA] ld1 {v0.2d, v1.2d}, [pA]
add pA, pA, #32 add pA, pA, #32
fmla v16.2d, v0.2d, v8.2d[0] fmla v16.2d, v0.2d, v8.d[0]
fmla v29.2d, v1.2d, v9.2d[1] fmla v29.2d, v1.2d, v9.d[1]
fmla v20.2d, v0.2d, v8.2d[1] fmla v20.2d, v0.2d, v8.d[1]
fmla v25.2d, v1.2d, v9.2d[0] fmla v25.2d, v1.2d, v9.d[0]
fmla v24.2d, v0.2d, v9.2d[0] fmla v24.2d, v0.2d, v9.d[0]
fmla v21.2d, v1.2d, v8.2d[1] fmla v21.2d, v1.2d, v8.d[1]
fmla v28.2d, v0.2d, v9.2d[1] fmla v28.2d, v0.2d, v9.d[1]
fmla v17.2d, v1.2d, v8.2d[0] fmla v17.2d, v1.2d, v8.d[0]
.endm .endm
.macro SAVE4x4 .macro SAVE4x4
@ -406,10 +406,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2d}, [pA] ld1 {v0.2d}, [pA]
add pA, pA, #16 add pA, pA, #16
fmla v16.2d, v0.2d, v8.2d[0] fmla v16.2d, v0.2d, v8.d[0]
fmla v20.2d, v0.2d, v8.2d[1] fmla v20.2d, v0.2d, v8.d[1]
fmla v24.2d, v0.2d, v9.2d[0] fmla v24.2d, v0.2d, v9.d[0]
fmla v28.2d, v0.2d, v9.2d[1] fmla v28.2d, v0.2d, v9.d[1]
.endm .endm
.macro SAVE2x4 .macro SAVE2x4
@ -490,15 +490,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v2.2d, v3.2d}, [pA] ld1 {v2.2d, v3.2d}, [pA]
add pA, pA, #32 add pA, pA, #32
fmla v16.2d, v0.2d, v8.2d[0] fmla v16.2d, v0.2d, v8.d[0]
fmla v17.2d, v1.2d, v8.2d[0] fmla v17.2d, v1.2d, v8.d[0]
fmla v18.2d, v2.2d, v8.2d[0] fmla v18.2d, v2.2d, v8.d[0]
fmla v19.2d, v3.2d, v8.2d[0] fmla v19.2d, v3.2d, v8.d[0]
fmla v20.2d, v0.2d, v8.2d[1] fmla v20.2d, v0.2d, v8.d[1]
fmla v21.2d, v1.2d, v8.2d[1] fmla v21.2d, v1.2d, v8.d[1]
fmla v22.2d, v2.2d, v8.2d[1] fmla v22.2d, v2.2d, v8.d[1]
fmla v23.2d, v3.2d, v8.2d[1] fmla v23.2d, v3.2d, v8.d[1]
.endm .endm
.macro SAVE8x2 .macro SAVE8x2
@ -534,10 +534,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2d, v1.2d}, [pA] ld1 {v0.2d, v1.2d}, [pA]
add pA, pA, #32 add pA, pA, #32
fmla v16.2d, v0.2d, v8.2d[0] fmla v16.2d, v0.2d, v8.d[0]
fmla v17.2d, v1.2d, v8.2d[0] fmla v17.2d, v1.2d, v8.d[0]
fmla v20.2d, v0.2d, v8.2d[1] fmla v20.2d, v0.2d, v8.d[1]
fmla v21.2d, v1.2d, v8.2d[1] fmla v21.2d, v1.2d, v8.d[1]
.endm .endm
.macro SAVE4x2 .macro SAVE4x2
@ -568,8 +568,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2d}, [pA] ld1 {v0.2d}, [pA]
add pA, pA, #16 add pA, pA, #16
fmla v16.2d, v0.2d, v8.2d[0] fmla v16.2d, v0.2d, v8.d[0]
fmla v20.2d, v0.2d, v8.2d[1] fmla v20.2d, v0.2d, v8.d[1]
.endm .endm
.macro SAVE2x2 .macro SAVE2x2
@ -597,7 +597,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ldr d0 , [pA] ldr d0 , [pA]
add pA, pA, #8 add pA, pA, #8
fmla v16.2d, v8.2d, v0.2d[0] fmla v16.2d, v8.2d, v0.d[0]
.endm .endm
.macro SAVE1x2 .macro SAVE1x2
@ -629,10 +629,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v2.2d, v3.2d}, [pA] ld1 {v2.2d, v3.2d}, [pA]
add pA, pA, #32 add pA, pA, #32
fmla v16.2d, v0.2d, v8.2d[0] fmla v16.2d, v0.2d, v8.d[0]
fmla v17.2d, v1.2d, v8.2d[0] fmla v17.2d, v1.2d, v8.d[0]
fmla v18.2d, v2.2d, v8.2d[0] fmla v18.2d, v2.2d, v8.d[0]
fmla v19.2d, v3.2d, v8.2d[0] fmla v19.2d, v3.2d, v8.d[0]
.endm .endm
.macro SAVE8x1 .macro SAVE8x1
@ -660,8 +660,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2d, v1.2d}, [pA] ld1 {v0.2d, v1.2d}, [pA]
add pA , pA, #32 add pA , pA, #32
fmla v16.2d, v0.2d, v8.2d[0] fmla v16.2d, v0.2d, v8.d[0]
fmla v17.2d, v1.2d, v8.2d[0] fmla v17.2d, v1.2d, v8.d[0]
.endm .endm
.macro SAVE4x1 .macro SAVE4x1
@ -686,7 +686,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2d}, [pA] ld1 {v0.2d}, [pA]
add pA , pA, #16 add pA , pA, #16
fmla v16.2d, v0.2d, v8.2d[0] fmla v16.2d, v0.2d, v8.d[0]
.endm .endm
.macro SAVE2x1 .macro SAVE2x1

View File

@ -158,25 +158,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v3.4s}, [pA] ld1 {v3.4s}, [pA]
add pA, pA, #16 add pA, pA, #16
fmul v16.4s, v0.4s, v8.2s[0] fmul v16.4s, v0.4s, v8.s[0]
fmul v17.4s, v1.4s, v8.2s[0] fmul v17.4s, v1.4s, v8.s[0]
fmul v18.4s, v2.4s, v8.2s[0] fmul v18.4s, v2.4s, v8.s[0]
fmul v19.4s, v3.4s, v8.2s[0] fmul v19.4s, v3.4s, v8.s[0]
fmul v20.4s, v0.4s, v8.2s[1] fmul v20.4s, v0.4s, v8.s[1]
fmul v21.4s, v1.4s, v8.2s[1] fmul v21.4s, v1.4s, v8.s[1]
fmul v22.4s, v2.4s, v8.2s[1] fmul v22.4s, v2.4s, v8.s[1]
fmul v23.4s, v3.4s, v8.2s[1] fmul v23.4s, v3.4s, v8.s[1]
fmul v24.4s, v0.4s, v9.2s[0] fmul v24.4s, v0.4s, v9.s[0]
fmul v25.4s, v1.4s, v9.2s[0] fmul v25.4s, v1.4s, v9.s[0]
fmul v26.4s, v2.4s, v9.2s[0] fmul v26.4s, v2.4s, v9.s[0]
fmul v27.4s, v3.4s, v9.2s[0] fmul v27.4s, v3.4s, v9.s[0]
fmul v28.4s, v0.4s, v9.2s[1] fmul v28.4s, v0.4s, v9.s[1]
fmul v29.4s, v1.4s, v9.2s[1] fmul v29.4s, v1.4s, v9.s[1]
fmul v30.4s, v2.4s, v9.2s[1] fmul v30.4s, v2.4s, v9.s[1]
fmul v31.4s, v3.4s, v9.2s[1] fmul v31.4s, v3.4s, v9.s[1]
ld1 {v12.2s, v13.2s}, [pB] ld1 {v12.2s, v13.2s}, [pB]
add pB, pB, #16 add pB, pB, #16
@ -191,25 +191,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL16x4_M1 .macro KERNEL16x4_M1
fmla v16.4s, v0.4s, v8.2s[0] fmla v16.4s, v0.4s, v8.s[0]
fmla v17.4s, v1.4s, v8.2s[0] fmla v17.4s, v1.4s, v8.s[0]
fmla v18.4s, v2.4s, v8.2s[0] fmla v18.4s, v2.4s, v8.s[0]
fmla v19.4s, v3.4s, v8.2s[0] fmla v19.4s, v3.4s, v8.s[0]
fmla v20.4s, v0.4s, v8.2s[1] fmla v20.4s, v0.4s, v8.s[1]
fmla v21.4s, v1.4s, v8.2s[1] fmla v21.4s, v1.4s, v8.s[1]
fmla v22.4s, v2.4s, v8.2s[1] fmla v22.4s, v2.4s, v8.s[1]
fmla v23.4s, v3.4s, v8.2s[1] fmla v23.4s, v3.4s, v8.s[1]
fmla v24.4s, v0.4s, v9.2s[0] fmla v24.4s, v0.4s, v9.s[0]
fmla v25.4s, v1.4s, v9.2s[0] fmla v25.4s, v1.4s, v9.s[0]
fmla v26.4s, v2.4s, v9.2s[0] fmla v26.4s, v2.4s, v9.s[0]
fmla v27.4s, v3.4s, v9.2s[0] fmla v27.4s, v3.4s, v9.s[0]
fmla v28.4s, v0.4s, v9.2s[1] fmla v28.4s, v0.4s, v9.s[1]
fmla v29.4s, v1.4s, v9.2s[1] fmla v29.4s, v1.4s, v9.s[1]
fmla v30.4s, v2.4s, v9.2s[1] fmla v30.4s, v2.4s, v9.s[1]
fmla v31.4s, v3.4s, v9.2s[1] fmla v31.4s, v3.4s, v9.s[1]
ld1 {v12.2s, v13.2s}, [pB] ld1 {v12.2s, v13.2s}, [pB]
add pB, pB, #16 add pB, pB, #16
@ -224,25 +224,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL16x4_M2 .macro KERNEL16x4_M2
fmla v16.4s, v4.4s, v12.2s[0] fmla v16.4s, v4.4s, v12.s[0]
fmla v17.4s, v5.4s, v12.2s[0] fmla v17.4s, v5.4s, v12.s[0]
fmla v18.4s, v6.4s, v12.2s[0] fmla v18.4s, v6.4s, v12.s[0]
fmla v19.4s, v7.4s, v12.2s[0] fmla v19.4s, v7.4s, v12.s[0]
fmla v20.4s, v4.4s, v12.2s[1] fmla v20.4s, v4.4s, v12.s[1]
fmla v21.4s, v5.4s, v12.2s[1] fmla v21.4s, v5.4s, v12.s[1]
fmla v22.4s, v6.4s, v12.2s[1] fmla v22.4s, v6.4s, v12.s[1]
fmla v23.4s, v7.4s, v12.2s[1] fmla v23.4s, v7.4s, v12.s[1]
fmla v24.4s, v4.4s, v13.2s[0] fmla v24.4s, v4.4s, v13.s[0]
fmla v25.4s, v5.4s, v13.2s[0] fmla v25.4s, v5.4s, v13.s[0]
fmla v26.4s, v6.4s, v13.2s[0] fmla v26.4s, v6.4s, v13.s[0]
fmla v27.4s, v7.4s, v13.2s[0] fmla v27.4s, v7.4s, v13.s[0]
fmla v28.4s, v4.4s, v13.2s[1] fmla v28.4s, v4.4s, v13.s[1]
fmla v29.4s, v5.4s, v13.2s[1] fmla v29.4s, v5.4s, v13.s[1]
fmla v30.4s, v6.4s, v13.2s[1] fmla v30.4s, v6.4s, v13.s[1]
fmla v31.4s, v7.4s, v13.2s[1] fmla v31.4s, v7.4s, v13.s[1]
ld1 {v8.2s, v9.2s}, [pB] ld1 {v8.2s, v9.2s}, [pB]
add pB, pB, #16 add pB, pB, #16
@ -257,25 +257,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL16x4_E .macro KERNEL16x4_E
fmla v16.4s, v4.4s, v12.2s[0] fmla v16.4s, v4.4s, v12.s[0]
fmla v17.4s, v5.4s, v12.2s[0] fmla v17.4s, v5.4s, v12.s[0]
fmla v18.4s, v6.4s, v12.2s[0] fmla v18.4s, v6.4s, v12.s[0]
fmla v19.4s, v7.4s, v12.2s[0] fmla v19.4s, v7.4s, v12.s[0]
fmla v20.4s, v4.4s, v12.2s[1] fmla v20.4s, v4.4s, v12.s[1]
fmla v21.4s, v5.4s, v12.2s[1] fmla v21.4s, v5.4s, v12.s[1]
fmla v22.4s, v6.4s, v12.2s[1] fmla v22.4s, v6.4s, v12.s[1]
fmla v23.4s, v7.4s, v12.2s[1] fmla v23.4s, v7.4s, v12.s[1]
fmla v24.4s, v4.4s, v13.2s[0] fmla v24.4s, v4.4s, v13.s[0]
fmla v25.4s, v5.4s, v13.2s[0] fmla v25.4s, v5.4s, v13.s[0]
fmla v26.4s, v6.4s, v13.2s[0] fmla v26.4s, v6.4s, v13.s[0]
fmla v27.4s, v7.4s, v13.2s[0] fmla v27.4s, v7.4s, v13.s[0]
fmla v28.4s, v4.4s, v13.2s[1] fmla v28.4s, v4.4s, v13.s[1]
fmla v29.4s, v5.4s, v13.2s[1] fmla v29.4s, v5.4s, v13.s[1]
fmla v30.4s, v6.4s, v13.2s[1] fmla v30.4s, v6.4s, v13.s[1]
fmla v31.4s, v7.4s, v13.2s[1] fmla v31.4s, v7.4s, v13.s[1]
.endm .endm
.macro KERNEL16x4_SUB .macro KERNEL16x4_SUB
@ -290,25 +290,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v3.4s}, [pA] ld1 {v3.4s}, [pA]
add pA, pA, #16 add pA, pA, #16
fmla v16.4s, v0.4s, v8.2s[0] fmla v16.4s, v0.4s, v8.s[0]
fmla v17.4s, v1.4s, v8.2s[0] fmla v17.4s, v1.4s, v8.s[0]
fmla v18.4s, v2.4s, v8.2s[0] fmla v18.4s, v2.4s, v8.s[0]
fmla v19.4s, v3.4s, v8.2s[0] fmla v19.4s, v3.4s, v8.s[0]
fmla v20.4s, v0.4s, v8.2s[1] fmla v20.4s, v0.4s, v8.s[1]
fmla v21.4s, v1.4s, v8.2s[1] fmla v21.4s, v1.4s, v8.s[1]
fmla v22.4s, v2.4s, v8.2s[1] fmla v22.4s, v2.4s, v8.s[1]
fmla v23.4s, v3.4s, v8.2s[1] fmla v23.4s, v3.4s, v8.s[1]
fmla v24.4s, v0.4s, v9.2s[0] fmla v24.4s, v0.4s, v9.s[0]
fmla v25.4s, v1.4s, v9.2s[0] fmla v25.4s, v1.4s, v9.s[0]
fmla v26.4s, v2.4s, v9.2s[0] fmla v26.4s, v2.4s, v9.s[0]
fmla v27.4s, v3.4s, v9.2s[0] fmla v27.4s, v3.4s, v9.s[0]
fmla v28.4s, v0.4s, v9.2s[1] fmla v28.4s, v0.4s, v9.s[1]
fmla v29.4s, v1.4s, v9.2s[1] fmla v29.4s, v1.4s, v9.s[1]
fmla v30.4s, v2.4s, v9.2s[1] fmla v30.4s, v2.4s, v9.s[1]
fmla v31.4s, v3.4s, v9.2s[1] fmla v31.4s, v3.4s, v9.s[1]
.endm .endm
.macro SAVE16x4 .macro SAVE16x4
@ -370,14 +370,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v1.4s}, [pA] ld1 {v1.4s}, [pA]
add pA, pA, #16 add pA, pA, #16
fmul v16.4s, v0.4s, v8.2s[0] fmul v16.4s, v0.4s, v8.s[0]
fmul v17.4s, v1.4s, v8.2s[0] fmul v17.4s, v1.4s, v8.s[0]
fmul v20.4s, v0.4s, v8.2s[1] fmul v20.4s, v0.4s, v8.s[1]
fmul v21.4s, v1.4s, v8.2s[1] fmul v21.4s, v1.4s, v8.s[1]
fmul v24.4s, v0.4s, v9.2s[0] fmul v24.4s, v0.4s, v9.s[0]
fmul v25.4s, v1.4s, v9.2s[0] fmul v25.4s, v1.4s, v9.s[0]
fmul v28.4s, v0.4s, v9.2s[1] fmul v28.4s, v0.4s, v9.s[1]
fmul v29.4s, v1.4s, v9.2s[1] fmul v29.4s, v1.4s, v9.s[1]
ld1 {v12.2s, v13.2s}, [pB] ld1 {v12.2s, v13.2s}, [pB]
add pB, pB, #16 add pB, pB, #16
@ -388,14 +388,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL8x4_M1 .macro KERNEL8x4_M1
fmla v16.4s, v0.4s, v8.2s[0] fmla v16.4s, v0.4s, v8.s[0]
fmla v17.4s, v1.4s, v8.2s[0] fmla v17.4s, v1.4s, v8.s[0]
fmla v20.4s, v0.4s, v8.2s[1] fmla v20.4s, v0.4s, v8.s[1]
fmla v21.4s, v1.4s, v8.2s[1] fmla v21.4s, v1.4s, v8.s[1]
fmla v24.4s, v0.4s, v9.2s[0] fmla v24.4s, v0.4s, v9.s[0]
fmla v25.4s, v1.4s, v9.2s[0] fmla v25.4s, v1.4s, v9.s[0]
fmla v28.4s, v0.4s, v9.2s[1] fmla v28.4s, v0.4s, v9.s[1]
fmla v29.4s, v1.4s, v9.2s[1] fmla v29.4s, v1.4s, v9.s[1]
ld1 {v12.2s, v13.2s}, [pB] ld1 {v12.2s, v13.2s}, [pB]
add pB, pB, #16 add pB, pB, #16
@ -406,14 +406,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL8x4_M2 .macro KERNEL8x4_M2
fmla v16.4s, v4.4s, v12.2s[0] fmla v16.4s, v4.4s, v12.s[0]
fmla v17.4s, v5.4s, v12.2s[0] fmla v17.4s, v5.4s, v12.s[0]
fmla v20.4s, v4.4s, v12.2s[1] fmla v20.4s, v4.4s, v12.s[1]
fmla v21.4s, v5.4s, v12.2s[1] fmla v21.4s, v5.4s, v12.s[1]
fmla v24.4s, v4.4s, v13.2s[0] fmla v24.4s, v4.4s, v13.s[0]
fmla v25.4s, v5.4s, v13.2s[0] fmla v25.4s, v5.4s, v13.s[0]
fmla v28.4s, v4.4s, v13.2s[1] fmla v28.4s, v4.4s, v13.s[1]
fmla v29.4s, v5.4s, v13.2s[1] fmla v29.4s, v5.4s, v13.s[1]
ld1 {v8.2s, v9.2s}, [pB] ld1 {v8.2s, v9.2s}, [pB]
add pB, pB, #16 add pB, pB, #16
@ -424,14 +424,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL8x4_E .macro KERNEL8x4_E
fmla v16.4s, v4.4s, v12.2s[0] fmla v16.4s, v4.4s, v12.s[0]
fmla v17.4s, v5.4s, v12.2s[0] fmla v17.4s, v5.4s, v12.s[0]
fmla v20.4s, v4.4s, v12.2s[1] fmla v20.4s, v4.4s, v12.s[1]
fmla v21.4s, v5.4s, v12.2s[1] fmla v21.4s, v5.4s, v12.s[1]
fmla v24.4s, v4.4s, v13.2s[0] fmla v24.4s, v4.4s, v13.s[0]
fmla v25.4s, v5.4s, v13.2s[0] fmla v25.4s, v5.4s, v13.s[0]
fmla v28.4s, v4.4s, v13.2s[1] fmla v28.4s, v4.4s, v13.s[1]
fmla v29.4s, v5.4s, v13.2s[1] fmla v29.4s, v5.4s, v13.s[1]
.endm .endm
.macro KERNEL8x4_SUB .macro KERNEL8x4_SUB
@ -442,14 +442,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v1.4s}, [pA] ld1 {v1.4s}, [pA]
add pA, pA, #16 add pA, pA, #16
fmla v16.4s, v0.4s, v8.2s[0] fmla v16.4s, v0.4s, v8.s[0]
fmla v17.4s, v1.4s, v8.2s[0] fmla v17.4s, v1.4s, v8.s[0]
fmla v20.4s, v0.4s, v8.2s[1] fmla v20.4s, v0.4s, v8.s[1]
fmla v21.4s, v1.4s, v8.2s[1] fmla v21.4s, v1.4s, v8.s[1]
fmla v24.4s, v0.4s, v9.2s[0] fmla v24.4s, v0.4s, v9.s[0]
fmla v25.4s, v1.4s, v9.2s[0] fmla v25.4s, v1.4s, v9.s[0]
fmla v28.4s, v0.4s, v9.2s[1] fmla v28.4s, v0.4s, v9.s[1]
fmla v29.4s, v1.4s, v9.2s[1] fmla v29.4s, v1.4s, v9.s[1]
.endm .endm
.macro SAVE8x4 .macro SAVE8x4
@ -501,17 +501,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s, v1.2s}, [pA] ld1 {v0.2s, v1.2s}, [pA]
add pA, pA, #16 add pA, pA, #16
fmul v16.2s, v0.2s, v8.2s[0] fmul v16.2s, v0.2s, v8.s[0]
fmul v29.2s, v1.2s, v9.2s[1] fmul v29.2s, v1.2s, v9.s[1]
fmul v20.2s, v0.2s, v8.2s[1] fmul v20.2s, v0.2s, v8.s[1]
fmul v25.2s, v1.2s, v9.2s[0] fmul v25.2s, v1.2s, v9.s[0]
fmul v24.2s, v0.2s, v9.2s[0] fmul v24.2s, v0.2s, v9.s[0]
fmul v21.2s, v1.2s, v8.2s[1] fmul v21.2s, v1.2s, v8.s[1]
fmul v28.2s, v0.2s, v9.2s[1] fmul v28.2s, v0.2s, v9.s[1]
fmul v17.2s, v1.2s, v8.2s[0] fmul v17.2s, v1.2s, v8.s[0]
ld1 {v12.2s, v13.2s}, [pB] ld1 {v12.2s, v13.2s}, [pB]
add pB, pB, #16 add pB, pB, #16
@ -520,61 +520,61 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL4x4_M1 .macro KERNEL4x4_M1
fmla v16.2s, v0.2s, v8.2s[0] fmla v16.2s, v0.2s, v8.s[0]
fmla v29.2s, v1.2s, v9.2s[1] fmla v29.2s, v1.2s, v9.s[1]
ld1 {v12.2s, v13.2s}, [pB] // For next round ld1 {v12.2s, v13.2s}, [pB] // For next round
add pB, pB, #16 add pB, pB, #16
fmla v20.2s, v0.2s, v8.2s[1] fmla v20.2s, v0.2s, v8.s[1]
fmla v25.2s, v1.2s, v9.2s[0] fmla v25.2s, v1.2s, v9.s[0]
ld1 {v4.2s, v5.2s}, [pA] // For next round ld1 {v4.2s, v5.2s}, [pA] // For next round
add pA, pA, #16 add pA, pA, #16
fmla v24.2s, v0.2s, v9.2s[0] fmla v24.2s, v0.2s, v9.s[0]
fmla v21.2s, v1.2s, v8.2s[1] fmla v21.2s, v1.2s, v8.s[1]
prfm PLDL1KEEP, [pB, #512] prfm PLDL1KEEP, [pB, #512]
fmla v28.2s, v0.2s, v9.2s[1] fmla v28.2s, v0.2s, v9.s[1]
fmla v17.2s, v1.2s, v8.2s[0] fmla v17.2s, v1.2s, v8.s[0]
.endm .endm
.macro KERNEL4x4_M2 .macro KERNEL4x4_M2
fmla v16.2s, v4.2s, v12.2s[0] fmla v16.2s, v4.2s, v12.s[0]
fmla v29.2s, v5.2s, v13.2s[1] fmla v29.2s, v5.2s, v13.s[1]
ld1 {v8.2s, v9.2s}, [pB] // For next round ld1 {v8.2s, v9.2s}, [pB] // For next round
add pB, pB, #16 add pB, pB, #16
fmla v20.2s, v4.2s, v12.2s[1] fmla v20.2s, v4.2s, v12.s[1]
fmla v25.2s, v5.2s, v13.2s[0] fmla v25.2s, v5.2s, v13.s[0]
ld1 {v0.2s, v1.2s}, [pA] // For next round ld1 {v0.2s, v1.2s}, [pA] // For next round
add pA, pA, #16 add pA, pA, #16
fmla v24.2s, v4.2s, v13.2s[0] fmla v24.2s, v4.2s, v13.s[0]
fmla v21.2s, v5.2s, v12.2s[1] fmla v21.2s, v5.2s, v12.s[1]
prfm PLDL1KEEP, [pA, #512] prfm PLDL1KEEP, [pA, #512]
fmla v28.2s, v4.2s, v13.2s[1] fmla v28.2s, v4.2s, v13.s[1]
fmla v17.2s, v5.2s, v12.2s[0] fmla v17.2s, v5.2s, v12.s[0]
.endm .endm
.macro KERNEL4x4_E .macro KERNEL4x4_E
fmla v16.2s, v4.2s, v12.2s[0] fmla v16.2s, v4.2s, v12.s[0]
fmla v29.2s, v5.2s, v13.2s[1] fmla v29.2s, v5.2s, v13.s[1]
fmla v20.2s, v4.2s, v12.2s[1] fmla v20.2s, v4.2s, v12.s[1]
fmla v25.2s, v5.2s, v13.2s[0] fmla v25.2s, v5.2s, v13.s[0]
fmla v24.2s, v4.2s, v13.2s[0] fmla v24.2s, v4.2s, v13.s[0]
fmla v21.2s, v5.2s, v12.2s[1] fmla v21.2s, v5.2s, v12.s[1]
fmla v28.2s, v4.2s, v13.2s[1] fmla v28.2s, v4.2s, v13.s[1]
fmla v17.2s, v5.2s, v12.2s[0] fmla v17.2s, v5.2s, v12.s[0]
.endm .endm
.macro KERNEL4x4_SUB .macro KERNEL4x4_SUB
@ -583,17 +583,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s, v1.2s}, [pA] ld1 {v0.2s, v1.2s}, [pA]
add pA, pA, #16 add pA, pA, #16
fmla v16.2s, v0.2s, v8.2s[0] fmla v16.2s, v0.2s, v8.s[0]
fmla v29.2s, v1.2s, v9.2s[1] fmla v29.2s, v1.2s, v9.s[1]
fmla v20.2s, v0.2s, v8.2s[1] fmla v20.2s, v0.2s, v8.s[1]
fmla v25.2s, v1.2s, v9.2s[0] fmla v25.2s, v1.2s, v9.s[0]
fmla v24.2s, v0.2s, v9.2s[0] fmla v24.2s, v0.2s, v9.s[0]
fmla v21.2s, v1.2s, v8.2s[1] fmla v21.2s, v1.2s, v8.s[1]
fmla v28.2s, v0.2s, v9.2s[1] fmla v28.2s, v0.2s, v9.s[1]
fmla v17.2s, v1.2s, v8.2s[0] fmla v17.2s, v1.2s, v8.s[0]
.endm .endm
.macro SAVE4x4 .macro SAVE4x4
@ -638,10 +638,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s}, [pA] ld1 {v0.2s}, [pA]
add pA, pA, #8 add pA, pA, #8
fmla v16.2s, v0.2s, v8.2s[0] fmla v16.2s, v0.2s, v8.s[0]
fmla v20.2s, v0.2s, v8.2s[1] fmla v20.2s, v0.2s, v8.s[1]
fmla v24.2s, v0.2s, v9.2s[0] fmla v24.2s, v0.2s, v9.s[0]
fmla v28.2s, v0.2s, v9.2s[1] fmla v28.2s, v0.2s, v9.s[1]
.endm .endm
.macro SAVE2x4 .macro SAVE2x4
@ -729,15 +729,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v3.4s}, [pA] ld1 {v3.4s}, [pA]
add pA, pA, #16 add pA, pA, #16
fmla v16.4s, v0.4s, v8.2s[0] fmla v16.4s, v0.4s, v8.s[0]
fmla v17.4s, v1.4s, v8.2s[0] fmla v17.4s, v1.4s, v8.s[0]
fmla v18.4s, v2.4s, v8.2s[0] fmla v18.4s, v2.4s, v8.s[0]
fmla v19.4s, v3.4s, v8.2s[0] fmla v19.4s, v3.4s, v8.s[0]
fmla v20.4s, v0.4s, v8.2s[1] fmla v20.4s, v0.4s, v8.s[1]
fmla v21.4s, v1.4s, v8.2s[1] fmla v21.4s, v1.4s, v8.s[1]
fmla v22.4s, v2.4s, v8.2s[1] fmla v22.4s, v2.4s, v8.s[1]
fmla v23.4s, v3.4s, v8.2s[1] fmla v23.4s, v3.4s, v8.s[1]
.endm .endm
.macro SAVE16x2 .macro SAVE16x2
@ -777,11 +777,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v1.4s}, [pA] ld1 {v1.4s}, [pA]
add pA, pA, #16 add pA, pA, #16
fmla v16.4s, v0.4s, v8.2s[0] fmla v16.4s, v0.4s, v8.s[0]
fmla v17.4s, v1.4s, v8.2s[0] fmla v17.4s, v1.4s, v8.s[0]
fmla v20.4s, v0.4s, v8.2s[1] fmla v20.4s, v0.4s, v8.s[1]
fmla v21.4s, v1.4s, v8.2s[1] fmla v21.4s, v1.4s, v8.s[1]
.endm .endm
.macro SAVE8x2 .macro SAVE8x2
@ -817,10 +817,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s, v1.2s}, [pA] ld1 {v0.2s, v1.2s}, [pA]
add pA, pA, #16 add pA, pA, #16
fmla v16.2s, v0.2s, v8.2s[0] fmla v16.2s, v0.2s, v8.s[0]
fmla v17.2s, v1.2s, v8.2s[0] fmla v17.2s, v1.2s, v8.s[0]
fmla v20.2s, v0.2s, v8.2s[1] fmla v20.2s, v0.2s, v8.s[1]
fmla v21.2s, v1.2s, v8.2s[1] fmla v21.2s, v1.2s, v8.s[1]
.endm .endm
.macro SAVE4x2 .macro SAVE4x2
@ -852,8 +852,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s}, [pA] ld1 {v0.2s}, [pA]
add pA, pA, #8 add pA, pA, #8
fmla v16.2s, v0.2s, v8.2s[0] fmla v16.2s, v0.2s, v8.s[0]
fmla v20.2s, v0.2s, v8.2s[1] fmla v20.2s, v0.2s, v8.s[1]
.endm .endm
.macro SAVE2x2 .macro SAVE2x2
@ -882,7 +882,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ldr s0 , [pA] ldr s0 , [pA]
add pA, pA, #4 add pA, pA, #4
fmla v16.2s, v8.2s, v0.2s[0] fmla v16.2s, v8.2s, v0.s[0]
.endm .endm
.macro SAVE1x2 .macro SAVE1x2
@ -918,10 +918,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v3.4s}, [pA] ld1 {v3.4s}, [pA]
add pA, pA, #16 add pA, pA, #16
fmla v16.4s, v0.4s, v8.2s[0] fmla v16.4s, v0.4s, v8.s[0]
fmla v17.4s, v1.4s, v8.2s[0] fmla v17.4s, v1.4s, v8.s[0]
fmla v18.4s, v2.4s, v8.2s[0] fmla v18.4s, v2.4s, v8.s[0]
fmla v19.4s, v3.4s, v8.2s[0] fmla v19.4s, v3.4s, v8.s[0]
.endm .endm
.macro SAVE16x1 .macro SAVE16x1
@ -951,8 +951,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v1.4s}, [pA] ld1 {v1.4s}, [pA]
add pA, pA, #16 add pA, pA, #16
fmla v16.4s, v0.4s, v8.2s[0] fmla v16.4s, v0.4s, v8.s[0]
fmla v17.4s, v1.4s, v8.2s[0] fmla v17.4s, v1.4s, v8.s[0]
.endm .endm
.macro SAVE8x1 .macro SAVE8x1
@ -978,8 +978,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s, v1.2s}, [pA] ld1 {v0.2s, v1.2s}, [pA]
add pA , pA, #16 add pA , pA, #16
fmla v16.2s, v0.2s, v8.2s[0] fmla v16.2s, v0.2s, v8.s[0]
fmla v17.2s, v1.2s, v8.2s[0] fmla v17.2s, v1.2s, v8.s[0]
.endm .endm
.macro SAVE4x1 .macro SAVE4x1
@ -1004,7 +1004,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s}, [pA] ld1 {v0.2s}, [pA]
add pA , pA, #8 add pA , pA, #8
fmla v16.2s, v0.2s, v8.2s[0] fmla v16.2s, v0.2s, v8.s[0]
.endm .endm
.macro SAVE2x1 .macro SAVE2x1

View File

@ -192,164 +192,164 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.4s}, [pA_0] ld1 {v0.4s}, [pA_0]
add pA_0, pA_0, #16 add pA_0, pA_0, #16
fmul v16.4s, v0.4s, v8.4s[0] fmul v16.4s, v0.4s, v8.s[0]
fmul v20.4s, v0.4s, v8.4s[1] fmul v20.4s, v0.4s, v8.s[1]
ld1 {v2.4s}, [pA_1] ld1 {v2.4s}, [pA_1]
add pA_1, pA_1, #16 add pA_1, pA_1, #16
fmul v24.4s, v0.4s, v8.4s[2] fmul v24.4s, v0.4s, v8.s[2]
fmul v28.4s, v0.4s, v8.4s[3] fmul v28.4s, v0.4s, v8.s[3]
ld1 {v4.4s}, [pA_2] ld1 {v4.4s}, [pA_2]
add pA_2, pA_2, #16 add pA_2, pA_2, #16
fmul v17.4s, v2.4s, v8.4s[0] fmul v17.4s, v2.4s, v8.s[0]
fmul v21.4s, v2.4s, v8.4s[1] fmul v21.4s, v2.4s, v8.s[1]
ld1 {v6.4s}, [pA_3] ld1 {v6.4s}, [pA_3]
add pA_3, pA_3, #16 add pA_3, pA_3, #16
fmul v25.4s, v2.4s, v8.4s[2] fmul v25.4s, v2.4s, v8.s[2]
fmul v29.4s, v2.4s, v8.4s[3] fmul v29.4s, v2.4s, v8.s[3]
ld1 {v12.4s}, [pB] // for next round ld1 {v12.4s}, [pB] // for next round
add pB, pB, #16 add pB, pB, #16
fmul v18.4s, v4.4s, v8.4s[0] fmul v18.4s, v4.4s, v8.s[0]
fmul v19.4s, v6.4s, v8.4s[0] fmul v19.4s, v6.4s, v8.s[0]
ld1 {v1.4s}, [pA_0] // for next round ld1 {v1.4s}, [pA_0] // for next round
add pA_0, pA_0, #16 add pA_0, pA_0, #16
fmul v22.4s, v4.4s, v8.4s[1] fmul v22.4s, v4.4s, v8.s[1]
fmul v23.4s, v6.4s, v8.4s[1] fmul v23.4s, v6.4s, v8.s[1]
ld1 {v3.4s}, [pA_1] // for next round ld1 {v3.4s}, [pA_1] // for next round
add pA_1, pA_1, #16 add pA_1, pA_1, #16
fmul v26.4s, v4.4s, v8.4s[2] fmul v26.4s, v4.4s, v8.s[2]
fmul v27.4s, v6.4s, v8.4s[2] fmul v27.4s, v6.4s, v8.s[2]
ld1 {v5.4s}, [pA_2] // for next round ld1 {v5.4s}, [pA_2] // for next round
add pA_2, pA_2, #16 add pA_2, pA_2, #16
fmul v30.4s, v4.4s, v8.4s[3] fmul v30.4s, v4.4s, v8.s[3]
fmul v31.4s, v6.4s, v8.4s[3] fmul v31.4s, v6.4s, v8.s[3]
ld1 {v7.4s}, [pA_3] // for next round ld1 {v7.4s}, [pA_3] // for next round
add pA_3, pA_3, #16 add pA_3, pA_3, #16
.endm .endm
.macro KERNEL16x4_M2 .macro KERNEL16x4_M2
fmla v16.4s, v1.4s, v12.4s[0] fmla v16.4s, v1.4s, v12.s[0]
fmla v17.4s, v3.4s, v12.4s[0] fmla v17.4s, v3.4s, v12.s[0]
ld1 {v8.4s}, [pB] // for next round ld1 {v8.4s}, [pB] // for next round
add pB, pB, #16 add pB, pB, #16
fmla v18.4s, v5.4s, v12.4s[0] fmla v18.4s, v5.4s, v12.s[0]
fmla v19.4s, v7.4s, v12.4s[0] fmla v19.4s, v7.4s, v12.s[0]
ld1 {v0.4s}, [pA_0] // for next round ld1 {v0.4s}, [pA_0] // for next round
add pA_0, pA_0, #16 add pA_0, pA_0, #16
fmla v20.4s, v1.4s, v12.4s[1] fmla v20.4s, v1.4s, v12.s[1]
fmla v21.4s, v3.4s, v12.4s[1] fmla v21.4s, v3.4s, v12.s[1]
ld1 {v2.4s}, [pA_1] // for next round ld1 {v2.4s}, [pA_1] // for next round
add pA_1, pA_1, #16 add pA_1, pA_1, #16
fmla v22.4s, v5.4s, v12.4s[1] fmla v22.4s, v5.4s, v12.s[1]
fmla v23.4s, v7.4s, v12.4s[1] fmla v23.4s, v7.4s, v12.s[1]
ld1 {v4.4s}, [pA_2] // for next round ld1 {v4.4s}, [pA_2] // for next round
add pA_2, pA_2, #16 add pA_2, pA_2, #16
fmla v24.4s, v1.4s, v12.4s[2] fmla v24.4s, v1.4s, v12.s[2]
fmla v25.4s, v3.4s, v12.4s[2] fmla v25.4s, v3.4s, v12.s[2]
ld1 {v6.4s}, [pA_3] // for next round ld1 {v6.4s}, [pA_3] // for next round
add pA_3, pA_3, #16 add pA_3, pA_3, #16
fmla v26.4s, v5.4s, v12.4s[2] fmla v26.4s, v5.4s, v12.s[2]
fmla v27.4s, v7.4s, v12.4s[2] fmla v27.4s, v7.4s, v12.s[2]
prfm PLDL1KEEP, [pA_2, #512] prfm PLDL1KEEP, [pA_2, #512]
fmla v28.4s, v1.4s, v12.4s[3] fmla v28.4s, v1.4s, v12.s[3]
fmla v29.4s, v3.4s, v12.4s[3] fmla v29.4s, v3.4s, v12.s[3]
prfm PLDL1KEEP, [pA_3, #512] prfm PLDL1KEEP, [pA_3, #512]
fmla v30.4s, v5.4s, v12.4s[3] fmla v30.4s, v5.4s, v12.s[3]
fmla v31.4s, v7.4s, v12.4s[3] fmla v31.4s, v7.4s, v12.s[3]
prfm PLDL1KEEP, [pB, #512] prfm PLDL1KEEP, [pB, #512]
.endm .endm
.macro KERNEL16x4_M1 .macro KERNEL16x4_M1
fmla v16.4s, v0.4s, v8.4s[0] fmla v16.4s, v0.4s, v8.s[0]
fmla v17.4s, v2.4s, v8.4s[0] fmla v17.4s, v2.4s, v8.s[0]
ld1 {v12.4s}, [pB] // for next round ld1 {v12.4s}, [pB] // for next round
add pB, pB, #16 add pB, pB, #16
fmla v18.4s, v4.4s, v8.4s[0] fmla v18.4s, v4.4s, v8.s[0]
fmla v19.4s, v6.4s, v8.4s[0] fmla v19.4s, v6.4s, v8.s[0]
ld1 {v1.4s}, [pA_0] // for next round ld1 {v1.4s}, [pA_0] // for next round
add pA_0, pA_0, #16 add pA_0, pA_0, #16
fmla v20.4s, v0.4s, v8.4s[1] fmla v20.4s, v0.4s, v8.s[1]
fmla v21.4s, v2.4s, v8.4s[1] fmla v21.4s, v2.4s, v8.s[1]
ld1 {v3.4s}, [pA_1] // for next round ld1 {v3.4s}, [pA_1] // for next round
add pA_1, pA_1, #16 add pA_1, pA_1, #16
fmla v22.4s, v4.4s, v8.4s[1] fmla v22.4s, v4.4s, v8.s[1]
fmla v23.4s, v6.4s, v8.4s[1] fmla v23.4s, v6.4s, v8.s[1]
ld1 {v5.4s}, [pA_2] // for next round ld1 {v5.4s}, [pA_2] // for next round
add pA_2, pA_2, #16 add pA_2, pA_2, #16
fmla v24.4s, v0.4s, v8.4s[2] fmla v24.4s, v0.4s, v8.s[2]
fmla v25.4s, v2.4s, v8.4s[2] fmla v25.4s, v2.4s, v8.s[2]
ld1 {v7.4s}, [pA_3] // for next round ld1 {v7.4s}, [pA_3] // for next round
add pA_3, pA_3, #16 add pA_3, pA_3, #16
fmla v26.4s, v4.4s, v8.4s[2] fmla v26.4s, v4.4s, v8.s[2]
fmla v27.4s, v6.4s, v8.4s[2] fmla v27.4s, v6.4s, v8.s[2]
prfm PLDL1KEEP, [pA_0, #512] prfm PLDL1KEEP, [pA_0, #512]
fmla v28.4s, v0.4s, v8.4s[3] fmla v28.4s, v0.4s, v8.s[3]
fmla v29.4s, v2.4s, v8.4s[3] fmla v29.4s, v2.4s, v8.s[3]
prfm PLDL1KEEP, [pA_1, #512] prfm PLDL1KEEP, [pA_1, #512]
fmla v30.4s, v4.4s, v8.4s[3] fmla v30.4s, v4.4s, v8.s[3]
fmla v31.4s, v6.4s, v8.4s[3] fmla v31.4s, v6.4s, v8.s[3]
.endm .endm
.macro KERNEL16x4_E .macro KERNEL16x4_E
fmla v16.4s, v1.4s, v12.4s[0] fmla v16.4s, v1.4s, v12.s[0]
fmla v17.4s, v3.4s, v12.4s[0] fmla v17.4s, v3.4s, v12.s[0]
fmla v18.4s, v5.4s, v12.4s[0] fmla v18.4s, v5.4s, v12.s[0]
fmla v19.4s, v7.4s, v12.4s[0] fmla v19.4s, v7.4s, v12.s[0]
fmla v20.4s, v1.4s, v12.4s[1] fmla v20.4s, v1.4s, v12.s[1]
fmla v21.4s, v3.4s, v12.4s[1] fmla v21.4s, v3.4s, v12.s[1]
fmla v22.4s, v5.4s, v12.4s[1] fmla v22.4s, v5.4s, v12.s[1]
fmla v23.4s, v7.4s, v12.4s[1] fmla v23.4s, v7.4s, v12.s[1]
fmla v24.4s, v1.4s, v12.4s[2] fmla v24.4s, v1.4s, v12.s[2]
fmla v25.4s, v3.4s, v12.4s[2] fmla v25.4s, v3.4s, v12.s[2]
fmla v26.4s, v5.4s, v12.4s[2] fmla v26.4s, v5.4s, v12.s[2]
fmla v27.4s, v7.4s, v12.4s[2] fmla v27.4s, v7.4s, v12.s[2]
fmla v28.4s, v1.4s, v12.4s[3] fmla v28.4s, v1.4s, v12.s[3]
fmla v29.4s, v3.4s, v12.4s[3] fmla v29.4s, v3.4s, v12.s[3]
fmla v30.4s, v5.4s, v12.4s[3] fmla v30.4s, v5.4s, v12.s[3]
fmla v31.4s, v7.4s, v12.4s[3] fmla v31.4s, v7.4s, v12.s[3]
.endm .endm
.macro KERNEL16x4_SUB .macro KERNEL16x4_SUB
@ -359,34 +359,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.4s}, [pA_0] ld1 {v0.4s}, [pA_0]
add pA_0, pA_0, #16 add pA_0, pA_0, #16
fmla v16.4s, v0.4s, v8.4s[0] fmla v16.4s, v0.4s, v8.s[0]
fmla v20.4s, v0.4s, v8.4s[1] fmla v20.4s, v0.4s, v8.s[1]
fmla v24.4s, v0.4s, v8.4s[2] fmla v24.4s, v0.4s, v8.s[2]
fmla v28.4s, v0.4s, v8.4s[3] fmla v28.4s, v0.4s, v8.s[3]
ld1 {v2.4s}, [pA_1] ld1 {v2.4s}, [pA_1]
add pA_1, pA_1, #16 add pA_1, pA_1, #16
fmla v17.4s, v2.4s, v8.4s[0] fmla v17.4s, v2.4s, v8.s[0]
fmla v21.4s, v2.4s, v8.4s[1] fmla v21.4s, v2.4s, v8.s[1]
fmla v25.4s, v2.4s, v8.4s[2] fmla v25.4s, v2.4s, v8.s[2]
fmla v29.4s, v2.4s, v8.4s[3] fmla v29.4s, v2.4s, v8.s[3]
ld1 {v4.4s}, [pA_2] ld1 {v4.4s}, [pA_2]
add pA_2, pA_2, #16 add pA_2, pA_2, #16
fmla v18.4s, v4.4s, v8.4s[0] fmla v18.4s, v4.4s, v8.s[0]
fmla v22.4s, v4.4s, v8.4s[1] fmla v22.4s, v4.4s, v8.s[1]
fmla v26.4s, v4.4s, v8.4s[2] fmla v26.4s, v4.4s, v8.s[2]
fmla v30.4s, v4.4s, v8.4s[3] fmla v30.4s, v4.4s, v8.s[3]
ld1 {v6.4s}, [pA_3] ld1 {v6.4s}, [pA_3]
add pA_3, pA_3, #16 add pA_3, pA_3, #16
fmla v19.4s, v6.4s, v8.4s[0] fmla v19.4s, v6.4s, v8.s[0]
fmla v23.4s, v6.4s, v8.4s[1] fmla v23.4s, v6.4s, v8.s[1]
fmla v27.4s, v6.4s, v8.4s[2] fmla v27.4s, v6.4s, v8.s[2]
fmla v31.4s, v6.4s, v8.4s[3] fmla v31.4s, v6.4s, v8.s[3]
.endm .endm
.macro SAVE16x4 .macro SAVE16x4
@ -456,28 +456,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s, v1.2s}, [pA_0] ld1 {v0.2s, v1.2s}, [pA_0]
add pA_0, pA_0, #16 add pA_0, pA_0, #16
fmla v16.2s, v0.2s, v8.2s[0] fmla v16.2s, v0.2s, v8.s[0]
fmla v29.2s, v1.2s, v9.2s[1] fmla v29.2s, v1.2s, v9.s[1]
fmla v20.2s, v0.2s, v8.2s[1] fmla v20.2s, v0.2s, v8.s[1]
fmla v25.2s, v1.2s, v9.2s[0] fmla v25.2s, v1.2s, v9.s[0]
ld1 {v2.2s, v3.2s}, [pA_1] ld1 {v2.2s, v3.2s}, [pA_1]
add pA_1, pA_1, #16 add pA_1, pA_1, #16
fmla v24.2s, v0.2s, v9.2s[0] fmla v24.2s, v0.2s, v9.s[0]
fmla v21.2s, v1.2s, v8.2s[1] fmla v21.2s, v1.2s, v8.s[1]
fmla v28.2s, v0.2s, v9.2s[1] fmla v28.2s, v0.2s, v9.s[1]
fmla v17.2s, v1.2s, v8.2s[0] fmla v17.2s, v1.2s, v8.s[0]
fmla v18.2s, v2.2s, v8.2s[0] fmla v18.2s, v2.2s, v8.s[0]
fmla v31.2s, v3.2s, v9.2s[1] fmla v31.2s, v3.2s, v9.s[1]
fmla v22.2s, v2.2s, v8.2s[1] fmla v22.2s, v2.2s, v8.s[1]
fmla v27.2s, v3.2s, v9.2s[0] fmla v27.2s, v3.2s, v9.s[0]
fmla v26.2s, v2.2s, v9.2s[0] fmla v26.2s, v2.2s, v9.s[0]
fmla v23.2s, v3.2s, v8.2s[1] fmla v23.2s, v3.2s, v8.s[1]
fmla v30.2s, v2.2s, v9.2s[1] fmla v30.2s, v2.2s, v9.s[1]
fmla v19.2s, v3.2s, v8.2s[0] fmla v19.2s, v3.2s, v8.s[0]
.endm .endm
.macro SAVE8x4 .macro SAVE8x4
@ -556,17 +556,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s, v1.2s}, [pA_0] ld1 {v0.2s, v1.2s}, [pA_0]
add pA_0, pA_0, #16 add pA_0, pA_0, #16
fmla v16.2s, v0.2s, v8.2s[0] fmla v16.2s, v0.2s, v8.s[0]
fmla v29.2s, v1.2s, v9.2s[1] fmla v29.2s, v1.2s, v9.s[1]
fmla v20.2s, v0.2s, v8.2s[1] fmla v20.2s, v0.2s, v8.s[1]
fmla v25.2s, v1.2s, v9.2s[0] fmla v25.2s, v1.2s, v9.s[0]
fmla v24.2s, v0.2s, v9.2s[0] fmla v24.2s, v0.2s, v9.s[0]
fmla v21.2s, v1.2s, v8.2s[1] fmla v21.2s, v1.2s, v8.s[1]
fmla v28.2s, v0.2s, v9.2s[1] fmla v28.2s, v0.2s, v9.s[1]
fmla v17.2s, v1.2s, v8.2s[0] fmla v17.2s, v1.2s, v8.s[0]
.endm .endm
.macro SAVE4x4 .macro SAVE4x4
@ -614,10 +614,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s}, [pA_0] ld1 {v0.2s}, [pA_0]
add pA_0, pA_0, #8 add pA_0, pA_0, #8
fmla v16.2s, v0.2s, v8.2s[0] fmla v16.2s, v0.2s, v8.s[0]
fmla v20.2s, v0.2s, v8.2s[1] fmla v20.2s, v0.2s, v8.s[1]
fmla v24.2s, v0.2s, v9.2s[0] fmla v24.2s, v0.2s, v9.s[0]
fmla v28.2s, v0.2s, v9.2s[1] fmla v28.2s, v0.2s, v9.s[1]
.endm .endm
.macro SAVE2x4 .macro SAVE2x4
@ -700,10 +700,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s, v1.2s}, [pA_0] ld1 {v0.2s, v1.2s}, [pA_0]
add pA_0, pA_0, #16 add pA_0, pA_0, #16
fmla v16.2s, v0.2s, v8.2s[0] fmla v16.2s, v0.2s, v8.s[0]
fmla v17.2s, v1.2s, v8.2s[0] fmla v17.2s, v1.2s, v8.s[0]
fmla v20.2s, v0.2s, v8.2s[1] fmla v20.2s, v0.2s, v8.s[1]
fmla v21.2s, v1.2s, v8.2s[1] fmla v21.2s, v1.2s, v8.s[1]
.endm .endm
.macro SAVE4x2 .macro SAVE4x2
@ -736,8 +736,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s}, [pA_0] ld1 {v0.2s}, [pA_0]
add pA_0, pA_0, #8 add pA_0, pA_0, #8
fmla v16.2s, v0.2s, v8.2s[0] fmla v16.2s, v0.2s, v8.s[0]
fmla v20.2s, v0.2s, v8.2s[1] fmla v20.2s, v0.2s, v8.s[1]
.endm .endm
.macro SAVE2x2 .macro SAVE2x2
@ -767,7 +767,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ldr s0 , [pA_0] ldr s0 , [pA_0]
add pA_0, pA_0, #4 add pA_0, pA_0, #4
fmla v16.2s, v8.2s, v0.2s[0] fmla v16.2s, v8.2s, v0.s[0]
.endm .endm
.macro SAVE1x2 .macro SAVE1x2
@ -796,8 +796,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s, v1.2s}, [pA_0] ld1 {v0.2s, v1.2s}, [pA_0]
add pA_0 , pA_0, #16 add pA_0 , pA_0, #16
fmla v16.2s, v0.2s, v8.2s[0] fmla v16.2s, v0.2s, v8.s[0]
fmla v17.2s, v1.2s, v8.2s[0] fmla v17.2s, v1.2s, v8.s[0]
.endm .endm
.macro SAVE4x1 .macro SAVE4x1
@ -825,7 +825,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s}, [pA_0] ld1 {v0.2s}, [pA_0]
add pA_0 , pA_0, #8 add pA_0 , pA_0, #8
fmla v16.2s, v0.2s, v8.2s[0] fmla v16.2s, v0.2s, v8.s[0]
.endm .endm
.macro SAVE2x1 .macro SAVE2x1

View File

@ -157,22 +157,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v1.4s}, [pA] ld1 {v1.4s}, [pA]
add pA, pA, #16 add pA, pA, #16
fmul v16.4s, v0.4s, v4.4s[0] fmul v16.4s, v0.4s, v4.s[0]
fmul v17.4s, v1.4s, v4.4s[0] fmul v17.4s, v1.4s, v4.s[0]
fmul v18.4s, v0.4s, v4.4s[1] fmul v18.4s, v0.4s, v4.s[1]
fmul v19.4s, v1.4s, v4.4s[1] fmul v19.4s, v1.4s, v4.s[1]
fmul v20.4s, v0.4s, v4.4s[2] fmul v20.4s, v0.4s, v4.s[2]
fmul v21.4s, v1.4s, v4.4s[2] fmul v21.4s, v1.4s, v4.s[2]
fmul v22.4s, v0.4s, v4.4s[3] fmul v22.4s, v0.4s, v4.s[3]
fmul v23.4s, v1.4s, v4.4s[3] fmul v23.4s, v1.4s, v4.s[3]
fmul v24.4s, v0.4s, v5.4s[0] fmul v24.4s, v0.4s, v5.s[0]
fmul v25.4s, v1.4s, v5.4s[0] fmul v25.4s, v1.4s, v5.s[0]
fmul v26.4s, v0.4s, v5.4s[1] fmul v26.4s, v0.4s, v5.s[1]
fmul v27.4s, v1.4s, v5.4s[1] fmul v27.4s, v1.4s, v5.s[1]
fmul v28.4s, v0.4s, v5.4s[2] fmul v28.4s, v0.4s, v5.s[2]
fmul v29.4s, v1.4s, v5.4s[2] fmul v29.4s, v1.4s, v5.s[2]
fmul v30.4s, v0.4s, v5.4s[3] fmul v30.4s, v0.4s, v5.s[3]
fmul v31.4s, v1.4s, v5.4s[3] fmul v31.4s, v1.4s, v5.s[3]
ld1 {v6.4s}, [pB] ld1 {v6.4s}, [pB]
add pB, pB, #16 add pB, pB, #16
@ -185,22 +185,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL8x8_M1 .macro KERNEL8x8_M1
fmla v16.4s, v0.4s, v4.4s[0] fmla v16.4s, v0.4s, v4.s[0]
fmla v17.4s, v1.4s, v4.4s[0] fmla v17.4s, v1.4s, v4.s[0]
fmla v18.4s, v0.4s, v4.4s[1] fmla v18.4s, v0.4s, v4.s[1]
fmla v19.4s, v1.4s, v4.4s[1] fmla v19.4s, v1.4s, v4.s[1]
fmla v20.4s, v0.4s, v4.4s[2] fmla v20.4s, v0.4s, v4.s[2]
fmla v21.4s, v1.4s, v4.4s[2] fmla v21.4s, v1.4s, v4.s[2]
fmla v22.4s, v0.4s, v4.4s[3] fmla v22.4s, v0.4s, v4.s[3]
fmla v23.4s, v1.4s, v4.4s[3] fmla v23.4s, v1.4s, v4.s[3]
fmla v24.4s, v0.4s, v5.4s[0] fmla v24.4s, v0.4s, v5.s[0]
fmla v25.4s, v1.4s, v5.4s[0] fmla v25.4s, v1.4s, v5.s[0]
fmla v26.4s, v0.4s, v5.4s[1] fmla v26.4s, v0.4s, v5.s[1]
fmla v27.4s, v1.4s, v5.4s[1] fmla v27.4s, v1.4s, v5.s[1]
fmla v28.4s, v0.4s, v5.4s[2] fmla v28.4s, v0.4s, v5.s[2]
fmla v29.4s, v1.4s, v5.4s[2] fmla v29.4s, v1.4s, v5.s[2]
fmla v30.4s, v0.4s, v5.4s[3] fmla v30.4s, v0.4s, v5.s[3]
fmla v31.4s, v1.4s, v5.4s[3] fmla v31.4s, v1.4s, v5.s[3]
ld1 {v6.4s}, [pB] ld1 {v6.4s}, [pB]
add pB, pB, #16 add pB, pB, #16
@ -213,22 +213,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL8x8_M2 .macro KERNEL8x8_M2
fmla v16.4s, v2.4s, v6.4s[0] fmla v16.4s, v2.4s, v6.s[0]
fmla v17.4s, v3.4s, v6.4s[0] fmla v17.4s, v3.4s, v6.s[0]
fmla v18.4s, v2.4s, v6.4s[1] fmla v18.4s, v2.4s, v6.s[1]
fmla v19.4s, v3.4s, v6.4s[1] fmla v19.4s, v3.4s, v6.s[1]
fmla v20.4s, v2.4s, v6.4s[2] fmla v20.4s, v2.4s, v6.s[2]
fmla v21.4s, v3.4s, v6.4s[2] fmla v21.4s, v3.4s, v6.s[2]
fmla v22.4s, v2.4s, v6.4s[3] fmla v22.4s, v2.4s, v6.s[3]
fmla v23.4s, v3.4s, v6.4s[3] fmla v23.4s, v3.4s, v6.s[3]
fmla v24.4s, v2.4s, v7.4s[0] fmla v24.4s, v2.4s, v7.s[0]
fmla v25.4s, v3.4s, v7.4s[0] fmla v25.4s, v3.4s, v7.s[0]
fmla v26.4s, v2.4s, v7.4s[1] fmla v26.4s, v2.4s, v7.s[1]
fmla v27.4s, v3.4s, v7.4s[1] fmla v27.4s, v3.4s, v7.s[1]
fmla v28.4s, v2.4s, v7.4s[2] fmla v28.4s, v2.4s, v7.s[2]
fmla v29.4s, v3.4s, v7.4s[2] fmla v29.4s, v3.4s, v7.s[2]
fmla v30.4s, v2.4s, v7.4s[3] fmla v30.4s, v2.4s, v7.s[3]
fmla v31.4s, v3.4s, v7.4s[3] fmla v31.4s, v3.4s, v7.s[3]
ld1 {v4.4s}, [pB] ld1 {v4.4s}, [pB]
add pB, pB, #16 add pB, pB, #16
@ -241,22 +241,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL8x8_E .macro KERNEL8x8_E
fmla v16.4s, v2.4s, v6.4s[0] fmla v16.4s, v2.4s, v6.s[0]
fmla v17.4s, v3.4s, v6.4s[0] fmla v17.4s, v3.4s, v6.s[0]
fmla v18.4s, v2.4s, v6.4s[1] fmla v18.4s, v2.4s, v6.s[1]
fmla v19.4s, v3.4s, v6.4s[1] fmla v19.4s, v3.4s, v6.s[1]
fmla v20.4s, v2.4s, v6.4s[2] fmla v20.4s, v2.4s, v6.s[2]
fmla v21.4s, v3.4s, v6.4s[2] fmla v21.4s, v3.4s, v6.s[2]
fmla v22.4s, v2.4s, v6.4s[3] fmla v22.4s, v2.4s, v6.s[3]
fmla v23.4s, v3.4s, v6.4s[3] fmla v23.4s, v3.4s, v6.s[3]
fmla v24.4s, v2.4s, v7.4s[0] fmla v24.4s, v2.4s, v7.s[0]
fmla v25.4s, v3.4s, v7.4s[0] fmla v25.4s, v3.4s, v7.s[0]
fmla v26.4s, v2.4s, v7.4s[1] fmla v26.4s, v2.4s, v7.s[1]
fmla v27.4s, v3.4s, v7.4s[1] fmla v27.4s, v3.4s, v7.s[1]
fmla v28.4s, v2.4s, v7.4s[2] fmla v28.4s, v2.4s, v7.s[2]
fmla v29.4s, v3.4s, v7.4s[2] fmla v29.4s, v3.4s, v7.s[2]
fmla v30.4s, v2.4s, v7.4s[3] fmla v30.4s, v2.4s, v7.s[3]
fmla v31.4s, v3.4s, v7.4s[3] fmla v31.4s, v3.4s, v7.s[3]
.endm .endm
.macro KERNEL8x8_SUB .macro KERNEL8x8_SUB
@ -269,22 +269,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v1.4s}, [pA] ld1 {v1.4s}, [pA]
add pA, pA, #16 add pA, pA, #16
fmla v16.4s, v0.4s, v4.4s[0] fmla v16.4s, v0.4s, v4.s[0]
fmla v17.4s, v1.4s, v4.4s[0] fmla v17.4s, v1.4s, v4.s[0]
fmla v18.4s, v0.4s, v4.4s[1] fmla v18.4s, v0.4s, v4.s[1]
fmla v19.4s, v1.4s, v4.4s[1] fmla v19.4s, v1.4s, v4.s[1]
fmla v20.4s, v0.4s, v4.4s[2] fmla v20.4s, v0.4s, v4.s[2]
fmla v21.4s, v1.4s, v4.4s[2] fmla v21.4s, v1.4s, v4.s[2]
fmla v22.4s, v0.4s, v4.4s[3] fmla v22.4s, v0.4s, v4.s[3]
fmla v23.4s, v1.4s, v4.4s[3] fmla v23.4s, v1.4s, v4.s[3]
fmla v24.4s, v0.4s, v5.4s[0] fmla v24.4s, v0.4s, v5.s[0]
fmla v25.4s, v1.4s, v5.4s[0] fmla v25.4s, v1.4s, v5.s[0]
fmla v26.4s, v0.4s, v5.4s[1] fmla v26.4s, v0.4s, v5.s[1]
fmla v27.4s, v1.4s, v5.4s[1] fmla v27.4s, v1.4s, v5.s[1]
fmla v28.4s, v0.4s, v5.4s[2] fmla v28.4s, v0.4s, v5.s[2]
fmla v29.4s, v1.4s, v5.4s[2] fmla v29.4s, v1.4s, v5.s[2]
fmla v30.4s, v0.4s, v5.4s[3] fmla v30.4s, v0.4s, v5.s[3]
fmla v31.4s, v1.4s, v5.4s[3] fmla v31.4s, v1.4s, v5.s[3]
.endm .endm
.macro SAVE8x8 .macro SAVE8x8
@ -367,14 +367,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.4s}, [pA] ld1 {v0.4s}, [pA]
add pA, pA, #16 add pA, pA, #16
fmul v16.4s, v0.4s, v4.4s[0] fmul v16.4s, v0.4s, v4.s[0]
fmul v18.4s, v0.4s, v4.4s[1] fmul v18.4s, v0.4s, v4.s[1]
fmul v20.4s, v0.4s, v4.4s[2] fmul v20.4s, v0.4s, v4.s[2]
fmul v22.4s, v0.4s, v4.4s[3] fmul v22.4s, v0.4s, v4.s[3]
fmul v24.4s, v0.4s, v5.4s[0] fmul v24.4s, v0.4s, v5.s[0]
fmul v26.4s, v0.4s, v5.4s[1] fmul v26.4s, v0.4s, v5.s[1]
fmul v28.4s, v0.4s, v5.4s[2] fmul v28.4s, v0.4s, v5.s[2]
fmul v30.4s, v0.4s, v5.4s[3] fmul v30.4s, v0.4s, v5.s[3]
ld1 {v6.4s}, [pB] ld1 {v6.4s}, [pB]
add pB, pB, #16 add pB, pB, #16
@ -385,14 +385,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL4x8_M1 .macro KERNEL4x8_M1
fmla v16.4s, v0.4s, v4.4s[0] fmla v16.4s, v0.4s, v4.s[0]
fmla v18.4s, v0.4s, v4.4s[1] fmla v18.4s, v0.4s, v4.s[1]
fmla v20.4s, v0.4s, v4.4s[2] fmla v20.4s, v0.4s, v4.s[2]
fmla v22.4s, v0.4s, v4.4s[3] fmla v22.4s, v0.4s, v4.s[3]
fmla v24.4s, v0.4s, v5.4s[0] fmla v24.4s, v0.4s, v5.s[0]
fmla v26.4s, v0.4s, v5.4s[1] fmla v26.4s, v0.4s, v5.s[1]
fmla v28.4s, v0.4s, v5.4s[2] fmla v28.4s, v0.4s, v5.s[2]
fmla v30.4s, v0.4s, v5.4s[3] fmla v30.4s, v0.4s, v5.s[3]
ld1 {v6.4s}, [pB] ld1 {v6.4s}, [pB]
add pB, pB, #16 add pB, pB, #16
@ -403,14 +403,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL4x8_M2 .macro KERNEL4x8_M2
fmla v16.4s, v2.4s, v6.4s[0] fmla v16.4s, v2.4s, v6.s[0]
fmla v18.4s, v2.4s, v6.4s[1] fmla v18.4s, v2.4s, v6.s[1]
fmla v20.4s, v2.4s, v6.4s[2] fmla v20.4s, v2.4s, v6.s[2]
fmla v22.4s, v2.4s, v6.4s[3] fmla v22.4s, v2.4s, v6.s[3]
fmla v24.4s, v2.4s, v7.4s[0] fmla v24.4s, v2.4s, v7.s[0]
fmla v26.4s, v2.4s, v7.4s[1] fmla v26.4s, v2.4s, v7.s[1]
fmla v28.4s, v2.4s, v7.4s[2] fmla v28.4s, v2.4s, v7.s[2]
fmla v30.4s, v2.4s, v7.4s[3] fmla v30.4s, v2.4s, v7.s[3]
ld1 {v4.4s}, [pB] ld1 {v4.4s}, [pB]
add pB, pB, #16 add pB, pB, #16
@ -421,14 +421,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL4x8_E .macro KERNEL4x8_E
fmla v16.4s, v2.4s, v6.4s[0] fmla v16.4s, v2.4s, v6.s[0]
fmla v18.4s, v2.4s, v6.4s[1] fmla v18.4s, v2.4s, v6.s[1]
fmla v20.4s, v2.4s, v6.4s[2] fmla v20.4s, v2.4s, v6.s[2]
fmla v22.4s, v2.4s, v6.4s[3] fmla v22.4s, v2.4s, v6.s[3]
fmla v24.4s, v2.4s, v7.4s[0] fmla v24.4s, v2.4s, v7.s[0]
fmla v26.4s, v2.4s, v7.4s[1] fmla v26.4s, v2.4s, v7.s[1]
fmla v28.4s, v2.4s, v7.4s[2] fmla v28.4s, v2.4s, v7.s[2]
fmla v30.4s, v2.4s, v7.4s[3] fmla v30.4s, v2.4s, v7.s[3]
.endm .endm
.macro KERNEL4x8_SUB .macro KERNEL4x8_SUB
@ -439,14 +439,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.4s}, [pA] ld1 {v0.4s}, [pA]
add pA, pA, #16 add pA, pA, #16
fmla v16.4s, v0.4s, v4.4s[0] fmla v16.4s, v0.4s, v4.s[0]
fmla v18.4s, v0.4s, v4.4s[1] fmla v18.4s, v0.4s, v4.s[1]
fmla v20.4s, v0.4s, v4.4s[2] fmla v20.4s, v0.4s, v4.s[2]
fmla v22.4s, v0.4s, v4.4s[3] fmla v22.4s, v0.4s, v4.s[3]
fmla v24.4s, v0.4s, v5.4s[0] fmla v24.4s, v0.4s, v5.s[0]
fmla v26.4s, v0.4s, v5.4s[1] fmla v26.4s, v0.4s, v5.s[1]
fmla v28.4s, v0.4s, v5.4s[2] fmla v28.4s, v0.4s, v5.s[2]
fmla v30.4s, v0.4s, v5.4s[3] fmla v30.4s, v0.4s, v5.s[3]
.endm .endm
.macro SAVE4x8 .macro SAVE4x8
@ -520,14 +520,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s}, [pA] ld1 {v0.2s}, [pA]
add pA, pA, #8 add pA, pA, #8
fmla v16.2s, v0.2s, v4.4s[0] fmla v16.2s, v0.2s, v4.s[0]
fmla v18.2s, v0.2s, v4.4s[1] fmla v18.2s, v0.2s, v4.s[1]
fmla v20.2s, v0.2s, v4.4s[2] fmla v20.2s, v0.2s, v4.s[2]
fmla v22.2s, v0.2s, v4.4s[3] fmla v22.2s, v0.2s, v4.s[3]
fmla v24.2s, v0.2s, v5.4s[0] fmla v24.2s, v0.2s, v5.s[0]
fmla v26.2s, v0.2s, v5.4s[1] fmla v26.2s, v0.2s, v5.s[1]
fmla v28.2s, v0.2s, v5.4s[2] fmla v28.2s, v0.2s, v5.s[2]
fmla v30.2s, v0.2s, v5.4s[3] fmla v30.2s, v0.2s, v5.s[3]
.endm .endm
.macro SAVE2x8 .macro SAVE2x8
@ -601,14 +601,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ldr s0, [pA] ldr s0, [pA]
add pA, pA, #4 add pA, pA, #4
fmla s16, s0, v4.4s[0] fmla s16, s0, v4.s[0]
fmla s18, s0, v4.4s[1] fmla s18, s0, v4.s[1]
fmla s20, s0, v4.4s[2] fmla s20, s0, v4.s[2]
fmla s22, s0, v4.4s[3] fmla s22, s0, v4.s[3]
fmla s24, s0, v5.4s[0] fmla s24, s0, v5.s[0]
fmla s26, s0, v5.4s[1] fmla s26, s0, v5.s[1]
fmla s28, s0, v5.4s[2] fmla s28, s0, v5.s[2]
fmla s30, s0, v5.4s[3] fmla s30, s0, v5.s[3]
.endm .endm
.macro SAVE1x8 .macro SAVE1x8
@ -682,14 +682,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v1.4s}, [pA] ld1 {v1.4s}, [pA]
add pA, pA, #16 add pA, pA, #16
fmul v16.4s, v0.4s, v8.2s[0] fmul v16.4s, v0.4s, v8.s[0]
fmul v17.4s, v1.4s, v8.2s[0] fmul v17.4s, v1.4s, v8.s[0]
fmul v20.4s, v0.4s, v8.2s[1] fmul v20.4s, v0.4s, v8.s[1]
fmul v21.4s, v1.4s, v8.2s[1] fmul v21.4s, v1.4s, v8.s[1]
fmul v24.4s, v0.4s, v9.2s[0] fmul v24.4s, v0.4s, v9.s[0]
fmul v25.4s, v1.4s, v9.2s[0] fmul v25.4s, v1.4s, v9.s[0]
fmul v28.4s, v0.4s, v9.2s[1] fmul v28.4s, v0.4s, v9.s[1]
fmul v29.4s, v1.4s, v9.2s[1] fmul v29.4s, v1.4s, v9.s[1]
ld1 {v12.2s, v13.2s}, [pB] ld1 {v12.2s, v13.2s}, [pB]
add pB, pB, #16 add pB, pB, #16
@ -700,14 +700,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL8x4_M1 .macro KERNEL8x4_M1
fmla v16.4s, v0.4s, v8.2s[0] fmla v16.4s, v0.4s, v8.s[0]
fmla v17.4s, v1.4s, v8.2s[0] fmla v17.4s, v1.4s, v8.s[0]
fmla v20.4s, v0.4s, v8.2s[1] fmla v20.4s, v0.4s, v8.s[1]
fmla v21.4s, v1.4s, v8.2s[1] fmla v21.4s, v1.4s, v8.s[1]
fmla v24.4s, v0.4s, v9.2s[0] fmla v24.4s, v0.4s, v9.s[0]
fmla v25.4s, v1.4s, v9.2s[0] fmla v25.4s, v1.4s, v9.s[0]
fmla v28.4s, v0.4s, v9.2s[1] fmla v28.4s, v0.4s, v9.s[1]
fmla v29.4s, v1.4s, v9.2s[1] fmla v29.4s, v1.4s, v9.s[1]
ld1 {v12.2s, v13.2s}, [pB] ld1 {v12.2s, v13.2s}, [pB]
add pB, pB, #16 add pB, pB, #16
@ -718,14 +718,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL8x4_M2 .macro KERNEL8x4_M2
fmla v16.4s, v4.4s, v12.2s[0] fmla v16.4s, v4.4s, v12.s[0]
fmla v17.4s, v5.4s, v12.2s[0] fmla v17.4s, v5.4s, v12.s[0]
fmla v20.4s, v4.4s, v12.2s[1] fmla v20.4s, v4.4s, v12.s[1]
fmla v21.4s, v5.4s, v12.2s[1] fmla v21.4s, v5.4s, v12.s[1]
fmla v24.4s, v4.4s, v13.2s[0] fmla v24.4s, v4.4s, v13.s[0]
fmla v25.4s, v5.4s, v13.2s[0] fmla v25.4s, v5.4s, v13.s[0]
fmla v28.4s, v4.4s, v13.2s[1] fmla v28.4s, v4.4s, v13.s[1]
fmla v29.4s, v5.4s, v13.2s[1] fmla v29.4s, v5.4s, v13.s[1]
ld1 {v8.2s, v9.2s}, [pB] ld1 {v8.2s, v9.2s}, [pB]
add pB, pB, #16 add pB, pB, #16
@ -736,14 +736,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL8x4_E .macro KERNEL8x4_E
fmla v16.4s, v4.4s, v12.2s[0] fmla v16.4s, v4.4s, v12.s[0]
fmla v17.4s, v5.4s, v12.2s[0] fmla v17.4s, v5.4s, v12.s[0]
fmla v20.4s, v4.4s, v12.2s[1] fmla v20.4s, v4.4s, v12.s[1]
fmla v21.4s, v5.4s, v12.2s[1] fmla v21.4s, v5.4s, v12.s[1]
fmla v24.4s, v4.4s, v13.2s[0] fmla v24.4s, v4.4s, v13.s[0]
fmla v25.4s, v5.4s, v13.2s[0] fmla v25.4s, v5.4s, v13.s[0]
fmla v28.4s, v4.4s, v13.2s[1] fmla v28.4s, v4.4s, v13.s[1]
fmla v29.4s, v5.4s, v13.2s[1] fmla v29.4s, v5.4s, v13.s[1]
.endm .endm
.macro KERNEL8x4_SUB .macro KERNEL8x4_SUB
@ -754,14 +754,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v1.4s}, [pA] ld1 {v1.4s}, [pA]
add pA, pA, #16 add pA, pA, #16
fmla v16.4s, v0.4s, v8.2s[0] fmla v16.4s, v0.4s, v8.s[0]
fmla v17.4s, v1.4s, v8.2s[0] fmla v17.4s, v1.4s, v8.s[0]
fmla v20.4s, v0.4s, v8.2s[1] fmla v20.4s, v0.4s, v8.s[1]
fmla v21.4s, v1.4s, v8.2s[1] fmla v21.4s, v1.4s, v8.s[1]
fmla v24.4s, v0.4s, v9.2s[0] fmla v24.4s, v0.4s, v9.s[0]
fmla v25.4s, v1.4s, v9.2s[0] fmla v25.4s, v1.4s, v9.s[0]
fmla v28.4s, v0.4s, v9.2s[1] fmla v28.4s, v0.4s, v9.s[1]
fmla v29.4s, v1.4s, v9.2s[1] fmla v29.4s, v1.4s, v9.s[1]
.endm .endm
.macro SAVE8x4 .macro SAVE8x4
@ -814,17 +814,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s, v1.2s}, [pA] ld1 {v0.2s, v1.2s}, [pA]
add pA, pA, #16 add pA, pA, #16
fmul v16.2s, v0.2s, v8.2s[0] fmul v16.2s, v0.2s, v8.s[0]
fmul v29.2s, v1.2s, v9.2s[1] fmul v29.2s, v1.2s, v9.s[1]
fmul v20.2s, v0.2s, v8.2s[1] fmul v20.2s, v0.2s, v8.s[1]
fmul v25.2s, v1.2s, v9.2s[0] fmul v25.2s, v1.2s, v9.s[0]
fmul v24.2s, v0.2s, v9.2s[0] fmul v24.2s, v0.2s, v9.s[0]
fmul v21.2s, v1.2s, v8.2s[1] fmul v21.2s, v1.2s, v8.s[1]
fmul v28.2s, v0.2s, v9.2s[1] fmul v28.2s, v0.2s, v9.s[1]
fmul v17.2s, v1.2s, v8.2s[0] fmul v17.2s, v1.2s, v8.s[0]
ld1 {v12.2s, v13.2s}, [pB] ld1 {v12.2s, v13.2s}, [pB]
add pB, pB, #16 add pB, pB, #16
@ -833,61 +833,61 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL4x4_M1 .macro KERNEL4x4_M1
fmla v16.2s, v0.2s, v8.2s[0] fmla v16.2s, v0.2s, v8.s[0]
fmla v29.2s, v1.2s, v9.2s[1] fmla v29.2s, v1.2s, v9.s[1]
ld1 {v12.2s, v13.2s}, [pB] // For next round ld1 {v12.2s, v13.2s}, [pB] // For next round
add pB, pB, #16 add pB, pB, #16
fmla v20.2s, v0.2s, v8.2s[1] fmla v20.2s, v0.2s, v8.s[1]
fmla v25.2s, v1.2s, v9.2s[0] fmla v25.2s, v1.2s, v9.s[0]
ld1 {v4.2s, v5.2s}, [pA] // For next round ld1 {v4.2s, v5.2s}, [pA] // For next round
add pA, pA, #16 add pA, pA, #16
fmla v24.2s, v0.2s, v9.2s[0] fmla v24.2s, v0.2s, v9.s[0]
fmla v21.2s, v1.2s, v8.2s[1] fmla v21.2s, v1.2s, v8.s[1]
prfm PLDL1KEEP, [pB, #512] prfm PLDL1KEEP, [pB, #512]
fmla v28.2s, v0.2s, v9.2s[1] fmla v28.2s, v0.2s, v9.s[1]
fmla v17.2s, v1.2s, v8.2s[0] fmla v17.2s, v1.2s, v8.s[0]
.endm .endm
.macro KERNEL4x4_M2 .macro KERNEL4x4_M2
fmla v16.2s, v4.2s, v12.2s[0] fmla v16.2s, v4.2s, v12.s[0]
fmla v29.2s, v5.2s, v13.2s[1] fmla v29.2s, v5.2s, v13.s[1]
ld1 {v8.2s, v9.2s}, [pB] // For next round ld1 {v8.2s, v9.2s}, [pB] // For next round
add pB, pB, #16 add pB, pB, #16
fmla v20.2s, v4.2s, v12.2s[1] fmla v20.2s, v4.2s, v12.s[1]
fmla v25.2s, v5.2s, v13.2s[0] fmla v25.2s, v5.2s, v13.s[0]
ld1 {v0.2s, v1.2s}, [pA] // For next round ld1 {v0.2s, v1.2s}, [pA] // For next round
add pA, pA, #16 add pA, pA, #16
fmla v24.2s, v4.2s, v13.2s[0] fmla v24.2s, v4.2s, v13.s[0]
fmla v21.2s, v5.2s, v12.2s[1] fmla v21.2s, v5.2s, v12.s[1]
prfm PLDL1KEEP, [pA, #512] prfm PLDL1KEEP, [pA, #512]
fmla v28.2s, v4.2s, v13.2s[1] fmla v28.2s, v4.2s, v13.s[1]
fmla v17.2s, v5.2s, v12.2s[0] fmla v17.2s, v5.2s, v12.s[0]
.endm .endm
.macro KERNEL4x4_E .macro KERNEL4x4_E
fmla v16.2s, v4.2s, v12.2s[0] fmla v16.2s, v4.2s, v12.s[0]
fmla v29.2s, v5.2s, v13.2s[1] fmla v29.2s, v5.2s, v13.s[1]
fmla v20.2s, v4.2s, v12.2s[1] fmla v20.2s, v4.2s, v12.s[1]
fmla v25.2s, v5.2s, v13.2s[0] fmla v25.2s, v5.2s, v13.s[0]
fmla v24.2s, v4.2s, v13.2s[0] fmla v24.2s, v4.2s, v13.s[0]
fmla v21.2s, v5.2s, v12.2s[1] fmla v21.2s, v5.2s, v12.s[1]
fmla v28.2s, v4.2s, v13.2s[1] fmla v28.2s, v4.2s, v13.s[1]
fmla v17.2s, v5.2s, v12.2s[0] fmla v17.2s, v5.2s, v12.s[0]
.endm .endm
.macro KERNEL4x4_SUB .macro KERNEL4x4_SUB
@ -896,17 +896,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s, v1.2s}, [pA] ld1 {v0.2s, v1.2s}, [pA]
add pA, pA, #16 add pA, pA, #16
fmla v16.2s, v0.2s, v8.2s[0] fmla v16.2s, v0.2s, v8.s[0]
fmla v29.2s, v1.2s, v9.2s[1] fmla v29.2s, v1.2s, v9.s[1]
fmla v20.2s, v0.2s, v8.2s[1] fmla v20.2s, v0.2s, v8.s[1]
fmla v25.2s, v1.2s, v9.2s[0] fmla v25.2s, v1.2s, v9.s[0]
fmla v24.2s, v0.2s, v9.2s[0] fmla v24.2s, v0.2s, v9.s[0]
fmla v21.2s, v1.2s, v8.2s[1] fmla v21.2s, v1.2s, v8.s[1]
fmla v28.2s, v0.2s, v9.2s[1] fmla v28.2s, v0.2s, v9.s[1]
fmla v17.2s, v1.2s, v8.2s[0] fmla v17.2s, v1.2s, v8.s[0]
.endm .endm
.macro SAVE4x4 .macro SAVE4x4
@ -951,10 +951,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s}, [pA] ld1 {v0.2s}, [pA]
add pA, pA, #8 add pA, pA, #8
fmla v16.2s, v0.2s, v8.2s[0] fmla v16.2s, v0.2s, v8.s[0]
fmla v20.2s, v0.2s, v8.2s[1] fmla v20.2s, v0.2s, v8.s[1]
fmla v24.2s, v0.2s, v9.2s[0] fmla v24.2s, v0.2s, v9.s[0]
fmla v28.2s, v0.2s, v9.2s[1] fmla v28.2s, v0.2s, v9.s[1]
.endm .endm
.macro SAVE2x4 .macro SAVE2x4
@ -1034,11 +1034,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v1.4s}, [pA] ld1 {v1.4s}, [pA]
add pA, pA, #16 add pA, pA, #16
fmla v16.4s, v0.4s, v8.2s[0] fmla v16.4s, v0.4s, v8.s[0]
fmla v17.4s, v1.4s, v8.2s[0] fmla v17.4s, v1.4s, v8.s[0]
fmla v20.4s, v0.4s, v8.2s[1] fmla v20.4s, v0.4s, v8.s[1]
fmla v21.4s, v1.4s, v8.2s[1] fmla v21.4s, v1.4s, v8.s[1]
.endm .endm
.macro SAVE8x2 .macro SAVE8x2
@ -1074,10 +1074,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s, v1.2s}, [pA] ld1 {v0.2s, v1.2s}, [pA]
add pA, pA, #16 add pA, pA, #16
fmla v16.2s, v0.2s, v8.2s[0] fmla v16.2s, v0.2s, v8.s[0]
fmla v17.2s, v1.2s, v8.2s[0] fmla v17.2s, v1.2s, v8.s[0]
fmla v20.2s, v0.2s, v8.2s[1] fmla v20.2s, v0.2s, v8.s[1]
fmla v21.2s, v1.2s, v8.2s[1] fmla v21.2s, v1.2s, v8.s[1]
.endm .endm
.macro SAVE4x2 .macro SAVE4x2
@ -1109,8 +1109,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s}, [pA] ld1 {v0.2s}, [pA]
add pA, pA, #8 add pA, pA, #8
fmla v16.2s, v0.2s, v8.2s[0] fmla v16.2s, v0.2s, v8.s[0]
fmla v20.2s, v0.2s, v8.2s[1] fmla v20.2s, v0.2s, v8.s[1]
.endm .endm
.macro SAVE2x2 .macro SAVE2x2
@ -1139,7 +1139,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ldr s0 , [pA] ldr s0 , [pA]
add pA, pA, #4 add pA, pA, #4
fmla v16.2s, v8.2s, v0.2s[0] fmla v16.2s, v8.2s, v0.s[0]
.endm .endm
.macro SAVE1x2 .macro SAVE1x2
@ -1169,8 +1169,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v1.4s}, [pA] ld1 {v1.4s}, [pA]
add pA, pA, #16 add pA, pA, #16
fmla v16.4s, v0.4s, v8.2s[0] fmla v16.4s, v0.4s, v8.s[0]
fmla v17.4s, v1.4s, v8.2s[0] fmla v17.4s, v1.4s, v8.s[0]
.endm .endm
.macro SAVE8x1 .macro SAVE8x1
@ -1196,8 +1196,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s, v1.2s}, [pA] ld1 {v0.2s, v1.2s}, [pA]
add pA , pA, #16 add pA , pA, #16
fmla v16.2s, v0.2s, v8.2s[0] fmla v16.2s, v0.2s, v8.s[0]
fmla v17.2s, v1.2s, v8.2s[0] fmla v17.2s, v1.2s, v8.s[0]
.endm .endm
.macro SAVE4x1 .macro SAVE4x1
@ -1222,7 +1222,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s}, [pA] ld1 {v0.2s}, [pA]
add pA , pA, #8 add pA , pA, #8
fmla v16.2s, v0.2s, v8.2s[0] fmla v16.2s, v0.2s, v8.s[0]
.endm .endm
.macro SAVE2x1 .macro SAVE2x1

384
kernel/arm64/strmm_kernel_16x4.S Executable file → Normal file
View File

@ -161,25 +161,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v3.4s}, [pA] ld1 {v3.4s}, [pA]
add pA, pA, #16 add pA, pA, #16
fmul v16.4s, v0.4s, v8.2s[0] fmul v16.4s, v0.4s, v8.s[0]
fmul v17.4s, v1.4s, v8.2s[0] fmul v17.4s, v1.4s, v8.s[0]
fmul v18.4s, v2.4s, v8.2s[0] fmul v18.4s, v2.4s, v8.s[0]
fmul v19.4s, v3.4s, v8.2s[0] fmul v19.4s, v3.4s, v8.s[0]
fmul v20.4s, v0.4s, v8.2s[1] fmul v20.4s, v0.4s, v8.s[1]
fmul v21.4s, v1.4s, v8.2s[1] fmul v21.4s, v1.4s, v8.s[1]
fmul v22.4s, v2.4s, v8.2s[1] fmul v22.4s, v2.4s, v8.s[1]
fmul v23.4s, v3.4s, v8.2s[1] fmul v23.4s, v3.4s, v8.s[1]
fmul v24.4s, v0.4s, v9.2s[0] fmul v24.4s, v0.4s, v9.s[0]
fmul v25.4s, v1.4s, v9.2s[0] fmul v25.4s, v1.4s, v9.s[0]
fmul v26.4s, v2.4s, v9.2s[0] fmul v26.4s, v2.4s, v9.s[0]
fmul v27.4s, v3.4s, v9.2s[0] fmul v27.4s, v3.4s, v9.s[0]
fmul v28.4s, v0.4s, v9.2s[1] fmul v28.4s, v0.4s, v9.s[1]
fmul v29.4s, v1.4s, v9.2s[1] fmul v29.4s, v1.4s, v9.s[1]
fmul v30.4s, v2.4s, v9.2s[1] fmul v30.4s, v2.4s, v9.s[1]
fmul v31.4s, v3.4s, v9.2s[1] fmul v31.4s, v3.4s, v9.s[1]
ld1 {v12.2s, v13.2s}, [pB] ld1 {v12.2s, v13.2s}, [pB]
add pB, pB, #16 add pB, pB, #16
@ -194,25 +194,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL16x4_M1 .macro KERNEL16x4_M1
fmla v16.4s, v0.4s, v8.2s[0] fmla v16.4s, v0.4s, v8.s[0]
fmla v17.4s, v1.4s, v8.2s[0] fmla v17.4s, v1.4s, v8.s[0]
fmla v18.4s, v2.4s, v8.2s[0] fmla v18.4s, v2.4s, v8.s[0]
fmla v19.4s, v3.4s, v8.2s[0] fmla v19.4s, v3.4s, v8.s[0]
fmla v20.4s, v0.4s, v8.2s[1] fmla v20.4s, v0.4s, v8.s[1]
fmla v21.4s, v1.4s, v8.2s[1] fmla v21.4s, v1.4s, v8.s[1]
fmla v22.4s, v2.4s, v8.2s[1] fmla v22.4s, v2.4s, v8.s[1]
fmla v23.4s, v3.4s, v8.2s[1] fmla v23.4s, v3.4s, v8.s[1]
fmla v24.4s, v0.4s, v9.2s[0] fmla v24.4s, v0.4s, v9.s[0]
fmla v25.4s, v1.4s, v9.2s[0] fmla v25.4s, v1.4s, v9.s[0]
fmla v26.4s, v2.4s, v9.2s[0] fmla v26.4s, v2.4s, v9.s[0]
fmla v27.4s, v3.4s, v9.2s[0] fmla v27.4s, v3.4s, v9.s[0]
fmla v28.4s, v0.4s, v9.2s[1] fmla v28.4s, v0.4s, v9.s[1]
fmla v29.4s, v1.4s, v9.2s[1] fmla v29.4s, v1.4s, v9.s[1]
fmla v30.4s, v2.4s, v9.2s[1] fmla v30.4s, v2.4s, v9.s[1]
fmla v31.4s, v3.4s, v9.2s[1] fmla v31.4s, v3.4s, v9.s[1]
ld1 {v12.2s, v13.2s}, [pB] ld1 {v12.2s, v13.2s}, [pB]
add pB, pB, #16 add pB, pB, #16
@ -227,25 +227,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL16x4_M2 .macro KERNEL16x4_M2
fmla v16.4s, v4.4s, v12.2s[0] fmla v16.4s, v4.4s, v12.s[0]
fmla v17.4s, v5.4s, v12.2s[0] fmla v17.4s, v5.4s, v12.s[0]
fmla v18.4s, v6.4s, v12.2s[0] fmla v18.4s, v6.4s, v12.s[0]
fmla v19.4s, v7.4s, v12.2s[0] fmla v19.4s, v7.4s, v12.s[0]
fmla v20.4s, v4.4s, v12.2s[1] fmla v20.4s, v4.4s, v12.s[1]
fmla v21.4s, v5.4s, v12.2s[1] fmla v21.4s, v5.4s, v12.s[1]
fmla v22.4s, v6.4s, v12.2s[1] fmla v22.4s, v6.4s, v12.s[1]
fmla v23.4s, v7.4s, v12.2s[1] fmla v23.4s, v7.4s, v12.s[1]
fmla v24.4s, v4.4s, v13.2s[0] fmla v24.4s, v4.4s, v13.s[0]
fmla v25.4s, v5.4s, v13.2s[0] fmla v25.4s, v5.4s, v13.s[0]
fmla v26.4s, v6.4s, v13.2s[0] fmla v26.4s, v6.4s, v13.s[0]
fmla v27.4s, v7.4s, v13.2s[0] fmla v27.4s, v7.4s, v13.s[0]
fmla v28.4s, v4.4s, v13.2s[1] fmla v28.4s, v4.4s, v13.s[1]
fmla v29.4s, v5.4s, v13.2s[1] fmla v29.4s, v5.4s, v13.s[1]
fmla v30.4s, v6.4s, v13.2s[1] fmla v30.4s, v6.4s, v13.s[1]
fmla v31.4s, v7.4s, v13.2s[1] fmla v31.4s, v7.4s, v13.s[1]
ld1 {v8.2s, v9.2s}, [pB] ld1 {v8.2s, v9.2s}, [pB]
add pB, pB, #16 add pB, pB, #16
@ -260,25 +260,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL16x4_E .macro KERNEL16x4_E
fmla v16.4s, v4.4s, v12.2s[0] fmla v16.4s, v4.4s, v12.s[0]
fmla v17.4s, v5.4s, v12.2s[0] fmla v17.4s, v5.4s, v12.s[0]
fmla v18.4s, v6.4s, v12.2s[0] fmla v18.4s, v6.4s, v12.s[0]
fmla v19.4s, v7.4s, v12.2s[0] fmla v19.4s, v7.4s, v12.s[0]
fmla v20.4s, v4.4s, v12.2s[1] fmla v20.4s, v4.4s, v12.s[1]
fmla v21.4s, v5.4s, v12.2s[1] fmla v21.4s, v5.4s, v12.s[1]
fmla v22.4s, v6.4s, v12.2s[1] fmla v22.4s, v6.4s, v12.s[1]
fmla v23.4s, v7.4s, v12.2s[1] fmla v23.4s, v7.4s, v12.s[1]
fmla v24.4s, v4.4s, v13.2s[0] fmla v24.4s, v4.4s, v13.s[0]
fmla v25.4s, v5.4s, v13.2s[0] fmla v25.4s, v5.4s, v13.s[0]
fmla v26.4s, v6.4s, v13.2s[0] fmla v26.4s, v6.4s, v13.s[0]
fmla v27.4s, v7.4s, v13.2s[0] fmla v27.4s, v7.4s, v13.s[0]
fmla v28.4s, v4.4s, v13.2s[1] fmla v28.4s, v4.4s, v13.s[1]
fmla v29.4s, v5.4s, v13.2s[1] fmla v29.4s, v5.4s, v13.s[1]
fmla v30.4s, v6.4s, v13.2s[1] fmla v30.4s, v6.4s, v13.s[1]
fmla v31.4s, v7.4s, v13.2s[1] fmla v31.4s, v7.4s, v13.s[1]
.endm .endm
.macro KERNEL16x4_SUB .macro KERNEL16x4_SUB
@ -293,25 +293,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v3.4s}, [pA] ld1 {v3.4s}, [pA]
add pA, pA, #16 add pA, pA, #16
fmla v16.4s, v0.4s, v8.2s[0] fmla v16.4s, v0.4s, v8.s[0]
fmla v17.4s, v1.4s, v8.2s[0] fmla v17.4s, v1.4s, v8.s[0]
fmla v18.4s, v2.4s, v8.2s[0] fmla v18.4s, v2.4s, v8.s[0]
fmla v19.4s, v3.4s, v8.2s[0] fmla v19.4s, v3.4s, v8.s[0]
fmla v20.4s, v0.4s, v8.2s[1] fmla v20.4s, v0.4s, v8.s[1]
fmla v21.4s, v1.4s, v8.2s[1] fmla v21.4s, v1.4s, v8.s[1]
fmla v22.4s, v2.4s, v8.2s[1] fmla v22.4s, v2.4s, v8.s[1]
fmla v23.4s, v3.4s, v8.2s[1] fmla v23.4s, v3.4s, v8.s[1]
fmla v24.4s, v0.4s, v9.2s[0] fmla v24.4s, v0.4s, v9.s[0]
fmla v25.4s, v1.4s, v9.2s[0] fmla v25.4s, v1.4s, v9.s[0]
fmla v26.4s, v2.4s, v9.2s[0] fmla v26.4s, v2.4s, v9.s[0]
fmla v27.4s, v3.4s, v9.2s[0] fmla v27.4s, v3.4s, v9.s[0]
fmla v28.4s, v0.4s, v9.2s[1] fmla v28.4s, v0.4s, v9.s[1]
fmla v29.4s, v1.4s, v9.2s[1] fmla v29.4s, v1.4s, v9.s[1]
fmla v30.4s, v2.4s, v9.2s[1] fmla v30.4s, v2.4s, v9.s[1]
fmla v31.4s, v3.4s, v9.2s[1] fmla v31.4s, v3.4s, v9.s[1]
.endm .endm
.macro SAVE16x4 .macro SAVE16x4
@ -369,14 +369,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v1.4s}, [pA] ld1 {v1.4s}, [pA]
add pA, pA, #16 add pA, pA, #16
fmul v16.4s, v0.4s, v8.2s[0] fmul v16.4s, v0.4s, v8.s[0]
fmul v17.4s, v1.4s, v8.2s[0] fmul v17.4s, v1.4s, v8.s[0]
fmul v20.4s, v0.4s, v8.2s[1] fmul v20.4s, v0.4s, v8.s[1]
fmul v21.4s, v1.4s, v8.2s[1] fmul v21.4s, v1.4s, v8.s[1]
fmul v24.4s, v0.4s, v9.2s[0] fmul v24.4s, v0.4s, v9.s[0]
fmul v25.4s, v1.4s, v9.2s[0] fmul v25.4s, v1.4s, v9.s[0]
fmul v28.4s, v0.4s, v9.2s[1] fmul v28.4s, v0.4s, v9.s[1]
fmul v29.4s, v1.4s, v9.2s[1] fmul v29.4s, v1.4s, v9.s[1]
ld1 {v12.2s, v13.2s}, [pB] ld1 {v12.2s, v13.2s}, [pB]
add pB, pB, #16 add pB, pB, #16
@ -387,14 +387,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL8x4_M1 .macro KERNEL8x4_M1
fmla v16.4s, v0.4s, v8.2s[0] fmla v16.4s, v0.4s, v8.s[0]
fmla v17.4s, v1.4s, v8.2s[0] fmla v17.4s, v1.4s, v8.s[0]
fmla v20.4s, v0.4s, v8.2s[1] fmla v20.4s, v0.4s, v8.s[1]
fmla v21.4s, v1.4s, v8.2s[1] fmla v21.4s, v1.4s, v8.s[1]
fmla v24.4s, v0.4s, v9.2s[0] fmla v24.4s, v0.4s, v9.s[0]
fmla v25.4s, v1.4s, v9.2s[0] fmla v25.4s, v1.4s, v9.s[0]
fmla v28.4s, v0.4s, v9.2s[1] fmla v28.4s, v0.4s, v9.s[1]
fmla v29.4s, v1.4s, v9.2s[1] fmla v29.4s, v1.4s, v9.s[1]
ld1 {v12.2s, v13.2s}, [pB] ld1 {v12.2s, v13.2s}, [pB]
add pB, pB, #16 add pB, pB, #16
@ -405,14 +405,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL8x4_M2 .macro KERNEL8x4_M2
fmla v16.4s, v4.4s, v12.2s[0] fmla v16.4s, v4.4s, v12.s[0]
fmla v17.4s, v5.4s, v12.2s[0] fmla v17.4s, v5.4s, v12.s[0]
fmla v20.4s, v4.4s, v12.2s[1] fmla v20.4s, v4.4s, v12.s[1]
fmla v21.4s, v5.4s, v12.2s[1] fmla v21.4s, v5.4s, v12.s[1]
fmla v24.4s, v4.4s, v13.2s[0] fmla v24.4s, v4.4s, v13.s[0]
fmla v25.4s, v5.4s, v13.2s[0] fmla v25.4s, v5.4s, v13.s[0]
fmla v28.4s, v4.4s, v13.2s[1] fmla v28.4s, v4.4s, v13.s[1]
fmla v29.4s, v5.4s, v13.2s[1] fmla v29.4s, v5.4s, v13.s[1]
ld1 {v8.2s, v9.2s}, [pB] ld1 {v8.2s, v9.2s}, [pB]
add pB, pB, #16 add pB, pB, #16
@ -423,14 +423,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL8x4_E .macro KERNEL8x4_E
fmla v16.4s, v4.4s, v12.2s[0] fmla v16.4s, v4.4s, v12.s[0]
fmla v17.4s, v5.4s, v12.2s[0] fmla v17.4s, v5.4s, v12.s[0]
fmla v20.4s, v4.4s, v12.2s[1] fmla v20.4s, v4.4s, v12.s[1]
fmla v21.4s, v5.4s, v12.2s[1] fmla v21.4s, v5.4s, v12.s[1]
fmla v24.4s, v4.4s, v13.2s[0] fmla v24.4s, v4.4s, v13.s[0]
fmla v25.4s, v5.4s, v13.2s[0] fmla v25.4s, v5.4s, v13.s[0]
fmla v28.4s, v4.4s, v13.2s[1] fmla v28.4s, v4.4s, v13.s[1]
fmla v29.4s, v5.4s, v13.2s[1] fmla v29.4s, v5.4s, v13.s[1]
.endm .endm
.macro KERNEL8x4_SUB .macro KERNEL8x4_SUB
@ -441,14 +441,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v1.4s}, [pA] ld1 {v1.4s}, [pA]
add pA, pA, #16 add pA, pA, #16
fmla v16.4s, v0.4s, v8.2s[0] fmla v16.4s, v0.4s, v8.s[0]
fmla v17.4s, v1.4s, v8.2s[0] fmla v17.4s, v1.4s, v8.s[0]
fmla v20.4s, v0.4s, v8.2s[1] fmla v20.4s, v0.4s, v8.s[1]
fmla v21.4s, v1.4s, v8.2s[1] fmla v21.4s, v1.4s, v8.s[1]
fmla v24.4s, v0.4s, v9.2s[0] fmla v24.4s, v0.4s, v9.s[0]
fmla v25.4s, v1.4s, v9.2s[0] fmla v25.4s, v1.4s, v9.s[0]
fmla v28.4s, v0.4s, v9.2s[1] fmla v28.4s, v0.4s, v9.s[1]
fmla v29.4s, v1.4s, v9.2s[1] fmla v29.4s, v1.4s, v9.s[1]
.endm .endm
.macro SAVE8x4 .macro SAVE8x4
@ -496,17 +496,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s, v1.2s}, [pA] ld1 {v0.2s, v1.2s}, [pA]
add pA, pA, #16 add pA, pA, #16
fmul v16.2s, v0.2s, v8.2s[0] fmul v16.2s, v0.2s, v8.s[0]
fmul v29.2s, v1.2s, v9.2s[1] fmul v29.2s, v1.2s, v9.s[1]
fmul v20.2s, v0.2s, v8.2s[1] fmul v20.2s, v0.2s, v8.s[1]
fmul v25.2s, v1.2s, v9.2s[0] fmul v25.2s, v1.2s, v9.s[0]
fmul v24.2s, v0.2s, v9.2s[0] fmul v24.2s, v0.2s, v9.s[0]
fmul v21.2s, v1.2s, v8.2s[1] fmul v21.2s, v1.2s, v8.s[1]
fmul v28.2s, v0.2s, v9.2s[1] fmul v28.2s, v0.2s, v9.s[1]
fmul v17.2s, v1.2s, v8.2s[0] fmul v17.2s, v1.2s, v8.s[0]
ld1 {v12.2s, v13.2s}, [pB] ld1 {v12.2s, v13.2s}, [pB]
add pB, pB, #16 add pB, pB, #16
@ -515,61 +515,61 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL4x4_M1 .macro KERNEL4x4_M1
fmla v16.2s, v0.2s, v8.2s[0] fmla v16.2s, v0.2s, v8.s[0]
fmla v29.2s, v1.2s, v9.2s[1] fmla v29.2s, v1.2s, v9.s[1]
ld1 {v12.2s, v13.2s}, [pB] // For next round ld1 {v12.2s, v13.2s}, [pB] // For next round
add pB, pB, #16 add pB, pB, #16
fmla v20.2s, v0.2s, v8.2s[1] fmla v20.2s, v0.2s, v8.s[1]
fmla v25.2s, v1.2s, v9.2s[0] fmla v25.2s, v1.2s, v9.s[0]
ld1 {v4.2s, v5.2s}, [pA] // For next round ld1 {v4.2s, v5.2s}, [pA] // For next round
add pA, pA, #16 add pA, pA, #16
fmla v24.2s, v0.2s, v9.2s[0] fmla v24.2s, v0.2s, v9.s[0]
fmla v21.2s, v1.2s, v8.2s[1] fmla v21.2s, v1.2s, v8.s[1]
prfm PLDL1KEEP, [pB, #512] prfm PLDL1KEEP, [pB, #512]
fmla v28.2s, v0.2s, v9.2s[1] fmla v28.2s, v0.2s, v9.s[1]
fmla v17.2s, v1.2s, v8.2s[0] fmla v17.2s, v1.2s, v8.s[0]
.endm .endm
.macro KERNEL4x4_M2 .macro KERNEL4x4_M2
fmla v16.2s, v4.2s, v12.2s[0] fmla v16.2s, v4.2s, v12.s[0]
fmla v29.2s, v5.2s, v13.2s[1] fmla v29.2s, v5.2s, v13.s[1]
ld1 {v8.2s, v9.2s}, [pB] // For next round ld1 {v8.2s, v9.2s}, [pB] // For next round
add pB, pB, #16 add pB, pB, #16
fmla v20.2s, v4.2s, v12.2s[1] fmla v20.2s, v4.2s, v12.s[1]
fmla v25.2s, v5.2s, v13.2s[0] fmla v25.2s, v5.2s, v13.s[0]
ld1 {v0.2s, v1.2s}, [pA] // For next round ld1 {v0.2s, v1.2s}, [pA] // For next round
add pA, pA, #16 add pA, pA, #16
fmla v24.2s, v4.2s, v13.2s[0] fmla v24.2s, v4.2s, v13.s[0]
fmla v21.2s, v5.2s, v12.2s[1] fmla v21.2s, v5.2s, v12.s[1]
prfm PLDL1KEEP, [pA, #512] prfm PLDL1KEEP, [pA, #512]
fmla v28.2s, v4.2s, v13.2s[1] fmla v28.2s, v4.2s, v13.s[1]
fmla v17.2s, v5.2s, v12.2s[0] fmla v17.2s, v5.2s, v12.s[0]
.endm .endm
.macro KERNEL4x4_E .macro KERNEL4x4_E
fmla v16.2s, v4.2s, v12.2s[0] fmla v16.2s, v4.2s, v12.s[0]
fmla v29.2s, v5.2s, v13.2s[1] fmla v29.2s, v5.2s, v13.s[1]
fmla v20.2s, v4.2s, v12.2s[1] fmla v20.2s, v4.2s, v12.s[1]
fmla v25.2s, v5.2s, v13.2s[0] fmla v25.2s, v5.2s, v13.s[0]
fmla v24.2s, v4.2s, v13.2s[0] fmla v24.2s, v4.2s, v13.s[0]
fmla v21.2s, v5.2s, v12.2s[1] fmla v21.2s, v5.2s, v12.s[1]
fmla v28.2s, v4.2s, v13.2s[1] fmla v28.2s, v4.2s, v13.s[1]
fmla v17.2s, v5.2s, v12.2s[0] fmla v17.2s, v5.2s, v12.s[0]
.endm .endm
.macro KERNEL4x4_SUB .macro KERNEL4x4_SUB
@ -578,17 +578,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s, v1.2s}, [pA] ld1 {v0.2s, v1.2s}, [pA]
add pA, pA, #16 add pA, pA, #16
fmla v16.2s, v0.2s, v8.2s[0] fmla v16.2s, v0.2s, v8.s[0]
fmla v29.2s, v1.2s, v9.2s[1] fmla v29.2s, v1.2s, v9.s[1]
fmla v20.2s, v0.2s, v8.2s[1] fmla v20.2s, v0.2s, v8.s[1]
fmla v25.2s, v1.2s, v9.2s[0] fmla v25.2s, v1.2s, v9.s[0]
fmla v24.2s, v0.2s, v9.2s[0] fmla v24.2s, v0.2s, v9.s[0]
fmla v21.2s, v1.2s, v8.2s[1] fmla v21.2s, v1.2s, v8.s[1]
fmla v28.2s, v0.2s, v9.2s[1] fmla v28.2s, v0.2s, v9.s[1]
fmla v17.2s, v1.2s, v8.2s[0] fmla v17.2s, v1.2s, v8.s[0]
.endm .endm
.macro SAVE4x4 .macro SAVE4x4
@ -633,10 +633,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s}, [pA] ld1 {v0.2s}, [pA]
add pA, pA, #8 add pA, pA, #8
fmla v16.2s, v0.2s, v8.2s[0] fmla v16.2s, v0.2s, v8.s[0]
fmla v20.2s, v0.2s, v8.2s[1] fmla v20.2s, v0.2s, v8.s[1]
fmla v24.2s, v0.2s, v9.2s[0] fmla v24.2s, v0.2s, v9.s[0]
fmla v28.2s, v0.2s, v9.2s[1] fmla v28.2s, v0.2s, v9.s[1]
.endm .endm
.macro SAVE2x4 .macro SAVE2x4
@ -718,15 +718,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v3.4s}, [pA] ld1 {v3.4s}, [pA]
add pA, pA, #16 add pA, pA, #16
fmla v16.4s, v0.4s, v8.2s[0] fmla v16.4s, v0.4s, v8.s[0]
fmla v17.4s, v1.4s, v8.2s[0] fmla v17.4s, v1.4s, v8.s[0]
fmla v18.4s, v2.4s, v8.2s[0] fmla v18.4s, v2.4s, v8.s[0]
fmla v19.4s, v3.4s, v8.2s[0] fmla v19.4s, v3.4s, v8.s[0]
fmla v20.4s, v0.4s, v8.2s[1] fmla v20.4s, v0.4s, v8.s[1]
fmla v21.4s, v1.4s, v8.2s[1] fmla v21.4s, v1.4s, v8.s[1]
fmla v22.4s, v2.4s, v8.2s[1] fmla v22.4s, v2.4s, v8.s[1]
fmla v23.4s, v3.4s, v8.2s[1] fmla v23.4s, v3.4s, v8.s[1]
.endm .endm
.macro SAVE16x2 .macro SAVE16x2
@ -764,11 +764,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v1.4s}, [pA] ld1 {v1.4s}, [pA]
add pA, pA, #16 add pA, pA, #16
fmla v16.4s, v0.4s, v8.2s[0] fmla v16.4s, v0.4s, v8.s[0]
fmla v17.4s, v1.4s, v8.2s[0] fmla v17.4s, v1.4s, v8.s[0]
fmla v20.4s, v0.4s, v8.2s[1] fmla v20.4s, v0.4s, v8.s[1]
fmla v21.4s, v1.4s, v8.2s[1] fmla v21.4s, v1.4s, v8.s[1]
.endm .endm
.macro SAVE8x2 .macro SAVE8x2
@ -802,10 +802,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s, v1.2s}, [pA] ld1 {v0.2s, v1.2s}, [pA]
add pA, pA, #16 add pA, pA, #16
fmla v16.2s, v0.2s, v8.2s[0] fmla v16.2s, v0.2s, v8.s[0]
fmla v17.2s, v1.2s, v8.2s[0] fmla v17.2s, v1.2s, v8.s[0]
fmla v20.2s, v0.2s, v8.2s[1] fmla v20.2s, v0.2s, v8.s[1]
fmla v21.2s, v1.2s, v8.2s[1] fmla v21.2s, v1.2s, v8.s[1]
.endm .endm
.macro SAVE4x2 .macro SAVE4x2
@ -837,8 +837,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s}, [pA] ld1 {v0.2s}, [pA]
add pA, pA, #8 add pA, pA, #8
fmla v16.2s, v0.2s, v8.2s[0] fmla v16.2s, v0.2s, v8.s[0]
fmla v20.2s, v0.2s, v8.2s[1] fmla v20.2s, v0.2s, v8.s[1]
.endm .endm
.macro SAVE2x2 .macro SAVE2x2
@ -866,7 +866,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ldr s0 , [pA] ldr s0 , [pA]
add pA, pA, #4 add pA, pA, #4
fmla v16.2s, v8.2s, v0.2s[0] fmla v16.2s, v8.2s, v0.s[0]
.endm .endm
.macro SAVE1x2 .macro SAVE1x2
@ -901,10 +901,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v3.4s}, [pA] ld1 {v3.4s}, [pA]
add pA, pA, #16 add pA, pA, #16
fmla v16.4s, v0.4s, v8.2s[0] fmla v16.4s, v0.4s, v8.s[0]
fmla v17.4s, v1.4s, v8.2s[0] fmla v17.4s, v1.4s, v8.s[0]
fmla v18.4s, v2.4s, v8.2s[0] fmla v18.4s, v2.4s, v8.s[0]
fmla v19.4s, v3.4s, v8.2s[0] fmla v19.4s, v3.4s, v8.s[0]
.endm .endm
.macro SAVE16x1 .macro SAVE16x1
@ -934,8 +934,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v1.4s}, [pA] ld1 {v1.4s}, [pA]
add pA, pA, #16 add pA, pA, #16
fmla v16.4s, v0.4s, v8.2s[0] fmla v16.4s, v0.4s, v8.s[0]
fmla v17.4s, v1.4s, v8.2s[0] fmla v17.4s, v1.4s, v8.s[0]
.endm .endm
.macro SAVE8x1 .macro SAVE8x1
@ -961,8 +961,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s, v1.2s}, [pA] ld1 {v0.2s, v1.2s}, [pA]
add pA , pA, #16 add pA , pA, #16
fmla v16.2s, v0.2s, v8.2s[0] fmla v16.2s, v0.2s, v8.s[0]
fmla v17.2s, v1.2s, v8.2s[0] fmla v17.2s, v1.2s, v8.s[0]
.endm .endm
.macro SAVE4x1 .macro SAVE4x1
@ -987,7 +987,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s}, [pA] ld1 {v0.2s}, [pA]
add pA , pA, #8 add pA , pA, #8
fmla v16.2s, v0.2s, v8.2s[0] fmla v16.2s, v0.2s, v8.s[0]
.endm .endm
.macro SAVE2x1 .macro SAVE2x1

View File

@ -147,17 +147,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s, v1.2s}, [pA] ld1 {v0.2s, v1.2s}, [pA]
add pA, pA, #16 add pA, pA, #16
fmul v16.2s, v0.2s, v8.2s[0] fmul v16.2s, v0.2s, v8.s[0]
fmul v29.2s, v1.2s, v9.2s[1] fmul v29.2s, v1.2s, v9.s[1]
fmul v20.2s, v0.2s, v8.2s[1] fmul v20.2s, v0.2s, v8.s[1]
fmul v25.2s, v1.2s, v9.2s[0] fmul v25.2s, v1.2s, v9.s[0]
fmul v24.2s, v0.2s, v9.2s[0] fmul v24.2s, v0.2s, v9.s[0]
fmul v21.2s, v1.2s, v8.2s[1] fmul v21.2s, v1.2s, v8.s[1]
fmul v28.2s, v0.2s, v9.2s[1] fmul v28.2s, v0.2s, v9.s[1]
fmul v17.2s, v1.2s, v8.2s[0] fmul v17.2s, v1.2s, v8.s[0]
ld1 {v12.2s, v13.2s}, [pB] ld1 {v12.2s, v13.2s}, [pB]
add pB, pB, #16 add pB, pB, #16
@ -166,61 +166,61 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL4x4_M1 .macro KERNEL4x4_M1
fmla v16.2s, v0.2s, v8.2s[0] fmla v16.2s, v0.2s, v8.s[0]
fmla v29.2s, v1.2s, v9.2s[1] fmla v29.2s, v1.2s, v9.s[1]
ld1 {v12.2s, v13.2s}, [pB] // For next round ld1 {v12.2s, v13.2s}, [pB] // For next round
add pB, pB, #16 add pB, pB, #16
fmla v20.2s, v0.2s, v8.2s[1] fmla v20.2s, v0.2s, v8.s[1]
fmla v25.2s, v1.2s, v9.2s[0] fmla v25.2s, v1.2s, v9.s[0]
ld1 {v4.2s, v5.2s}, [pA] // For next round ld1 {v4.2s, v5.2s}, [pA] // For next round
add pA, pA, #16 add pA, pA, #16
fmla v24.2s, v0.2s, v9.2s[0] fmla v24.2s, v0.2s, v9.s[0]
fmla v21.2s, v1.2s, v8.2s[1] fmla v21.2s, v1.2s, v8.s[1]
prfm PLDL1KEEP, [pB, #512] prfm PLDL1KEEP, [pB, #512]
fmla v28.2s, v0.2s, v9.2s[1] fmla v28.2s, v0.2s, v9.s[1]
fmla v17.2s, v1.2s, v8.2s[0] fmla v17.2s, v1.2s, v8.s[0]
.endm .endm
.macro KERNEL4x4_M2 .macro KERNEL4x4_M2
fmla v16.2s, v4.2s, v12.2s[0] fmla v16.2s, v4.2s, v12.s[0]
fmla v29.2s, v5.2s, v13.2s[1] fmla v29.2s, v5.2s, v13.s[1]
ld1 {v8.2s, v9.2s}, [pB] // For next round ld1 {v8.2s, v9.2s}, [pB] // For next round
add pB, pB, #16 add pB, pB, #16
fmla v20.2s, v4.2s, v12.2s[1] fmla v20.2s, v4.2s, v12.s[1]
fmla v25.2s, v5.2s, v13.2s[0] fmla v25.2s, v5.2s, v13.s[0]
ld1 {v0.2s, v1.2s}, [pA] // For next round ld1 {v0.2s, v1.2s}, [pA] // For next round
add pA, pA, #16 add pA, pA, #16
fmla v24.2s, v4.2s, v13.2s[0] fmla v24.2s, v4.2s, v13.s[0]
fmla v21.2s, v5.2s, v12.2s[1] fmla v21.2s, v5.2s, v12.s[1]
prfm PLDL1KEEP, [pA, #512] prfm PLDL1KEEP, [pA, #512]
fmla v28.2s, v4.2s, v13.2s[1] fmla v28.2s, v4.2s, v13.s[1]
fmla v17.2s, v5.2s, v12.2s[0] fmla v17.2s, v5.2s, v12.s[0]
.endm .endm
.macro KERNEL4x4_E .macro KERNEL4x4_E
fmla v16.2s, v4.2s, v12.2s[0] fmla v16.2s, v4.2s, v12.s[0]
fmla v29.2s, v5.2s, v13.2s[1] fmla v29.2s, v5.2s, v13.s[1]
fmla v20.2s, v4.2s, v12.2s[1] fmla v20.2s, v4.2s, v12.s[1]
fmla v25.2s, v5.2s, v13.2s[0] fmla v25.2s, v5.2s, v13.s[0]
fmla v24.2s, v4.2s, v13.2s[0] fmla v24.2s, v4.2s, v13.s[0]
fmla v21.2s, v5.2s, v12.2s[1] fmla v21.2s, v5.2s, v12.s[1]
fmla v28.2s, v4.2s, v13.2s[1] fmla v28.2s, v4.2s, v13.s[1]
fmla v17.2s, v5.2s, v12.2s[0] fmla v17.2s, v5.2s, v12.s[0]
.endm .endm
.macro KERNEL4x4_SUB .macro KERNEL4x4_SUB
@ -229,17 +229,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s, v1.2s}, [pA] ld1 {v0.2s, v1.2s}, [pA]
add pA, pA, #16 add pA, pA, #16
fmla v16.2s, v0.2s, v8.2s[0] fmla v16.2s, v0.2s, v8.s[0]
fmla v29.2s, v1.2s, v9.2s[1] fmla v29.2s, v1.2s, v9.s[1]
fmla v20.2s, v0.2s, v8.2s[1] fmla v20.2s, v0.2s, v8.s[1]
fmla v25.2s, v1.2s, v9.2s[0] fmla v25.2s, v1.2s, v9.s[0]
fmla v24.2s, v0.2s, v9.2s[0] fmla v24.2s, v0.2s, v9.s[0]
fmla v21.2s, v1.2s, v8.2s[1] fmla v21.2s, v1.2s, v8.s[1]
fmla v28.2s, v0.2s, v9.2s[1] fmla v28.2s, v0.2s, v9.s[1]
fmla v17.2s, v1.2s, v8.2s[0] fmla v17.2s, v1.2s, v8.s[0]
.endm .endm
.macro SAVE4x4 .macro SAVE4x4
@ -280,10 +280,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s}, [pA] ld1 {v0.2s}, [pA]
add pA, pA, #8 add pA, pA, #8
fmla v16.2s, v0.2s, v8.2s[0] fmla v16.2s, v0.2s, v8.s[0]
fmla v20.2s, v0.2s, v8.2s[1] fmla v20.2s, v0.2s, v8.s[1]
fmla v24.2s, v0.2s, v9.2s[0] fmla v24.2s, v0.2s, v9.s[0]
fmla v28.2s, v0.2s, v9.2s[1] fmla v28.2s, v0.2s, v9.s[1]
.endm .endm
.macro SAVE2x4 .macro SAVE2x4
@ -353,10 +353,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s, v1.2s}, [pA] ld1 {v0.2s, v1.2s}, [pA]
add pA, pA, #16 add pA, pA, #16
fmla v16.2s, v0.2s, v8.2s[0] fmla v16.2s, v0.2s, v8.s[0]
fmla v17.2s, v1.2s, v8.2s[0] fmla v17.2s, v1.2s, v8.s[0]
fmla v20.2s, v0.2s, v8.2s[1] fmla v20.2s, v0.2s, v8.s[1]
fmla v21.2s, v1.2s, v8.2s[1] fmla v21.2s, v1.2s, v8.s[1]
.endm .endm
.macro SAVE4x2 .macro SAVE4x2
@ -386,8 +386,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s}, [pA] ld1 {v0.2s}, [pA]
add pA, pA, #8 add pA, pA, #8
fmla v16.2s, v0.2s, v8.2s[0] fmla v16.2s, v0.2s, v8.s[0]
fmla v20.2s, v0.2s, v8.2s[1] fmla v20.2s, v0.2s, v8.s[1]
.endm .endm
.macro SAVE2x2 .macro SAVE2x2
@ -414,7 +414,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ldr s0 , [pA] ldr s0 , [pA]
add pA, pA, #4 add pA, pA, #4
fmla v16.2s, v8.2s, v0.2s[0] fmla v16.2s, v8.2s, v0.s[0]
.endm .endm
.macro SAVE1x2 .macro SAVE1x2
@ -440,8 +440,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s, v1.2s}, [pA] ld1 {v0.2s, v1.2s}, [pA]
add pA , pA, #16 add pA , pA, #16
fmla v16.2s, v0.2s, v8.2s[0] fmla v16.2s, v0.2s, v8.s[0]
fmla v17.2s, v1.2s, v8.2s[0] fmla v17.2s, v1.2s, v8.s[0]
.endm .endm
.macro SAVE4x1 .macro SAVE4x1
@ -468,7 +468,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s}, [pA] ld1 {v0.2s}, [pA]
add pA , pA, #8 add pA , pA, #8
fmla v16.2s, v0.2s, v8.2s[0] fmla v16.2s, v0.2s, v8.s[0]
.endm .endm
.macro SAVE2x1 .macro SAVE2x1

472
kernel/arm64/strmm_kernel_8x8.S Executable file → Normal file
View File

@ -159,22 +159,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v1.4s}, [pA] ld1 {v1.4s}, [pA]
add pA, pA, #16 add pA, pA, #16
fmul v16.4s, v0.4s, v4.4s[0] fmul v16.4s, v0.4s, v4.s[0]
fmul v17.4s, v1.4s, v4.4s[0] fmul v17.4s, v1.4s, v4.s[0]
fmul v18.4s, v0.4s, v4.4s[1] fmul v18.4s, v0.4s, v4.s[1]
fmul v19.4s, v1.4s, v4.4s[1] fmul v19.4s, v1.4s, v4.s[1]
fmul v20.4s, v0.4s, v4.4s[2] fmul v20.4s, v0.4s, v4.s[2]
fmul v21.4s, v1.4s, v4.4s[2] fmul v21.4s, v1.4s, v4.s[2]
fmul v22.4s, v0.4s, v4.4s[3] fmul v22.4s, v0.4s, v4.s[3]
fmul v23.4s, v1.4s, v4.4s[3] fmul v23.4s, v1.4s, v4.s[3]
fmul v24.4s, v0.4s, v5.4s[0] fmul v24.4s, v0.4s, v5.s[0]
fmul v25.4s, v1.4s, v5.4s[0] fmul v25.4s, v1.4s, v5.s[0]
fmul v26.4s, v0.4s, v5.4s[1] fmul v26.4s, v0.4s, v5.s[1]
fmul v27.4s, v1.4s, v5.4s[1] fmul v27.4s, v1.4s, v5.s[1]
fmul v28.4s, v0.4s, v5.4s[2] fmul v28.4s, v0.4s, v5.s[2]
fmul v29.4s, v1.4s, v5.4s[2] fmul v29.4s, v1.4s, v5.s[2]
fmul v30.4s, v0.4s, v5.4s[3] fmul v30.4s, v0.4s, v5.s[3]
fmul v31.4s, v1.4s, v5.4s[3] fmul v31.4s, v1.4s, v5.s[3]
ld1 {v6.4s}, [pB] ld1 {v6.4s}, [pB]
add pB, pB, #16 add pB, pB, #16
@ -187,22 +187,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL8x8_M1 .macro KERNEL8x8_M1
fmla v16.4s, v0.4s, v4.4s[0] fmla v16.4s, v0.4s, v4.s[0]
fmla v17.4s, v1.4s, v4.4s[0] fmla v17.4s, v1.4s, v4.s[0]
fmla v18.4s, v0.4s, v4.4s[1] fmla v18.4s, v0.4s, v4.s[1]
fmla v19.4s, v1.4s, v4.4s[1] fmla v19.4s, v1.4s, v4.s[1]
fmla v20.4s, v0.4s, v4.4s[2] fmla v20.4s, v0.4s, v4.s[2]
fmla v21.4s, v1.4s, v4.4s[2] fmla v21.4s, v1.4s, v4.s[2]
fmla v22.4s, v0.4s, v4.4s[3] fmla v22.4s, v0.4s, v4.s[3]
fmla v23.4s, v1.4s, v4.4s[3] fmla v23.4s, v1.4s, v4.s[3]
fmla v24.4s, v0.4s, v5.4s[0] fmla v24.4s, v0.4s, v5.s[0]
fmla v25.4s, v1.4s, v5.4s[0] fmla v25.4s, v1.4s, v5.s[0]
fmla v26.4s, v0.4s, v5.4s[1] fmla v26.4s, v0.4s, v5.s[1]
fmla v27.4s, v1.4s, v5.4s[1] fmla v27.4s, v1.4s, v5.s[1]
fmla v28.4s, v0.4s, v5.4s[2] fmla v28.4s, v0.4s, v5.s[2]
fmla v29.4s, v1.4s, v5.4s[2] fmla v29.4s, v1.4s, v5.s[2]
fmla v30.4s, v0.4s, v5.4s[3] fmla v30.4s, v0.4s, v5.s[3]
fmla v31.4s, v1.4s, v5.4s[3] fmla v31.4s, v1.4s, v5.s[3]
ld1 {v6.4s}, [pB] ld1 {v6.4s}, [pB]
add pB, pB, #16 add pB, pB, #16
@ -215,22 +215,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL8x8_M2 .macro KERNEL8x8_M2
fmla v16.4s, v2.4s, v6.4s[0] fmla v16.4s, v2.4s, v6.s[0]
fmla v17.4s, v3.4s, v6.4s[0] fmla v17.4s, v3.4s, v6.s[0]
fmla v18.4s, v2.4s, v6.4s[1] fmla v18.4s, v2.4s, v6.s[1]
fmla v19.4s, v3.4s, v6.4s[1] fmla v19.4s, v3.4s, v6.s[1]
fmla v20.4s, v2.4s, v6.4s[2] fmla v20.4s, v2.4s, v6.s[2]
fmla v21.4s, v3.4s, v6.4s[2] fmla v21.4s, v3.4s, v6.s[2]
fmla v22.4s, v2.4s, v6.4s[3] fmla v22.4s, v2.4s, v6.s[3]
fmla v23.4s, v3.4s, v6.4s[3] fmla v23.4s, v3.4s, v6.s[3]
fmla v24.4s, v2.4s, v7.4s[0] fmla v24.4s, v2.4s, v7.s[0]
fmla v25.4s, v3.4s, v7.4s[0] fmla v25.4s, v3.4s, v7.s[0]
fmla v26.4s, v2.4s, v7.4s[1] fmla v26.4s, v2.4s, v7.s[1]
fmla v27.4s, v3.4s, v7.4s[1] fmla v27.4s, v3.4s, v7.s[1]
fmla v28.4s, v2.4s, v7.4s[2] fmla v28.4s, v2.4s, v7.s[2]
fmla v29.4s, v3.4s, v7.4s[2] fmla v29.4s, v3.4s, v7.s[2]
fmla v30.4s, v2.4s, v7.4s[3] fmla v30.4s, v2.4s, v7.s[3]
fmla v31.4s, v3.4s, v7.4s[3] fmla v31.4s, v3.4s, v7.s[3]
ld1 {v4.4s}, [pB] ld1 {v4.4s}, [pB]
add pB, pB, #16 add pB, pB, #16
@ -243,22 +243,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL8x8_E .macro KERNEL8x8_E
fmla v16.4s, v2.4s, v6.4s[0] fmla v16.4s, v2.4s, v6.s[0]
fmla v17.4s, v3.4s, v6.4s[0] fmla v17.4s, v3.4s, v6.s[0]
fmla v18.4s, v2.4s, v6.4s[1] fmla v18.4s, v2.4s, v6.s[1]
fmla v19.4s, v3.4s, v6.4s[1] fmla v19.4s, v3.4s, v6.s[1]
fmla v20.4s, v2.4s, v6.4s[2] fmla v20.4s, v2.4s, v6.s[2]
fmla v21.4s, v3.4s, v6.4s[2] fmla v21.4s, v3.4s, v6.s[2]
fmla v22.4s, v2.4s, v6.4s[3] fmla v22.4s, v2.4s, v6.s[3]
fmla v23.4s, v3.4s, v6.4s[3] fmla v23.4s, v3.4s, v6.s[3]
fmla v24.4s, v2.4s, v7.4s[0] fmla v24.4s, v2.4s, v7.s[0]
fmla v25.4s, v3.4s, v7.4s[0] fmla v25.4s, v3.4s, v7.s[0]
fmla v26.4s, v2.4s, v7.4s[1] fmla v26.4s, v2.4s, v7.s[1]
fmla v27.4s, v3.4s, v7.4s[1] fmla v27.4s, v3.4s, v7.s[1]
fmla v28.4s, v2.4s, v7.4s[2] fmla v28.4s, v2.4s, v7.s[2]
fmla v29.4s, v3.4s, v7.4s[2] fmla v29.4s, v3.4s, v7.s[2]
fmla v30.4s, v2.4s, v7.4s[3] fmla v30.4s, v2.4s, v7.s[3]
fmla v31.4s, v3.4s, v7.4s[3] fmla v31.4s, v3.4s, v7.s[3]
.endm .endm
.macro KERNEL8x8_SUB .macro KERNEL8x8_SUB
@ -271,22 +271,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v1.4s}, [pA] ld1 {v1.4s}, [pA]
add pA, pA, #16 add pA, pA, #16
fmla v16.4s, v0.4s, v4.4s[0] fmla v16.4s, v0.4s, v4.s[0]
fmla v17.4s, v1.4s, v4.4s[0] fmla v17.4s, v1.4s, v4.s[0]
fmla v18.4s, v0.4s, v4.4s[1] fmla v18.4s, v0.4s, v4.s[1]
fmla v19.4s, v1.4s, v4.4s[1] fmla v19.4s, v1.4s, v4.s[1]
fmla v20.4s, v0.4s, v4.4s[2] fmla v20.4s, v0.4s, v4.s[2]
fmla v21.4s, v1.4s, v4.4s[2] fmla v21.4s, v1.4s, v4.s[2]
fmla v22.4s, v0.4s, v4.4s[3] fmla v22.4s, v0.4s, v4.s[3]
fmla v23.4s, v1.4s, v4.4s[3] fmla v23.4s, v1.4s, v4.s[3]
fmla v24.4s, v0.4s, v5.4s[0] fmla v24.4s, v0.4s, v5.s[0]
fmla v25.4s, v1.4s, v5.4s[0] fmla v25.4s, v1.4s, v5.s[0]
fmla v26.4s, v0.4s, v5.4s[1] fmla v26.4s, v0.4s, v5.s[1]
fmla v27.4s, v1.4s, v5.4s[1] fmla v27.4s, v1.4s, v5.s[1]
fmla v28.4s, v0.4s, v5.4s[2] fmla v28.4s, v0.4s, v5.s[2]
fmla v29.4s, v1.4s, v5.4s[2] fmla v29.4s, v1.4s, v5.s[2]
fmla v30.4s, v0.4s, v5.4s[3] fmla v30.4s, v0.4s, v5.s[3]
fmla v31.4s, v1.4s, v5.4s[3] fmla v31.4s, v1.4s, v5.s[3]
.endm .endm
.macro SAVE8x8 .macro SAVE8x8
@ -361,14 +361,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.4s}, [pA] ld1 {v0.4s}, [pA]
add pA, pA, #16 add pA, pA, #16
fmul v16.4s, v0.4s, v4.4s[0] fmul v16.4s, v0.4s, v4.s[0]
fmul v18.4s, v0.4s, v4.4s[1] fmul v18.4s, v0.4s, v4.s[1]
fmul v20.4s, v0.4s, v4.4s[2] fmul v20.4s, v0.4s, v4.s[2]
fmul v22.4s, v0.4s, v4.4s[3] fmul v22.4s, v0.4s, v4.s[3]
fmul v24.4s, v0.4s, v5.4s[0] fmul v24.4s, v0.4s, v5.s[0]
fmul v26.4s, v0.4s, v5.4s[1] fmul v26.4s, v0.4s, v5.s[1]
fmul v28.4s, v0.4s, v5.4s[2] fmul v28.4s, v0.4s, v5.s[2]
fmul v30.4s, v0.4s, v5.4s[3] fmul v30.4s, v0.4s, v5.s[3]
ld1 {v6.4s}, [pB] ld1 {v6.4s}, [pB]
add pB, pB, #16 add pB, pB, #16
@ -379,14 +379,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL4x8_M1 .macro KERNEL4x8_M1
fmla v16.4s, v0.4s, v4.4s[0] fmla v16.4s, v0.4s, v4.s[0]
fmla v18.4s, v0.4s, v4.4s[1] fmla v18.4s, v0.4s, v4.s[1]
fmla v20.4s, v0.4s, v4.4s[2] fmla v20.4s, v0.4s, v4.s[2]
fmla v22.4s, v0.4s, v4.4s[3] fmla v22.4s, v0.4s, v4.s[3]
fmla v24.4s, v0.4s, v5.4s[0] fmla v24.4s, v0.4s, v5.s[0]
fmla v26.4s, v0.4s, v5.4s[1] fmla v26.4s, v0.4s, v5.s[1]
fmla v28.4s, v0.4s, v5.4s[2] fmla v28.4s, v0.4s, v5.s[2]
fmla v30.4s, v0.4s, v5.4s[3] fmla v30.4s, v0.4s, v5.s[3]
ld1 {v6.4s}, [pB] ld1 {v6.4s}, [pB]
add pB, pB, #16 add pB, pB, #16
@ -397,14 +397,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL4x8_M2 .macro KERNEL4x8_M2
fmla v16.4s, v2.4s, v6.4s[0] fmla v16.4s, v2.4s, v6.s[0]
fmla v18.4s, v2.4s, v6.4s[1] fmla v18.4s, v2.4s, v6.s[1]
fmla v20.4s, v2.4s, v6.4s[2] fmla v20.4s, v2.4s, v6.s[2]
fmla v22.4s, v2.4s, v6.4s[3] fmla v22.4s, v2.4s, v6.s[3]
fmla v24.4s, v2.4s, v7.4s[0] fmla v24.4s, v2.4s, v7.s[0]
fmla v26.4s, v2.4s, v7.4s[1] fmla v26.4s, v2.4s, v7.s[1]
fmla v28.4s, v2.4s, v7.4s[2] fmla v28.4s, v2.4s, v7.s[2]
fmla v30.4s, v2.4s, v7.4s[3] fmla v30.4s, v2.4s, v7.s[3]
ld1 {v4.4s}, [pB] ld1 {v4.4s}, [pB]
add pB, pB, #16 add pB, pB, #16
@ -415,14 +415,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL4x8_E .macro KERNEL4x8_E
fmla v16.4s, v2.4s, v6.4s[0] fmla v16.4s, v2.4s, v6.s[0]
fmla v18.4s, v2.4s, v6.4s[1] fmla v18.4s, v2.4s, v6.s[1]
fmla v20.4s, v2.4s, v6.4s[2] fmla v20.4s, v2.4s, v6.s[2]
fmla v22.4s, v2.4s, v6.4s[3] fmla v22.4s, v2.4s, v6.s[3]
fmla v24.4s, v2.4s, v7.4s[0] fmla v24.4s, v2.4s, v7.s[0]
fmla v26.4s, v2.4s, v7.4s[1] fmla v26.4s, v2.4s, v7.s[1]
fmla v28.4s, v2.4s, v7.4s[2] fmla v28.4s, v2.4s, v7.s[2]
fmla v30.4s, v2.4s, v7.4s[3] fmla v30.4s, v2.4s, v7.s[3]
.endm .endm
.macro KERNEL4x8_SUB .macro KERNEL4x8_SUB
@ -433,14 +433,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.4s}, [pA] ld1 {v0.4s}, [pA]
add pA, pA, #16 add pA, pA, #16
fmla v16.4s, v0.4s, v4.4s[0] fmla v16.4s, v0.4s, v4.s[0]
fmla v18.4s, v0.4s, v4.4s[1] fmla v18.4s, v0.4s, v4.s[1]
fmla v20.4s, v0.4s, v4.4s[2] fmla v20.4s, v0.4s, v4.s[2]
fmla v22.4s, v0.4s, v4.4s[3] fmla v22.4s, v0.4s, v4.s[3]
fmla v24.4s, v0.4s, v5.4s[0] fmla v24.4s, v0.4s, v5.s[0]
fmla v26.4s, v0.4s, v5.4s[1] fmla v26.4s, v0.4s, v5.s[1]
fmla v28.4s, v0.4s, v5.4s[2] fmla v28.4s, v0.4s, v5.s[2]
fmla v30.4s, v0.4s, v5.4s[3] fmla v30.4s, v0.4s, v5.s[3]
.endm .endm
.macro SAVE4x8 .macro SAVE4x8
@ -514,14 +514,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s}, [pA] ld1 {v0.2s}, [pA]
add pA, pA, #8 add pA, pA, #8
fmla v16.2s, v0.2s, v4.4s[0] fmla v16.2s, v0.2s, v4.s[0]
fmla v18.2s, v0.2s, v4.4s[1] fmla v18.2s, v0.2s, v4.s[1]
fmla v20.2s, v0.2s, v4.4s[2] fmla v20.2s, v0.2s, v4.s[2]
fmla v22.2s, v0.2s, v4.4s[3] fmla v22.2s, v0.2s, v4.s[3]
fmla v24.2s, v0.2s, v5.4s[0] fmla v24.2s, v0.2s, v5.s[0]
fmla v26.2s, v0.2s, v5.4s[1] fmla v26.2s, v0.2s, v5.s[1]
fmla v28.2s, v0.2s, v5.4s[2] fmla v28.2s, v0.2s, v5.s[2]
fmla v30.2s, v0.2s, v5.4s[3] fmla v30.2s, v0.2s, v5.s[3]
.endm .endm
.macro SAVE2x8 .macro SAVE2x8
@ -595,14 +595,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ldr s0, [pA] ldr s0, [pA]
add pA, pA, #4 add pA, pA, #4
fmla s16, s0, v4.4s[0] fmla s16, s0, v4.s[0]
fmla s18, s0, v4.4s[1] fmla s18, s0, v4.s[1]
fmla s20, s0, v4.4s[2] fmla s20, s0, v4.s[2]
fmla s22, s0, v4.4s[3] fmla s22, s0, v4.s[3]
fmla s24, s0, v5.4s[0] fmla s24, s0, v5.s[0]
fmla s26, s0, v5.4s[1] fmla s26, s0, v5.s[1]
fmla s28, s0, v5.4s[2] fmla s28, s0, v5.s[2]
fmla s30, s0, v5.4s[3] fmla s30, s0, v5.s[3]
.endm .endm
.macro SAVE1x8 .macro SAVE1x8
@ -676,14 +676,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v1.4s}, [pA] ld1 {v1.4s}, [pA]
add pA, pA, #16 add pA, pA, #16
fmul v16.4s, v0.4s, v8.2s[0] fmul v16.4s, v0.4s, v8.s[0]
fmul v17.4s, v1.4s, v8.2s[0] fmul v17.4s, v1.4s, v8.s[0]
fmul v20.4s, v0.4s, v8.2s[1] fmul v20.4s, v0.4s, v8.s[1]
fmul v21.4s, v1.4s, v8.2s[1] fmul v21.4s, v1.4s, v8.s[1]
fmul v24.4s, v0.4s, v9.2s[0] fmul v24.4s, v0.4s, v9.s[0]
fmul v25.4s, v1.4s, v9.2s[0] fmul v25.4s, v1.4s, v9.s[0]
fmul v28.4s, v0.4s, v9.2s[1] fmul v28.4s, v0.4s, v9.s[1]
fmul v29.4s, v1.4s, v9.2s[1] fmul v29.4s, v1.4s, v9.s[1]
ld1 {v12.2s, v13.2s}, [pB] ld1 {v12.2s, v13.2s}, [pB]
add pB, pB, #16 add pB, pB, #16
@ -694,14 +694,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL8x4_M1 .macro KERNEL8x4_M1
fmla v16.4s, v0.4s, v8.2s[0] fmla v16.4s, v0.4s, v8.s[0]
fmla v17.4s, v1.4s, v8.2s[0] fmla v17.4s, v1.4s, v8.s[0]
fmla v20.4s, v0.4s, v8.2s[1] fmla v20.4s, v0.4s, v8.s[1]
fmla v21.4s, v1.4s, v8.2s[1] fmla v21.4s, v1.4s, v8.s[1]
fmla v24.4s, v0.4s, v9.2s[0] fmla v24.4s, v0.4s, v9.s[0]
fmla v25.4s, v1.4s, v9.2s[0] fmla v25.4s, v1.4s, v9.s[0]
fmla v28.4s, v0.4s, v9.2s[1] fmla v28.4s, v0.4s, v9.s[1]
fmla v29.4s, v1.4s, v9.2s[1] fmla v29.4s, v1.4s, v9.s[1]
ld1 {v12.2s, v13.2s}, [pB] ld1 {v12.2s, v13.2s}, [pB]
add pB, pB, #16 add pB, pB, #16
@ -712,14 +712,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL8x4_M2 .macro KERNEL8x4_M2
fmla v16.4s, v4.4s, v12.2s[0] fmla v16.4s, v4.4s, v12.s[0]
fmla v17.4s, v5.4s, v12.2s[0] fmla v17.4s, v5.4s, v12.s[0]
fmla v20.4s, v4.4s, v12.2s[1] fmla v20.4s, v4.4s, v12.s[1]
fmla v21.4s, v5.4s, v12.2s[1] fmla v21.4s, v5.4s, v12.s[1]
fmla v24.4s, v4.4s, v13.2s[0] fmla v24.4s, v4.4s, v13.s[0]
fmla v25.4s, v5.4s, v13.2s[0] fmla v25.4s, v5.4s, v13.s[0]
fmla v28.4s, v4.4s, v13.2s[1] fmla v28.4s, v4.4s, v13.s[1]
fmla v29.4s, v5.4s, v13.2s[1] fmla v29.4s, v5.4s, v13.s[1]
ld1 {v8.2s, v9.2s}, [pB] ld1 {v8.2s, v9.2s}, [pB]
add pB, pB, #16 add pB, pB, #16
@ -730,14 +730,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL8x4_E .macro KERNEL8x4_E
fmla v16.4s, v4.4s, v12.2s[0] fmla v16.4s, v4.4s, v12.s[0]
fmla v17.4s, v5.4s, v12.2s[0] fmla v17.4s, v5.4s, v12.s[0]
fmla v20.4s, v4.4s, v12.2s[1] fmla v20.4s, v4.4s, v12.s[1]
fmla v21.4s, v5.4s, v12.2s[1] fmla v21.4s, v5.4s, v12.s[1]
fmla v24.4s, v4.4s, v13.2s[0] fmla v24.4s, v4.4s, v13.s[0]
fmla v25.4s, v5.4s, v13.2s[0] fmla v25.4s, v5.4s, v13.s[0]
fmla v28.4s, v4.4s, v13.2s[1] fmla v28.4s, v4.4s, v13.s[1]
fmla v29.4s, v5.4s, v13.2s[1] fmla v29.4s, v5.4s, v13.s[1]
.endm .endm
.macro KERNEL8x4_SUB .macro KERNEL8x4_SUB
@ -748,14 +748,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v1.4s}, [pA] ld1 {v1.4s}, [pA]
add pA, pA, #16 add pA, pA, #16
fmla v16.4s, v0.4s, v8.2s[0] fmla v16.4s, v0.4s, v8.s[0]
fmla v17.4s, v1.4s, v8.2s[0] fmla v17.4s, v1.4s, v8.s[0]
fmla v20.4s, v0.4s, v8.2s[1] fmla v20.4s, v0.4s, v8.s[1]
fmla v21.4s, v1.4s, v8.2s[1] fmla v21.4s, v1.4s, v8.s[1]
fmla v24.4s, v0.4s, v9.2s[0] fmla v24.4s, v0.4s, v9.s[0]
fmla v25.4s, v1.4s, v9.2s[0] fmla v25.4s, v1.4s, v9.s[0]
fmla v28.4s, v0.4s, v9.2s[1] fmla v28.4s, v0.4s, v9.s[1]
fmla v29.4s, v1.4s, v9.2s[1] fmla v29.4s, v1.4s, v9.s[1]
.endm .endm
.macro SAVE8x4 .macro SAVE8x4
@ -808,17 +808,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s, v1.2s}, [pA] ld1 {v0.2s, v1.2s}, [pA]
add pA, pA, #16 add pA, pA, #16
fmul v16.2s, v0.2s, v8.2s[0] fmul v16.2s, v0.2s, v8.s[0]
fmul v29.2s, v1.2s, v9.2s[1] fmul v29.2s, v1.2s, v9.s[1]
fmul v20.2s, v0.2s, v8.2s[1] fmul v20.2s, v0.2s, v8.s[1]
fmul v25.2s, v1.2s, v9.2s[0] fmul v25.2s, v1.2s, v9.s[0]
fmul v24.2s, v0.2s, v9.2s[0] fmul v24.2s, v0.2s, v9.s[0]
fmul v21.2s, v1.2s, v8.2s[1] fmul v21.2s, v1.2s, v8.s[1]
fmul v28.2s, v0.2s, v9.2s[1] fmul v28.2s, v0.2s, v9.s[1]
fmul v17.2s, v1.2s, v8.2s[0] fmul v17.2s, v1.2s, v8.s[0]
ld1 {v12.2s, v13.2s}, [pB] ld1 {v12.2s, v13.2s}, [pB]
add pB, pB, #16 add pB, pB, #16
@ -827,61 +827,61 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL4x4_M1 .macro KERNEL4x4_M1
fmla v16.2s, v0.2s, v8.2s[0] fmla v16.2s, v0.2s, v8.s[0]
fmla v29.2s, v1.2s, v9.2s[1] fmla v29.2s, v1.2s, v9.s[1]
ld1 {v12.2s, v13.2s}, [pB] // For next round ld1 {v12.2s, v13.2s}, [pB] // For next round
add pB, pB, #16 add pB, pB, #16
fmla v20.2s, v0.2s, v8.2s[1] fmla v20.2s, v0.2s, v8.s[1]
fmla v25.2s, v1.2s, v9.2s[0] fmla v25.2s, v1.2s, v9.s[0]
ld1 {v4.2s, v5.2s}, [pA] // For next round ld1 {v4.2s, v5.2s}, [pA] // For next round
add pA, pA, #16 add pA, pA, #16
fmla v24.2s, v0.2s, v9.2s[0] fmla v24.2s, v0.2s, v9.s[0]
fmla v21.2s, v1.2s, v8.2s[1] fmla v21.2s, v1.2s, v8.s[1]
prfm PLDL1KEEP, [pB, #512] prfm PLDL1KEEP, [pB, #512]
fmla v28.2s, v0.2s, v9.2s[1] fmla v28.2s, v0.2s, v9.s[1]
fmla v17.2s, v1.2s, v8.2s[0] fmla v17.2s, v1.2s, v8.s[0]
.endm .endm
.macro KERNEL4x4_M2 .macro KERNEL4x4_M2
fmla v16.2s, v4.2s, v12.2s[0] fmla v16.2s, v4.2s, v12.s[0]
fmla v29.2s, v5.2s, v13.2s[1] fmla v29.2s, v5.2s, v13.s[1]
ld1 {v8.2s, v9.2s}, [pB] // For next round ld1 {v8.2s, v9.2s}, [pB] // For next round
add pB, pB, #16 add pB, pB, #16
fmla v20.2s, v4.2s, v12.2s[1] fmla v20.2s, v4.2s, v12.s[1]
fmla v25.2s, v5.2s, v13.2s[0] fmla v25.2s, v5.2s, v13.s[0]
ld1 {v0.2s, v1.2s}, [pA] // For next round ld1 {v0.2s, v1.2s}, [pA] // For next round
add pA, pA, #16 add pA, pA, #16
fmla v24.2s, v4.2s, v13.2s[0] fmla v24.2s, v4.2s, v13.s[0]
fmla v21.2s, v5.2s, v12.2s[1] fmla v21.2s, v5.2s, v12.s[1]
prfm PLDL1KEEP, [pA, #512] prfm PLDL1KEEP, [pA, #512]
fmla v28.2s, v4.2s, v13.2s[1] fmla v28.2s, v4.2s, v13.s[1]
fmla v17.2s, v5.2s, v12.2s[0] fmla v17.2s, v5.2s, v12.s[0]
.endm .endm
.macro KERNEL4x4_E .macro KERNEL4x4_E
fmla v16.2s, v4.2s, v12.2s[0] fmla v16.2s, v4.2s, v12.s[0]
fmla v29.2s, v5.2s, v13.2s[1] fmla v29.2s, v5.2s, v13.s[1]
fmla v20.2s, v4.2s, v12.2s[1] fmla v20.2s, v4.2s, v12.s[1]
fmla v25.2s, v5.2s, v13.2s[0] fmla v25.2s, v5.2s, v13.s[0]
fmla v24.2s, v4.2s, v13.2s[0] fmla v24.2s, v4.2s, v13.s[0]
fmla v21.2s, v5.2s, v12.2s[1] fmla v21.2s, v5.2s, v12.s[1]
fmla v28.2s, v4.2s, v13.2s[1] fmla v28.2s, v4.2s, v13.s[1]
fmla v17.2s, v5.2s, v12.2s[0] fmla v17.2s, v5.2s, v12.s[0]
.endm .endm
.macro KERNEL4x4_SUB .macro KERNEL4x4_SUB
@ -890,17 +890,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s, v1.2s}, [pA] ld1 {v0.2s, v1.2s}, [pA]
add pA, pA, #16 add pA, pA, #16
fmla v16.2s, v0.2s, v8.2s[0] fmla v16.2s, v0.2s, v8.s[0]
fmla v29.2s, v1.2s, v9.2s[1] fmla v29.2s, v1.2s, v9.s[1]
fmla v20.2s, v0.2s, v8.2s[1] fmla v20.2s, v0.2s, v8.s[1]
fmla v25.2s, v1.2s, v9.2s[0] fmla v25.2s, v1.2s, v9.s[0]
fmla v24.2s, v0.2s, v9.2s[0] fmla v24.2s, v0.2s, v9.s[0]
fmla v21.2s, v1.2s, v8.2s[1] fmla v21.2s, v1.2s, v8.s[1]
fmla v28.2s, v0.2s, v9.2s[1] fmla v28.2s, v0.2s, v9.s[1]
fmla v17.2s, v1.2s, v8.2s[0] fmla v17.2s, v1.2s, v8.s[0]
.endm .endm
.macro SAVE4x4 .macro SAVE4x4
@ -945,10 +945,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s}, [pA] ld1 {v0.2s}, [pA]
add pA, pA, #8 add pA, pA, #8
fmla v16.2s, v0.2s, v8.2s[0] fmla v16.2s, v0.2s, v8.s[0]
fmla v20.2s, v0.2s, v8.2s[1] fmla v20.2s, v0.2s, v8.s[1]
fmla v24.2s, v0.2s, v9.2s[0] fmla v24.2s, v0.2s, v9.s[0]
fmla v28.2s, v0.2s, v9.2s[1] fmla v28.2s, v0.2s, v9.s[1]
.endm .endm
.macro SAVE2x4 .macro SAVE2x4
@ -1028,11 +1028,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v1.4s}, [pA] ld1 {v1.4s}, [pA]
add pA, pA, #16 add pA, pA, #16
fmla v16.4s, v0.4s, v8.2s[0] fmla v16.4s, v0.4s, v8.s[0]
fmla v17.4s, v1.4s, v8.2s[0] fmla v17.4s, v1.4s, v8.s[0]
fmla v20.4s, v0.4s, v8.2s[1] fmla v20.4s, v0.4s, v8.s[1]
fmla v21.4s, v1.4s, v8.2s[1] fmla v21.4s, v1.4s, v8.s[1]
.endm .endm
.macro SAVE8x2 .macro SAVE8x2
@ -1068,10 +1068,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s, v1.2s}, [pA] ld1 {v0.2s, v1.2s}, [pA]
add pA, pA, #16 add pA, pA, #16
fmla v16.2s, v0.2s, v8.2s[0] fmla v16.2s, v0.2s, v8.s[0]
fmla v17.2s, v1.2s, v8.2s[0] fmla v17.2s, v1.2s, v8.s[0]
fmla v20.2s, v0.2s, v8.2s[1] fmla v20.2s, v0.2s, v8.s[1]
fmla v21.2s, v1.2s, v8.2s[1] fmla v21.2s, v1.2s, v8.s[1]
.endm .endm
.macro SAVE4x2 .macro SAVE4x2
@ -1103,8 +1103,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s}, [pA] ld1 {v0.2s}, [pA]
add pA, pA, #8 add pA, pA, #8
fmla v16.2s, v0.2s, v8.2s[0] fmla v16.2s, v0.2s, v8.s[0]
fmla v20.2s, v0.2s, v8.2s[1] fmla v20.2s, v0.2s, v8.s[1]
.endm .endm
.macro SAVE2x2 .macro SAVE2x2
@ -1133,7 +1133,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ldr s0 , [pA] ldr s0 , [pA]
add pA, pA, #4 add pA, pA, #4
fmla v16.2s, v8.2s, v0.2s[0] fmla v16.2s, v8.2s, v0.s[0]
.endm .endm
.macro SAVE1x2 .macro SAVE1x2
@ -1163,8 +1163,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v1.4s}, [pA] ld1 {v1.4s}, [pA]
add pA, pA, #16 add pA, pA, #16
fmla v16.4s, v0.4s, v8.2s[0] fmla v16.4s, v0.4s, v8.s[0]
fmla v17.4s, v1.4s, v8.2s[0] fmla v17.4s, v1.4s, v8.s[0]
.endm .endm
.macro SAVE8x1 .macro SAVE8x1
@ -1190,8 +1190,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s, v1.2s}, [pA] ld1 {v0.2s, v1.2s}, [pA]
add pA , pA, #16 add pA , pA, #16
fmla v16.2s, v0.2s, v8.2s[0] fmla v16.2s, v0.2s, v8.s[0]
fmla v17.2s, v1.2s, v8.2s[0] fmla v17.2s, v1.2s, v8.s[0]
.endm .endm
.macro SAVE4x1 .macro SAVE4x1
@ -1216,7 +1216,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v0.2s}, [pA] ld1 {v0.2s}, [pA]
add pA , pA, #8 add pA , pA, #8
fmla v16.2s, v0.2s, v8.2s[0] fmla v16.2s, v0.2s, v8.s[0]
.endm .endm
.macro SAVE2x1 .macro SAVE2x1

View File

@ -182,93 +182,93 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v2.2d, v3.2d}, [pA] ld2 {v2.2d, v3.2d}, [pA]
add pA, pA, #32 add pA, pA, #32
fmul v16.2d, v0.2d, v8.2d[0] fmul v16.2d, v0.2d, v8.d[0]
OP_ii v16.2d, v1.2d, v9.2d[0] OP_ii v16.2d, v1.2d, v9.d[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC) defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v17.16b, v17.16b, v17.16b eor v17.16b, v17.16b, v17.16b
fmls v17.2d, v0.2d, v9.2d[0] fmls v17.2d, v0.2d, v9.d[0]
#else #else
fmul v17.2d, v0.2d, v9.2d[0] fmul v17.2d, v0.2d, v9.d[0]
#endif #endif
OP_ir v17.2d, v1.2d, v8.2d[0] OP_ir v17.2d, v1.2d, v8.d[0]
fmul v18.2d, v2.2d, v8.2d[0] fmul v18.2d, v2.2d, v8.d[0]
OP_ii v18.2d, v3.2d, v9.2d[0] OP_ii v18.2d, v3.2d, v9.d[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC) defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v19.16b, v19.16b, v19.16b eor v19.16b, v19.16b, v19.16b
fmls v19.2d, v2.2d, v9.2d[0] fmls v19.2d, v2.2d, v9.d[0]
#else #else
fmul v19.2d, v2.2d, v9.2d[0] fmul v19.2d, v2.2d, v9.d[0]
#endif #endif
OP_ir v19.2d, v3.2d, v8.2d[0] OP_ir v19.2d, v3.2d, v8.d[0]
fmul v20.2d, v0.2d, v8.2d[1] fmul v20.2d, v0.2d, v8.d[1]
OP_ii v20.2d, v1.2d, v9.2d[1] OP_ii v20.2d, v1.2d, v9.d[1]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC) defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v21.16b, v21.16b, v21.16b eor v21.16b, v21.16b, v21.16b
fmls v21.2d, v0.2d, v9.2d[1] fmls v21.2d, v0.2d, v9.d[1]
#else #else
fmul v21.2d, v0.2d, v9.2d[1] fmul v21.2d, v0.2d, v9.d[1]
#endif #endif
OP_ir v21.2d, v1.2d, v8.2d[1] OP_ir v21.2d, v1.2d, v8.d[1]
fmul v22.2d, v2.2d, v8.2d[1] fmul v22.2d, v2.2d, v8.d[1]
OP_ii v22.2d, v3.2d, v9.2d[1] OP_ii v22.2d, v3.2d, v9.d[1]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC) defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v23.16b, v23.16b, v23.16b eor v23.16b, v23.16b, v23.16b
fmls v23.2d, v2.2d, v9.2d[1] fmls v23.2d, v2.2d, v9.d[1]
#else #else
fmul v23.2d, v2.2d, v9.2d[1] fmul v23.2d, v2.2d, v9.d[1]
#endif #endif
OP_ir v23.2d, v3.2d, v8.2d[1] OP_ir v23.2d, v3.2d, v8.d[1]
fmul v24.2d, v0.2d, v10.2d[0] fmul v24.2d, v0.2d, v10.d[0]
OP_ii v24.2d, v1.2d, v11.2d[0] OP_ii v24.2d, v1.2d, v11.d[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC) defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v25.16b, v25.16b, v25.16b eor v25.16b, v25.16b, v25.16b
fmls v25.2d, v0.2d, v11.2d[0] fmls v25.2d, v0.2d, v11.d[0]
#else #else
fmul v25.2d, v0.2d, v11.2d[0] fmul v25.2d, v0.2d, v11.d[0]
#endif #endif
OP_ir v25.2d, v1.2d, v10.2d[0] OP_ir v25.2d, v1.2d, v10.d[0]
fmul v26.2d, v2.2d, v10.2d[0] fmul v26.2d, v2.2d, v10.d[0]
OP_ii v26.2d, v3.2d, v11.2d[0] OP_ii v26.2d, v3.2d, v11.d[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC) defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v27.16b, v27.16b, v27.16b eor v27.16b, v27.16b, v27.16b
fmls v27.2d, v2.2d, v11.2d[0] fmls v27.2d, v2.2d, v11.d[0]
#else #else
fmul v27.2d, v2.2d, v11.2d[0] fmul v27.2d, v2.2d, v11.d[0]
#endif #endif
OP_ir v27.2d, v3.2d, v10.2d[0] OP_ir v27.2d, v3.2d, v10.d[0]
fmul v28.2d, v0.2d, v10.2d[1] fmul v28.2d, v0.2d, v10.d[1]
OP_ii v28.2d, v1.2d, v11.2d[1] OP_ii v28.2d, v1.2d, v11.d[1]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC) defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v29.16b, v29.16b, v29.16b eor v29.16b, v29.16b, v29.16b
fmls v29.2d, v0.2d, v11.2d[1] fmls v29.2d, v0.2d, v11.d[1]
#else #else
fmul v29.2d, v0.2d, v11.2d[1] fmul v29.2d, v0.2d, v11.d[1]
#endif #endif
OP_ir v29.2d, v1.2d, v10.2d[1] OP_ir v29.2d, v1.2d, v10.d[1]
fmul v30.2d, v2.2d, v10.2d[1] fmul v30.2d, v2.2d, v10.d[1]
OP_ii v30.2d, v3.2d, v11.2d[1] OP_ii v30.2d, v3.2d, v11.d[1]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC) defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v31.16b, v31.16b, v31.16b eor v31.16b, v31.16b, v31.16b
fmls v31.2d, v2.2d, v11.2d[1] fmls v31.2d, v2.2d, v11.d[1]
#else #else
fmul v31.2d, v2.2d, v11.2d[1] fmul v31.2d, v2.2d, v11.d[1]
#endif #endif
OP_ir v31.2d, v3.2d, v10.2d[1] OP_ir v31.2d, v3.2d, v10.d[1]
ld2 {v12.2d, v13.2d}, [pB] ld2 {v12.2d, v13.2d}, [pB]
add pB, pB, #32 add pB, pB, #32
@ -281,161 +281,161 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL4x4_M1 .macro KERNEL4x4_M1
OP_rr v16.2d, v0.2d, v8.2d[0] OP_rr v16.2d, v0.2d, v8.d[0]
OP_ii v16.2d, v1.2d, v9.2d[0] OP_ii v16.2d, v1.2d, v9.d[0]
OP_ri v17.2d, v0.2d, v9.2d[0] OP_ri v17.2d, v0.2d, v9.d[0]
OP_ir v17.2d, v1.2d, v8.2d[0] OP_ir v17.2d, v1.2d, v8.d[0]
ld2 {v12.2d, v13.2d}, [pB] // For next round ld2 {v12.2d, v13.2d}, [pB] // For next round
add pB, pB, #32 add pB, pB, #32
OP_rr v18.2d, v2.2d, v8.2d[0] OP_rr v18.2d, v2.2d, v8.d[0]
OP_ii v18.2d, v3.2d, v9.2d[0] OP_ii v18.2d, v3.2d, v9.d[0]
OP_ri v19.2d, v2.2d, v9.2d[0] OP_ri v19.2d, v2.2d, v9.d[0]
OP_ir v19.2d, v3.2d, v8.2d[0] OP_ir v19.2d, v3.2d, v8.d[0]
ld2 {v14.2d, v15.2d}, [pB] // For next round ld2 {v14.2d, v15.2d}, [pB] // For next round
add pB, pB, #32 add pB, pB, #32
OP_rr v20.2d, v0.2d, v8.2d[1] OP_rr v20.2d, v0.2d, v8.d[1]
OP_ii v20.2d, v1.2d, v9.2d[1] OP_ii v20.2d, v1.2d, v9.d[1]
OP_ri v21.2d, v0.2d, v9.2d[1] OP_ri v21.2d, v0.2d, v9.d[1]
OP_ir v21.2d, v1.2d, v8.2d[1] OP_ir v21.2d, v1.2d, v8.d[1]
ld2 {v4.2d, v5.2d} , [pA] // For next round ld2 {v4.2d, v5.2d} , [pA] // For next round
add pA, pA, #32 add pA, pA, #32
OP_rr v22.2d, v2.2d, v8.2d[1] OP_rr v22.2d, v2.2d, v8.d[1]
OP_ii v22.2d, v3.2d, v9.2d[1] OP_ii v22.2d, v3.2d, v9.d[1]
OP_ri v23.2d, v2.2d, v9.2d[1] OP_ri v23.2d, v2.2d, v9.d[1]
OP_ir v23.2d, v3.2d, v8.2d[1] OP_ir v23.2d, v3.2d, v8.d[1]
ld2 {v6.2d, v7.2d} , [pA] // For next round ld2 {v6.2d, v7.2d} , [pA] // For next round
add pA, pA, #32 add pA, pA, #32
OP_rr v24.2d, v0.2d, v10.2d[0] OP_rr v24.2d, v0.2d, v10.d[0]
OP_ii v24.2d, v1.2d, v11.2d[0] OP_ii v24.2d, v1.2d, v11.d[0]
OP_ri v25.2d, v0.2d, v11.2d[0] OP_ri v25.2d, v0.2d, v11.d[0]
OP_ir v25.2d, v1.2d, v10.2d[0] OP_ir v25.2d, v1.2d, v10.d[0]
prfm PLDL1KEEP, [pA, #512] prfm PLDL1KEEP, [pA, #512]
OP_rr v26.2d, v2.2d, v10.2d[0] OP_rr v26.2d, v2.2d, v10.d[0]
OP_ii v26.2d, v3.2d, v11.2d[0] OP_ii v26.2d, v3.2d, v11.d[0]
OP_ri v27.2d, v2.2d, v11.2d[0] OP_ri v27.2d, v2.2d, v11.d[0]
OP_ir v27.2d, v3.2d, v10.2d[0] OP_ir v27.2d, v3.2d, v10.d[0]
prfm PLDL1KEEP, [pB, #512] prfm PLDL1KEEP, [pB, #512]
OP_rr v28.2d, v0.2d, v10.2d[1] OP_rr v28.2d, v0.2d, v10.d[1]
OP_ii v28.2d, v1.2d, v11.2d[1] OP_ii v28.2d, v1.2d, v11.d[1]
OP_ri v29.2d, v0.2d, v11.2d[1] OP_ri v29.2d, v0.2d, v11.d[1]
OP_ir v29.2d, v1.2d, v10.2d[1] OP_ir v29.2d, v1.2d, v10.d[1]
OP_rr v30.2d, v2.2d, v10.2d[1] OP_rr v30.2d, v2.2d, v10.d[1]
OP_ii v30.2d, v3.2d, v11.2d[1] OP_ii v30.2d, v3.2d, v11.d[1]
OP_ri v31.2d, v2.2d, v11.2d[1] OP_ri v31.2d, v2.2d, v11.d[1]
OP_ir v31.2d, v3.2d, v10.2d[1] OP_ir v31.2d, v3.2d, v10.d[1]
.endm .endm
.macro KERNEL4x4_M2 .macro KERNEL4x4_M2
OP_rr v16.2d, v4.2d, v12.2d[0] OP_rr v16.2d, v4.2d, v12.d[0]
OP_ii v16.2d, v5.2d, v13.2d[0] OP_ii v16.2d, v5.2d, v13.d[0]
OP_ri v17.2d, v4.2d, v13.2d[0] OP_ri v17.2d, v4.2d, v13.d[0]
OP_ir v17.2d, v5.2d, v12.2d[0] OP_ir v17.2d, v5.2d, v12.d[0]
ld2 {v8.2d, v9.2d}, [pB] // For next round ld2 {v8.2d, v9.2d}, [pB] // For next round
add pB, pB, #32 add pB, pB, #32
OP_rr v18.2d, v6.2d, v12.2d[0] OP_rr v18.2d, v6.2d, v12.d[0]
OP_ii v18.2d, v7.2d, v13.2d[0] OP_ii v18.2d, v7.2d, v13.d[0]
OP_ri v19.2d, v6.2d, v13.2d[0] OP_ri v19.2d, v6.2d, v13.d[0]
OP_ir v19.2d, v7.2d, v12.2d[0] OP_ir v19.2d, v7.2d, v12.d[0]
ld2 {v10.2d, v11.2d}, [pB] // For next round ld2 {v10.2d, v11.2d}, [pB] // For next round
add pB, pB, #32 add pB, pB, #32
OP_rr v20.2d, v4.2d, v12.2d[1] OP_rr v20.2d, v4.2d, v12.d[1]
OP_ii v20.2d, v5.2d, v13.2d[1] OP_ii v20.2d, v5.2d, v13.d[1]
OP_ri v21.2d, v4.2d, v13.2d[1] OP_ri v21.2d, v4.2d, v13.d[1]
OP_ir v21.2d, v5.2d, v12.2d[1] OP_ir v21.2d, v5.2d, v12.d[1]
ld2 {v0.2d, v1.2d}, [pA] // For next round ld2 {v0.2d, v1.2d}, [pA] // For next round
add pA, pA, #32 add pA, pA, #32
OP_rr v22.2d, v6.2d, v12.2d[1] OP_rr v22.2d, v6.2d, v12.d[1]
OP_ii v22.2d, v7.2d, v13.2d[1] OP_ii v22.2d, v7.2d, v13.d[1]
OP_ri v23.2d, v6.2d, v13.2d[1] OP_ri v23.2d, v6.2d, v13.d[1]
OP_ir v23.2d, v7.2d, v12.2d[1] OP_ir v23.2d, v7.2d, v12.d[1]
ld2 {v2.2d, v3.2d}, [pA] // For next round ld2 {v2.2d, v3.2d}, [pA] // For next round
add pA, pA, #32 add pA, pA, #32
OP_rr v24.2d, v4.2d, v14.2d[0] OP_rr v24.2d, v4.2d, v14.d[0]
OP_ii v24.2d, v5.2d, v15.2d[0] OP_ii v24.2d, v5.2d, v15.d[0]
OP_ri v25.2d, v4.2d, v15.2d[0] OP_ri v25.2d, v4.2d, v15.d[0]
OP_ir v25.2d, v5.2d, v14.2d[0] OP_ir v25.2d, v5.2d, v14.d[0]
prfm PLDL1KEEP, [pA, #512] prfm PLDL1KEEP, [pA, #512]
OP_rr v26.2d, v6.2d, v14.2d[0] OP_rr v26.2d, v6.2d, v14.d[0]
OP_ii v26.2d, v7.2d, v15.2d[0] OP_ii v26.2d, v7.2d, v15.d[0]
OP_ri v27.2d, v6.2d, v15.2d[0] OP_ri v27.2d, v6.2d, v15.d[0]
OP_ir v27.2d, v7.2d, v14.2d[0] OP_ir v27.2d, v7.2d, v14.d[0]
prfm PLDL1KEEP, [pB, #512] prfm PLDL1KEEP, [pB, #512]
OP_rr v28.2d, v4.2d, v14.2d[1] OP_rr v28.2d, v4.2d, v14.d[1]
OP_ii v28.2d, v5.2d, v15.2d[1] OP_ii v28.2d, v5.2d, v15.d[1]
OP_ri v29.2d, v4.2d, v15.2d[1] OP_ri v29.2d, v4.2d, v15.d[1]
OP_ir v29.2d, v5.2d, v14.2d[1] OP_ir v29.2d, v5.2d, v14.d[1]
OP_rr v30.2d, v6.2d, v14.2d[1] OP_rr v30.2d, v6.2d, v14.d[1]
OP_ii v30.2d, v7.2d, v15.2d[1] OP_ii v30.2d, v7.2d, v15.d[1]
OP_ri v31.2d, v6.2d, v15.2d[1] OP_ri v31.2d, v6.2d, v15.d[1]
OP_ir v31.2d, v7.2d, v14.2d[1] OP_ir v31.2d, v7.2d, v14.d[1]
.endm .endm
.macro KERNEL4x4_E .macro KERNEL4x4_E
OP_rr v16.2d, v4.2d, v12.2d[0] OP_rr v16.2d, v4.2d, v12.d[0]
OP_ii v16.2d, v5.2d, v13.2d[0] OP_ii v16.2d, v5.2d, v13.d[0]
OP_ri v17.2d, v4.2d, v13.2d[0] OP_ri v17.2d, v4.2d, v13.d[0]
OP_ir v17.2d, v5.2d, v12.2d[0] OP_ir v17.2d, v5.2d, v12.d[0]
OP_rr v18.2d, v6.2d, v12.2d[0] OP_rr v18.2d, v6.2d, v12.d[0]
OP_ii v18.2d, v7.2d, v13.2d[0] OP_ii v18.2d, v7.2d, v13.d[0]
OP_ri v19.2d, v6.2d, v13.2d[0] OP_ri v19.2d, v6.2d, v13.d[0]
OP_ir v19.2d, v7.2d, v12.2d[0] OP_ir v19.2d, v7.2d, v12.d[0]
OP_rr v20.2d, v4.2d, v12.2d[1] OP_rr v20.2d, v4.2d, v12.d[1]
OP_ii v20.2d, v5.2d, v13.2d[1] OP_ii v20.2d, v5.2d, v13.d[1]
OP_ri v21.2d, v4.2d, v13.2d[1] OP_ri v21.2d, v4.2d, v13.d[1]
OP_ir v21.2d, v5.2d, v12.2d[1] OP_ir v21.2d, v5.2d, v12.d[1]
OP_rr v22.2d, v6.2d, v12.2d[1] OP_rr v22.2d, v6.2d, v12.d[1]
OP_ii v22.2d, v7.2d, v13.2d[1] OP_ii v22.2d, v7.2d, v13.d[1]
OP_ri v23.2d, v6.2d, v13.2d[1] OP_ri v23.2d, v6.2d, v13.d[1]
OP_ir v23.2d, v7.2d, v12.2d[1] OP_ir v23.2d, v7.2d, v12.d[1]
OP_rr v24.2d, v4.2d, v14.2d[0] OP_rr v24.2d, v4.2d, v14.d[0]
OP_ii v24.2d, v5.2d, v15.2d[0] OP_ii v24.2d, v5.2d, v15.d[0]
OP_ri v25.2d, v4.2d, v15.2d[0] OP_ri v25.2d, v4.2d, v15.d[0]
OP_ir v25.2d, v5.2d, v14.2d[0] OP_ir v25.2d, v5.2d, v14.d[0]
OP_rr v26.2d, v6.2d, v14.2d[0] OP_rr v26.2d, v6.2d, v14.d[0]
OP_ii v26.2d, v7.2d, v15.2d[0] OP_ii v26.2d, v7.2d, v15.d[0]
OP_ri v27.2d, v6.2d, v15.2d[0] OP_ri v27.2d, v6.2d, v15.d[0]
OP_ir v27.2d, v7.2d, v14.2d[0] OP_ir v27.2d, v7.2d, v14.d[0]
OP_rr v28.2d, v4.2d, v14.2d[1] OP_rr v28.2d, v4.2d, v14.d[1]
OP_ii v28.2d, v5.2d, v15.2d[1] OP_ii v28.2d, v5.2d, v15.d[1]
OP_ri v29.2d, v4.2d, v15.2d[1] OP_ri v29.2d, v4.2d, v15.d[1]
OP_ir v29.2d, v5.2d, v14.2d[1] OP_ir v29.2d, v5.2d, v14.d[1]
OP_rr v30.2d, v6.2d, v14.2d[1] OP_rr v30.2d, v6.2d, v14.d[1]
OP_ii v30.2d, v7.2d, v15.2d[1] OP_ii v30.2d, v7.2d, v15.d[1]
OP_ri v31.2d, v6.2d, v15.2d[1] OP_ri v31.2d, v6.2d, v15.d[1]
OP_ir v31.2d, v7.2d, v14.2d[1] OP_ir v31.2d, v7.2d, v14.d[1]
.endm .endm
.macro KERNEL4x4_SUB .macro KERNEL4x4_SUB
@ -448,45 +448,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v2.2d, v3.2d}, [pA] ld2 {v2.2d, v3.2d}, [pA]
add pA, pA, #32 add pA, pA, #32
OP_rr v16.2d, v0.2d, v8.2d[0] OP_rr v16.2d, v0.2d, v8.d[0]
OP_ii v16.2d, v1.2d, v9.2d[0] OP_ii v16.2d, v1.2d, v9.d[0]
OP_ri v17.2d, v0.2d, v9.2d[0] OP_ri v17.2d, v0.2d, v9.d[0]
OP_ir v17.2d, v1.2d, v8.2d[0] OP_ir v17.2d, v1.2d, v8.d[0]
OP_rr v18.2d, v2.2d, v8.2d[0] OP_rr v18.2d, v2.2d, v8.d[0]
OP_ii v18.2d, v3.2d, v9.2d[0] OP_ii v18.2d, v3.2d, v9.d[0]
OP_ri v19.2d, v2.2d, v9.2d[0] OP_ri v19.2d, v2.2d, v9.d[0]
OP_ir v19.2d, v3.2d, v8.2d[0] OP_ir v19.2d, v3.2d, v8.d[0]
OP_rr v20.2d, v0.2d, v8.2d[1] OP_rr v20.2d, v0.2d, v8.d[1]
OP_ii v20.2d, v1.2d, v9.2d[1] OP_ii v20.2d, v1.2d, v9.d[1]
OP_ri v21.2d, v0.2d, v9.2d[1] OP_ri v21.2d, v0.2d, v9.d[1]
OP_ir v21.2d, v1.2d, v8.2d[1] OP_ir v21.2d, v1.2d, v8.d[1]
OP_rr v22.2d, v2.2d, v8.2d[1] OP_rr v22.2d, v2.2d, v8.d[1]
OP_ii v22.2d, v3.2d, v9.2d[1] OP_ii v22.2d, v3.2d, v9.d[1]
OP_ri v23.2d, v2.2d, v9.2d[1] OP_ri v23.2d, v2.2d, v9.d[1]
OP_ir v23.2d, v3.2d, v8.2d[1] OP_ir v23.2d, v3.2d, v8.d[1]
OP_rr v24.2d, v0.2d, v10.2d[0] OP_rr v24.2d, v0.2d, v10.d[0]
OP_ii v24.2d, v1.2d, v11.2d[0] OP_ii v24.2d, v1.2d, v11.d[0]
OP_ri v25.2d, v0.2d, v11.2d[0] OP_ri v25.2d, v0.2d, v11.d[0]
OP_ir v25.2d, v1.2d, v10.2d[0] OP_ir v25.2d, v1.2d, v10.d[0]
OP_rr v26.2d, v2.2d, v10.2d[0] OP_rr v26.2d, v2.2d, v10.d[0]
OP_ii v26.2d, v3.2d, v11.2d[0] OP_ii v26.2d, v3.2d, v11.d[0]
OP_ri v27.2d, v2.2d, v11.2d[0] OP_ri v27.2d, v2.2d, v11.d[0]
OP_ir v27.2d, v3.2d, v10.2d[0] OP_ir v27.2d, v3.2d, v10.d[0]
OP_rr v28.2d, v0.2d, v10.2d[1] OP_rr v28.2d, v0.2d, v10.d[1]
OP_ii v28.2d, v1.2d, v11.2d[1] OP_ii v28.2d, v1.2d, v11.d[1]
OP_ri v29.2d, v0.2d, v11.2d[1] OP_ri v29.2d, v0.2d, v11.d[1]
OP_ir v29.2d, v1.2d, v10.2d[1] OP_ir v29.2d, v1.2d, v10.d[1]
OP_rr v30.2d, v2.2d, v10.2d[1] OP_rr v30.2d, v2.2d, v10.d[1]
OP_ii v30.2d, v3.2d, v11.2d[1] OP_ii v30.2d, v3.2d, v11.d[1]
OP_ri v31.2d, v2.2d, v11.2d[1] OP_ri v31.2d, v2.2d, v11.d[1]
OP_ir v31.2d, v3.2d, v10.2d[1] OP_ir v31.2d, v3.2d, v10.d[1]
.endm .endm
.macro SAVE4x4 .macro SAVE4x4
@ -582,25 +582,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v0.2d, v1.2d}, [pA] ld2 {v0.2d, v1.2d}, [pA]
add pA, pA, #32 add pA, pA, #32
OP_rr v16.2d, v0.2d, v8.2d[0] OP_rr v16.2d, v0.2d, v8.d[0]
OP_ii v16.2d, v1.2d, v9.2d[0] OP_ii v16.2d, v1.2d, v9.d[0]
OP_ri v17.2d, v0.2d, v9.2d[0] OP_ri v17.2d, v0.2d, v9.d[0]
OP_ir v17.2d, v1.2d, v8.2d[0] OP_ir v17.2d, v1.2d, v8.d[0]
OP_rr v20.2d, v0.2d, v8.2d[1] OP_rr v20.2d, v0.2d, v8.d[1]
OP_ii v20.2d, v1.2d, v9.2d[1] OP_ii v20.2d, v1.2d, v9.d[1]
OP_ri v21.2d, v0.2d, v9.2d[1] OP_ri v21.2d, v0.2d, v9.d[1]
OP_ir v21.2d, v1.2d, v8.2d[1] OP_ir v21.2d, v1.2d, v8.d[1]
OP_rr v24.2d, v0.2d, v10.2d[0] OP_rr v24.2d, v0.2d, v10.d[0]
OP_ii v24.2d, v1.2d, v11.2d[0] OP_ii v24.2d, v1.2d, v11.d[0]
OP_ri v25.2d, v0.2d, v11.2d[0] OP_ri v25.2d, v0.2d, v11.d[0]
OP_ir v25.2d, v1.2d, v10.2d[0] OP_ir v25.2d, v1.2d, v10.d[0]
OP_rr v28.2d, v0.2d, v10.2d[1] OP_rr v28.2d, v0.2d, v10.d[1]
OP_ii v28.2d, v1.2d, v11.2d[1] OP_ii v28.2d, v1.2d, v11.d[1]
OP_ri v29.2d, v0.2d, v11.2d[1] OP_ri v29.2d, v0.2d, v11.d[1]
OP_ir v29.2d, v1.2d, v10.2d[1] OP_ir v29.2d, v1.2d, v10.d[1]
.endm .endm
.macro SAVE2x4 .macro SAVE2x4
@ -669,25 +669,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v0.d, v1.d}[0], [pA] ld2 {v0.d, v1.d}[0], [pA]
add pA, pA, #16 add pA, pA, #16
OP_rr d16, d0, v8.2d[0] OP_rr d16, d0, v8.d[0]
OP_ii d16, d1, v9.2d[0] OP_ii d16, d1, v9.d[0]
OP_ri d17, d0, v9.2d[0] OP_ri d17, d0, v9.d[0]
OP_ir d17, d1, v8.2d[0] OP_ir d17, d1, v8.d[0]
OP_rr d20, d0, v8.2d[1] OP_rr d20, d0, v8.d[1]
OP_ii d20, d1, v9.2d[1] OP_ii d20, d1, v9.d[1]
OP_ri d21, d0, v9.2d[1] OP_ri d21, d0, v9.d[1]
OP_ir d21, d1, v8.2d[1] OP_ir d21, d1, v8.d[1]
OP_rr d24, d0, v10.2d[0] OP_rr d24, d0, v10.d[0]
OP_ii d24, d1, v11.2d[0] OP_ii d24, d1, v11.d[0]
OP_ri d25, d0, v11.2d[0] OP_ri d25, d0, v11.d[0]
OP_ir d25, d1, v10.2d[0] OP_ir d25, d1, v10.d[0]
OP_rr d28, d0, v10.2d[1] OP_rr d28, d0, v10.d[1]
OP_ii d28, d1, v11.2d[1] OP_ii d28, d1, v11.d[1]
OP_ri d29, d0, v11.2d[1] OP_ri d29, d0, v11.d[1]
OP_ir d29, d1, v10.2d[1] OP_ir d29, d1, v10.d[1]
.endm .endm
.macro SAVE1x4 .macro SAVE1x4
@ -756,25 +756,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v2.2d, v3.2d}, [pA] ld2 {v2.2d, v3.2d}, [pA]
add pA, pA, #32 add pA, pA, #32
OP_rr v16.2d, v0.2d, v8.2d[0] OP_rr v16.2d, v0.2d, v8.d[0]
OP_ii v16.2d, v1.2d, v9.2d[0] OP_ii v16.2d, v1.2d, v9.d[0]
OP_ri v17.2d, v0.2d, v9.2d[0] OP_ri v17.2d, v0.2d, v9.d[0]
OP_ir v17.2d, v1.2d, v8.2d[0] OP_ir v17.2d, v1.2d, v8.d[0]
OP_rr v18.2d, v2.2d, v8.2d[0] OP_rr v18.2d, v2.2d, v8.d[0]
OP_ii v18.2d, v3.2d, v9.2d[0] OP_ii v18.2d, v3.2d, v9.d[0]
OP_ri v19.2d, v2.2d, v9.2d[0] OP_ri v19.2d, v2.2d, v9.d[0]
OP_ir v19.2d, v3.2d, v8.2d[0] OP_ir v19.2d, v3.2d, v8.d[0]
OP_rr v20.2d, v0.2d, v8.2d[1] OP_rr v20.2d, v0.2d, v8.d[1]
OP_ii v20.2d, v1.2d, v9.2d[1] OP_ii v20.2d, v1.2d, v9.d[1]
OP_ri v21.2d, v0.2d, v9.2d[1] OP_ri v21.2d, v0.2d, v9.d[1]
OP_ir v21.2d, v1.2d, v8.2d[1] OP_ir v21.2d, v1.2d, v8.d[1]
OP_rr v22.2d, v2.2d, v8.2d[1] OP_rr v22.2d, v2.2d, v8.d[1]
OP_ii v22.2d, v3.2d, v9.2d[1] OP_ii v22.2d, v3.2d, v9.d[1]
OP_ri v23.2d, v2.2d, v9.2d[1] OP_ri v23.2d, v2.2d, v9.d[1]
OP_ir v23.2d, v3.2d, v8.2d[1] OP_ir v23.2d, v3.2d, v8.d[1]
.endm .endm
.macro SAVE4x2 .macro SAVE4x2
@ -833,15 +833,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v0.2d, v1.2d}, [pA] ld2 {v0.2d, v1.2d}, [pA]
add pA, pA, #32 add pA, pA, #32
OP_rr v16.2d, v0.2d, v8.2d[0] OP_rr v16.2d, v0.2d, v8.d[0]
OP_ii v16.2d, v1.2d, v9.2d[0] OP_ii v16.2d, v1.2d, v9.d[0]
OP_ri v17.2d, v0.2d, v9.2d[0] OP_ri v17.2d, v0.2d, v9.d[0]
OP_ir v17.2d, v1.2d, v8.2d[0] OP_ir v17.2d, v1.2d, v8.d[0]
OP_rr v20.2d, v0.2d, v8.2d[1] OP_rr v20.2d, v0.2d, v8.d[1]
OP_ii v20.2d, v1.2d, v9.2d[1] OP_ii v20.2d, v1.2d, v9.d[1]
OP_ri v21.2d, v0.2d, v9.2d[1] OP_ri v21.2d, v0.2d, v9.d[1]
OP_ir v21.2d, v1.2d, v8.2d[1] OP_ir v21.2d, v1.2d, v8.d[1]
.endm .endm
.macro SAVE2x2 .macro SAVE2x2
@ -886,15 +886,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v0.d, v1.d}[0], [pA] ld2 {v0.d, v1.d}[0], [pA]
add pA, pA, #16 add pA, pA, #16
OP_rr d16, d0, v8.2d[0] OP_rr d16, d0, v8.d[0]
OP_ii d16, d1, v9.2d[0] OP_ii d16, d1, v9.d[0]
OP_ri d17, d0, v9.2d[0] OP_ri d17, d0, v9.d[0]
OP_ir d17, d1, v8.2d[0] OP_ir d17, d1, v8.d[0]
OP_rr d20, d0, v8.2d[1] OP_rr d20, d0, v8.d[1]
OP_ii d20, d1, v9.2d[1] OP_ii d20, d1, v9.d[1]
OP_ri d21, d0, v9.2d[1] OP_ri d21, d0, v9.d[1]
OP_ir d21, d1, v8.2d[1] OP_ir d21, d1, v8.d[1]
.endm .endm
.macro SAVE1x2 .macro SAVE1x2

View File

@ -185,93 +185,93 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v2.2d, v3.2d}, [pA] ld2 {v2.2d, v3.2d}, [pA]
add pA, pA, #32 add pA, pA, #32
fmul v16.2d, v0.2d, v8.2d[0] fmul v16.2d, v0.2d, v8.d[0]
OP_ii v16.2d, v1.2d, v9.2d[0] OP_ii v16.2d, v1.2d, v9.d[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC) defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v17.16b, v17.16b, v17.16b eor v17.16b, v17.16b, v17.16b
fmls v17.2d, v0.2d, v9.2d[0] fmls v17.2d, v0.2d, v9.d[0]
#else #else
fmul v17.2d, v0.2d, v9.2d[0] fmul v17.2d, v0.2d, v9.d[0]
#endif #endif
OP_ir v17.2d, v1.2d, v8.2d[0] OP_ir v17.2d, v1.2d, v8.d[0]
fmul v18.2d, v2.2d, v8.2d[0] fmul v18.2d, v2.2d, v8.d[0]
OP_ii v18.2d, v3.2d, v9.2d[0] OP_ii v18.2d, v3.2d, v9.d[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC) defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v19.16b, v19.16b, v19.16b eor v19.16b, v19.16b, v19.16b
fmls v19.2d, v2.2d, v9.2d[0] fmls v19.2d, v2.2d, v9.d[0]
#else #else
fmul v19.2d, v2.2d, v9.2d[0] fmul v19.2d, v2.2d, v9.d[0]
#endif #endif
OP_ir v19.2d, v3.2d, v8.2d[0] OP_ir v19.2d, v3.2d, v8.d[0]
fmul v20.2d, v0.2d, v8.2d[1] fmul v20.2d, v0.2d, v8.d[1]
OP_ii v20.2d, v1.2d, v9.2d[1] OP_ii v20.2d, v1.2d, v9.d[1]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC) defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v21.16b, v21.16b, v21.16b eor v21.16b, v21.16b, v21.16b
fmls v21.2d, v0.2d, v9.2d[1] fmls v21.2d, v0.2d, v9.d[1]
#else #else
fmul v21.2d, v0.2d, v9.2d[1] fmul v21.2d, v0.2d, v9.d[1]
#endif #endif
OP_ir v21.2d, v1.2d, v8.2d[1] OP_ir v21.2d, v1.2d, v8.d[1]
fmul v22.2d, v2.2d, v8.2d[1] fmul v22.2d, v2.2d, v8.d[1]
OP_ii v22.2d, v3.2d, v9.2d[1] OP_ii v22.2d, v3.2d, v9.d[1]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC) defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v23.16b, v23.16b, v23.16b eor v23.16b, v23.16b, v23.16b
fmls v23.2d, v2.2d, v9.2d[1] fmls v23.2d, v2.2d, v9.d[1]
#else #else
fmul v23.2d, v2.2d, v9.2d[1] fmul v23.2d, v2.2d, v9.d[1]
#endif #endif
OP_ir v23.2d, v3.2d, v8.2d[1] OP_ir v23.2d, v3.2d, v8.d[1]
fmul v24.2d, v0.2d, v10.2d[0] fmul v24.2d, v0.2d, v10.d[0]
OP_ii v24.2d, v1.2d, v11.2d[0] OP_ii v24.2d, v1.2d, v11.d[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC) defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v25.16b, v25.16b, v25.16b eor v25.16b, v25.16b, v25.16b
fmls v25.2d, v0.2d, v11.2d[0] fmls v25.2d, v0.2d, v11.d[0]
#else #else
fmul v25.2d, v0.2d, v11.2d[0] fmul v25.2d, v0.2d, v11.d[0]
#endif #endif
OP_ir v25.2d, v1.2d, v10.2d[0] OP_ir v25.2d, v1.2d, v10.d[0]
fmul v26.2d, v2.2d, v10.2d[0] fmul v26.2d, v2.2d, v10.d[0]
OP_ii v26.2d, v3.2d, v11.2d[0] OP_ii v26.2d, v3.2d, v11.d[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC) defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v27.16b, v27.16b, v27.16b eor v27.16b, v27.16b, v27.16b
fmls v27.2d, v2.2d, v11.2d[0] fmls v27.2d, v2.2d, v11.d[0]
#else #else
fmul v27.2d, v2.2d, v11.2d[0] fmul v27.2d, v2.2d, v11.d[0]
#endif #endif
OP_ir v27.2d, v3.2d, v10.2d[0] OP_ir v27.2d, v3.2d, v10.d[0]
fmul v28.2d, v0.2d, v10.2d[1] fmul v28.2d, v0.2d, v10.d[1]
OP_ii v28.2d, v1.2d, v11.2d[1] OP_ii v28.2d, v1.2d, v11.d[1]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC) defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v29.16b, v29.16b, v29.16b eor v29.16b, v29.16b, v29.16b
fmls v29.2d, v0.2d, v11.2d[1] fmls v29.2d, v0.2d, v11.d[1]
#else #else
fmul v29.2d, v0.2d, v11.2d[1] fmul v29.2d, v0.2d, v11.d[1]
#endif #endif
OP_ir v29.2d, v1.2d, v10.2d[1] OP_ir v29.2d, v1.2d, v10.d[1]
fmul v30.2d, v2.2d, v10.2d[1] fmul v30.2d, v2.2d, v10.d[1]
OP_ii v30.2d, v3.2d, v11.2d[1] OP_ii v30.2d, v3.2d, v11.d[1]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC) defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v31.16b, v31.16b, v31.16b eor v31.16b, v31.16b, v31.16b
fmls v31.2d, v2.2d, v11.2d[1] fmls v31.2d, v2.2d, v11.d[1]
#else #else
fmul v31.2d, v2.2d, v11.2d[1] fmul v31.2d, v2.2d, v11.d[1]
#endif #endif
OP_ir v31.2d, v3.2d, v10.2d[1] OP_ir v31.2d, v3.2d, v10.d[1]
ld2 {v12.2d, v13.2d}, [pB] ld2 {v12.2d, v13.2d}, [pB]
add pB, pB, #32 add pB, pB, #32
@ -284,161 +284,161 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro KERNEL4x4_M1 .macro KERNEL4x4_M1
OP_rr v16.2d, v0.2d, v8.2d[0] OP_rr v16.2d, v0.2d, v8.d[0]
OP_ii v16.2d, v1.2d, v9.2d[0] OP_ii v16.2d, v1.2d, v9.d[0]
OP_ri v17.2d, v0.2d, v9.2d[0] OP_ri v17.2d, v0.2d, v9.d[0]
OP_ir v17.2d, v1.2d, v8.2d[0] OP_ir v17.2d, v1.2d, v8.d[0]
ld2 {v12.2d, v13.2d}, [pB] // For next round ld2 {v12.2d, v13.2d}, [pB] // For next round
add pB, pB, #32 add pB, pB, #32
OP_rr v18.2d, v2.2d, v8.2d[0] OP_rr v18.2d, v2.2d, v8.d[0]
OP_ii v18.2d, v3.2d, v9.2d[0] OP_ii v18.2d, v3.2d, v9.d[0]
OP_ri v19.2d, v2.2d, v9.2d[0] OP_ri v19.2d, v2.2d, v9.d[0]
OP_ir v19.2d, v3.2d, v8.2d[0] OP_ir v19.2d, v3.2d, v8.d[0]
ld2 {v14.2d, v15.2d}, [pB] // For next round ld2 {v14.2d, v15.2d}, [pB] // For next round
add pB, pB, #32 add pB, pB, #32
OP_rr v20.2d, v0.2d, v8.2d[1] OP_rr v20.2d, v0.2d, v8.d[1]
OP_ii v20.2d, v1.2d, v9.2d[1] OP_ii v20.2d, v1.2d, v9.d[1]
OP_ri v21.2d, v0.2d, v9.2d[1] OP_ri v21.2d, v0.2d, v9.d[1]
OP_ir v21.2d, v1.2d, v8.2d[1] OP_ir v21.2d, v1.2d, v8.d[1]
ld2 {v4.2d, v5.2d} , [pA] // For next round ld2 {v4.2d, v5.2d} , [pA] // For next round
add pA, pA, #32 add pA, pA, #32
OP_rr v22.2d, v2.2d, v8.2d[1] OP_rr v22.2d, v2.2d, v8.d[1]
OP_ii v22.2d, v3.2d, v9.2d[1] OP_ii v22.2d, v3.2d, v9.d[1]
OP_ri v23.2d, v2.2d, v9.2d[1] OP_ri v23.2d, v2.2d, v9.d[1]
OP_ir v23.2d, v3.2d, v8.2d[1] OP_ir v23.2d, v3.2d, v8.d[1]
ld2 {v6.2d, v7.2d} , [pA] // For next round ld2 {v6.2d, v7.2d} , [pA] // For next round
add pA, pA, #32 add pA, pA, #32
OP_rr v24.2d, v0.2d, v10.2d[0] OP_rr v24.2d, v0.2d, v10.d[0]
OP_ii v24.2d, v1.2d, v11.2d[0] OP_ii v24.2d, v1.2d, v11.d[0]
OP_ri v25.2d, v0.2d, v11.2d[0] OP_ri v25.2d, v0.2d, v11.d[0]
OP_ir v25.2d, v1.2d, v10.2d[0] OP_ir v25.2d, v1.2d, v10.d[0]
prfm PLDL1KEEP, [pA, #512] prfm PLDL1KEEP, [pA, #512]
OP_rr v26.2d, v2.2d, v10.2d[0] OP_rr v26.2d, v2.2d, v10.d[0]
OP_ii v26.2d, v3.2d, v11.2d[0] OP_ii v26.2d, v3.2d, v11.d[0]
OP_ri v27.2d, v2.2d, v11.2d[0] OP_ri v27.2d, v2.2d, v11.d[0]
OP_ir v27.2d, v3.2d, v10.2d[0] OP_ir v27.2d, v3.2d, v10.d[0]
prfm PLDL1KEEP, [pB, #512] prfm PLDL1KEEP, [pB, #512]
OP_rr v28.2d, v0.2d, v10.2d[1] OP_rr v28.2d, v0.2d, v10.d[1]
OP_ii v28.2d, v1.2d, v11.2d[1] OP_ii v28.2d, v1.2d, v11.d[1]
OP_ri v29.2d, v0.2d, v11.2d[1] OP_ri v29.2d, v0.2d, v11.d[1]
OP_ir v29.2d, v1.2d, v10.2d[1] OP_ir v29.2d, v1.2d, v10.d[1]
OP_rr v30.2d, v2.2d, v10.2d[1] OP_rr v30.2d, v2.2d, v10.d[1]
OP_ii v30.2d, v3.2d, v11.2d[1] OP_ii v30.2d, v3.2d, v11.d[1]
OP_ri v31.2d, v2.2d, v11.2d[1] OP_ri v31.2d, v2.2d, v11.d[1]
OP_ir v31.2d, v3.2d, v10.2d[1] OP_ir v31.2d, v3.2d, v10.d[1]
.endm .endm
.macro KERNEL4x4_M2 .macro KERNEL4x4_M2
OP_rr v16.2d, v4.2d, v12.2d[0] OP_rr v16.2d, v4.2d, v12.d[0]
OP_ii v16.2d, v5.2d, v13.2d[0] OP_ii v16.2d, v5.2d, v13.d[0]
OP_ri v17.2d, v4.2d, v13.2d[0] OP_ri v17.2d, v4.2d, v13.d[0]
OP_ir v17.2d, v5.2d, v12.2d[0] OP_ir v17.2d, v5.2d, v12.d[0]
ld2 {v8.2d, v9.2d}, [pB] // For next round ld2 {v8.2d, v9.2d}, [pB] // For next round
add pB, pB, #32 add pB, pB, #32
OP_rr v18.2d, v6.2d, v12.2d[0] OP_rr v18.2d, v6.2d, v12.d[0]
OP_ii v18.2d, v7.2d, v13.2d[0] OP_ii v18.2d, v7.2d, v13.d[0]
OP_ri v19.2d, v6.2d, v13.2d[0] OP_ri v19.2d, v6.2d, v13.d[0]
OP_ir v19.2d, v7.2d, v12.2d[0] OP_ir v19.2d, v7.2d, v12.d[0]
ld2 {v10.2d, v11.2d}, [pB] // For next round ld2 {v10.2d, v11.2d}, [pB] // For next round
add pB, pB, #32 add pB, pB, #32
OP_rr v20.2d, v4.2d, v12.2d[1] OP_rr v20.2d, v4.2d, v12.d[1]
OP_ii v20.2d, v5.2d, v13.2d[1] OP_ii v20.2d, v5.2d, v13.d[1]
OP_ri v21.2d, v4.2d, v13.2d[1] OP_ri v21.2d, v4.2d, v13.d[1]
OP_ir v21.2d, v5.2d, v12.2d[1] OP_ir v21.2d, v5.2d, v12.d[1]
ld2 {v0.2d, v1.2d}, [pA] // For next round ld2 {v0.2d, v1.2d}, [pA] // For next round
add pA, pA, #32 add pA, pA, #32
OP_rr v22.2d, v6.2d, v12.2d[1] OP_rr v22.2d, v6.2d, v12.d[1]
OP_ii v22.2d, v7.2d, v13.2d[1] OP_ii v22.2d, v7.2d, v13.d[1]
OP_ri v23.2d, v6.2d, v13.2d[1] OP_ri v23.2d, v6.2d, v13.d[1]
OP_ir v23.2d, v7.2d, v12.2d[1] OP_ir v23.2d, v7.2d, v12.d[1]
ld2 {v2.2d, v3.2d}, [pA] // For next round ld2 {v2.2d, v3.2d}, [pA] // For next round
add pA, pA, #32 add pA, pA, #32
OP_rr v24.2d, v4.2d, v14.2d[0] OP_rr v24.2d, v4.2d, v14.d[0]
OP_ii v24.2d, v5.2d, v15.2d[0] OP_ii v24.2d, v5.2d, v15.d[0]
OP_ri v25.2d, v4.2d, v15.2d[0] OP_ri v25.2d, v4.2d, v15.d[0]
OP_ir v25.2d, v5.2d, v14.2d[0] OP_ir v25.2d, v5.2d, v14.d[0]
prfm PLDL1KEEP, [pA, #512] prfm PLDL1KEEP, [pA, #512]
OP_rr v26.2d, v6.2d, v14.2d[0] OP_rr v26.2d, v6.2d, v14.d[0]
OP_ii v26.2d, v7.2d, v15.2d[0] OP_ii v26.2d, v7.2d, v15.d[0]
OP_ri v27.2d, v6.2d, v15.2d[0] OP_ri v27.2d, v6.2d, v15.d[0]
OP_ir v27.2d, v7.2d, v14.2d[0] OP_ir v27.2d, v7.2d, v14.d[0]
prfm PLDL1KEEP, [pB, #512] prfm PLDL1KEEP, [pB, #512]
OP_rr v28.2d, v4.2d, v14.2d[1] OP_rr v28.2d, v4.2d, v14.d[1]
OP_ii v28.2d, v5.2d, v15.2d[1] OP_ii v28.2d, v5.2d, v15.d[1]
OP_ri v29.2d, v4.2d, v15.2d[1] OP_ri v29.2d, v4.2d, v15.d[1]
OP_ir v29.2d, v5.2d, v14.2d[1] OP_ir v29.2d, v5.2d, v14.d[1]
OP_rr v30.2d, v6.2d, v14.2d[1] OP_rr v30.2d, v6.2d, v14.d[1]
OP_ii v30.2d, v7.2d, v15.2d[1] OP_ii v30.2d, v7.2d, v15.d[1]
OP_ri v31.2d, v6.2d, v15.2d[1] OP_ri v31.2d, v6.2d, v15.d[1]
OP_ir v31.2d, v7.2d, v14.2d[1] OP_ir v31.2d, v7.2d, v14.d[1]
.endm .endm
.macro KERNEL4x4_E .macro KERNEL4x4_E
OP_rr v16.2d, v4.2d, v12.2d[0] OP_rr v16.2d, v4.2d, v12.d[0]
OP_ii v16.2d, v5.2d, v13.2d[0] OP_ii v16.2d, v5.2d, v13.d[0]
OP_ri v17.2d, v4.2d, v13.2d[0] OP_ri v17.2d, v4.2d, v13.d[0]
OP_ir v17.2d, v5.2d, v12.2d[0] OP_ir v17.2d, v5.2d, v12.d[0]
OP_rr v18.2d, v6.2d, v12.2d[0] OP_rr v18.2d, v6.2d, v12.d[0]
OP_ii v18.2d, v7.2d, v13.2d[0] OP_ii v18.2d, v7.2d, v13.d[0]
OP_ri v19.2d, v6.2d, v13.2d[0] OP_ri v19.2d, v6.2d, v13.d[0]
OP_ir v19.2d, v7.2d, v12.2d[0] OP_ir v19.2d, v7.2d, v12.d[0]
OP_rr v20.2d, v4.2d, v12.2d[1] OP_rr v20.2d, v4.2d, v12.d[1]
OP_ii v20.2d, v5.2d, v13.2d[1] OP_ii v20.2d, v5.2d, v13.d[1]
OP_ri v21.2d, v4.2d, v13.2d[1] OP_ri v21.2d, v4.2d, v13.d[1]
OP_ir v21.2d, v5.2d, v12.2d[1] OP_ir v21.2d, v5.2d, v12.d[1]
OP_rr v22.2d, v6.2d, v12.2d[1] OP_rr v22.2d, v6.2d, v12.d[1]
OP_ii v22.2d, v7.2d, v13.2d[1] OP_ii v22.2d, v7.2d, v13.d[1]
OP_ri v23.2d, v6.2d, v13.2d[1] OP_ri v23.2d, v6.2d, v13.d[1]
OP_ir v23.2d, v7.2d, v12.2d[1] OP_ir v23.2d, v7.2d, v12.d[1]
OP_rr v24.2d, v4.2d, v14.2d[0] OP_rr v24.2d, v4.2d, v14.d[0]
OP_ii v24.2d, v5.2d, v15.2d[0] OP_ii v24.2d, v5.2d, v15.d[0]
OP_ri v25.2d, v4.2d, v15.2d[0] OP_ri v25.2d, v4.2d, v15.d[0]
OP_ir v25.2d, v5.2d, v14.2d[0] OP_ir v25.2d, v5.2d, v14.d[0]
OP_rr v26.2d, v6.2d, v14.2d[0] OP_rr v26.2d, v6.2d, v14.d[0]
OP_ii v26.2d, v7.2d, v15.2d[0] OP_ii v26.2d, v7.2d, v15.d[0]
OP_ri v27.2d, v6.2d, v15.2d[0] OP_ri v27.2d, v6.2d, v15.d[0]
OP_ir v27.2d, v7.2d, v14.2d[0] OP_ir v27.2d, v7.2d, v14.d[0]
OP_rr v28.2d, v4.2d, v14.2d[1] OP_rr v28.2d, v4.2d, v14.d[1]
OP_ii v28.2d, v5.2d, v15.2d[1] OP_ii v28.2d, v5.2d, v15.d[1]
OP_ri v29.2d, v4.2d, v15.2d[1] OP_ri v29.2d, v4.2d, v15.d[1]
OP_ir v29.2d, v5.2d, v14.2d[1] OP_ir v29.2d, v5.2d, v14.d[1]
OP_rr v30.2d, v6.2d, v14.2d[1] OP_rr v30.2d, v6.2d, v14.d[1]
OP_ii v30.2d, v7.2d, v15.2d[1] OP_ii v30.2d, v7.2d, v15.d[1]
OP_ri v31.2d, v6.2d, v15.2d[1] OP_ri v31.2d, v6.2d, v15.d[1]
OP_ir v31.2d, v7.2d, v14.2d[1] OP_ir v31.2d, v7.2d, v14.d[1]
.endm .endm
.macro KERNEL4x4_SUB .macro KERNEL4x4_SUB
@ -451,45 +451,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v2.2d, v3.2d}, [pA] ld2 {v2.2d, v3.2d}, [pA]
add pA, pA, #32 add pA, pA, #32
OP_rr v16.2d, v0.2d, v8.2d[0] OP_rr v16.2d, v0.2d, v8.d[0]
OP_ii v16.2d, v1.2d, v9.2d[0] OP_ii v16.2d, v1.2d, v9.d[0]
OP_ri v17.2d, v0.2d, v9.2d[0] OP_ri v17.2d, v0.2d, v9.d[0]
OP_ir v17.2d, v1.2d, v8.2d[0] OP_ir v17.2d, v1.2d, v8.d[0]
OP_rr v18.2d, v2.2d, v8.2d[0] OP_rr v18.2d, v2.2d, v8.d[0]
OP_ii v18.2d, v3.2d, v9.2d[0] OP_ii v18.2d, v3.2d, v9.d[0]
OP_ri v19.2d, v2.2d, v9.2d[0] OP_ri v19.2d, v2.2d, v9.d[0]
OP_ir v19.2d, v3.2d, v8.2d[0] OP_ir v19.2d, v3.2d, v8.d[0]
OP_rr v20.2d, v0.2d, v8.2d[1] OP_rr v20.2d, v0.2d, v8.d[1]
OP_ii v20.2d, v1.2d, v9.2d[1] OP_ii v20.2d, v1.2d, v9.d[1]
OP_ri v21.2d, v0.2d, v9.2d[1] OP_ri v21.2d, v0.2d, v9.d[1]
OP_ir v21.2d, v1.2d, v8.2d[1] OP_ir v21.2d, v1.2d, v8.d[1]
OP_rr v22.2d, v2.2d, v8.2d[1] OP_rr v22.2d, v2.2d, v8.d[1]
OP_ii v22.2d, v3.2d, v9.2d[1] OP_ii v22.2d, v3.2d, v9.d[1]
OP_ri v23.2d, v2.2d, v9.2d[1] OP_ri v23.2d, v2.2d, v9.d[1]
OP_ir v23.2d, v3.2d, v8.2d[1] OP_ir v23.2d, v3.2d, v8.d[1]
OP_rr v24.2d, v0.2d, v10.2d[0] OP_rr v24.2d, v0.2d, v10.d[0]
OP_ii v24.2d, v1.2d, v11.2d[0] OP_ii v24.2d, v1.2d, v11.d[0]
OP_ri v25.2d, v0.2d, v11.2d[0] OP_ri v25.2d, v0.2d, v11.d[0]
OP_ir v25.2d, v1.2d, v10.2d[0] OP_ir v25.2d, v1.2d, v10.d[0]
OP_rr v26.2d, v2.2d, v10.2d[0] OP_rr v26.2d, v2.2d, v10.d[0]
OP_ii v26.2d, v3.2d, v11.2d[0] OP_ii v26.2d, v3.2d, v11.d[0]
OP_ri v27.2d, v2.2d, v11.2d[0] OP_ri v27.2d, v2.2d, v11.d[0]
OP_ir v27.2d, v3.2d, v10.2d[0] OP_ir v27.2d, v3.2d, v10.d[0]
OP_rr v28.2d, v0.2d, v10.2d[1] OP_rr v28.2d, v0.2d, v10.d[1]
OP_ii v28.2d, v1.2d, v11.2d[1] OP_ii v28.2d, v1.2d, v11.d[1]
OP_ri v29.2d, v0.2d, v11.2d[1] OP_ri v29.2d, v0.2d, v11.d[1]
OP_ir v29.2d, v1.2d, v10.2d[1] OP_ir v29.2d, v1.2d, v10.d[1]
OP_rr v30.2d, v2.2d, v10.2d[1] OP_rr v30.2d, v2.2d, v10.d[1]
OP_ii v30.2d, v3.2d, v11.2d[1] OP_ii v30.2d, v3.2d, v11.d[1]
OP_ri v31.2d, v2.2d, v11.2d[1] OP_ri v31.2d, v2.2d, v11.d[1]
OP_ir v31.2d, v3.2d, v10.2d[1] OP_ir v31.2d, v3.2d, v10.d[1]
.endm .endm
.macro SAVE4x4 .macro SAVE4x4
@ -577,25 +577,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v0.2d, v1.2d}, [pA] ld2 {v0.2d, v1.2d}, [pA]
add pA, pA, #32 add pA, pA, #32
OP_rr v16.2d, v0.2d, v8.2d[0] OP_rr v16.2d, v0.2d, v8.d[0]
OP_ii v16.2d, v1.2d, v9.2d[0] OP_ii v16.2d, v1.2d, v9.d[0]
OP_ri v17.2d, v0.2d, v9.2d[0] OP_ri v17.2d, v0.2d, v9.d[0]
OP_ir v17.2d, v1.2d, v8.2d[0] OP_ir v17.2d, v1.2d, v8.d[0]
OP_rr v20.2d, v0.2d, v8.2d[1] OP_rr v20.2d, v0.2d, v8.d[1]
OP_ii v20.2d, v1.2d, v9.2d[1] OP_ii v20.2d, v1.2d, v9.d[1]
OP_ri v21.2d, v0.2d, v9.2d[1] OP_ri v21.2d, v0.2d, v9.d[1]
OP_ir v21.2d, v1.2d, v8.2d[1] OP_ir v21.2d, v1.2d, v8.d[1]
OP_rr v24.2d, v0.2d, v10.2d[0] OP_rr v24.2d, v0.2d, v10.d[0]
OP_ii v24.2d, v1.2d, v11.2d[0] OP_ii v24.2d, v1.2d, v11.d[0]
OP_ri v25.2d, v0.2d, v11.2d[0] OP_ri v25.2d, v0.2d, v11.d[0]
OP_ir v25.2d, v1.2d, v10.2d[0] OP_ir v25.2d, v1.2d, v10.d[0]
OP_rr v28.2d, v0.2d, v10.2d[1] OP_rr v28.2d, v0.2d, v10.d[1]
OP_ii v28.2d, v1.2d, v11.2d[1] OP_ii v28.2d, v1.2d, v11.d[1]
OP_ri v29.2d, v0.2d, v11.2d[1] OP_ri v29.2d, v0.2d, v11.d[1]
OP_ir v29.2d, v1.2d, v10.2d[1] OP_ir v29.2d, v1.2d, v10.d[1]
.endm .endm
.macro SAVE2x4 .macro SAVE2x4
@ -660,25 +660,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v0.d, v1.d}[0], [pA] ld2 {v0.d, v1.d}[0], [pA]
add pA, pA, #16 add pA, pA, #16
OP_rr d16, d0, v8.2d[0] OP_rr d16, d0, v8.d[0]
OP_ii d16, d1, v9.2d[0] OP_ii d16, d1, v9.d[0]
OP_ri d17, d0, v9.2d[0] OP_ri d17, d0, v9.d[0]
OP_ir d17, d1, v8.2d[0] OP_ir d17, d1, v8.d[0]
OP_rr d20, d0, v8.2d[1] OP_rr d20, d0, v8.d[1]
OP_ii d20, d1, v9.2d[1] OP_ii d20, d1, v9.d[1]
OP_ri d21, d0, v9.2d[1] OP_ri d21, d0, v9.d[1]
OP_ir d21, d1, v8.2d[1] OP_ir d21, d1, v8.d[1]
OP_rr d24, d0, v10.2d[0] OP_rr d24, d0, v10.d[0]
OP_ii d24, d1, v11.2d[0] OP_ii d24, d1, v11.d[0]
OP_ri d25, d0, v11.2d[0] OP_ri d25, d0, v11.d[0]
OP_ir d25, d1, v10.2d[0] OP_ir d25, d1, v10.d[0]
OP_rr d28, d0, v10.2d[1] OP_rr d28, d0, v10.d[1]
OP_ii d28, d1, v11.2d[1] OP_ii d28, d1, v11.d[1]
OP_ri d29, d0, v11.2d[1] OP_ri d29, d0, v11.d[1]
OP_ir d29, d1, v10.2d[1] OP_ir d29, d1, v10.d[1]
.endm .endm
.macro SAVE1x4 .macro SAVE1x4
@ -743,25 +743,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v2.2d, v3.2d}, [pA] ld2 {v2.2d, v3.2d}, [pA]
add pA, pA, #32 add pA, pA, #32
OP_rr v16.2d, v0.2d, v8.2d[0] OP_rr v16.2d, v0.2d, v8.d[0]
OP_ii v16.2d, v1.2d, v9.2d[0] OP_ii v16.2d, v1.2d, v9.d[0]
OP_ri v17.2d, v0.2d, v9.2d[0] OP_ri v17.2d, v0.2d, v9.d[0]
OP_ir v17.2d, v1.2d, v8.2d[0] OP_ir v17.2d, v1.2d, v8.d[0]
OP_rr v18.2d, v2.2d, v8.2d[0] OP_rr v18.2d, v2.2d, v8.d[0]
OP_ii v18.2d, v3.2d, v9.2d[0] OP_ii v18.2d, v3.2d, v9.d[0]
OP_ri v19.2d, v2.2d, v9.2d[0] OP_ri v19.2d, v2.2d, v9.d[0]
OP_ir v19.2d, v3.2d, v8.2d[0] OP_ir v19.2d, v3.2d, v8.d[0]
OP_rr v20.2d, v0.2d, v8.2d[1] OP_rr v20.2d, v0.2d, v8.d[1]
OP_ii v20.2d, v1.2d, v9.2d[1] OP_ii v20.2d, v1.2d, v9.d[1]
OP_ri v21.2d, v0.2d, v9.2d[1] OP_ri v21.2d, v0.2d, v9.d[1]
OP_ir v21.2d, v1.2d, v8.2d[1] OP_ir v21.2d, v1.2d, v8.d[1]
OP_rr v22.2d, v2.2d, v8.2d[1] OP_rr v22.2d, v2.2d, v8.d[1]
OP_ii v22.2d, v3.2d, v9.2d[1] OP_ii v22.2d, v3.2d, v9.d[1]
OP_ri v23.2d, v2.2d, v9.2d[1] OP_ri v23.2d, v2.2d, v9.d[1]
OP_ir v23.2d, v3.2d, v8.2d[1] OP_ir v23.2d, v3.2d, v8.d[1]
.endm .endm
.macro SAVE4x2 .macro SAVE4x2
@ -816,15 +816,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v0.2d, v1.2d}, [pA] ld2 {v0.2d, v1.2d}, [pA]
add pA, pA, #32 add pA, pA, #32
OP_rr v16.2d, v0.2d, v8.2d[0] OP_rr v16.2d, v0.2d, v8.d[0]
OP_ii v16.2d, v1.2d, v9.2d[0] OP_ii v16.2d, v1.2d, v9.d[0]
OP_ri v17.2d, v0.2d, v9.2d[0] OP_ri v17.2d, v0.2d, v9.d[0]
OP_ir v17.2d, v1.2d, v8.2d[0] OP_ir v17.2d, v1.2d, v8.d[0]
OP_rr v20.2d, v0.2d, v8.2d[1] OP_rr v20.2d, v0.2d, v8.d[1]
OP_ii v20.2d, v1.2d, v9.2d[1] OP_ii v20.2d, v1.2d, v9.d[1]
OP_ri v21.2d, v0.2d, v9.2d[1] OP_ri v21.2d, v0.2d, v9.d[1]
OP_ir v21.2d, v1.2d, v8.2d[1] OP_ir v21.2d, v1.2d, v8.d[1]
.endm .endm
.macro SAVE2x2 .macro SAVE2x2
@ -867,15 +867,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v0.d, v1.d}[0], [pA] ld2 {v0.d, v1.d}[0], [pA]
add pA, pA, #16 add pA, pA, #16
OP_rr d16, d0, v8.2d[0] OP_rr d16, d0, v8.d[0]
OP_ii d16, d1, v9.2d[0] OP_ii d16, d1, v9.d[0]
OP_ri d17, d0, v9.2d[0] OP_ri d17, d0, v9.d[0]
OP_ir d17, d1, v8.2d[0] OP_ir d17, d1, v8.d[0]
OP_rr d20, d0, v8.2d[1] OP_rr d20, d0, v8.d[1]
OP_ii d20, d1, v9.2d[1] OP_ii d20, d1, v9.d[1]
OP_ri d21, d0, v9.2d[1] OP_ri d21, d0, v9.d[1]
OP_ir d21, d1, v8.2d[1] OP_ir d21, d1, v8.d[1]
.endm .endm
.macro SAVE1x2 .macro SAVE1x2

View File

@ -3,14 +3,18 @@
#CGEMM_BETA = ../generic/zgemm_beta.c #CGEMM_BETA = ../generic/zgemm_beta.c
#ZGEMM_BETA = ../generic/zgemm_beta.c #ZGEMM_BETA = ../generic/zgemm_beta.c
STRMMKERNEL = gemm_kernel_power6.S STRMMKERNEL = strmm_kernel_16x8_power8.S
DTRMMKERNEL = dtrmm_kernel_16x4_power8.S DTRMMKERNEL = dtrmm_kernel_16x4_power8.S
CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c CTRMMKERNEL = ctrmm_kernel_8x4_power8.S
ZTRMMKERNEL = ztrmm_kernel_8x2_power8.S ZTRMMKERNEL = ztrmm_kernel_8x2_power8.S
SGEMMKERNEL = gemm_kernel_power6.S SGEMMKERNEL = sgemm_kernel_16x8_power8.S
SGEMMONCOPY = ../generic/gemm_ncopy_4.c SGEMMINCOPY = ../generic/gemm_ncopy_16.c
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c SGEMMITCOPY = ../generic/gemm_tcopy_16.c
SGEMMONCOPY = ../generic/gemm_ncopy_8.c
SGEMMOTCOPY = ../generic/gemm_tcopy_8.c
SGEMMINCOPYOBJ = sgemm_incopy.o
SGEMMITCOPYOBJ = sgemm_itcopy.o
SGEMMONCOPYOBJ = sgemm_oncopy.o SGEMMONCOPYOBJ = sgemm_oncopy.o
SGEMMOTCOPYOBJ = sgemm_otcopy.o SGEMMOTCOPYOBJ = sgemm_otcopy.o
@ -24,11 +28,15 @@ DGEMMITCOPYOBJ = dgemm_itcopy.o
DGEMMONCOPYOBJ = dgemm_oncopy.o DGEMMONCOPYOBJ = dgemm_oncopy.o
DGEMMOTCOPYOBJ = dgemm_otcopy.o DGEMMOTCOPYOBJ = dgemm_otcopy.o
CGEMMKERNEL = ../generic/zgemmkernel_2x2.c CGEMMKERNEL = cgemm_kernel_8x4_power8.S
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c CGEMMINCOPY = ../generic/zgemm_ncopy_8.c
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c CGEMMITCOPY = ../generic/zgemm_tcopy_8.c
CGEMMONCOPY = ../generic/zgemm_ncopy_4.c
CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c
CGEMMONCOPYOBJ = cgemm_oncopy.o CGEMMONCOPYOBJ = cgemm_oncopy.o
CGEMMOTCOPYOBJ = cgemm_otcopy.o CGEMMOTCOPYOBJ = cgemm_otcopy.o
CGEMMINCOPYOBJ = cgemm_incopy.o
CGEMMITCOPYOBJ = cgemm_itcopy.o
ZGEMMKERNEL = zgemm_kernel_8x2_power8.S ZGEMMKERNEL = zgemm_kernel_8x2_power8.S
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
@ -97,56 +105,56 @@ ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
#ISMINKERNEL = ../arm/imin.c #ISMINKERNEL = ../arm/imin.c
#IDMINKERNEL = ../arm/imin.c #IDMINKERNEL = ../arm/imin.c
# #
#SASUMKERNEL = ../arm/asum.c SASUMKERNEL = sasum.c
#DASUMKERNEL = ../arm/asum.c DASUMKERNEL = dasum.c
#CASUMKERNEL = ../arm/zasum.c CASUMKERNEL = casum.c
#ZASUMKERNEL = ../arm/zasum.c ZASUMKERNEL = zasum.c
# #
#SAXPYKERNEL = ../arm/axpy.c #SAXPYKERNEL = ../arm/axpy.c
#DAXPYKERNEL = ../arm/axpy.c DAXPYKERNEL = daxpy.c
#CAXPYKERNEL = ../arm/zaxpy.c #CAXPYKERNEL = ../arm/zaxpy.c
#ZAXPYKERNEL = ../arm/zaxpy.c ZAXPYKERNEL = zaxpy.c
# #
#SCOPYKERNEL = ../arm/copy.c SCOPYKERNEL = scopy.c
#DCOPYKERNEL = ../arm/copy.c DCOPYKERNEL = dcopy.c
#CCOPYKERNEL = ../arm/zcopy.c CCOPYKERNEL = ccopy.c
#ZCOPYKERNEL = ../arm/zcopy.c ZCOPYKERNEL = zcopy.c
# #
#SDOTKERNEL = ../arm/dot.c SDOTKERNEL = sdot.c
#DDOTKERNEL = ../arm/dot.c DDOTKERNEL = ddot.c
#CDOTKERNEL = ../arm/zdot.c #CDOTKERNEL = ../arm/zdot.c
#ZDOTKERNEL = ../arm/zdot.c ZDOTKERNEL = zdot.c
# #
#SNRM2KERNEL = ../arm/nrm2.c #SNRM2KERNEL = ../arm/nrm2.c
#DNRM2KERNEL = ../arm/nrm2.c #DNRM2KERNEL = ../arm/nrm2.c
#CNRM2KERNEL = ../arm/znrm2.c #CNRM2KERNEL = ../arm/znrm2.c
#ZNRM2KERNEL = ../arm/znrm2.c #ZNRM2KERNEL = ../arm/znrm2.c
# #
#SROTKERNEL = ../arm/rot.c SROTKERNEL = srot.c
#DROTKERNEL = ../arm/rot.c DROTKERNEL = drot.c
#CROTKERNEL = ../arm/zrot.c #CROTKERNEL = ../arm/zrot.c
#ZROTKERNEL = ../arm/zrot.c #ZROTKERNEL = ../arm/zrot.c
# #
#SSCALKERNEL = ../arm/scal.c SSCALKERNEL = sscal.c
#DSCALKERNEL = ../arm/scal.c DSCALKERNEL = dscal.c
#CSCALKERNEL = ../arm/zscal.c #CSCALKERNEL = ../arm/zscal.c
#ZSCALKERNEL = ../arm/zscal.c ZSCALKERNEL = zscal.c
# #
#SSWAPKERNEL = ../arm/swap.c SSWAPKERNEL = sswap.c
#DSWAPKERNEL = ../arm/swap.c DSWAPKERNEL = dswap.c
#CSWAPKERNEL = ../arm/zswap.c CSWAPKERNEL = cswap.c
#ZSWAPKERNEL = ../arm/zswap.c ZSWAPKERNEL = zswap.c
# #
#SGEMVNKERNEL = ../arm/gemv_n.c #SGEMVNKERNEL = ../arm/gemv_n.c
#DGEMVNKERNEL = ../arm/gemv_n.c DGEMVNKERNEL = dgemv_n.c
#CGEMVNKERNEL = ../arm/zgemv_n.c #CGEMVNKERNEL = ../arm/zgemv_n.c
#ZGEMVNKERNEL = ../arm/zgemv_n.c #ZGEMVNKERNEL = ../arm/zgemv_n.c
# #
#SGEMVTKERNEL = ../arm/gemv_t.c #SGEMVTKERNEL = ../arm/gemv_t.c
#DGEMVTKERNEL = ../arm/gemv_t.c #DGEMVTKERNEL = ../arm/gemv_t.c
#CGEMVTKERNEL = ../arm/zgemv_t.c #CGEMVTKERNEL = ../arm/zgemv_t.c
#ZGEMVTKERNEL = ../arm/zgemv_t.c #ZGEMVTKERNEL = zgemv_t_4.c
#SSYMV_U_KERNEL = ../generic/symv_k.c #SSYMV_U_KERNEL = ../generic/symv_k.c

151
kernel/power/casum.c Normal file
View File

@ -0,0 +1,151 @@
/***************************************************************************
Copyright (c) 2013-2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2016/03/28 Werner Saar (wernsaar@googlemail.com)
* BLASTEST : OK
* CTEST : OK
* TEST : OK
* LAPACK-TEST : OK
**************************************************************************************/
#include "common.h"
#include <math.h>
#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif
#if defined(POWER8)
#include "casum_microk_power8.c"
#endif
#ifndef HAVE_KERNEL_16
static void casum_kernel_16(BLASLONG n, FLOAT *x1, FLOAT *svec)
{
BLASLONG i=0;
FLOAT *x = x1;
FLOAT temp0, temp1, temp2, temp3;
FLOAT temp4, temp5, temp6, temp7;
FLOAT sum0 = 0.0;
FLOAT sum1 = 0.0;
FLOAT sum2 = 0.0;
FLOAT sum3 = 0.0;
while ( i< n )
{
temp0 = ABS(x[0]);
temp1 = ABS(x[1]);
temp2 = ABS(x[2]);
temp3 = ABS(x[3]);
temp4 = ABS(x[4]);
temp5 = ABS(x[5]);
temp6 = ABS(x[6]);
temp7 = ABS(x[7]);
sum0 += temp0;
sum1 += temp1;
sum2 += temp2;
sum3 += temp3;
sum0 += temp4;
sum1 += temp5;
sum2 += temp6;
sum3 += temp7;
x+=8;
i+=4;
}
svec[0] = sum0+sum1+sum2+sum3;
svec[1] = 0.0;
svec[2] = 0.0;
svec[3] = 0.0;
}
#endif
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i=0;
BLASLONG ip=0;
FLOAT sumf = 0.0;
FLOAT svec[4] __attribute__ ((aligned (16)));;
BLASLONG n1;
BLASLONG inc_x2;
if (n <= 0 || inc_x <= 0) return(sumf);
if ( inc_x == 1 )
{
n1 = n & -16;
if ( n1 > 0 )
{
casum_kernel_16(n1, x, svec);
sumf = svec[0] + svec[1]+svec[2]+svec[3];
i=n1;
ip = 2 * n1;
}
while(i < n)
{
sumf += ABS(x[ip]) + ABS(x[ip+1]);
ip += 2;
i++;
}
}
else
{
inc_x2 = 2 * inc_x;
while(i < n)
{
sumf += ABS(x[ip]) + ABS(x[ip+1]);
ip += inc_x2;
i++;
}
}
return(sumf);
}

View File

@ -0,0 +1,177 @@
/***************************************************************************
Copyright (c) 2013-2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2016/03/28 Werner Saar (wernsaar@googlemail.com)
* BLASTEST : OK
* CTEST : OK
* TEST : OK
* LAPACK-TEST : OK
**************************************************************************************/
#define HAVE_KERNEL_16 1
static void casum_kernel_16( BLASLONG n, FLOAT *x, FLOAT *svec) __attribute__ ((noinline));
static void casum_kernel_16( BLASLONG n, FLOAT *x, FLOAT *svec)
{
BLASLONG i = n;
BLASLONG o16 = 16;
BLASLONG o32 = 32;
BLASLONG o48 = 48;
BLASLONG o64 = 64;
BLASLONG o80 = 80;
BLASLONG o96 = 96;
BLASLONG o112 = 112;
FLOAT *x1=x;
BLASLONG pre = 384;
__asm__ __volatile__
(
"dcbt %2 , %4 \n\t"
"xxlxor 32,32,32 \n\t"
"xxlxor 33,33,33 \n\t"
"xxlxor 34,34,34 \n\t"
"xxlxor 35,35,35 \n\t"
"xxlxor 36,36,36 \n\t"
"xxlxor 37,37,37 \n\t"
"xxlxor 38,38,38 \n\t"
"xxlxor 39,39,39 \n\t"
"lxvw4x 40, 0, %2 \n\t"
"lxvw4x 41, %5, %2 \n\t"
"lxvw4x 42, %6, %2 \n\t"
"lxvw4x 43, %7, %2 \n\t"
"lxvw4x 44, %8, %2 \n\t"
"lxvw4x 45, %9, %2 \n\t"
"lxvw4x 46, %10, %2 \n\t"
"lxvw4x 47, %11, %2 \n\t"
"addi %2, %2, 128 \n\t"
"addic. %0 , %0 , -16 \n\t"
"ble 2f \n\t"
".align 5 \n\t"
"1: \n\t"
"dcbt %2 , %4 \n\t"
"xvabssp 48, 40 \n\t"
"xvabssp 49, 41 \n\t"
"xvabssp 50, 42 \n\t"
"xvabssp 51, 43 \n\t"
"lxvw4x 40, 0, %2 \n\t"
"lxvw4x 41, %5, %2 \n\t"
"xvabssp 52, 44 \n\t"
"xvabssp 53, 45 \n\t"
"lxvw4x 42, %6, %2 \n\t"
"lxvw4x 43, %7, %2 \n\t"
"xvabssp 54, 46 \n\t"
"xvabssp 55, 47 \n\t"
"lxvw4x 44, %8, %2 \n\t"
"lxvw4x 45, %9, %2 \n\t"
"xvaddsp 32, 32, 48 \n\t"
"xvaddsp 33, 33, 49 \n\t"
"lxvw4x 46, %10, %2 \n\t"
"lxvw4x 47, %11, %2 \n\t"
"xvaddsp 34, 34, 50 \n\t"
"xvaddsp 35, 35, 51 \n\t"
"addi %2, %2, 128 \n\t"
"xvaddsp 36, 36, 52 \n\t"
"xvaddsp 37, 37, 53 \n\t"
"addic. %0 , %0 , -16 \n\t"
"xvaddsp 38, 38, 54 \n\t"
"xvaddsp 39, 39, 55 \n\t"
"bgt 1b \n\t"
"2: \n\t"
"xvabssp 48, 40 \n\t"
"xvabssp 49, 41 \n\t"
"xvabssp 50, 42 \n\t"
"xvabssp 51, 43 \n\t"
"xvabssp 52, 44 \n\t"
"xvabssp 53, 45 \n\t"
"xvabssp 54, 46 \n\t"
"xvabssp 55, 47 \n\t"
"xvaddsp 32, 32, 48 \n\t"
"xvaddsp 33, 33, 49 \n\t"
"xvaddsp 34, 34, 50 \n\t"
"xvaddsp 35, 35, 51 \n\t"
"xvaddsp 36, 36, 52 \n\t"
"xvaddsp 37, 37, 53 \n\t"
"xvaddsp 38, 38, 54 \n\t"
"xvaddsp 39, 39, 55 \n\t"
"xvaddsp 32, 32, 33 \n\t"
"xvaddsp 34, 34, 35 \n\t"
"xvaddsp 36, 36, 37 \n\t"
"xvaddsp 38, 38, 39 \n\t"
"xvaddsp 32, 32, 34 \n\t"
"xvaddsp 36, 36, 38 \n\t"
"xvaddsp 32, 32, 36 \n\t"
"stxvw4x 32, 0, %3 \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"r" (x1), // 2
"r" (svec), // 3
"r" (pre), // 4
"r" (o16), // 5
"r" (o32), // 6
"r" (o48), // 7
"r" (o64), // 8
"r" (o80), // 9
"r" (o96), // 10
"r" (o112) // 11
: "cr0", "%0", "%2", "memory"
);
}

140
kernel/power/ccopy.c Normal file
View File

@ -0,0 +1,140 @@
/***************************************************************************
Copyright (c) 2013-2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2016/03/25 Werner Saar (wernsaar@googlemail.com)
* BLASTEST : OK
* CTEST : OK
* TEST : OK
* LAPACK-TEST : OK
**************************************************************************************/
#include "common.h"
#if defined(POWER8)
#include "ccopy_microk_power8.c"
#endif
#ifndef HAVE_KERNEL_32
static void ccopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y)
{
BLASLONG i=0;
FLOAT f0, f1, f2, f3, f4, f5, f6, f7;
FLOAT *x1=x;
FLOAT *y1=y;
while ( i<n )
{
f0 = x1[0];
f1 = x1[1];
f2 = x1[2];
f3 = x1[3];
f4 = x1[4];
f5 = x1[5];
f6 = x1[6];
f7 = x1[7];
y1[0] = f0;
y1[1] = f1;
y1[2] = f2;
y1[3] = f3;
y1[4] = f4;
y1[5] = f5;
y1[6] = f6;
y1[7] = f7;
x1 += 8;
y1 += 8;
i+=4;
}
return;
}
#endif
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;
if ( n <= 0 ) return(0);
if ( (inc_x == 1) && (inc_y == 1 ))
{
BLASLONG n1 = n & -32;
if ( n1 > 0 )
{
ccopy_kernel_32(n1, x, y);
i=n1;
ix=n1*2;
iy=n1*2;
}
while(i < n)
{
y[iy] = x[iy] ;
y[iy+1] = x[ix+1] ;
ix+=2;
iy+=2;
i++ ;
}
}
else
{
BLASLONG inc_x2 = 2 * inc_x;
BLASLONG inc_y2 = 2 * inc_y;
while(i < n)
{
y[iy] = x[ix] ;
y[iy+1] = x[ix+1] ;
ix += inc_x2 ;
iy += inc_y2 ;
i++ ;
}
}
return(0);
}

View File

@ -0,0 +1,174 @@
/***************************************************************************
Copyright (c) 2013-2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2016/03/25 Werner Saar (wernsaar@googlemail.com)
* BLASTEST : OK
* CTEST : OK
* TEST : OK
* LAPACK-TEST : OK
**************************************************************************************/
#define HAVE_KERNEL_32 1
static void ccopy_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
static void ccopy_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y)
{
BLASLONG i = n;
BLASLONG o16 = 16;
BLASLONG o32 = 32;
BLASLONG o48 = 48;
BLASLONG o64 = 64;
BLASLONG o80 = 80;
BLASLONG o96 = 96;
BLASLONG o112 = 112;
FLOAT *x1=x;
FLOAT *y1=y;
BLASLONG pre = 384;
BLASLONG alpha=0;
__asm__ __volatile__
(
"lxvw4x 40, 0, %2 \n\t"
"lxvw4x 41, %5, %2 \n\t"
"lxvw4x 42, %6, %2 \n\t"
"lxvw4x 43, %7, %2 \n\t"
"lxvw4x 44, %8, %2 \n\t"
"lxvw4x 45, %9, %2 \n\t"
"lxvw4x 46, %10, %2 \n\t"
"lxvw4x 47, %11, %2 \n\t"
"addi %2, %2, 128 \n\t"
"lxvw4x 50, 0, %2 \n\t"
"lxvw4x 51, %5, %2 \n\t"
"lxvw4x 52, %6, %2 \n\t"
"lxvw4x 53, %7, %2 \n\t"
"lxvw4x 54, %8, %2 \n\t"
"lxvw4x 55, %9, %2 \n\t"
"lxvw4x 56, %10, %2 \n\t"
"lxvw4x 57, %11, %2 \n\t"
"addi %2, %2, 128 \n\t"
"addic. %0 , %0 , -32 \n\t"
"ble 2f \n\t"
".align 5 \n\t"
"1: \n\t"
"stxvw4x 40, 0, %1 \n\t"
"stxvw4x 41, %5, %1 \n\t"
"lxvw4x 40, 0, %2 \n\t"
"lxvw4x 41, %5, %2 \n\t"
"stxvw4x 42, %6, %1 \n\t"
"stxvw4x 43, %7, %1 \n\t"
"lxvw4x 42, %6, %2 \n\t"
"lxvw4x 43, %7, %2 \n\t"
"stxvw4x 44, %8, %1 \n\t"
"stxvw4x 45, %9, %1 \n\t"
"lxvw4x 44, %8, %2 \n\t"
"lxvw4x 45, %9, %2 \n\t"
"stxvw4x 46, %10, %1 \n\t"
"stxvw4x 47, %11, %1 \n\t"
"lxvw4x 46, %10, %2 \n\t"
"lxvw4x 47, %11, %2 \n\t"
"addi %1, %1, 128 \n\t"
"addi %2, %2, 128 \n\t"
"stxvw4x 50, 0, %1 \n\t"
"stxvw4x 51, %5, %1 \n\t"
"lxvw4x 50, 0, %2 \n\t"
"lxvw4x 51, %5, %2 \n\t"
"stxvw4x 52, %6, %1 \n\t"
"stxvw4x 53, %7, %1 \n\t"
"lxvw4x 52, %6, %2 \n\t"
"lxvw4x 53, %7, %2 \n\t"
"stxvw4x 54, %8, %1 \n\t"
"stxvw4x 55, %9, %1 \n\t"
"lxvw4x 54, %8, %2 \n\t"
"lxvw4x 55, %9, %2 \n\t"
"stxvw4x 56, %10, %1 \n\t"
"stxvw4x 57, %11, %1 \n\t"
"lxvw4x 56, %10, %2 \n\t"
"lxvw4x 57, %11, %2 \n\t"
"addi %1, %1, 128 \n\t"
"addi %2, %2, 128 \n\t"
"addic. %0 , %0 , -32 \n\t"
"bgt 1b \n\t"
"2: \n\t"
"stxvw4x 40, 0, %1 \n\t"
"stxvw4x 41, %5, %1 \n\t"
"stxvw4x 42, %6, %1 \n\t"
"stxvw4x 43, %7, %1 \n\t"
"stxvw4x 44, %8, %1 \n\t"
"stxvw4x 45, %9, %1 \n\t"
"stxvw4x 46, %10, %1 \n\t"
"stxvw4x 47, %11, %1 \n\t"
"addi %1, %1, 128 \n\t"
"stxvw4x 50, 0, %1 \n\t"
"stxvw4x 51, %5, %1 \n\t"
"stxvw4x 52, %6, %1 \n\t"
"stxvw4x 53, %7, %1 \n\t"
"stxvw4x 54, %8, %1 \n\t"
"stxvw4x 55, %9, %1 \n\t"
"stxvw4x 56, %10, %1 \n\t"
"stxvw4x 57, %11, %1 \n\t"
:
:
"r" (i), // 0
"r" (y1), // 1
"r" (x1), // 2
"r" (alpha), // 3
"r" (pre), // 4
"r" (o16), // 5
"r" (o32), // 6
"r" (o48), // 7
"r" (o64), // 8
"r" (o80), // 9
"r" (o96), // 10
"r" (o112) // 11
: "cr0", "%0", "%2" , "%1", "memory"
);
}

View File

@ -0,0 +1,407 @@
/***************************************************************************
Copyright (c) 2013-2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2016/04/04 Werner Saar (wernsaar@googlemail.com)
* BLASTEST : OK
* CTEST : OK
* TEST : OK
* LAPACK-TEST : OK
**************************************************************************************/
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#define ASSEMBLER
#include "common.h"
#include "def_vsx.h"
#ifndef __64BIT__
#define LOAD lwz
#else
#define LOAD ld
#endif
#ifdef __64BIT__
#define STACKSIZE 32000
#define ALPHA_R_SP 296(SP)
#define ALPHA_I_SP 304(SP)
#define FZERO 312(SP)
#else
#define STACKSIZE 256
#define ALPHA_R_SP 224(SP)
#define ALPHA_I_SP 232(SP)
#define FZERO 240(SP)
#endif
#define M r3
#define N r4
#define K r5
#ifdef linux
#ifndef __64BIT__
#define A r6
#define B r7
#define C r8
#define LDC r9
#define OFFSET r10
#else
#define A r8
#define B r9
#define C r10
#define LDC r6
#define OFFSET r7
#endif
#endif
#if defined(_AIX) || defined(__APPLE__)
#if !defined(__64BIT__) && defined(DOUBLE)
#define A r10
#define B r6
#define C r7
#define LDC r8
#define OFFSET r9
#else
#define A r8
#define B r9
#define C r10
#define LDC r6
#define OFFSET r7
#endif
#endif
#define o0 0
#define alpha_dr vs28
#define alpha_di vs29
#define alpha_sr vs30
#define alpha_si vs31
#define FRAMEPOINTER r12
#define BBUFFER r14
#define L r15
#define o12 r16
#define o4 r17
#define T2 r19
#define BBO r20
#define o8 r21
#define I r22
#define J r23
#define AO r24
#define BO r25
#define CO r26
#define o16 r27
#define o32 r28
#define o48 r29
#define PRE r30
#define T1 r31
#ifndef NEEDPARAM
PROLOGUE
PROFCODE
mr FRAMEPOINTER, SP
addi SP, SP, -STACKSIZE
addi SP, SP, -STACKSIZE
addi SP, SP, -STACKSIZE
addi SP, SP, -STACKSIZE
li r0, 0
stfd f14, 0(SP)
stfd f15, 8(SP)
stfd f16, 16(SP)
stfd f17, 24(SP)
stfd f18, 32(SP)
stfd f19, 40(SP)
stfd f20, 48(SP)
stfd f21, 56(SP)
stfd f22, 64(SP)
stfd f23, 72(SP)
stfd f24, 80(SP)
stfd f25, 88(SP)
stfd f26, 96(SP)
stfd f27, 104(SP)
stfd f28, 112(SP)
stfd f29, 120(SP)
stfd f30, 128(SP)
stfd f31, 136(SP)
#ifdef __64BIT__
std r31, 144(SP)
std r30, 152(SP)
std r29, 160(SP)
std r28, 168(SP)
std r27, 176(SP)
std r26, 184(SP)
std r25, 192(SP)
std r24, 200(SP)
std r23, 208(SP)
std r22, 216(SP)
std r21, 224(SP)
std r20, 232(SP)
std r19, 240(SP)
std r18, 248(SP)
std r17, 256(SP)
std r16, 264(SP)
std r15, 272(SP)
std r14, 280(SP)
#else
stw r31, 144(SP)
stw r30, 148(SP)
stw r29, 152(SP)
stw r28, 156(SP)
stw r27, 160(SP)
stw r26, 164(SP)
stw r25, 168(SP)
stw r24, 172(SP)
stw r23, 176(SP)
stw r22, 180(SP)
stw r21, 184(SP)
stw r20, 188(SP)
stw r19, 192(SP)
stw r18, 196(SP)
stw r17, 200(SP)
stw r16, 204(SP)
stw r15, 208(SP)
stw r14, 212(SP)
#endif
stfs f1, ALPHA_R_SP
stfs f2, ALPHA_I_SP
// stw r0, FZERO
#ifdef linux
#ifdef __64BIT__
ld LDC, FRAMESLOT(0) + 0(FRAMEPOINTER)
#endif
#endif
#if defined(_AIX) || defined(__APPLE__)
#ifdef __64BIT__
ld LDC, FRAMESLOT(0) + 0(FRAMEPOINTER)
#else
#ifdef DOUBLE
lwz B, FRAMESLOT(0) + 0(FRAMEPOINTER)
lwz C, FRAMESLOT(1) + 0(FRAMEPOINTER)
lwz LDC, FRAMESLOT(2) + 0(FRAMEPOINTER)
#else
lwz LDC, FRAMESLOT(0) + 0(FRAMEPOINTER)
#endif
#endif
#endif
#ifdef TRMMKERNEL
#if defined(linux) && defined(__64BIT__)
ld OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER)
#endif
#if defined(_AIX) || defined(__APPLE__)
#ifdef __64BIT__
ld OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER)
#else
#ifdef DOUBLE
lwz OFFSET, FRAMESLOT(3) + 0(FRAMEPOINTER)
#else
lwz OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER)
#endif
#endif
#endif
#if defined(TRMMKERNEL) && !defined(LEFT)
neg KK, OFFSET
#endif
#endif
#include "cgemm_macros_8x4_power8.S"
cmpwi cr0, M, 0
ble L999_H1
cmpwi cr0, N, 0
ble L999_H1
cmpwi cr0, K, 0
ble L999_H1
slwi LDC, LDC, ZBASE_SHIFT
li PRE, 384
li o4 , 4
li o8 , 8
li o12 , 12
li o16 , 16
li o32 , 32
li o48 , 48
addi BBUFFER, SP, 512+4096
li T1, -4096
and BBUFFER, BBUFFER, T1
#ifdef __64BIT__
addi T1 , SP, 296
#else
addi T1 , SP, 224
#endif
stxsspx vs1, 0, T1
lxsspx alpha_dr, 0, T1
stxsspx vs2, o8 , T1
lxsspx alpha_di, o8, T1
addi T1, SP, 360
li T2, 0
stw T2, 0(T1)
stw T2, 4(T1)
stw T2, 8(T1)
stxsspx alpha_dr, o12, T1
lxvw4x alpha_sr, o0 , T1
addi T1, T1, 16
stw T2, 0(T1)
stw T2, 4(T1)
stw T2, 8(T1)
stxsspx alpha_di, o12, T1
lxvw4x alpha_si, o0 , T1
.align 5
#include "cgemm_logic_8x4_power8.S"
L999:
addi r3, 0, 0
lfd f14, 0(SP)
lfd f15, 8(SP)
lfd f16, 16(SP)
lfd f17, 24(SP)
lfd f18, 32(SP)
lfd f19, 40(SP)
lfd f20, 48(SP)
lfd f21, 56(SP)
lfd f22, 64(SP)
lfd f23, 72(SP)
lfd f24, 80(SP)
lfd f25, 88(SP)
lfd f26, 96(SP)
lfd f27, 104(SP)
lfd f28, 112(SP)
lfd f29, 120(SP)
lfd f30, 128(SP)
lfd f31, 136(SP)
#ifdef __64BIT__
ld r31, 144(SP)
ld r30, 152(SP)
ld r29, 160(SP)
ld r28, 168(SP)
ld r27, 176(SP)
ld r26, 184(SP)
ld r25, 192(SP)
ld r24, 200(SP)
ld r23, 208(SP)
ld r22, 216(SP)
ld r21, 224(SP)
ld r20, 232(SP)
ld r19, 240(SP)
ld r18, 248(SP)
ld r17, 256(SP)
ld r16, 264(SP)
ld r15, 272(SP)
ld r14, 280(SP)
#else
lwz r31, 144(SP)
lwz r30, 148(SP)
lwz r29, 152(SP)
lwz r28, 156(SP)
lwz r27, 160(SP)
lwz r26, 164(SP)
lwz r25, 168(SP)
lwz r24, 172(SP)
lwz r23, 176(SP)
lwz r22, 180(SP)
lwz r21, 184(SP)
lwz r20, 188(SP)
lwz r19, 192(SP)
lwz r18, 196(SP)
lwz r17, 200(SP)
lwz r16, 204(SP)
lwz r15, 208(SP)
lwz r14, 212(SP)
#endif
addi SP, SP, STACKSIZE
addi SP, SP, STACKSIZE
addi SP, SP, STACKSIZE
addi SP, SP, STACKSIZE
blr
EPILOGUE
#endif

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

175
kernel/power/cswap.c Normal file
View File

@ -0,0 +1,175 @@
/***************************************************************************
Copyright (c) 2013-2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2016/03/27 Werner Saar (wernsaar@googlemail.com)
* BLASTEST : OK
* CTEST : OK
* TEST : OK
* LAPACK-TEST : OK
**************************************************************************************/
#include "common.h"
#if defined(POWER8)
#include "cswap_microk_power8.c"
#endif
#ifndef HAVE_KERNEL_32
static void cswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y)
{
BLASLONG i=0;
FLOAT f0, f1, f2, f3, f4, f5, f6, f7;
FLOAT g0, g1, g2, g3, g4, g5, g6, g7;
FLOAT *x1=x;
FLOAT *y1=y;
while ( i<n )
{
f0 = x1[0];
f1 = x1[1];
f2 = x1[2];
f3 = x1[3];
f4 = x1[4];
f5 = x1[5];
f6 = x1[6];
f7 = x1[7];
g0 = y1[0];
g1 = y1[1];
g2 = y1[2];
g3 = y1[3];
g4 = y1[4];
g5 = y1[5];
g6 = y1[6];
g7 = y1[7];
y1[0] = f0;
y1[1] = f1;
y1[2] = f2;
y1[3] = f3;
y1[4] = f4;
y1[5] = f5;
y1[6] = f6;
y1[7] = f7;
x1[0] = g0;
x1[1] = g1;
x1[2] = g2;
x1[3] = g3;
x1[4] = g4;
x1[5] = g5;
x1[6] = g6;
x1[7] = g7;
x1 += 8;
y1 += 8;
i+=4;
}
return;
}
#endif
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;
FLOAT temp[2];
BLASLONG inc_x2, inc_y2;
if ( n <= 0 ) return(0);
if ( (inc_x == 1) && (inc_y == 1 ))
{
BLASLONG n1 = n & -32;
if ( n1 > 0 )
{
cswap_kernel_32(n1, x, y);
i=n1;
ix = 2* n1;
iy = 2* n1;
}
while(i < n)
{
temp[0] = x[ix] ;
temp[1] = x[ix+1] ;
x[ix] = y[iy] ;
x[ix+1] = y[iy+1] ;
y[iy] = temp[0] ;
y[iy+1] = temp[1] ;
ix += 2 ;
iy += 2 ;
i++ ;
}
}
else
{
inc_x2 = 2 * inc_x;
inc_y2 = 2 * inc_y;
while(i < n)
{
temp[0] = x[ix] ;
temp[1] = x[ix+1] ;
x[ix] = y[iy] ;
x[ix+1] = y[iy+1] ;
y[iy] = temp[0] ;
y[iy+1] = temp[1] ;
ix += inc_x2 ;
iy += inc_y2 ;
i++ ;
}
}
return(0);
}

View File

@ -0,0 +1,180 @@
/***************************************************************************
Copyright (c) 2013-2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2016/03/27 Werner Saar (wernsaar@googlemail.com)
* BLASTEST : OK
* CTEST : OK
* TEST : OK
* LAPACK-TEST : OK
**************************************************************************************/
#define HAVE_KERNEL_32 1
static void cswap_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
static void cswap_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y)
{
BLASLONG i = n;
BLASLONG o16 = 16;
BLASLONG o32 = 32;
BLASLONG o48 = 48;
BLASLONG o64 = 64;
BLASLONG o80 = 80;
BLASLONG o96 = 96;
BLASLONG o112 = 112;
FLOAT *x1=x;
FLOAT *y1=y;
FLOAT *x2=x+1;
FLOAT *y2=y+1;
BLASLONG pre = 384;
BLASLONG alpha=0;
__asm__ __volatile__
(
"addi %3, %3, -4 \n\t"
"addi %4, %4, -4 \n\t"
".align 5 \n\t"
"1: \n\t"
"lxvw4x 32, 0, %2 \n\t"
"lxvw4x 33, %5, %2 \n\t"
"lxvw4x 34, %6, %2 \n\t"
"lxvw4x 35, %7, %2 \n\t"
"lxvw4x 36, %8, %2 \n\t"
"lxvw4x 37, %9, %2 \n\t"
"lxvw4x 38, %10, %2 \n\t"
"lxvw4x 39, %11, %2 \n\t"
"addi %2, %2, 128 \n\t"
"lxvw4x 40, 0, %2 \n\t"
"lxvw4x 41, %5, %2 \n\t"
"lxvw4x 42, %6, %2 \n\t"
"lxvw4x 43, %7, %2 \n\t"
"lxvw4x 44, %8, %2 \n\t"
"lxvw4x 45, %9, %2 \n\t"
"lxvw4x 46, %10, %2 \n\t"
"lxvw4x 47, %11, %2 \n\t"
"addi %2, %2, 128 \n\t"
"lxvw4x 48, 0, %1 \n\t"
"lxvw4x 49, %5, %1 \n\t"
"lxvw4x 50, %6, %1 \n\t"
"lxvw4x 51, %7, %1 \n\t"
"lxvw4x 52, %8, %1 \n\t"
"lxvw4x 53, %9, %1 \n\t"
"lxvw4x 54, %10, %1 \n\t"
"lxvw4x 55, %11, %1 \n\t"
"addi %1, %1, 128 \n\t"
"lxvw4x 56, 0, %1 \n\t"
"lxvw4x 57, %5, %1 \n\t"
"lxvw4x 58, %6, %1 \n\t"
"lxvw4x 59, %7, %1 \n\t"
"lxvw4x 60, %8, %1 \n\t"
"lxvw4x 61, %9, %1 \n\t"
"lxvw4x 62, %10, %1 \n\t"
"lxvw4x 63, %11, %1 \n\t"
"addi %1, %1, 128 \n\t"
"stxvw4x 32, 0, %3 \n\t"
"stxvw4x 33, %5, %3 \n\t"
"stxvw4x 34, %6, %3 \n\t"
"stxvw4x 35, %7, %3 \n\t"
"stxvw4x 36, %8, %3 \n\t"
"stxvw4x 37, %9, %3 \n\t"
"stxvw4x 38, %10, %3 \n\t"
"stxvw4x 39, %11, %3 \n\t"
"addi %3, %3, 128 \n\t"
"stxvw4x 40, 0, %3 \n\t"
"stxvw4x 41, %5, %3 \n\t"
"stxvw4x 42, %6, %3 \n\t"
"stxvw4x 43, %7, %3 \n\t"
"stxvw4x 44, %8, %3 \n\t"
"stxvw4x 45, %9, %3 \n\t"
"stxvw4x 46, %10, %3 \n\t"
"stxvw4x 47, %11, %3 \n\t"
"addi %3, %3, 128 \n\t"
"stxvw4x 48, 0, %4 \n\t"
"stxvw4x 49, %5, %4 \n\t"
"stxvw4x 50, %6, %4 \n\t"
"stxvw4x 51, %7, %4 \n\t"
"stxvw4x 52, %8, %4 \n\t"
"stxvw4x 53, %9, %4 \n\t"
"stxvw4x 54, %10, %4 \n\t"
"stxvw4x 55, %11, %4 \n\t"
"addi %4, %4, 128 \n\t"
"stxvw4x 56, 0, %4 \n\t"
"stxvw4x 57, %5, %4 \n\t"
"stxvw4x 58, %6, %4 \n\t"
"stxvw4x 59, %7, %4 \n\t"
"stxvw4x 60, %8, %4 \n\t"
"stxvw4x 61, %9, %4 \n\t"
"stxvw4x 62, %10, %4 \n\t"
"stxvw4x 63, %11, %4 \n\t"
"addi %4, %4, 128 \n\t"
"addic. %0 , %0 , -32 \n\t"
"bgt 1b \n\t"
"2: \n\t"
:
:
"r" (i), // 0
"r" (y1), // 1
"r" (x1), // 2
"r" (y2), // 3
"r" (x2), // 4
"r" (o16), // 5
"r" (o32), // 6
"r" (o48), // 7
"r" (o64), // 8
"r" (o80), // 9
"r" (o96), // 10
"r" (o112) // 11
: "cr0", "%0", "%2" , "%1", "%3", "%4", "memory"
);
}

View File

@ -0,0 +1,399 @@
/***************************************************************************
Copyright (c) 2013-2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2016/04/04 Werner Saar (wernsaar@googlemail.com)
* BLASTEST : OK
* CTEST : OK
* TEST : OK
* LAPACK-TEST : OK
**************************************************************************************/
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#define ASSEMBLER
#include "common.h"
#include "def_vsx.h"
#ifndef __64BIT__
#define LOAD lwz
#else
#define LOAD ld
#endif
#ifdef __64BIT__
#define STACKSIZE 400
#define ALPHA_R_SP 304(SP)
#define ALPHA_I_SP 312(SP)
#else
#define STACKSIZE 256
#define ALPHA_R_SP 224(SP)
#define ALPHA_I_SP 232(SP)
#define FZERO 240(SP)
#endif
#define M r3
#define N r4
#define K r5
#ifdef linux
#ifndef __64BIT__
#define A r6
#define B r7
#define C r8
#define LDC r9
#define OFFSET r10
#else
#define A r8
#define B r9
#define C r10
#define LDC r6
#define OFFSET r7
#endif
#endif
#if defined(_AIX) || defined(__APPLE__)
#if !defined(__64BIT__) && defined(DOUBLE)
#define A r10
#define B r6
#define C r7
#define LDC r8
#define OFFSET r9
#else
#define A r8
#define B r9
#define C r10
#define LDC r6
#define OFFSET r7
#endif
#endif
#define o0 0
#define alpha_dr vs28
#define alpha_di vs29
#define alpha_sr vs30
#define alpha_si vs31
#define o12 r12
#define KKK r13
#define K1 r14
#define L r15
#define o16 r16
#define NOTUSED r17
#define T2 r19
#define KK r20
#define o8 r21
#define I r22
#define J r23
#define AO r24
#define BO r25
#define CO r26
#define o4 r27
#define o32 r28
#define o48 r29
#define PRE r30
#define T1 r31
#ifndef NEEDPARAM
PROLOGUE
PROFCODE
addi SP, SP, -STACKSIZE
li r0, 0
stfd f14, 0(SP)
stfd f15, 8(SP)
stfd f16, 16(SP)
stfd f17, 24(SP)
stfd f18, 32(SP)
stfd f19, 40(SP)
stfd f20, 48(SP)
stfd f21, 56(SP)
stfd f22, 64(SP)
stfd f23, 72(SP)
stfd f24, 80(SP)
stfd f25, 88(SP)
stfd f26, 96(SP)
stfd f27, 104(SP)
stfd f28, 112(SP)
stfd f29, 120(SP)
stfd f30, 128(SP)
stfd f31, 136(SP)
#ifdef __64BIT__
std r31, 144(SP)
std r30, 152(SP)
std r29, 160(SP)
std r28, 168(SP)
std r27, 176(SP)
std r26, 184(SP)
std r25, 192(SP)
std r24, 200(SP)
std r23, 208(SP)
std r22, 216(SP)
std r21, 224(SP)
std r20, 232(SP)
std r19, 240(SP)
std r18, 248(SP)
std r17, 256(SP)
std r16, 264(SP)
std r15, 272(SP)
std r14, 280(SP)
std r13, 288(SP)
std r12, 296(SP)
#else
stw r31, 144(SP)
stw r30, 148(SP)
stw r29, 152(SP)
stw r28, 156(SP)
stw r27, 160(SP)
stw r26, 164(SP)
stw r25, 168(SP)
stw r24, 172(SP)
stw r23, 176(SP)
stw r22, 180(SP)
stw r21, 184(SP)
stw r20, 188(SP)
stw r19, 192(SP)
stw r18, 196(SP)
stw r17, 200(SP)
stw r16, 204(SP)
stw r15, 208(SP)
stw r14, 212(SP)
stw r13, 216(SP)
#endif
stfs f1, ALPHA_R_SP
stfs f2, ALPHA_I_SP
// stw r0, FZERO
#ifdef linux
#ifdef __64BIT__
ld LDC, FRAMESLOT(0) + STACKSIZE(SP)
#endif
#endif
#if defined(_AIX) || defined(__APPLE__)
#ifdef __64BIT__
ld LDC, FRAMESLOT(0) + STACKSIZE(SP)
#else
#ifdef DOUBLE
lwz B, FRAMESLOT(0) + STACKSIZE(SP)
lwz C, FRAMESLOT(1) + STACKSIZE(SP)
lwz LDC, FRAMESLOT(2) + STACKSIZE(SP)
#else
lwz LDC, FRAMESLOT(0) + STACKSIZE(SP)
#endif
#endif
#endif
#ifdef TRMMKERNEL
#if defined(linux) && defined(__64BIT__)
ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
#endif
#if defined(_AIX) || defined(__APPLE__)
#ifdef __64BIT__
ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
#else
#ifdef DOUBLE
lwz OFFSET, FRAMESLOT(3) + STACKSIZE(SP)
#else
lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
#endif
#endif
#endif
#if defined(TRMMKERNEL) && !defined(LEFT)
neg KK, OFFSET
#endif
#endif
#include "ctrmm_macros_8x4_power8.S"
cmpwi cr0, M, 0
ble L999_H1
cmpwi cr0, N, 0
ble L999_H1
cmpwi cr0, K, 0
ble L999_H1
slwi LDC, LDC, ZBASE_SHIFT
li PRE, 384
li o4 , 4
li o8 , 8
li o12 , 12
li o16 , 16
li o32 , 32
li o48 , 48
#ifdef __64BIT__
addi T1, SP, 304
#else
addi T1, SP, 224
#endif
lxsspx alpha_dr, 0, T1
lxsspx alpha_di, o8, T1
addi T1, SP, 360
li T2, 0
stw T2, 0(T1)
stw T2, 4(T1)
stw T2, 8(T1)
stxsspx alpha_dr, o12, T1
lxvw4x alpha_sr, o0 , T1
addi T1, T1, 16
stw T2, 0(T1)
stw T2, 4(T1)
stw T2, 8(T1)
stxsspx alpha_di, o12, T1
lxvw4x alpha_si, o0 , T1
.align 5
#include "ctrmm_logic_8x4_power8.S"
L999:
addi r3, 0, 0
lfd f14, 0(SP)
lfd f15, 8(SP)
lfd f16, 16(SP)
lfd f17, 24(SP)
lfd f18, 32(SP)
lfd f19, 40(SP)
lfd f20, 48(SP)
lfd f21, 56(SP)
lfd f22, 64(SP)
lfd f23, 72(SP)
lfd f24, 80(SP)
lfd f25, 88(SP)
lfd f26, 96(SP)
lfd f27, 104(SP)
lfd f28, 112(SP)
lfd f29, 120(SP)
lfd f30, 128(SP)
lfd f31, 136(SP)
#ifdef __64BIT__
ld r31, 144(SP)
ld r30, 152(SP)
ld r29, 160(SP)
ld r28, 168(SP)
ld r27, 176(SP)
ld r26, 184(SP)
ld r25, 192(SP)
ld r24, 200(SP)
ld r23, 208(SP)
ld r22, 216(SP)
ld r21, 224(SP)
ld r20, 232(SP)
ld r19, 240(SP)
ld r18, 248(SP)
ld r17, 256(SP)
ld r16, 264(SP)
ld r15, 272(SP)
ld r14, 280(SP)
ld r13, 288(SP)
ld r12, 296(SP)
#else
lwz r31, 144(SP)
lwz r30, 148(SP)
lwz r29, 152(SP)
lwz r28, 156(SP)
lwz r27, 160(SP)
lwz r26, 164(SP)
lwz r25, 168(SP)
lwz r24, 172(SP)
lwz r23, 176(SP)
lwz r22, 180(SP)
lwz r21, 184(SP)
lwz r20, 188(SP)
lwz r19, 192(SP)
lwz r18, 196(SP)
lwz r17, 200(SP)
lwz r16, 204(SP)
lwz r15, 208(SP)
lwz r14, 212(SP)
lwz r13, 216(SP)
#endif
addi SP, SP, STACKSIZE
blr
EPILOGUE
#endif

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

144
kernel/power/dasum.c Normal file
View File

@ -0,0 +1,144 @@
/***************************************************************************
Copyright (c) 2013-2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2016/03/28 Werner Saar (wernsaar@googlemail.com)
* BLASTEST : OK
* CTEST : OK
* TEST : OK
* LAPACK-TEST : OK
**************************************************************************************/
#include "common.h"
#include <math.h>
#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif
#if defined(POWER8)
#include "dasum_microk_power8.c"
#endif
#ifndef HAVE_KERNEL_16
static void dasum_kernel_16(BLASLONG n, FLOAT *x1, FLOAT *svec)
{
BLASLONG i=0;
FLOAT *x = x1;
FLOAT temp0, temp1, temp2, temp3;
FLOAT temp4, temp5, temp6, temp7;
FLOAT sum0 = 0.0;
FLOAT sum1 = 0.0;
FLOAT sum2 = 0.0;
FLOAT sum3 = 0.0;
while ( i< n )
{
temp0 = ABS(x[0]);
temp1 = ABS(x[1]);
temp2 = ABS(x[2]);
temp3 = ABS(x[3]);
temp4 = ABS(x[4]);
temp5 = ABS(x[5]);
temp6 = ABS(x[6]);
temp7 = ABS(x[7]);
sum0 += temp0;
sum1 += temp1;
sum2 += temp2;
sum3 += temp3;
sum0 += temp4;
sum1 += temp5;
sum2 += temp6;
sum3 += temp7;
x+=8;
i+=8;
}
svec[0] = sum0+sum1+sum2+sum3;
svec[1] = 0.0;
}
#endif
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i=0;
FLOAT sumf = 0.0;
FLOAT svec[2] __attribute__ ((aligned (16)));;
BLASLONG n1;
if (n <= 0 || inc_x <= 0) return(sumf);
if ( inc_x == 1 )
{
n1 = n & -16;
if ( n1 > 0 )
{
dasum_kernel_16(n1, x, svec);
sumf = svec[0] + svec[1];
i=n1;
}
while(i < n)
{
sumf += ABS(x[i]);
i++;
}
}
else
{
n *= inc_x;
while(i < n)
{
sumf += ABS(x[i]);
i += inc_x;
}
}
return(sumf);
}

View File

@ -0,0 +1,177 @@
/***************************************************************************
Copyright (c) 2013-2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2016/03/28 Werner Saar (wernsaar@googlemail.com)
* BLASTEST : OK
* CTEST : OK
* TEST : OK
* LAPACK-TEST : OK
**************************************************************************************/
#define HAVE_KERNEL_16 1
static void dasum_kernel_16( BLASLONG n, FLOAT *x, FLOAT *svec) __attribute__ ((noinline));
static void dasum_kernel_16( BLASLONG n, FLOAT *x, FLOAT *svec)
{
BLASLONG i = n;
BLASLONG o16 = 16;
BLASLONG o32 = 32;
BLASLONG o48 = 48;
BLASLONG o64 = 64;
BLASLONG o80 = 80;
BLASLONG o96 = 96;
BLASLONG o112 = 112;
FLOAT *x1=x;
BLASLONG pre = 384;
__asm__ __volatile__
(
"dcbt %2 , %4 \n\t"
"xxlxor 32,32,32 \n\t"
"xxlxor 33,33,33 \n\t"
"xxlxor 34,34,34 \n\t"
"xxlxor 35,35,35 \n\t"
"xxlxor 36,36,36 \n\t"
"xxlxor 37,37,37 \n\t"
"xxlxor 38,38,38 \n\t"
"xxlxor 39,39,39 \n\t"
"lxvd2x 40, 0, %2 \n\t"
"lxvd2x 41, %5, %2 \n\t"
"lxvd2x 42, %6, %2 \n\t"
"lxvd2x 43, %7, %2 \n\t"
"lxvd2x 44, %8, %2 \n\t"
"lxvd2x 45, %9, %2 \n\t"
"lxvd2x 46, %10, %2 \n\t"
"lxvd2x 47, %11, %2 \n\t"
"addi %2, %2, 128 \n\t"
"addic. %0 , %0 , -16 \n\t"
"ble 2f \n\t"
".align 5 \n\t"
"1: \n\t"
"dcbt %2 , %4 \n\t"
"xvabsdp 48, 40 \n\t"
"xvabsdp 49, 41 \n\t"
"xvabsdp 50, 42 \n\t"
"xvabsdp 51, 43 \n\t"
"lxvd2x 40, 0, %2 \n\t"
"lxvd2x 41, %5, %2 \n\t"
"xvabsdp 52, 44 \n\t"
"xvabsdp 53, 45 \n\t"
"lxvd2x 42, %6, %2 \n\t"
"lxvd2x 43, %7, %2 \n\t"
"xvabsdp 54, 46 \n\t"
"xvabsdp 55, 47 \n\t"
"lxvd2x 44, %8, %2 \n\t"
"lxvd2x 45, %9, %2 \n\t"
"xvadddp 32, 32, 48 \n\t"
"xvadddp 33, 33, 49 \n\t"
"lxvd2x 46, %10, %2 \n\t"
"lxvd2x 47, %11, %2 \n\t"
"xvadddp 34, 34, 50 \n\t"
"xvadddp 35, 35, 51 \n\t"
"addi %2, %2, 128 \n\t"
"xvadddp 36, 36, 52 \n\t"
"xvadddp 37, 37, 53 \n\t"
"addic. %0 , %0 , -16 \n\t"
"xvadddp 38, 38, 54 \n\t"
"xvadddp 39, 39, 55 \n\t"
"bgt 1b \n\t"
"2: \n\t"
"xvabsdp 48, 40 \n\t"
"xvabsdp 49, 41 \n\t"
"xvabsdp 50, 42 \n\t"
"xvabsdp 51, 43 \n\t"
"xvabsdp 52, 44 \n\t"
"xvabsdp 53, 45 \n\t"
"xvabsdp 54, 46 \n\t"
"xvabsdp 55, 47 \n\t"
"xvadddp 32, 32, 48 \n\t"
"xvadddp 33, 33, 49 \n\t"
"xvadddp 34, 34, 50 \n\t"
"xvadddp 35, 35, 51 \n\t"
"xvadddp 36, 36, 52 \n\t"
"xvadddp 37, 37, 53 \n\t"
"xvadddp 38, 38, 54 \n\t"
"xvadddp 39, 39, 55 \n\t"
"xvadddp 32, 32, 33 \n\t"
"xvadddp 34, 34, 35 \n\t"
"xvadddp 36, 36, 37 \n\t"
"xvadddp 38, 38, 39 \n\t"
"xvadddp 32, 32, 34 \n\t"
"xvadddp 36, 36, 38 \n\t"
"xvadddp 32, 32, 36 \n\t"
"stxvd2x 32, 0, %3 \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"r" (x1), // 2
"r" (svec), // 3
"r" (pre), // 4
"r" (o16), // 5
"r" (o32), // 6
"r" (o48), // 7
"r" (o64), // 8
"r" (o80), // 9
"r" (o96), // 10
"r" (o112) // 11
: "cr0", "%0", "%2", "memory"
);
}

136
kernel/power/daxpy.c Normal file
View File

@ -0,0 +1,136 @@
/***************************************************************************
Copyright (c) 2013-2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2016/03/22 Werner Saar (wernsaar@googlemail.com)
* BLASTEST : OK
* CTEST : OK
* TEST : OK
* LAPACK-TEST : OK
**************************************************************************************/
#include "common.h"
#if defined(POWER8)
#include "daxpy_microk_power8.c"
#endif
#ifndef HAVE_KERNEL_8
static void daxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
{
BLASLONG register i = 0;
FLOAT a = *alpha;
while(i < n)
{
y[i] += a * x[i];
y[i+1] += a * x[i+1];
y[i+2] += a * x[i+2];
y[i+3] += a * x[i+3];
y[i+4] += a * x[i+4];
y[i+5] += a * x[i+5];
y[i+6] += a * x[i+6];
y[i+7] += a * x[i+7];
i+=8 ;
}
}
#endif
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;
FLOAT a2[4];
a2[0]=da;
a2[1]=da;
a2[2]=da;
a2[3]=da;
if ( n <= 0 ) return(0);
if ( (inc_x == 1) && (inc_y == 1) )
{
BLASLONG n1 = n & -16;
if ( n1 )
daxpy_kernel_8(n1, x, y , a2 );
i = n1;
while(i < n)
{
y[i] += da * x[i] ;
i++ ;
}
return(0);
}
BLASLONG n1 = n & -4;
while(i < n1)
{
FLOAT m1 = da * x[ix] ;
FLOAT m2 = da * x[ix+inc_x] ;
FLOAT m3 = da * x[ix+2*inc_x] ;
FLOAT m4 = da * x[ix+3*inc_x] ;
y[iy] += m1 ;
y[iy+inc_y] += m2 ;
y[iy+2*inc_y] += m3 ;
y[iy+3*inc_y] += m4 ;
ix += inc_x*4 ;
iy += inc_y*4 ;
i+=4 ;
}
while(i < n)
{
y[iy] += da * x[ix] ;
ix += inc_x ;
iy += inc_y ;
i++ ;
}
return(0);
}

View File

@ -0,0 +1,201 @@
/***************************************************************************
Copyright (c) 2013-2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2016/03/22 Werner Saar (wernsaar@googlemail.com)
* BLASTEST : OK
* CTEST : OK
* TEST : OK
* LAPACK-TEST : OK
**************************************************************************************/
#define HAVE_KERNEL_8 1
static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __attribute__ ((noinline));
static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
{
BLASLONG i = n;
BLASLONG o16 = 16;
BLASLONG o32 = 32;
BLASLONG o48 = 48;
FLOAT *x1=x;
FLOAT *y1=y;
FLOAT *y2=y+1;
BLASLONG pre = 384;
__asm__ __volatile__
(
"lxsdx 33, %5, %4 \n\t"
"xxspltd 32, 33, 0 \n\t"
"addi %8, %8, -8 \n\t"
"dcbt %2, %9 \n\t"
"dcbt %3, %9 \n\t"
"lxvd2x 40, 0, %2 \n\t"
"lxvd2x 41, %5, %2 \n\t"
"lxvd2x 42, %6, %2 \n\t"
"lxvd2x 43, %7, %2 \n\t"
"lxvd2x 48, 0, %3 \n\t"
"lxvd2x 49, %5, %3 \n\t"
"lxvd2x 50, %6, %3 \n\t"
"lxvd2x 51, %7, %3 \n\t"
"addi %2, %2, 64 \n\t"
"addi %3, %3, 64 \n\t"
"lxvd2x 44, 0, %2 \n\t"
"lxvd2x 45, %5, %2 \n\t"
"lxvd2x 46, %6, %2 \n\t"
"lxvd2x 47, %7, %2 \n\t"
"lxvd2x 52, 0, %3 \n\t"
"lxvd2x 53, %5, %3 \n\t"
"lxvd2x 54, %6, %3 \n\t"
"lxvd2x 55, %7, %3 \n\t"
"addi %2, %2, 64 \n\t"
"addi %3, %3, 64 \n\t"
"addic. %0 , %0 , -16 \n\t"
"ble 2f \n\t"
".align 5 \n\t"
"1: \n\t"
"dcbt %2, %9 \n\t"
"dcbt %3, %9 \n\t"
"xvmaddadp 48, 40, 32 \n\t"
"xvmaddadp 49, 41, 32 \n\t"
"lxvd2x 40, 0, %2 \n\t"
"lxvd2x 41, %5, %2 \n\t"
"stxvd2x 48, 0, %8 \n\t"
"stxvd2x 49, %5, %8 \n\t"
"xvmaddadp 50, 42, 32 \n\t"
"xvmaddadp 51, 43, 32 \n\t"
"lxvd2x 42, %6, %2 \n\t"
"lxvd2x 43, %7, %2 \n\t"
"stxvd2x 50, %6, %8 \n\t"
"stxvd2x 51, %7, %8 \n\t"
"lxvd2x 48, 0, %3 \n\t"
"lxvd2x 49, %5, %3 \n\t"
"lxvd2x 50, %6, %3 \n\t"
"lxvd2x 51, %7, %3 \n\t"
"addi %2, %2, 64 \n\t"
"addi %8, %8, 64 \n\t"
"xvmaddadp 52, 44, 32 \n\t"
"addi %3, %3, 64 \n\t"
"xvmaddadp 53, 45, 32 \n\t"
"lxvd2x 44, 0, %2 \n\t"
"lxvd2x 45, %5, %2 \n\t"
"stxvd2x 52, 0, %8 \n\t"
"stxvd2x 53, %5, %8 \n\t"
"xvmaddadp 54, 46, 32 \n\t"
"xvmaddadp 55, 47, 32 \n\t"
"lxvd2x 46, %6, %2 \n\t"
"lxvd2x 47, %7, %2 \n\t"
"stxvd2x 54, %6, %8 \n\t"
"stxvd2x 55, %7, %8 \n\t"
"addi %2, %2, 64 \n\t"
"addi %8, %8, 64 \n\t"
"lxvd2x 52, 0, %3 \n\t"
"lxvd2x 53, %5, %3 \n\t"
"lxvd2x 54, %6, %3 \n\t"
"lxvd2x 55, %7, %3 \n\t"
"addi %3, %3, 64 \n\t"
"addic. %0 , %0 , -16 \n\t"
"bgt 1b \n\t"
"2: \n\t"
"xvmaddadp 48, 40, 32 \n\t"
"xvmaddadp 49, 41, 32 \n\t"
"xvmaddadp 50, 42, 32 \n\t"
"xvmaddadp 51, 43, 32 \n\t"
"xvmaddadp 52, 44, 32 \n\t"
"xvmaddadp 53, 45, 32 \n\t"
"xvmaddadp 54, 46, 32 \n\t"
"xvmaddadp 55, 47, 32 \n\t"
"stxvd2x 48, 0, %8 \n\t"
"stxvd2x 49, %5, %8 \n\t"
"stxvd2x 50, %6, %8 \n\t"
"stxvd2x 51, %7, %8 \n\t"
"addi %8, %8, 64 \n\t"
"stxvd2x 52, 0, %8 \n\t"
"stxvd2x 53, %5, %8 \n\t"
"stxvd2x 54, %6, %8 \n\t"
"stxvd2x 55, %7, %8 \n\t"
"addi %8, %8, 64 \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"r" (x1), // 2
"r" (y1), // 3
"r" (alpha), // 4
"r" (o16), // 5
"r" (o32), // 6
"r" (o48), // 7
"r" (y2), // 8
"r" (pre) // 9
: "cr0", "%0", "%2" , "%3", "%8", "memory"
);
}

131
kernel/power/dcopy.c Normal file
View File

@ -0,0 +1,131 @@
/***************************************************************************
Copyright (c) 2013-2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2016/03/25 Werner Saar (wernsaar@googlemail.com)
* BLASTEST : OK
* CTEST : OK
* TEST : OK
* LAPACK-TEST : OK
**************************************************************************************/
#include "common.h"
#if defined(POWER8)
#include "dcopy_microk_power8.c"
#endif
#ifndef HAVE_KERNEL_32
static void dcopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y)
{
BLASLONG i=0;
FLOAT f0, f1, f2, f3, f4, f5, f6, f7;
FLOAT *x1=x;
FLOAT *y1=y;
while ( i<n )
{
f0 = x1[0];
f1 = x1[1];
f2 = x1[2];
f3 = x1[3];
f4 = x1[4];
f5 = x1[5];
f6 = x1[6];
f7 = x1[7];
y1[0] = f0;
y1[1] = f1;
y1[2] = f2;
y1[3] = f3;
y1[4] = f4;
y1[5] = f5;
y1[6] = f6;
y1[7] = f7;
x1 += 8;
y1 += 8;
i+=8;
}
return;
}
#endif
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;
if ( n <= 0 ) return(0);
if ( (inc_x == 1) && (inc_y == 1 ))
{
BLASLONG n1 = n & -32;
if ( n1 > 0 )
{
dcopy_kernel_32(n1, x, y);
i=n1;
}
while(i < n)
{
y[i] = x[i] ;
i++ ;
}
}
else
{
while(i < n)
{
y[iy] = x[ix] ;
ix += inc_x ;
iy += inc_y ;
i++ ;
}
}
return(0);
}

View File

@ -0,0 +1,174 @@
/***************************************************************************
Copyright (c) 2013-2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2016/03/25 Werner Saar (wernsaar@googlemail.com)
* BLASTEST : OK
* CTEST : OK
* TEST : OK
* LAPACK-TEST : OK
**************************************************************************************/
#define HAVE_KERNEL_32 1
static void dcopy_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
static void dcopy_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y)
{
BLASLONG i = n;
BLASLONG o16 = 16;
BLASLONG o32 = 32;
BLASLONG o48 = 48;
BLASLONG o64 = 64;
BLASLONG o80 = 80;
BLASLONG o96 = 96;
BLASLONG o112 = 112;
FLOAT *x1=x;
FLOAT *y1=y;
BLASLONG pre = 384;
BLASLONG alpha=0;
__asm__ __volatile__
(
"lxvd2x 40, 0, %2 \n\t"
"lxvd2x 41, %5, %2 \n\t"
"lxvd2x 42, %6, %2 \n\t"
"lxvd2x 43, %7, %2 \n\t"
"lxvd2x 44, %8, %2 \n\t"
"lxvd2x 45, %9, %2 \n\t"
"lxvd2x 46, %10, %2 \n\t"
"lxvd2x 47, %11, %2 \n\t"
"addi %2, %2, 128 \n\t"
"lxvd2x 50, 0, %2 \n\t"
"lxvd2x 51, %5, %2 \n\t"
"lxvd2x 52, %6, %2 \n\t"
"lxvd2x 53, %7, %2 \n\t"
"lxvd2x 54, %8, %2 \n\t"
"lxvd2x 55, %9, %2 \n\t"
"lxvd2x 56, %10, %2 \n\t"
"lxvd2x 57, %11, %2 \n\t"
"addi %2, %2, 128 \n\t"
"addic. %0 , %0 , -32 \n\t"
"ble 2f \n\t"
".align 5 \n\t"
"1: \n\t"
"stxvd2x 40, 0, %1 \n\t"
"stxvd2x 41, %5, %1 \n\t"
"lxvd2x 40, 0, %2 \n\t"
"lxvd2x 41, %5, %2 \n\t"
"stxvd2x 42, %6, %1 \n\t"
"stxvd2x 43, %7, %1 \n\t"
"lxvd2x 42, %6, %2 \n\t"
"lxvd2x 43, %7, %2 \n\t"
"stxvd2x 44, %8, %1 \n\t"
"stxvd2x 45, %9, %1 \n\t"
"lxvd2x 44, %8, %2 \n\t"
"lxvd2x 45, %9, %2 \n\t"
"stxvd2x 46, %10, %1 \n\t"
"stxvd2x 47, %11, %1 \n\t"
"lxvd2x 46, %10, %2 \n\t"
"lxvd2x 47, %11, %2 \n\t"
"addi %1, %1, 128 \n\t"
"addi %2, %2, 128 \n\t"
"stxvd2x 50, 0, %1 \n\t"
"stxvd2x 51, %5, %1 \n\t"
"lxvd2x 50, 0, %2 \n\t"
"lxvd2x 51, %5, %2 \n\t"
"stxvd2x 52, %6, %1 \n\t"
"stxvd2x 53, %7, %1 \n\t"
"lxvd2x 52, %6, %2 \n\t"
"lxvd2x 53, %7, %2 \n\t"
"stxvd2x 54, %8, %1 \n\t"
"stxvd2x 55, %9, %1 \n\t"
"lxvd2x 54, %8, %2 \n\t"
"lxvd2x 55, %9, %2 \n\t"
"stxvd2x 56, %10, %1 \n\t"
"stxvd2x 57, %11, %1 \n\t"
"lxvd2x 56, %10, %2 \n\t"
"lxvd2x 57, %11, %2 \n\t"
"addi %1, %1, 128 \n\t"
"addi %2, %2, 128 \n\t"
"addic. %0 , %0 , -32 \n\t"
"bgt 1b \n\t"
"2: \n\t"
"stxvd2x 40, 0, %1 \n\t"
"stxvd2x 41, %5, %1 \n\t"
"stxvd2x 42, %6, %1 \n\t"
"stxvd2x 43, %7, %1 \n\t"
"stxvd2x 44, %8, %1 \n\t"
"stxvd2x 45, %9, %1 \n\t"
"stxvd2x 46, %10, %1 \n\t"
"stxvd2x 47, %11, %1 \n\t"
"addi %1, %1, 128 \n\t"
"stxvd2x 50, 0, %1 \n\t"
"stxvd2x 51, %5, %1 \n\t"
"stxvd2x 52, %6, %1 \n\t"
"stxvd2x 53, %7, %1 \n\t"
"stxvd2x 54, %8, %1 \n\t"
"stxvd2x 55, %9, %1 \n\t"
"stxvd2x 56, %10, %1 \n\t"
"stxvd2x 57, %11, %1 \n\t"
:
:
"r" (i), // 0
"r" (y1), // 1
"r" (x1), // 2
"r" (alpha), // 3
"r" (pre), // 4
"r" (o16), // 5
"r" (o32), // 6
"r" (o48), // 7
"r" (o64), // 8
"r" (o80), // 9
"r" (o96), // 10
"r" (o112) // 11
: "cr0", "%0", "%2" , "%1", "memory"
);
}

139
kernel/power/ddot.c Normal file
View File

@ -0,0 +1,139 @@
/***************************************************************************
Copyright (c) 2013-2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2016/03/20 Werner Saar (wernsaar@googlemail.com)
* BLASTEST : OK
* CTEST : OK
* TEST : OK
* LAPACK-TEST : OK
**************************************************************************************/
#include "common.h"
#if defined(POWER8)
#include "ddot_microk_power8.c"
#endif
#ifndef HAVE_KERNEL_8
static void ddot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d)
{
BLASLONG register i = 0;
FLOAT dot = 0.0;
while(i < n)
{
dot += y[i] * x[i]
+ y[i+1] * x[i+1]
+ y[i+2] * x[i+2]
+ y[i+3] * x[i+3]
+ y[i+4] * x[i+4]
+ y[i+5] * x[i+5]
+ y[i+6] * x[i+6]
+ y[i+7] * x[i+7] ;
i+=8 ;
}
*d += dot;
}
#endif
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;
FLOAT dot = 0.0 ;
if ( n <= 0 ) return(dot);
if ( (inc_x == 1) && (inc_y == 1) )
{
BLASLONG n1 = n & -16;
if ( n1 )
ddot_kernel_8(n1, x, y , &dot );
i = n1;
while(i < n)
{
dot += y[i] * x[i] ;
i++ ;
}
return(dot);
}
FLOAT temp1 = 0.0;
FLOAT temp2 = 0.0;
BLASLONG n1 = n & -4;
while(i < n1)
{
FLOAT m1 = y[iy] * x[ix] ;
FLOAT m2 = y[iy+inc_y] * x[ix+inc_x] ;
FLOAT m3 = y[iy+2*inc_y] * x[ix+2*inc_x] ;
FLOAT m4 = y[iy+3*inc_y] * x[ix+3*inc_x] ;
ix += inc_x*4 ;
iy += inc_y*4 ;
temp1 += m1+m3;
temp2 += m2+m4;
i+=4 ;
}
while(i < n)
{
temp1 += y[iy] * x[ix] ;
ix += inc_x ;
iy += inc_y ;
i++ ;
}
dot = temp1 + temp2;
return(dot);
}

View File

@ -0,0 +1,178 @@
/***************************************************************************
Copyright (c) 2013-2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2016/03/20 Werner Saar (wernsaar@googlemail.com)
* BLASTEST : OK
* CTEST : OK
* TEST : OK
* LAPACK-TEST : OK
**************************************************************************************/
#define HAVE_KERNEL_8 1
static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline));
static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
{
BLASLONG i = n;
BLASLONG o16 = 16;
BLASLONG o32 = 32;
BLASLONG o48 = 48;
BLASLONG o64 = 64;
BLASLONG o80 = 80;
BLASLONG o96 = 96;
BLASLONG o112 = 112;
FLOAT *x1=x;
FLOAT *y1=y;
BLASLONG pre = 384;
__asm__ __volatile__
(
"xxlxor 32,32,32 \n\t"
"xxlxor 33,33,33 \n\t"
"xxlxor 34,34,34 \n\t"
"xxlxor 35,35,35 \n\t"
"xxlxor 36,36,36 \n\t"
"xxlxor 37,37,37 \n\t"
"xxlxor 38,38,38 \n\t"
"xxlxor 39,39,39 \n\t"
"dcbt %2, %12 \n\t"
"dcbt %3, %12 \n\t"
"lxvd2x 40, 0, %2 \n\t"
"lxvd2x 48, 0, %3 \n\t"
"lxvd2x 41, %5, %2 \n\t"
"lxvd2x 49, %5, %3 \n\t"
"lxvd2x 42, %6, %2 \n\t"
"lxvd2x 50, %6, %3 \n\t"
"lxvd2x 43, %7, %2 \n\t"
"lxvd2x 51, %7, %3 \n\t"
"lxvd2x 44, %8, %2 \n\t"
"lxvd2x 52, %8, %3 \n\t"
"lxvd2x 45, %9, %2 \n\t"
"lxvd2x 53, %9, %3 \n\t"
"lxvd2x 46, %10, %2 \n\t"
"lxvd2x 54, %10, %3 \n\t"
"lxvd2x 47, %11, %2 \n\t"
"lxvd2x 55, %11, %3 \n\t"
"addi %2, %2, 128 \n\t"
"addi %3, %3, 128 \n\t"
"addic. %0 , %0 , -16 \n\t"
"ble 2f \n\t"
".align 5 \n\t"
"1: \n\t"
"dcbt %2, %12 \n\t"
"dcbt %3, %12 \n\t"
"xvmaddadp 32, 40, 48 \n\t"
"lxvd2x 40, 0, %2 \n\t"
"lxvd2x 48, 0, %3 \n\t"
"xvmaddadp 33, 41, 49 \n\t"
"lxvd2x 41, %5, %2 \n\t"
"lxvd2x 49, %5, %3 \n\t"
"xvmaddadp 34, 42, 50 \n\t"
"lxvd2x 42, %6, %2 \n\t"
"lxvd2x 50, %6, %3 \n\t"
"xvmaddadp 35, 43, 51 \n\t"
"lxvd2x 43, %7, %2 \n\t"
"lxvd2x 51, %7, %3 \n\t"
"xvmaddadp 36, 44, 52 \n\t"
"lxvd2x 44, %8, %2 \n\t"
"lxvd2x 52, %8, %3 \n\t"
"xvmaddadp 37, 45, 53 \n\t"
"lxvd2x 45, %9, %2 \n\t"
"lxvd2x 53, %9, %3 \n\t"
"xvmaddadp 38, 46, 54 \n\t"
"lxvd2x 46, %10, %2 \n\t"
"lxvd2x 54, %10, %3 \n\t"
"xvmaddadp 39, 47, 55 \n\t"
"lxvd2x 47, %11, %2 \n\t"
"lxvd2x 55, %11, %3 \n\t"
"addi %2, %2, 128 \n\t"
"addi %3, %3, 128 \n\t"
"addic. %0 , %0 , -16 \n\t"
"bgt 1b \n\t"
"2: \n\t"
"xvmaddadp 32, 40, 48 \n\t"
"xvmaddadp 33, 41, 49 \n\t"
"xvmaddadp 34, 42, 50 \n\t"
"xvmaddadp 35, 43, 51 \n\t"
"xvmaddadp 36, 44, 52 \n\t"
"xvmaddadp 37, 45, 53 \n\t"
"xvmaddadp 38, 46, 54 \n\t"
"xvmaddadp 39, 47, 55 \n\t"
"xvadddp 32, 32, 33 \n\t"
"xvadddp 34, 34, 35 \n\t"
"xvadddp 36, 36, 37 \n\t"
"xvadddp 38, 38, 39 \n\t"
"xvadddp 32, 32, 34 \n\t"
"xvadddp 36, 36, 38 \n\t"
"xvadddp 32, 32, 36 \n\t"
"xxswapd 33, 32 \n\t"
"xsadddp 32, 32, 33 \n\t"
"stxsdx 32, 0, %4 \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"r" (x1), // 2
"r" (y1), // 3
"r" (dot), // 4
"r" (o16), // 5
"r" (o32), // 6
"r" (o48), // 7
"r" (o64), // 8
"r" (o80), // 9
"r" (o96), // 10
"r" (o112), // 11
"r" (pre) // 12
: "cr0", "%0", "%2" , "%3", "memory"
);
}

426
kernel/power/dgemv_n.c Normal file
View File

@ -0,0 +1,426 @@
/***************************************************************************
Copyright (c) 2013-2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2016/03/30 Werner Saar (wernsaar@googlemail.com)
* BLASTEST : OK
* CTEST : OK
* TEST : OK
* LAPACK-TEST : OK
**************************************************************************************/
#include "common.h"
#if defined(POWER8)
#include "dgemv_n_microk_power8.c"
#endif
#define NBMAX 4096
#ifndef HAVE_KERNEL_4x4
static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha)
{
BLASLONG i;
FLOAT *a0,*a1,*a2,*a3;
FLOAT x[4] __attribute__ ((aligned (16)));;
a0 = ap[0];
a1 = ap[1];
a2 = ap[2];
a3 = ap[3];
for ( i=0; i<4; i++)
x[i] = xo[i] * *alpha;
for ( i=0; i< n; i+=4 )
{
y[i] += a0[i]*x[0] + a1[i]*x[1] + a2[i]*x[2] + a3[i]*x[3];
y[i+1] += a0[i+1]*x[0] + a1[i+1]*x[1] + a2[i+1]*x[2] + a3[i+1]*x[3];
y[i+2] += a0[i+2]*x[0] + a1[i+2]*x[1] + a2[i+2]*x[2] + a3[i+2]*x[3];
y[i+3] += a0[i+3]*x[0] + a1[i+3]*x[1] + a2[i+3]*x[2] + a3[i+3]*x[3];
}
}
#endif
#ifndef HAVE_KERNEL_4x2
static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha)
{
BLASLONG i;
FLOAT *a0,*a1;
FLOAT x[4] __attribute__ ((aligned (16)));;
a0 = ap[0];
a1 = ap[1];
for ( i=0; i<2; i++)
x[i] = xo[i] * *alpha;
for ( i=0; i< n; i+=4 )
{
y[i] += a0[i]*x[0] + a1[i]*x[1];
y[i+1] += a0[i+1]*x[0] + a1[i+1]*x[1];
y[i+2] += a0[i+2]*x[0] + a1[i+2]*x[1];
y[i+3] += a0[i+3]*x[0] + a1[i+3]*x[1];
}
}
#endif
#ifndef HAVE_KERNEL_4x1
static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *xo, FLOAT *y, FLOAT *alpha)
{
BLASLONG i;
FLOAT *a0;
FLOAT x[4] __attribute__ ((aligned (16)));;
a0 = ap;
for ( i=0; i<1; i++)
x[i] = xo[i] * *alpha;
for ( i=0; i< n; i+=4 )
{
y[i] += a0[i]*x[0];
y[i+1] += a0[i+1]*x[0];
y[i+2] += a0[i+2]*x[0];
y[i+3] += a0[i+3]*x[0];
}
}
#endif
static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
{
BLASLONG i;
if ( inc_dest != 1 )
{
for ( i=0; i<n; i++ )
{
*dest += *src;
src++;
dest += inc_dest;
}
return;
}
}
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
{
BLASLONG i;
BLASLONG j;
FLOAT *a_ptr;
FLOAT *x_ptr;
FLOAT *y_ptr;
BLASLONG n1;
BLASLONG m1;
BLASLONG m2;
BLASLONG m3;
BLASLONG n2;
BLASLONG lda4 = lda << 2;
FLOAT *ap[4] __attribute__ ((aligned (16)));;
FLOAT xbuffer[8] __attribute__ ((aligned (16)));;
FLOAT alpha_r[4] __attribute__ ((aligned (16)));;
FLOAT *ybuffer;
alpha_r[0] = alpha;
if ( m < 1 ) return(0);
if ( n < 1 ) return(0);
ybuffer = buffer;
n1 = n >> 2 ;
n2 = n & 3 ;
m3 = m & 3 ;
m1 = m & -4 ;
m2 = (m & (NBMAX-1)) - m3 ;
y_ptr = y;
BLASLONG NB = NBMAX;
while ( NB == NBMAX )
{
m1 -= NB;
if ( m1 < 0)
{
if ( m2 == 0 ) break;
NB = m2;
}
a_ptr = a;
x_ptr = x;
ap[0] = a_ptr;
ap[1] = a_ptr + lda;
ap[2] = ap[1] + lda;
ap[3] = ap[2] + lda;
if ( inc_y != 1 )
memset(ybuffer,0,NB*8);
else
ybuffer = y_ptr;
if ( inc_x == 1 )
{
for( i = 0; i < n1 ; i++)
{
dgemv_kernel_4x4(NB,ap,x_ptr,ybuffer,alpha_r);
ap[0] += lda4;
ap[1] += lda4;
ap[2] += lda4;
ap[3] += lda4;
a_ptr += lda4;
x_ptr += 4;
}
if ( n2 & 2 )
{
dgemv_kernel_4x2(NB,ap,x_ptr,ybuffer,alpha_r);
a_ptr += lda*2;
x_ptr += 2;
}
if ( n2 & 1 )
{
dgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer,alpha_r);
a_ptr += lda;
x_ptr += 1;
}
}
else
{
for( i = 0; i < n1 ; i++)
{
xbuffer[0] = x_ptr[0];
x_ptr += inc_x;
xbuffer[1] = x_ptr[0];
x_ptr += inc_x;
xbuffer[2] = x_ptr[0];
x_ptr += inc_x;
xbuffer[3] = x_ptr[0];
x_ptr += inc_x;
dgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,alpha_r);
ap[0] += lda4;
ap[1] += lda4;
ap[2] += lda4;
ap[3] += lda4;
a_ptr += lda4;
}
for( i = 0; i < n2 ; i++)
{
xbuffer[0] = x_ptr[0];
x_ptr += inc_x;
dgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,alpha_r);
a_ptr += lda;
}
}
a += NB;
if ( inc_y != 1 )
{
add_y(NB,ybuffer,y_ptr,inc_y);
y_ptr += NB * inc_y;
}
else
y_ptr += NB ;
}
if ( m3 == 0 ) return(0);
if ( m3 == 3 )
{
a_ptr = a;
x_ptr = x;
FLOAT temp0 = 0.0;
FLOAT temp1 = 0.0;
FLOAT temp2 = 0.0;
if ( lda == 3 && inc_x ==1 )
{
for( i = 0; i < ( n & -4 ); i+=4 )
{
temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1];
temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1];
temp2 += a_ptr[2] * x_ptr[0] + a_ptr[5] * x_ptr[1];
temp0 += a_ptr[6] * x_ptr[2] + a_ptr[9] * x_ptr[3];
temp1 += a_ptr[7] * x_ptr[2] + a_ptr[10] * x_ptr[3];
temp2 += a_ptr[8] * x_ptr[2] + a_ptr[11] * x_ptr[3];
a_ptr += 12;
x_ptr += 4;
}
for( ; i < n; i++ )
{
temp0 += a_ptr[0] * x_ptr[0];
temp1 += a_ptr[1] * x_ptr[0];
temp2 += a_ptr[2] * x_ptr[0];
a_ptr += 3;
x_ptr ++;
}
}
else
{
for( i = 0; i < n; i++ )
{
temp0 += a_ptr[0] * x_ptr[0];
temp1 += a_ptr[1] * x_ptr[0];
temp2 += a_ptr[2] * x_ptr[0];
a_ptr += lda;
x_ptr += inc_x;
}
}
y_ptr[0] += alpha * temp0;
y_ptr += inc_y;
y_ptr[0] += alpha * temp1;
y_ptr += inc_y;
y_ptr[0] += alpha * temp2;
return(0);
}
if ( m3 == 2 )
{
a_ptr = a;
x_ptr = x;
FLOAT temp0 = 0.0;
FLOAT temp1 = 0.0;
if ( lda == 2 && inc_x ==1 )
{
for( i = 0; i < (n & -4) ; i+=4 )
{
temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1];
temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1];
temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3];
temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3];
a_ptr += 8;
x_ptr += 4;
}
for( ; i < n; i++ )
{
temp0 += a_ptr[0] * x_ptr[0];
temp1 += a_ptr[1] * x_ptr[0];
a_ptr += 2;
x_ptr ++;
}
}
else
{
for( i = 0; i < n; i++ )
{
temp0 += a_ptr[0] * x_ptr[0];
temp1 += a_ptr[1] * x_ptr[0];
a_ptr += lda;
x_ptr += inc_x;
}
}
y_ptr[0] += alpha * temp0;
y_ptr += inc_y;
y_ptr[0] += alpha * temp1;
return(0);
}
if ( m3 == 1 )
{
a_ptr = a;
x_ptr = x;
FLOAT temp = 0.0;
if ( lda == 1 && inc_x ==1 )
{
for( i = 0; i < (n & -4); i+=4 )
{
temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3];
}
for( ; i < n; i++ )
{
temp += a_ptr[i] * x_ptr[i];
}
}
else
{
for( i = 0; i < n; i++ )
{
temp += a_ptr[0] * x_ptr[0];
a_ptr += lda;
x_ptr += inc_x;
}
}
y_ptr[0] += alpha * temp;
return(0);
}
return(0);
}

View File

@ -0,0 +1,301 @@
/***************************************************************************
Copyright (c) 2013-2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2016/03/30 Werner Saar (wernsaar@googlemail.com)
* BLASTEST : OK
* CTEST : OK
* TEST : OK
* LAPACK-TEST : OK
**************************************************************************************/
#define HAVE_KERNEL_4x4 1
static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));
static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha)
{
BLASLONG i=n;
BLASLONG o8 = 8;
BLASLONG o16 = 16;
BLASLONG o24 = 24;
BLASLONG pre = 384;
FLOAT *a0,*a1,*a2,*a3;
FLOAT *y1=y+1;
FLOAT x[4] __attribute__ ((aligned (16)));;
a0 = ap[0]+1;
a1 = ap[1]+1;
a2 = ap[2]+1;
a3 = ap[3]+1;
x[0]=xo[0] * *alpha;
x[1]=xo[1] * *alpha;
x[2]=xo[2] * *alpha;
x[3]=xo[3] * *alpha;
__asm__ __volatile__
(
"lxvdsx 32, 0 , %1 \n\t" // x0
"lxvdsx 33,%3 , %1 \n\t" // x1
"lxvdsx 34,%4 , %1 \n\t" // x2
"lxvdsx 35,%5 , %1 \n\t" // x3
"addi %2 , %2 , -8 \n\t"
"addi %6 , %6 , -8 \n\t"
"addi %7 , %7 , -8 \n\t"
"addi %8 , %8 , -8 \n\t"
"addi %9 , %9 , -8 \n\t"
"lxvd2x 48, 0, %6 \n\t" // a0[0], a0[1]
"lxvd2x 49,%4, %6 \n\t" // a0[2], a0[3]
"lxvd2x 50, 0, %7 \n\t" // a1[0], a1[1]
"lxvd2x 51,%4, %7 \n\t" // a1[2], a1[3]
"lxvd2x 52, 0, %8 \n\t" // a2[0], a2[1]
"lxvd2x 53,%4, %8 \n\t" // a2[2], a2[3]
"lxvd2x 54, 0, %9 \n\t" // a3[0], a3[1]
"lxvd2x 55,%4, %9 \n\t" // a3[2], a3[3]
"addi %6, %6, 32 \n\t"
"addi %7, %7, 32 \n\t"
"addi %8, %8, 32 \n\t"
"addi %9, %9, 32 \n\t"
"addic. %0 , %0 , -4 \n\t"
"ble 2f \n\t"
".align 5 \n\t"
"1: \n\t"
"dcbt %2, %10 \n\t"
"lxvd2x 40, 0, %2 \n\t" // y0, y1
"lxvd2x 41,%4, %2 \n\t" // y2, y3
"dcbt %6, %10 \n\t"
"dcbt %7, %10 \n\t"
"dcbt %8, %10 \n\t"
"dcbt %9, %10 \n\t"
"xvmaddadp 40, 48, 32 \n\t"
"xvmaddadp 41, 49, 32 \n\t"
"lxvd2x 48, 0, %6 \n\t" // a0[0], a0[1]
"lxvd2x 49,%4, %6 \n\t" // a0[2], a0[3]
"xvmaddadp 40, 50, 33 \n\t"
"addi %6, %6, 32 \n\t"
"xvmaddadp 41, 51, 33 \n\t"
"lxvd2x 50, 0, %7 \n\t" // a1[0], a1[1]
"lxvd2x 51,%4, %7 \n\t" // a1[2], a1[3]
"xvmaddadp 40, 52, 34 \n\t"
"addi %7, %7, 32 \n\t"
"xvmaddadp 41, 53, 34 \n\t"
"lxvd2x 52, 0, %8 \n\t" // a2[0], a2[1]
"lxvd2x 53,%4, %8 \n\t" // a2[2], a2[3]
"xvmaddadp 40, 54, 35 \n\t"
"addi %8, %8, 32 \n\t"
"xvmaddadp 41, 55, 35 \n\t"
"stxvd2x 40, 0, %2 \n\t" // y0, y1
"stxvd2x 41,%4, %2 \n\t" // y2, y3
"lxvd2x 54, 0, %9 \n\t" // a3[0], a3[1]
"lxvd2x 55,%4, %9 \n\t" // a3[2], a3[3]
"addi %9, %9, 32 \n\t"
"addi %2, %2, 32 \n\t"
"addic. %0 , %0 , -4 \n\t"
"ble 2f \n\t"
"lxvd2x 40, 0, %2 \n\t" // y0, y1
"lxvd2x 41,%4, %2 \n\t" // y2, y3
"xvmaddadp 40, 48, 32 \n\t"
"xvmaddadp 41, 49, 32 \n\t"
"lxvd2x 48, 0, %6 \n\t" // a0[0], a0[1]
"lxvd2x 49,%4, %6 \n\t" // a0[2], a0[3]
"xvmaddadp 40, 50, 33 \n\t"
"addi %6, %6, 32 \n\t"
"xvmaddadp 41, 51, 33 \n\t"
"lxvd2x 50, 0, %7 \n\t" // a1[0], a1[1]
"lxvd2x 51,%4, %7 \n\t" // a1[2], a1[3]
"xvmaddadp 40, 52, 34 \n\t"
"addi %7, %7, 32 \n\t"
"xvmaddadp 41, 53, 34 \n\t"
"lxvd2x 52, 0, %8 \n\t" // a2[0], a2[1]
"lxvd2x 53,%4, %8 \n\t" // a2[2], a2[3]
"xvmaddadp 40, 54, 35 \n\t"
"addi %8, %8, 32 \n\t"
"xvmaddadp 41, 55, 35 \n\t"
"stxvd2x 40, 0, %2 \n\t" // y0, y1
"stxvd2x 41,%4, %2 \n\t" // y2, y3
"lxvd2x 54, 0, %9 \n\t" // a3[0], a3[1]
"lxvd2x 55,%4, %9 \n\t" // a3[2], a3[3]
"addi %9, %9, 32 \n\t"
"addi %2, %2, 32 \n\t"
"addic. %0 , %0 , -4 \n\t"
"ble 2f \n\t"
"lxvd2x 40, 0, %2 \n\t" // y0, y1
"lxvd2x 41,%4, %2 \n\t" // y2, y3
"xvmaddadp 40, 48, 32 \n\t"
"xvmaddadp 41, 49, 32 \n\t"
"lxvd2x 48, 0, %6 \n\t" // a0[0], a0[1]
"lxvd2x 49,%4, %6 \n\t" // a0[2], a0[3]
"xvmaddadp 40, 50, 33 \n\t"
"addi %6, %6, 32 \n\t"
"xvmaddadp 41, 51, 33 \n\t"
"lxvd2x 50, 0, %7 \n\t" // a1[0], a1[1]
"lxvd2x 51,%4, %7 \n\t" // a1[2], a1[3]
"xvmaddadp 40, 52, 34 \n\t"
"addi %7, %7, 32 \n\t"
"xvmaddadp 41, 53, 34 \n\t"
"lxvd2x 52, 0, %8 \n\t" // a2[0], a2[1]
"lxvd2x 53,%4, %8 \n\t" // a2[2], a2[3]
"xvmaddadp 40, 54, 35 \n\t"
"addi %8, %8, 32 \n\t"
"xvmaddadp 41, 55, 35 \n\t"
"stxvd2x 40, 0, %2 \n\t" // y0, y1
"stxvd2x 41,%4, %2 \n\t" // y2, y3
"lxvd2x 54, 0, %9 \n\t" // a3[0], a3[1]
"lxvd2x 55,%4, %9 \n\t" // a3[2], a3[3]
"addi %9, %9, 32 \n\t"
"addi %2, %2, 32 \n\t"
"addic. %0 , %0 , -4 \n\t"
"ble 2f \n\t"
"lxvd2x 40, 0, %2 \n\t" // y0, y1
"lxvd2x 41,%4, %2 \n\t" // y2, y3
"xvmaddadp 40, 48, 32 \n\t"
"xvmaddadp 41, 49, 32 \n\t"
"lxvd2x 48, 0, %6 \n\t" // a0[0], a0[1]
"lxvd2x 49,%4, %6 \n\t" // a0[2], a0[3]
"xvmaddadp 40, 50, 33 \n\t"
"addi %6, %6, 32 \n\t"
"xvmaddadp 41, 51, 33 \n\t"
"lxvd2x 50, 0, %7 \n\t" // a1[0], a1[1]
"lxvd2x 51,%4, %7 \n\t" // a1[2], a1[3]
"xvmaddadp 40, 52, 34 \n\t"
"addi %7, %7, 32 \n\t"
"xvmaddadp 41, 53, 34 \n\t"
"lxvd2x 52, 0, %8 \n\t" // a2[0], a2[1]
"lxvd2x 53,%4, %8 \n\t" // a2[2], a2[3]
"xvmaddadp 40, 54, 35 \n\t"
"addi %8, %8, 32 \n\t"
"xvmaddadp 41, 55, 35 \n\t"
"stxvd2x 40, 0, %2 \n\t" // y0, y1
"stxvd2x 41,%4, %2 \n\t" // y2, y3
"lxvd2x 54, 0, %9 \n\t" // a3[0], a3[1]
"lxvd2x 55,%4, %9 \n\t" // a3[2], a3[3]
"addi %9, %9, 32 \n\t"
"addi %2, %2, 32 \n\t"
"addic. %0 , %0 , -4 \n\t"
"bgt 1b \n\t"
"2: \n\t"
"lxvd2x 40, 0, %2 \n\t" // y0, y1
"lxvd2x 41,%4, %2 \n\t" // y2, y3
"xvmaddadp 40, 48, 32 \n\t"
"xvmaddadp 41, 49, 32 \n\t"
"xvmaddadp 40, 50, 33 \n\t"
"xvmaddadp 41, 51, 33 \n\t"
"xvmaddadp 40, 52, 34 \n\t"
"xvmaddadp 41, 53, 34 \n\t"
"xvmaddadp 40, 54, 35 \n\t"
"xvmaddadp 41, 55, 35 \n\t"
"stxvd2x 40, 0, %2 \n\t" // y0, y1
"stxvd2x 41,%4, %2 \n\t" // y2, y3
:
:
"r" (i), // 0
"r" (x), // 1
"r" (y1), // 2
"r" (o8), // 3
"r" (o16), // 4
"r" (o24), // 5
"r" (a0), // 6
"r" (a1), // 7
"r" (a2), // 8
"r" (a3), // 9
"r" (pre) // 10
: "cr0", "%0", "%2" , "%6", "%7", "%8", "%9", "memory"
);
}

167
kernel/power/drot.c Normal file
View File

@ -0,0 +1,167 @@
/***************************************************************************
Copyright (c) 2013-2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2016/03/27 Werner Saar (wernsaar@googlemail.com)
* BLASTEST : OK
* CTEST : OK
* TEST : OK
* LAPACK-TEST : OK
**************************************************************************************/
#include "common.h"
#pragma GCC optimize "O1"
#if defined(POWER8)
#include "drot_microk_power8.c"
#endif
#ifndef HAVE_KERNEL_16
static void drot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
{
BLASLONG i=0;
FLOAT f0, f1, f2, f3;
FLOAT x00, x01, x02, x03;
FLOAT g0, g1, g2, g3;
FLOAT y00, y01, y02, y03;
FLOAT *x1=x;
FLOAT *y1=y;
FLOAT c1=*c;
FLOAT s1=*s;
while ( i<n )
{
x00 = x1[0];
y00 = y1[0];
x01 = x1[1];
y01 = y1[1];
x02 = x1[2];
y02 = y1[2];
x03 = x1[3];
y03 = y1[3];
f0 = c1*x00 + s1*y00;
g0 = c1*y00 - s1*x00;
f1 = c1*x01 + s1*y01;
g1 = c1*y01 - s1*x01;
f2 = c1*x02 + s1*y02;
g2 = c1*y02 - s1*x02;
f3 = c1*x03 + s1*y03;
g3 = c1*y03 - s1*x03;
x1[0] = f0;
y1[0] = g0;
x1[1] = f1;
y1[1] = g1;
x1[2] = f2;
y1[2] = g2;
x1[3] = f3;
y1[3] = g3;
x1 += 4;
y1 += 4;
i+=4;
}
return;
}
#endif
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;
FLOAT c1[4] __attribute__ ((aligned (16)));;
FLOAT s1[4] __attribute__ ((aligned (16)));;
FLOAT *x1=x;
FLOAT *y1=y;
FLOAT temp;
if ( n <= 0 ) return(0);
if ( (inc_x == 1) && (inc_y == 1) )
{
BLASLONG n1 = n & -16;
if ( n1 > 0 )
{
c1[0]=c;
c1[1]=c;
c1[2]=c;
c1[3]=c;
s1[0]=s;
s1[1]=s;
s1[2]=s;
s1[3]=s;
drot_kernel_16(n1, x1, y1, c1, s1);
i=n1;
}
while(i < n)
{
temp = c*x[i] + s*y[i] ;
y[i] = c*y[i] - s*x[i] ;
x[i] = temp ;
i++ ;
}
}
else
{
while(i < n)
{
temp = c*x[ix] + s*y[iy] ;
y[iy] = c*y[iy] - s*x[ix] ;
x[ix] = temp ;
ix += inc_x ;
iy += inc_y ;
i++ ;
}
}
return(0);
}

View File

@ -0,0 +1,211 @@
/***************************************************************************
Copyright (c) 2013-2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2016/03/27 Werner Saar (wernsaar@googlemail.com)
*
* I don't use fused multiply-add ( precision problems with lapack )
*
* BLASTEST : OK
* CTEST : OK
* TEST : OK
* LAPACK-TEST : OK
**************************************************************************************/
#define HAVE_KERNEL_16 1
static void drot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) __attribute__ ((noinline));
static void drot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
{
BLASLONG i = n;
BLASLONG o16 = 16;
BLASLONG o32 = 32;
BLASLONG o48 = 48;
FLOAT *x1=x;
FLOAT *y1=y;
FLOAT *x2=x+1;
FLOAT *y2=y+1;
__asm__ __volatile__
(
"lxsdx 36 , %5, %3 \n\t" // load c
"lxsdx 37 , %5, %4 \n\t" // load s
"addi %8 , %8, -8 \n\t"
"addi %9 , %9, -8 \n\t"
"xxspltd 36 , 36, 0 \n\t"
"xxspltd 37 , 37, 0 \n\t"
"lxvd2x 32, 0, %1 \n\t" // load x
"lxvd2x 33, %5, %1 \n\t"
"lxvd2x 34, %6, %1 \n\t"
"lxvd2x 35, %7, %1 \n\t"
"lxvd2x 40, 0, %2 \n\t" // load y
"lxvd2x 41, %5, %2 \n\t"
"lxvd2x 42, %6, %2 \n\t"
"lxvd2x 43, %7, %2 \n\t"
"addi %1, %1, 64 \n\t"
"addi %2, %2, 64 \n\t"
"addic. %0 , %0 , -8 \n\t"
"ble 2f \n\t"
".align 5 \n\t"
"1: \n\t"
"xvmuldp 48, 32, 36 \n\t" // c * x
"xvmuldp 49, 33, 36 \n\t"
"xvmuldp 50, 34, 36 \n\t"
"xvmuldp 51, 35, 36 \n\t"
"xvmuldp 56, 40, 36 \n\t" // c * y
"xvmuldp 57, 41, 36 \n\t"
"xvmuldp 58, 42, 36 \n\t"
"xvmuldp 59, 43, 36 \n\t"
"xvmuldp 52, 32, 37 \n\t" // s * x
"xvmuldp 53, 33, 37 \n\t"
"lxvd2x 32, 0, %1 \n\t" // load x
"lxvd2x 33, %5, %1 \n\t"
"xvmuldp 54, 34, 37 \n\t"
"xvmuldp 55, 35, 37 \n\t"
"lxvd2x 34, %6, %1 \n\t"
"lxvd2x 35, %7, %1 \n\t"
"xvmuldp 60, 40, 37 \n\t" // s * y
"xvmuldp 61, 41, 37 \n\t"
"lxvd2x 40, 0, %2 \n\t" // load y
"lxvd2x 41, %5, %2 \n\t"
"xvmuldp 62, 42, 37 \n\t"
"xvmuldp 63, 43, 37 \n\t"
"lxvd2x 42, %6, %2 \n\t"
"lxvd2x 43, %7, %2 \n\t"
"xvadddp 48, 48 , 60 \n\t" // c * x + s * y
"xvadddp 49, 49 , 61 \n\t" // c * x + s * y
"addi %1, %1, 64 \n\t"
"addi %2, %2, 64 \n\t"
"xvadddp 50, 50 , 62 \n\t" // c * x + s * y
"xvadddp 51, 51 , 63 \n\t" // c * x + s * y
"xvsubdp 56, 56 , 52 \n\t" // c * y - s * x
"xvsubdp 57, 57 , 53 \n\t" // c * y - s * x
"xvsubdp 58, 58 , 54 \n\t" // c * y - s * x
"xvsubdp 59, 59 , 55 \n\t" // c * y - s * x
"stxvd2x 48, 0, %8 \n\t" // store x
"stxvd2x 49, %5, %8 \n\t"
"stxvd2x 50, %6, %8 \n\t"
"stxvd2x 51, %7, %8 \n\t"
"stxvd2x 56, 0, %9 \n\t" // store y
"stxvd2x 57, %5, %9 \n\t"
"stxvd2x 58, %6, %9 \n\t"
"stxvd2x 59, %7, %9 \n\t"
"addi %8, %8, 64 \n\t"
"addi %9, %9, 64 \n\t"
"addic. %0 , %0 , -8 \n\t"
"bgt 1b \n\t"
"2: \n\t"
"xvmuldp 48, 32, 36 \n\t" // c * x
"xvmuldp 49, 33, 36 \n\t"
"xvmuldp 50, 34, 36 \n\t"
"xvmuldp 51, 35, 36 \n\t"
"xvmuldp 56, 40, 36 \n\t" // c * y
"xvmuldp 57, 41, 36 \n\t"
"xvmuldp 58, 42, 36 \n\t"
"xvmuldp 59, 43, 36 \n\t"
"xvmuldp 52, 32, 37 \n\t" // s * x
"xvmuldp 53, 33, 37 \n\t"
"xvmuldp 54, 34, 37 \n\t"
"xvmuldp 55, 35, 37 \n\t"
"xvmuldp 60, 40, 37 \n\t" // s * y
"xvmuldp 61, 41, 37 \n\t"
"xvmuldp 62, 42, 37 \n\t"
"xvmuldp 63, 43, 37 \n\t"
"xvadddp 48, 48 , 60 \n\t" // c * x + s * y
"xvadddp 49, 49 , 61 \n\t" // c * x + s * y
"xvadddp 50, 50 , 62 \n\t" // c * x + s * y
"xvadddp 51, 51 , 63 \n\t" // c * x + s * y
"xvsubdp 56, 56 , 52 \n\t" // c * y - s * x
"xvsubdp 57, 57 , 53 \n\t" // c * y - s * x
"xvsubdp 58, 58 , 54 \n\t" // c * y - s * x
"xvsubdp 59, 59 , 55 \n\t" // c * y - s * x
"stxvd2x 48, 0, %8 \n\t" // store x
"stxvd2x 49, %5, %8 \n\t"
"stxvd2x 50, %6, %8 \n\t"
"stxvd2x 51, %7, %8 \n\t"
"stxvd2x 56, 0, %9 \n\t" // store y
"stxvd2x 57, %5, %9 \n\t"
"stxvd2x 58, %6, %9 \n\t"
"stxvd2x 59, %7, %9 \n\t"
:
:
"r" (i), // 0
"r" (x1), // 1
"r" (y1), // 2
"r" (c), // 3
"r" (s), // 4
"r" (o16), // 5
"r" (o32), // 6
"r" (o48), // 7
"r" (x2), // 8
"r" (y2) // 9
: "cr0", "%0", "%1" , "%2", "%8", "%9", "memory"
);
}

174
kernel/power/dscal.c Normal file
View File

@ -0,0 +1,174 @@
/***************************************************************************
Copyright (c) 2013-2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2016/03/25 Werner Saar (wernsaar@googlemail.com)
* BLASTEST : OK
* CTEST : OK
* TEST : OK
* LAPACK-TEST : OK
**************************************************************************************/
#include "common.h"
#if defined(POWER8)
#include "dscal_microk_power8.c"
#endif
#if !defined(HAVE_KERNEL_8)
static void dscal_kernel_8( BLASLONG n, FLOAT *da , FLOAT *x )
{
BLASLONG i;
FLOAT alpha = *da;
for( i=0; i<n; i+=8 )
{
x[0] *= alpha;
x[1] *= alpha;
x[2] *= alpha;
x[3] *= alpha;
x[4] *= alpha;
x[5] *= alpha;
x[6] *= alpha;
x[7] *= alpha;
x+=8;
}
}
static void dscal_kernel_8_zero( BLASLONG n, FLOAT *da , FLOAT *x )
{
BLASLONG i;
FLOAT alpha=0.0;
for( i=0; i<n; i+=8 )
{
x[0] = alpha;
x[1] = alpha;
x[2] = alpha;
x[3] = alpha;
x[4] = alpha;
x[5] = alpha;
x[6] = alpha;
x[7] = alpha;
x+=8;
}
}
#endif
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
{
BLASLONG i=0,j=0;
if ( n <= 0 || inc_x <=0 )
return(0);
if ( inc_x == 1 )
{
if ( da == 0.0 )
{
BLASLONG n1 = n & -16;
if ( n1 > 0 )
{
FLOAT alpha[2];
alpha[0]=da;
alpha[1]=da;
dscal_kernel_8_zero(n1 , alpha , x);
j=n1;
}
while(j < n)
{
x[j]=0.0;
j++;
}
}
else
{
BLASLONG n1 = n & -16;
if ( n1 > 0 )
{
FLOAT alpha[2];
alpha[0]=da;
alpha[1]=da;
dscal_kernel_8(n1 , alpha , x);
j=n1;
}
while(j < n)
{
x[j] = da * x[j] ;
j++;
}
}
}
else
{
if ( da == 0.0 )
{
while(j < n)
{
x[i]=0.0;
i += inc_x ;
j++;
}
}
else
{
while(j < n)
{
x[i] = da * x[i] ;
i += inc_x ;
j++;
}
}
}
return 0;
}

View File

@ -0,0 +1,219 @@
/***************************************************************************
Copyright (c) 2013-2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2016/03/25 Werner Saar (wernsaar@googlemail.com)
* BLASTEST : OK
* CTEST : OK
* TEST : OK
* LAPACK-TEST : OK
**************************************************************************************/
#define HAVE_KERNEL_8 1
static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline));
static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x)
{
BLASLONG i = n;
BLASLONG o16 = 16;
BLASLONG o32 = 32;
BLASLONG o48 = 48;
BLASLONG o64 = 64;
BLASLONG o80 = 80;
BLASLONG o96 = 96;
BLASLONG o112 = 112;
FLOAT *x1=x;
FLOAT *x2=x+1;
BLASLONG pre = 384;
__asm__ __volatile__
(
"lxsdx 33, 0, %3 \n\t"
"xxspltd 32, 33, 0 \n\t"
"addi %1, %1, -8 \n\t"
"dcbt %2, %4 \n\t"
"lxvd2x 40, 0, %2 \n\t"
"lxvd2x 41, %5, %2 \n\t"
"lxvd2x 42, %6, %2 \n\t"
"lxvd2x 43, %7, %2 \n\t"
"lxvd2x 44, %8, %2 \n\t"
"lxvd2x 45, %9, %2 \n\t"
"lxvd2x 46, %10, %2 \n\t"
"lxvd2x 47, %11, %2 \n\t"
"addi %2, %2, 128 \n\t"
"addic. %0 , %0 , -16 \n\t"
"ble 2f \n\t"
".align 5 \n\t"
"1: \n\t"
"dcbt %2, %4 \n\t"
"xvmuldp 48, 40, 32 \n\t"
"xvmuldp 49, 41, 32 \n\t"
"lxvd2x 40, 0, %2 \n\t"
"lxvd2x 41, %5, %2 \n\t"
"xvmuldp 50, 42, 32 \n\t"
"xvmuldp 51, 43, 32 \n\t"
"lxvd2x 42, %6, %2 \n\t"
"lxvd2x 43, %7, %2 \n\t"
"xvmuldp 52, 44, 32 \n\t"
"xvmuldp 53, 45, 32 \n\t"
"lxvd2x 44, %8, %2 \n\t"
"lxvd2x 45, %9, %2 \n\t"
"xvmuldp 54, 46, 32 \n\t"
"xvmuldp 55, 47, 32 \n\t"
"lxvd2x 46, %10, %2 \n\t"
"lxvd2x 47, %11, %2 \n\t"
"stxvd2x 48, 0, %1 \n\t"
"stxvd2x 49, %5, %1 \n\t"
"stxvd2x 50, %6, %1 \n\t"
"stxvd2x 51, %7, %1 \n\t"
"stxvd2x 52, %8, %1 \n\t"
"stxvd2x 53, %9, %1 \n\t"
"stxvd2x 54, %10, %1 \n\t"
"stxvd2x 55, %11, %1 \n\t"
"addi %1, %1, 128 \n\t"
"addi %2, %2, 128 \n\t"
"addic. %0 , %0 , -16 \n\t"
"bgt 1b \n\t"
"2: \n\t"
"xvmuldp 48, 40, 32 \n\t"
"xvmuldp 49, 41, 32 \n\t"
"xvmuldp 50, 42, 32 \n\t"
"xvmuldp 51, 43, 32 \n\t"
"xvmuldp 52, 44, 32 \n\t"
"xvmuldp 53, 45, 32 \n\t"
"xvmuldp 54, 46, 32 \n\t"
"xvmuldp 55, 47, 32 \n\t"
"stxvd2x 48, 0, %1 \n\t"
"stxvd2x 49, %5, %1 \n\t"
"stxvd2x 50, %6, %1 \n\t"
"stxvd2x 51, %7, %1 \n\t"
"stxvd2x 52, %8, %1 \n\t"
"stxvd2x 53, %9, %1 \n\t"
"stxvd2x 54, %10, %1 \n\t"
"stxvd2x 55, %11, %1 \n\t"
:
:
"r" (i), // 0
"r" (x2), // 1
"r" (x1), // 2
"r" (alpha), // 3
"r" (pre), // 4
"r" (o16), // 5
"r" (o32), // 6
"r" (o48), // 7
"r" (o64), // 8
"r" (o80), // 9
"r" (o96), // 10
"r" (o112) // 11
: "cr0", "%0", "%2" , "%1", "memory"
);
}
static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline));
static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
{
BLASLONG i = n;
BLASLONG o16 = 16;
BLASLONG o32 = 32;
BLASLONG o48 = 48;
BLASLONG o64 = 64;
BLASLONG o80 = 80;
BLASLONG o96 = 96;
BLASLONG o112 = 112;
FLOAT *x1=x;
FLOAT *x2=x+1;
BLASLONG pre = 384;
__asm__ __volatile__
(
"xxlxor 32 , 32 , 32 \n\t"
"addi %1, %1, -8 \n\t"
".align 5 \n\t"
"1: \n\t"
"stxvd2x 32, 0, %1 \n\t"
"stxvd2x 32, %5, %1 \n\t"
"stxvd2x 32, %6, %1 \n\t"
"stxvd2x 32, %7, %1 \n\t"
"stxvd2x 32, %8, %1 \n\t"
"stxvd2x 32, %9, %1 \n\t"
"stxvd2x 32, %10, %1 \n\t"
"stxvd2x 32, %11, %1 \n\t"
"addi %1, %1, 128 \n\t"
"addic. %0 , %0 , -16 \n\t"
"bgt 1b \n\t"
"2: \n\t"
:
:
"r" (i), // 0
"r" (x2), // 1
"r" (x1), // 2
"r" (alpha), // 3
"r" (pre), // 4
"r" (o16), // 5
"r" (o32), // 6
"r" (o48), // 7
"r" (o64), // 8
"r" (o80), // 9
"r" (o96), // 10
"r" (o112) // 11
: "cr0", "%0", "%2" , "%1", "memory"
);
}

154
kernel/power/dswap.c Normal file
View File

@ -0,0 +1,154 @@
/***************************************************************************
Copyright (c) 2013-2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2016/03/25 Werner Saar (wernsaar@googlemail.com)
* BLASTEST : OK
* CTEST : OK
* TEST : OK
* LAPACK-TEST : OK
**************************************************************************************/
#include "common.h"
#if defined(POWER8)
#include "dswap_microk_power8.c"
#endif
#ifndef HAVE_KERNEL_32
static void dswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y)
{
BLASLONG i=0;
FLOAT f0, f1, f2, f3, f4, f5, f6, f7;
FLOAT g0, g1, g2, g3, g4, g5, g6, g7;
FLOAT *x1=x;
FLOAT *y1=y;
while ( i<n )
{
f0 = x1[0];
f1 = x1[1];
f2 = x1[2];
f3 = x1[3];
f4 = x1[4];
f5 = x1[5];
f6 = x1[6];
f7 = x1[7];
g0 = y1[0];
g1 = y1[1];
g2 = y1[2];
g3 = y1[3];
g4 = y1[4];
g5 = y1[5];
g6 = y1[6];
g7 = y1[7];
y1[0] = f0;
y1[1] = f1;
y1[2] = f2;
y1[3] = f3;
y1[4] = f4;
y1[5] = f5;
y1[6] = f6;
y1[7] = f7;
x1[0] = g0;
x1[1] = g1;
x1[2] = g2;
x1[3] = g3;
x1[4] = g4;
x1[5] = g5;
x1[6] = g6;
x1[7] = g7;
x1 += 8;
y1 += 8;
i+=8;
}
return;
}
#endif
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;
FLOAT temp;
if ( n <= 0 ) return(0);
if ( (inc_x == 1) && (inc_y == 1 ))
{
BLASLONG n1 = n & -32;
if ( n1 > 0 )
{
dswap_kernel_32(n1, x, y);
i=n1;
}
while(i < n)
{
temp = y[i];
y[i] = x[i] ;
x[i] = temp;
i++ ;
}
}
else
{
while(i < n)
{
temp = y[iy];
y[iy] = x[ix] ;
x[ix] = temp;
ix += inc_x ;
iy += inc_y ;
i++ ;
}
}
return(0);
}

View File

@ -0,0 +1,180 @@
/***************************************************************************
Copyright (c) 2013-2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2016/03/25 Werner Saar (wernsaar@googlemail.com)
* BLASTEST : OK
* CTEST : OK
* TEST : OK
* LAPACK-TEST : OK
**************************************************************************************/
#define HAVE_KERNEL_32 1
static void dswap_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
static void dswap_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y)
{
BLASLONG i = n;
BLASLONG o16 = 16;
BLASLONG o32 = 32;
BLASLONG o48 = 48;
BLASLONG o64 = 64;
BLASLONG o80 = 80;
BLASLONG o96 = 96;
BLASLONG o112 = 112;
FLOAT *x1=x;
FLOAT *y1=y;
FLOAT *x2=x+1;
FLOAT *y2=y+1;
BLASLONG pre = 384;
BLASLONG alpha=0;
__asm__ __volatile__
(
"addi %3, %3, -8 \n\t"
"addi %4, %4, -8 \n\t"
".align 5 \n\t"
"1: \n\t"
"lxvd2x 32, 0, %2 \n\t"
"lxvd2x 33, %5, %2 \n\t"
"lxvd2x 34, %6, %2 \n\t"
"lxvd2x 35, %7, %2 \n\t"
"lxvd2x 36, %8, %2 \n\t"
"lxvd2x 37, %9, %2 \n\t"
"lxvd2x 38, %10, %2 \n\t"
"lxvd2x 39, %11, %2 \n\t"
"addi %2, %2, 128 \n\t"
"lxvd2x 40, 0, %2 \n\t"
"lxvd2x 41, %5, %2 \n\t"
"lxvd2x 42, %6, %2 \n\t"
"lxvd2x 43, %7, %2 \n\t"
"lxvd2x 44, %8, %2 \n\t"
"lxvd2x 45, %9, %2 \n\t"
"lxvd2x 46, %10, %2 \n\t"
"lxvd2x 47, %11, %2 \n\t"
"addi %2, %2, 128 \n\t"
"lxvd2x 48, 0, %1 \n\t"
"lxvd2x 49, %5, %1 \n\t"
"lxvd2x 50, %6, %1 \n\t"
"lxvd2x 51, %7, %1 \n\t"
"lxvd2x 52, %8, %1 \n\t"
"lxvd2x 53, %9, %1 \n\t"
"lxvd2x 54, %10, %1 \n\t"
"lxvd2x 55, %11, %1 \n\t"
"addi %1, %1, 128 \n\t"
"lxvd2x 56, 0, %1 \n\t"
"lxvd2x 57, %5, %1 \n\t"
"lxvd2x 58, %6, %1 \n\t"
"lxvd2x 59, %7, %1 \n\t"
"lxvd2x 60, %8, %1 \n\t"
"lxvd2x 61, %9, %1 \n\t"
"lxvd2x 62, %10, %1 \n\t"
"lxvd2x 63, %11, %1 \n\t"
"addi %1, %1, 128 \n\t"
"stxvd2x 32, 0, %3 \n\t"
"stxvd2x 33, %5, %3 \n\t"
"stxvd2x 34, %6, %3 \n\t"
"stxvd2x 35, %7, %3 \n\t"
"stxvd2x 36, %8, %3 \n\t"
"stxvd2x 37, %9, %3 \n\t"
"stxvd2x 38, %10, %3 \n\t"
"stxvd2x 39, %11, %3 \n\t"
"addi %3, %3, 128 \n\t"
"stxvd2x 40, 0, %3 \n\t"
"stxvd2x 41, %5, %3 \n\t"
"stxvd2x 42, %6, %3 \n\t"
"stxvd2x 43, %7, %3 \n\t"
"stxvd2x 44, %8, %3 \n\t"
"stxvd2x 45, %9, %3 \n\t"
"stxvd2x 46, %10, %3 \n\t"
"stxvd2x 47, %11, %3 \n\t"
"addi %3, %3, 128 \n\t"
"stxvd2x 48, 0, %4 \n\t"
"stxvd2x 49, %5, %4 \n\t"
"stxvd2x 50, %6, %4 \n\t"
"stxvd2x 51, %7, %4 \n\t"
"stxvd2x 52, %8, %4 \n\t"
"stxvd2x 53, %9, %4 \n\t"
"stxvd2x 54, %10, %4 \n\t"
"stxvd2x 55, %11, %4 \n\t"
"addi %4, %4, 128 \n\t"
"stxvd2x 56, 0, %4 \n\t"
"stxvd2x 57, %5, %4 \n\t"
"stxvd2x 58, %6, %4 \n\t"
"stxvd2x 59, %7, %4 \n\t"
"stxvd2x 60, %8, %4 \n\t"
"stxvd2x 61, %9, %4 \n\t"
"stxvd2x 62, %10, %4 \n\t"
"stxvd2x 63, %11, %4 \n\t"
"addi %4, %4, 128 \n\t"
"addic. %0 , %0 , -32 \n\t"
"bgt 1b \n\t"
"2: \n\t"
:
:
"r" (i), // 0
"r" (y1), // 1
"r" (x1), // 2
"r" (y2), // 3
"r" (x2), // 4
"r" (o16), // 5
"r" (o32), // 6
"r" (o48), // 7
"r" (o64), // 8
"r" (o80), // 9
"r" (o96), // 10
"r" (o112) // 11
: "cr0", "%0", "%2" , "%1", "%3", "%4", "memory"
);
}

146
kernel/power/sasum.c Normal file
View File

@ -0,0 +1,146 @@
/***************************************************************************
Copyright (c) 2013-2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2016/03/28 Werner Saar (wernsaar@googlemail.com)
* BLASTEST : OK
* CTEST : OK
* TEST : OK
* LAPACK-TEST : OK
**************************************************************************************/
#include "common.h"
#include <math.h>
#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif
#if defined(POWER8)
#include "sasum_microk_power8.c"
#endif
#ifndef HAVE_KERNEL_32
static void sasum_kernel_32(BLASLONG n, FLOAT *x1, FLOAT *svec)
{
BLASLONG i=0;
FLOAT *x = x1;
FLOAT temp0, temp1, temp2, temp3;
FLOAT temp4, temp5, temp6, temp7;
FLOAT sum0 = 0.0;
FLOAT sum1 = 0.0;
FLOAT sum2 = 0.0;
FLOAT sum3 = 0.0;
while ( i< n )
{
temp0 = ABS(x[0]);
temp1 = ABS(x[1]);
temp2 = ABS(x[2]);
temp3 = ABS(x[3]);
temp4 = ABS(x[4]);
temp5 = ABS(x[5]);
temp6 = ABS(x[6]);
temp7 = ABS(x[7]);
sum0 += temp0;
sum1 += temp1;
sum2 += temp2;
sum3 += temp3;
sum0 += temp4;
sum1 += temp5;
sum2 += temp6;
sum3 += temp7;
x+=8;
i+=8;
}
svec[0] = sum0+sum1+sum2+sum3;
svec[1] = 0.0;
svec[2] = 0.0;
svec[3] = 0.0;
}
#endif
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i=0;
FLOAT sumf = 0.0;
FLOAT svec[4] __attribute__ ((aligned (16)));;
BLASLONG n1;
if (n <= 0 || inc_x <= 0) return(sumf);
if ( inc_x == 1 )
{
n1 = n & -32;
if ( n1 > 0 )
{
sasum_kernel_32(n1, x, svec);
sumf = svec[0] + svec[1]+svec[2]+svec[3];
i=n1;
}
while(i < n)
{
sumf += ABS(x[i]);
i++;
}
}
else
{
n *= inc_x;
while(i < n)
{
sumf += ABS(x[i]);
i += inc_x;
}
}
return(sumf);
}

View File

@ -0,0 +1,177 @@
/***************************************************************************
Copyright (c) 2013-2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2016/03/28 Werner Saar (wernsaar@googlemail.com)
* BLASTEST : OK
* CTEST : OK
* TEST : OK
* LAPACK-TEST : OK
**************************************************************************************/
#define HAVE_KERNEL_32 1
static void sasum_kernel_32( BLASLONG n, FLOAT *x, FLOAT *svec) __attribute__ ((noinline));
static void sasum_kernel_32( BLASLONG n, FLOAT *x, FLOAT *svec)
{
BLASLONG i = n;
BLASLONG o16 = 16;
BLASLONG o32 = 32;
BLASLONG o48 = 48;
BLASLONG o64 = 64;
BLASLONG o80 = 80;
BLASLONG o96 = 96;
BLASLONG o112 = 112;
FLOAT *x1=x;
BLASLONG pre = 384;
__asm__ __volatile__
(
"dcbt %2 , %4 \n\t"
"xxlxor 32,32,32 \n\t"
"xxlxor 33,33,33 \n\t"
"xxlxor 34,34,34 \n\t"
"xxlxor 35,35,35 \n\t"
"xxlxor 36,36,36 \n\t"
"xxlxor 37,37,37 \n\t"
"xxlxor 38,38,38 \n\t"
"xxlxor 39,39,39 \n\t"
"lxvw4x 40, 0, %2 \n\t"
"lxvw4x 41, %5, %2 \n\t"
"lxvw4x 42, %6, %2 \n\t"
"lxvw4x 43, %7, %2 \n\t"
"lxvw4x 44, %8, %2 \n\t"
"lxvw4x 45, %9, %2 \n\t"
"lxvw4x 46, %10, %2 \n\t"
"lxvw4x 47, %11, %2 \n\t"
"addi %2, %2, 128 \n\t"
"addic. %0 , %0 , -32 \n\t"
"ble 2f \n\t"
".align 5 \n\t"
"1: \n\t"
"dcbt %2 , %4 \n\t"
"xvabssp 48, 40 \n\t"
"xvabssp 49, 41 \n\t"
"xvabssp 50, 42 \n\t"
"xvabssp 51, 43 \n\t"
"lxvw4x 40, 0, %2 \n\t"
"lxvw4x 41, %5, %2 \n\t"
"xvabssp 52, 44 \n\t"
"xvabssp 53, 45 \n\t"
"lxvw4x 42, %6, %2 \n\t"
"lxvw4x 43, %7, %2 \n\t"
"xvabssp 54, 46 \n\t"
"xvabssp 55, 47 \n\t"
"lxvw4x 44, %8, %2 \n\t"
"lxvw4x 45, %9, %2 \n\t"
"xvaddsp 32, 32, 48 \n\t"
"xvaddsp 33, 33, 49 \n\t"
"lxvw4x 46, %10, %2 \n\t"
"lxvw4x 47, %11, %2 \n\t"
"xvaddsp 34, 34, 50 \n\t"
"xvaddsp 35, 35, 51 \n\t"
"addi %2, %2, 128 \n\t"
"xvaddsp 36, 36, 52 \n\t"
"xvaddsp 37, 37, 53 \n\t"
"addic. %0 , %0 , -32 \n\t"
"xvaddsp 38, 38, 54 \n\t"
"xvaddsp 39, 39, 55 \n\t"
"bgt 1b \n\t"
"2: \n\t"
"xvabssp 48, 40 \n\t"
"xvabssp 49, 41 \n\t"
"xvabssp 50, 42 \n\t"
"xvabssp 51, 43 \n\t"
"xvabssp 52, 44 \n\t"
"xvabssp 53, 45 \n\t"
"xvabssp 54, 46 \n\t"
"xvabssp 55, 47 \n\t"
"xvaddsp 32, 32, 48 \n\t"
"xvaddsp 33, 33, 49 \n\t"
"xvaddsp 34, 34, 50 \n\t"
"xvaddsp 35, 35, 51 \n\t"
"xvaddsp 36, 36, 52 \n\t"
"xvaddsp 37, 37, 53 \n\t"
"xvaddsp 38, 38, 54 \n\t"
"xvaddsp 39, 39, 55 \n\t"
"xvaddsp 32, 32, 33 \n\t"
"xvaddsp 34, 34, 35 \n\t"
"xvaddsp 36, 36, 37 \n\t"
"xvaddsp 38, 38, 39 \n\t"
"xvaddsp 32, 32, 34 \n\t"
"xvaddsp 36, 36, 38 \n\t"
"xvaddsp 32, 32, 36 \n\t"
"stxvw4x 32, 0, %3 \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"r" (x1), // 2
"r" (svec), // 3
"r" (pre), // 4
"r" (o16), // 5
"r" (o32), // 6
"r" (o48), // 7
"r" (o64), // 8
"r" (o80), // 9
"r" (o96), // 10
"r" (o112) // 11
: "cr0", "%0", "%2", "memory"
);
}

131
kernel/power/scopy.c Normal file
View File

@ -0,0 +1,131 @@
/***************************************************************************
Copyright (c) 2013-2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2016/03/25 Werner Saar (wernsaar@googlemail.com)
* BLASTEST : OK
* CTEST : OK
* TEST : OK
* LAPACK-TEST : OK
**************************************************************************************/
#include "common.h"
#if defined(POWER8)
#include "scopy_microk_power8.c"
#endif
#ifndef HAVE_KERNEL_32
static void scopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y)
{
BLASLONG i=0;
FLOAT f0, f1, f2, f3, f4, f5, f6, f7;
FLOAT *x1=x;
FLOAT *y1=y;
while ( i<n )
{
f0 = x1[0];
f1 = x1[1];
f2 = x1[2];
f3 = x1[3];
f4 = x1[4];
f5 = x1[5];
f6 = x1[6];
f7 = x1[7];
y1[0] = f0;
y1[1] = f1;
y1[2] = f2;
y1[3] = f3;
y1[4] = f4;
y1[5] = f5;
y1[6] = f6;
y1[7] = f7;
x1 += 8;
y1 += 8;
i+=8;
}
return;
}
#endif
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;
if ( n <= 0 ) return(0);
if ( (inc_x == 1) && (inc_y == 1 ))
{
BLASLONG n1 = n & -32;
if ( n1 > 0 )
{
scopy_kernel_32(n1, x, y);
i=n1;
}
while(i < n)
{
y[i] = x[i] ;
i++ ;
}
}
else
{
while(i < n)
{
y[iy] = x[ix] ;
ix += inc_x ;
iy += inc_y ;
i++ ;
}
}
return(0);
}

View File

@ -0,0 +1,131 @@
/***************************************************************************
Copyright (c) 2013-2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2016/03/25 Werner Saar (wernsaar@googlemail.com)
* BLASTEST : OK
* CTEST : OK
* TEST : OK
* LAPACK-TEST : OK
**************************************************************************************/
#define HAVE_KERNEL_32 1
static void scopy_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
static void scopy_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y)
{
BLASLONG i = n;
BLASLONG o16 = 16;
BLASLONG o32 = 32;
BLASLONG o48 = 48;
BLASLONG o64 = 64;
BLASLONG o80 = 80;
BLASLONG o96 = 96;
BLASLONG o112 = 112;
FLOAT *x1=x;
FLOAT *y1=y;
BLASLONG pre = 384;
BLASLONG alpha=0;
__asm__ __volatile__
(
"lxvw4x 40, 0, %2 \n\t"
"lxvw4x 41, %5, %2 \n\t"
"lxvw4x 42, %6, %2 \n\t"
"lxvw4x 43, %7, %2 \n\t"
"lxvw4x 44, %8, %2 \n\t"
"lxvw4x 45, %9, %2 \n\t"
"lxvw4x 46, %10, %2 \n\t"
"lxvw4x 47, %11, %2 \n\t"
"addi %2, %2, 128 \n\t"
"addic. %0 , %0 , -32 \n\t"
"ble 2f \n\t"
".align 5 \n\t"
"1: \n\t"
"stxvw4x 40, 0, %1 \n\t"
"stxvw4x 41, %5, %1 \n\t"
"lxvw4x 40, 0, %2 \n\t"
"lxvw4x 41, %5, %2 \n\t"
"stxvw4x 42, %6, %1 \n\t"
"stxvw4x 43, %7, %1 \n\t"
"lxvw4x 42, %6, %2 \n\t"
"lxvw4x 43, %7, %2 \n\t"
"stxvw4x 44, %8, %1 \n\t"
"stxvw4x 45, %9, %1 \n\t"
"lxvw4x 44, %8, %2 \n\t"
"lxvw4x 45, %9, %2 \n\t"
"stxvw4x 46, %10, %1 \n\t"
"stxvw4x 47, %11, %1 \n\t"
"lxvw4x 46, %10, %2 \n\t"
"lxvw4x 47, %11, %2 \n\t"
"addi %1, %1, 128 \n\t"
"addi %2, %2, 128 \n\t"
"addic. %0 , %0 , -32 \n\t"
"bgt 1b \n\t"
"2: \n\t"
"stxvw4x 40, 0, %1 \n\t"
"stxvw4x 41, %5, %1 \n\t"
"stxvw4x 42, %6, %1 \n\t"
"stxvw4x 43, %7, %1 \n\t"
"stxvw4x 44, %8, %1 \n\t"
"stxvw4x 45, %9, %1 \n\t"
"stxvw4x 46, %10, %1 \n\t"
"stxvw4x 47, %11, %1 \n\t"
:
:
"r" (i), // 0
"r" (y1), // 1
"r" (x1), // 2
"r" (alpha), // 3
"r" (pre), // 4
"r" (o16), // 5
"r" (o32), // 6
"r" (o48), // 7
"r" (o64), // 8
"r" (o80), // 9
"r" (o96), // 10
"r" (o112) // 11
: "cr0", "%0", "%2" , "%1", "memory"
);
}

126
kernel/power/sdot.c Normal file
View File

@ -0,0 +1,126 @@
/***************************************************************************
Copyright (c) 2013-2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2016/03/21 Werner Saar (wernsaar@googlemail.com)
* BLASTEST : OK
* CTEST : OK
* TEST : OK
* LAPACK-TEST : OK
**************************************************************************************/
#include "common.h"
#if defined(POWER8)
#include "sdot_microk_power8.c"
#endif
#ifndef HAVE_KERNEL_16
static void sdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d)
{
BLASLONG register i = 0;
FLOAT dot = 0.0;
while(i < n)
{
dot += y[i] * x[i]
+ y[i+1] * x[i+1]
+ y[i+2] * x[i+2]
+ y[i+3] * x[i+3]
+ y[i+4] * x[i+4]
+ y[i+5] * x[i+5]
+ y[i+6] * x[i+6]
+ y[i+7] * x[i+7] ;
i+=8 ;
}
*d += dot;
}
#endif
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;
FLOAT dot = 0.0 ;
if ( n <= 0 ) return(dot);
if ( (inc_x == 1) && (inc_y == 1) )
{
BLASLONG n1 = n & -32;
if ( n1 )
sdot_kernel_16(n1, x, y , &dot );
i = n1;
while(i < n)
{
dot += y[i] * x[i] ;
i++ ;
}
return(dot);
}
BLASLONG n1 = n & -2;
while(i < n1)
{
dot += y[iy] * x[ix] + y[iy+inc_y] * x[ix+inc_x];
ix += inc_x*2 ;
iy += inc_y*2 ;
i+=2 ;
}
while(i < n)
{
dot += y[iy] * x[ix] ;
ix += inc_x ;
iy += inc_y ;
i++ ;
}
return(dot);
}

View File

@ -0,0 +1,179 @@
/***************************************************************************
Copyright (c) 2013-2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2016/03/21 Werner Saar (wernsaar@googlemail.com)
* BLASTEST : OK
* CTEST : OK
* TEST : OK
* LAPACK-TEST : OK
**************************************************************************************/
#define HAVE_KERNEL_16 1
static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline));
static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
{
BLASLONG i = n;
BLASLONG o16 = 16;
BLASLONG o32 = 32;
BLASLONG o48 = 48;
BLASLONG o64 = 64;
BLASLONG o80 = 80;
BLASLONG o96 = 96;
BLASLONG o112 = 112;
FLOAT *x1=x;
FLOAT *y1=y;
BLASLONG pre = 384;
FLOAT tempdot[4];
__asm__ __volatile__
(
"xxlxor 32,32,32 \n\t"
"xxlxor 33,33,33 \n\t"
"xxlxor 34,34,34 \n\t"
"xxlxor 35,35,35 \n\t"
"xxlxor 36,36,36 \n\t"
"xxlxor 37,37,37 \n\t"
"xxlxor 38,38,38 \n\t"
"xxlxor 39,39,39 \n\t"
"dcbt %2, %12 \n\t"
"dcbt %3, %12 \n\t"
"lxvw4x 40, 0, %2 \n\t"
"lxvw4x 48, 0, %3 \n\t"
"lxvw4x 41, %5, %2 \n\t"
"lxvw4x 49, %5, %3 \n\t"
"lxvw4x 42, %6, %2 \n\t"
"lxvw4x 50, %6, %3 \n\t"
"lxvw4x 43, %7, %2 \n\t"
"lxvw4x 51, %7, %3 \n\t"
"lxvw4x 44, %8, %2 \n\t"
"lxvw4x 52, %8, %3 \n\t"
"lxvw4x 45, %9, %2 \n\t"
"lxvw4x 53, %9, %3 \n\t"
"lxvw4x 46, %10, %2 \n\t"
"lxvw4x 54, %10, %3 \n\t"
"lxvw4x 47, %11, %2 \n\t"
"lxvw4x 55, %11, %3 \n\t"
"addi %2, %2, 128 \n\t"
"addi %3, %3, 128 \n\t"
"addic. %0 , %0 , -32 \n\t"
"ble 2f \n\t"
".align 5 \n\t"
"1: \n\t"
"dcbt %2, %12 \n\t"
"dcbt %3, %12 \n\t"
"xvmaddasp 32, 40, 48 \n\t"
"lxvw4x 40, 0, %2 \n\t"
"lxvw4x 48, 0, %3 \n\t"
"xvmaddasp 33, 41, 49 \n\t"
"lxvw4x 41, %5, %2 \n\t"
"lxvw4x 49, %5, %3 \n\t"
"xvmaddasp 34, 42, 50 \n\t"
"lxvw4x 42, %6, %2 \n\t"
"lxvw4x 50, %6, %3 \n\t"
"xvmaddasp 35, 43, 51 \n\t"
"lxvw4x 43, %7, %2 \n\t"
"lxvw4x 51, %7, %3 \n\t"
"xvmaddasp 36, 44, 52 \n\t"
"lxvw4x 44, %8, %2 \n\t"
"lxvw4x 52, %8, %3 \n\t"
"xvmaddasp 37, 45, 53 \n\t"
"lxvw4x 45, %9, %2 \n\t"
"lxvw4x 53, %9, %3 \n\t"
"xvmaddasp 38, 46, 54 \n\t"
"lxvw4x 46, %10, %2 \n\t"
"lxvw4x 54, %10, %3 \n\t"
"xvmaddasp 39, 47, 55 \n\t"
"lxvw4x 47, %11, %2 \n\t"
"lxvw4x 55, %11, %3 \n\t"
"addi %2, %2, 128 \n\t"
"addi %3, %3, 128 \n\t"
"addic. %0 , %0 , -32 \n\t"
"bgt 1b \n\t"
"2: \n\t"
"xvmaddasp 32, 40, 48 \n\t"
"xvmaddasp 33, 41, 49 \n\t"
"xvmaddasp 34, 42, 50 \n\t"
"xvmaddasp 35, 43, 51 \n\t"
"xvmaddasp 36, 44, 52 \n\t"
"xvmaddasp 37, 45, 53 \n\t"
"xvmaddasp 38, 46, 54 \n\t"
"xvmaddasp 39, 47, 55 \n\t"
"xvaddsp 32, 32 , 33 \n\t"
"xvaddsp 34, 34 , 35 \n\t"
"xvaddsp 36, 36 , 37 \n\t"
"xvaddsp 38, 38 , 39 \n\t"
"xvaddsp 32, 32 , 34 \n\t"
"xvaddsp 36, 36 , 38 \n\t"
"xvaddsp 32, 32 , 36 \n\t"
"stxvw4x 32, 0 , %4 \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"r" (x1), // 2
"r" (y1), // 3
"r" (tempdot), // 4
"r" (o16), // 5
"r" (o32), // 6
"r" (o48), // 7
"r" (o64), // 8
"r" (o80), // 9
"r" (o96), // 10
"r" (o112), // 11
"r" (pre) // 12
: "cr0", "%0", "%2" , "%3", "memory"
);
*dot = tempdot[0] + tempdot[1] + tempdot[2] + tempdot[3];
}

View File

@ -0,0 +1,371 @@
/***************************************************************************
Copyright (c) 2013-2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2016/04/02 Werner Saar (wernsaar@googlemail.com)
* BLASTEST : OK
* CTEST : OK
* TEST : OK
* LAPACK-TEST : OK
**************************************************************************************/
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#define ASSEMBLER
#include "common.h"
#include "def_vsx.h"
#ifndef __64BIT__
#define LOAD lwz
#else
#define LOAD ld
#endif
#ifdef __64BIT__
#define STACKSIZE 32752
#define ALPHA_SP 296(SP)
#define FZERO 304(SP)
#else
#define STACKSIZE 240
#define ALPHA_SP 224(SP)
#define FZERO 232(SP)
#endif
#define M r3
#define N r4
#define K r5
#ifdef linux
#ifndef __64BIT__
#define A r6
#define B r7
#define C r8
#define LDC r9
#define OFFSET r10
#else
#define A r7
#define B r8
#define C r9
#define LDC r10
#define OFFSET r6
#endif
#endif
#if defined(_AIX) || defined(__APPLE__)
#if !defined(__64BIT__) && defined(DOUBLE)
#define A r8
#define B r9
#define C r10
#define LDC r7
#define OFFSET r6
#else
#define A r7
#define B r8
#define C r9
#define LDC r10
#define OFFSET r6
#endif
#endif
#define alpha_r vs30
#define alpha_vr vs31
#define o0 0
#define FRAMEPOINTER r12
#define BBUFFER r14
#define o4 r15
#define o12 r16
#define o8 r17
#define L r18
#define T1 r19
#define KK r20
#define BBO r21
#define I r22
#define J r23
#define AO r24
#define BO r25
#define CO r26
#define o16 r27
#define o32 r28
#define o48 r29
#define PRE r30
#define T2 r31
#include "sgemm_macros_16x8_power8.S"
#ifndef NEEDPARAM
PROLOGUE
PROFCODE
mr FRAMEPOINTER, SP
addi SP, SP, -STACKSIZE
addi SP, SP, -STACKSIZE
addi SP, SP, -STACKSIZE
addi SP, SP, -STACKSIZE
li r0, 0
stfd f14, 0(SP)
stfd f15, 8(SP)
stfd f16, 16(SP)
stfd f17, 24(SP)
stfd f18, 32(SP)
stfd f19, 40(SP)
stfd f20, 48(SP)
stfd f21, 56(SP)
stfd f22, 64(SP)
stfd f23, 72(SP)
stfd f24, 80(SP)
stfd f25, 88(SP)
stfd f26, 96(SP)
stfd f27, 104(SP)
stfd f28, 112(SP)
stfd f29, 120(SP)
stfd f30, 128(SP)
stfd f31, 136(SP)
#ifdef __64BIT__
std r31, 144(SP)
std r30, 152(SP)
std r29, 160(SP)
std r28, 168(SP)
std r27, 176(SP)
std r26, 184(SP)
std r25, 192(SP)
std r24, 200(SP)
std r23, 208(SP)
std r22, 216(SP)
std r21, 224(SP)
std r20, 232(SP)
std r19, 240(SP)
std r18, 248(SP)
std r17, 256(SP)
std r16, 264(SP)
std r15, 272(SP)
std r14, 280(SP)
#else
stw r31, 144(SP)
stw r30, 148(SP)
stw r29, 152(SP)
stw r28, 156(SP)
stw r27, 160(SP)
stw r26, 164(SP)
stw r25, 168(SP)
stw r24, 172(SP)
stw r23, 176(SP)
stw r22, 180(SP)
stw r21, 184(SP)
stw r20, 188(SP)
stw r19, 192(SP)
stw r18, 196(SP)
stw r17, 200(SP)
stw r16, 204(SP)
stw r15, 208(SP)
stw r14, 212(SP)
#endif
// stfd f1, ALPHA_SP
// stw r0, FZERO
#if defined(_AIX) || defined(__APPLE__)
#if !defined(__64BIT__) && defined(DOUBLE)
lwz LDC, FRAMESLOT(0) + 0(FRAMEPOINTER)
#endif
#endif
slwi LDC, LDC, 2
#if defined(TRMMKERNEL)
#if defined(linux) && defined(__64BIT__)
ld OFFSET, FRAMESLOT(0) + 0(FRAMEPOINTER)
#endif
#if defined(_AIX) || defined(__APPLE__)
#ifdef __64BIT__
ld OFFSET, FRAMESLOT(0) + 0(FRAMEPOINTER)
#else
#ifdef DOUBLE
lwz OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER)
#else
lwz OFFSET, FRAMESLOT(0) + 0(FRAMEPOINTER)
#endif
#endif
#endif
#endif
cmpwi cr0, M, 0
ble L999_H1
cmpwi cr0, N, 0
ble L999_H1
cmpwi cr0, K, 0
ble L999_H1
li PRE, 256
li o4 , 4
li o8 , 8
li o12, 12
li o16, 16
li o32, 32
li o48, 48
addi BBUFFER, SP, 512+4096
li T1, -4096
and BBUFFER, BBUFFER, T1
addi T1, SP, 300
stxsspx f1, o0 , T1
stxsspx f1, o4 , T1
stxsspx f1, o8 , T1
stxsspx f1, o12 , T1
lxsspx alpha_r, o0, T1
lxvw4x alpha_vr, o0, T1
#include "sgemm_logic_16x8_power8.S"
L999:
addi r3, 0, 0
lfd f14, 0(SP)
lfd f15, 8(SP)
lfd f16, 16(SP)
lfd f17, 24(SP)
lfd f18, 32(SP)
lfd f19, 40(SP)
lfd f20, 48(SP)
lfd f21, 56(SP)
lfd f22, 64(SP)
lfd f23, 72(SP)
lfd f24, 80(SP)
lfd f25, 88(SP)
lfd f26, 96(SP)
lfd f27, 104(SP)
lfd f28, 112(SP)
lfd f29, 120(SP)
lfd f30, 128(SP)
lfd f31, 136(SP)
#ifdef __64BIT__
ld r31, 144(SP)
ld r30, 152(SP)
ld r29, 160(SP)
ld r28, 168(SP)
ld r27, 176(SP)
ld r26, 184(SP)
ld r25, 192(SP)
ld r24, 200(SP)
ld r23, 208(SP)
ld r22, 216(SP)
ld r21, 224(SP)
ld r20, 232(SP)
ld r19, 240(SP)
ld r18, 248(SP)
ld r17, 256(SP)
ld r16, 264(SP)
ld r15, 272(SP)
ld r14, 280(SP)
#else
lwz r31, 144(SP)
lwz r30, 148(SP)
lwz r29, 152(SP)
lwz r28, 156(SP)
lwz r27, 160(SP)
lwz r26, 164(SP)
lwz r25, 168(SP)
lwz r24, 172(SP)
lwz r23, 176(SP)
lwz r22, 180(SP)
lwz r21, 184(SP)
lwz r20, 188(SP)
lwz r19, 192(SP)
lwz r18, 196(SP)
lwz r17, 200(SP)
lwz r16, 204(SP)
lwz r15, 208(SP)
lwz r14, 212(SP)
#endif
addi SP, SP, STACKSIZE
addi SP, SP, STACKSIZE
addi SP, SP, STACKSIZE
addi SP, SP, STACKSIZE
blr
EPILOGUE
#endif

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

167
kernel/power/srot.c Normal file
View File

@ -0,0 +1,167 @@
/***************************************************************************
Copyright (c) 2013-2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2016/03/26 Werner Saar (wernsaar@googlemail.com)
* BLASTEST : OK
* CTEST : OK
* TEST : OK
* LAPACK-TEST : OK
**************************************************************************************/
#include "common.h"
#pragma GCC optimize "O1"
#if defined(POWER8)
#include "srot_microk_power8.c"
#endif
#ifndef HAVE_KERNEL_16
static void srot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
{
BLASLONG i=0;
FLOAT f0, f1, f2, f3;
FLOAT x00, x01, x02, x03;
FLOAT g0, g1, g2, g3;
FLOAT y00, y01, y02, y03;
FLOAT *x1=x;
FLOAT *y1=y;
FLOAT c1=*c;
FLOAT s1=*s;
while ( i<n )
{
x00 = x1[0];
y00 = y1[0];
x01 = x1[1];
y01 = y1[1];
x02 = x1[2];
y02 = y1[2];
x03 = x1[3];
y03 = y1[3];
f0 = c1*x00 + s1*y00;
g0 = c1*y00 - s1*x00;
f1 = c1*x01 + s1*y01;
g1 = c1*y01 - s1*x01;
f2 = c1*x02 + s1*y02;
g2 = c1*y02 - s1*x02;
f3 = c1*x03 + s1*y03;
g3 = c1*y03 - s1*x03;
x1[0] = f0;
y1[0] = g0;
x1[1] = f1;
y1[1] = g1;
x1[2] = f2;
y1[2] = g2;
x1[3] = f3;
y1[3] = g3;
x1 += 4;
y1 += 4;
i+=4;
}
return;
}
#endif
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;
FLOAT c1[4] __attribute__ ((aligned (16)));;
FLOAT s1[4] __attribute__ ((aligned (16)));;
FLOAT *x1=x;
FLOAT *y1=y;
FLOAT temp;
if ( n <= 0 ) return(0);
if ( (inc_x == 1) && (inc_y == 1) )
{
BLASLONG n1 = n & -16;
if ( n1 > 0 )
{
c1[0]=c;
c1[1]=c;
c1[2]=c;
c1[3]=c;
s1[0]=s;
s1[1]=s;
s1[2]=s;
s1[3]=s;
srot_kernel_16(n1, x1, y1, c1, s1);
i=n1;
}
while(i < n)
{
temp = c*x[i] + s*y[i] ;
y[i] = c*y[i] - s*x[i] ;
x[i] = temp ;
i++ ;
}
}
else
{
while(i < n)
{
temp = c*x[ix] + s*y[iy] ;
y[iy] = c*y[iy] - s*x[ix] ;
x[ix] = temp ;
ix += inc_x ;
iy += inc_y ;
i++ ;
}
}
return(0);
}

View File

@ -0,0 +1,208 @@
/***************************************************************************
Copyright (c) 2013-2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2016/03/27 Werner Saar (wernsaar@googlemail.com)
*
* I don't use fused multiply-add ( precision problems with lapack )
*
* BLASTEST : OK
* CTEST : OK
* TEST : OK
* LAPACK-TEST : OK
**************************************************************************************/
#define HAVE_KERNEL_16 1
static void srot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) __attribute__ ((noinline));
static void srot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
{
BLASLONG i = n;
BLASLONG o16 = 16;
BLASLONG o32 = 32;
BLASLONG o48 = 48;
FLOAT *x1=x;
FLOAT *y1=y;
FLOAT *x2=x+1;
FLOAT *y2=y+1;
__asm__ __volatile__
(
"lxvw4x 36 , 0, %3 \n\t" // load c
"lxvw4x 37 , 0, %4 \n\t" // load s
"addi %8 , %8, -4 \n\t"
"addi %9 , %9, -4 \n\t"
"lxvw4x 32, 0, %1 \n\t" // load x
"lxvw4x 33, %5, %1 \n\t"
"lxvw4x 34, %6, %1 \n\t"
"lxvw4x 35, %7, %1 \n\t"
"lxvw4x 40, 0, %2 \n\t" // load y
"lxvw4x 41, %5, %2 \n\t"
"lxvw4x 42, %6, %2 \n\t"
"lxvw4x 43, %7, %2 \n\t"
"addi %1, %1, 64 \n\t"
"addi %2, %2, 64 \n\t"
"addic. %0 , %0 , -16 \n\t"
"ble 2f \n\t"
".align 5 \n\t"
"1: \n\t"
"xvmulsp 48, 32, 36 \n\t" // c * x
"xvmulsp 49, 33, 36 \n\t"
"xvmulsp 50, 34, 36 \n\t"
"xvmulsp 51, 35, 36 \n\t"
"xvmulsp 56, 40, 36 \n\t" // c * y
"xvmulsp 57, 41, 36 \n\t"
"xvmulsp 58, 42, 36 \n\t"
"xvmulsp 59, 43, 36 \n\t"
"xvmulsp 52, 32, 37 \n\t" // s * x
"xvmulsp 53, 33, 37 \n\t"
"lxvw4x 32, 0, %1 \n\t" // load x
"lxvw4x 33, %5, %1 \n\t"
"xvmulsp 54, 34, 37 \n\t"
"xvmulsp 55, 35, 37 \n\t"
"lxvw4x 34, %6, %1 \n\t"
"lxvw4x 35, %7, %1 \n\t"
"xvmulsp 60, 40, 37 \n\t" // s * y
"xvmulsp 61, 41, 37 \n\t"
"lxvw4x 40, 0, %2 \n\t" // load y
"lxvw4x 41, %5, %2 \n\t"
"xvmulsp 62, 42, 37 \n\t"
"xvmulsp 63, 43, 37 \n\t"
"lxvw4x 42, %6, %2 \n\t"
"lxvw4x 43, %7, %2 \n\t"
"xvaddsp 48, 48 , 60 \n\t" // c * x + s * y
"xvaddsp 49, 49 , 61 \n\t" // c * x + s * y
"addi %1, %1, 64 \n\t"
"addi %2, %2, 64 \n\t"
"xvaddsp 50, 50 , 62 \n\t" // c * x + s * y
"xvaddsp 51, 51 , 63 \n\t" // c * x + s * y
"xvsubsp 56, 56 , 52 \n\t" // c * y - s * x
"xvsubsp 57, 57 , 53 \n\t" // c * y - s * x
"xvsubsp 58, 58 , 54 \n\t" // c * y - s * x
"xvsubsp 59, 59 , 55 \n\t" // c * y - s * x
"stxvw4x 48, 0, %8 \n\t" // store x
"stxvw4x 49, %5, %8 \n\t"
"stxvw4x 50, %6, %8 \n\t"
"stxvw4x 51, %7, %8 \n\t"
"stxvw4x 56, 0, %9 \n\t" // store y
"stxvw4x 57, %5, %9 \n\t"
"stxvw4x 58, %6, %9 \n\t"
"stxvw4x 59, %7, %9 \n\t"
"addi %8, %8, 64 \n\t"
"addi %9, %9, 64 \n\t"
"addic. %0 , %0 , -16 \n\t"
"bgt 1b \n\t"
"2: \n\t"
"xvmulsp 48, 32, 36 \n\t" // c * x
"xvmulsp 49, 33, 36 \n\t"
"xvmulsp 50, 34, 36 \n\t"
"xvmulsp 51, 35, 36 \n\t"
"xvmulsp 56, 40, 36 \n\t" // c * y
"xvmulsp 57, 41, 36 \n\t"
"xvmulsp 58, 42, 36 \n\t"
"xvmulsp 59, 43, 36 \n\t"
"xvmulsp 52, 32, 37 \n\t" // s * x
"xvmulsp 53, 33, 37 \n\t"
"xvmulsp 54, 34, 37 \n\t"
"xvmulsp 55, 35, 37 \n\t"
"xvmulsp 60, 40, 37 \n\t" // s * y
"xvmulsp 61, 41, 37 \n\t"
"xvmulsp 62, 42, 37 \n\t"
"xvmulsp 63, 43, 37 \n\t"
"xvaddsp 48, 48 , 60 \n\t" // c * x + s * y
"xvaddsp 49, 49 , 61 \n\t" // c * x + s * y
"xvaddsp 50, 50 , 62 \n\t" // c * x + s * y
"xvaddsp 51, 51 , 63 \n\t" // c * x + s * y
"xvsubsp 56, 56 , 52 \n\t" // c * y - s * x
"xvsubsp 57, 57 , 53 \n\t" // c * y - s * x
"xvsubsp 58, 58 , 54 \n\t" // c * y - s * x
"xvsubsp 59, 59 , 55 \n\t" // c * y - s * x
"stxvw4x 48, 0, %8 \n\t" // store x
"stxvw4x 49, %5, %8 \n\t"
"stxvw4x 50, %6, %8 \n\t"
"stxvw4x 51, %7, %8 \n\t"
"stxvw4x 56, 0, %9 \n\t" // store y
"stxvw4x 57, %5, %9 \n\t"
"stxvw4x 58, %6, %9 \n\t"
"stxvw4x 59, %7, %9 \n\t"
:
:
"r" (i), // 0
"r" (x1), // 1
"r" (y1), // 2
"r" (c), // 3
"r" (s), // 4
"r" (o16), // 5
"r" (o32), // 6
"r" (o48), // 7
"r" (x2), // 8
"r" (y2) // 9
: "cr0", "%0", "%1" , "%2", "%8", "%9", "memory"
);
}

179
kernel/power/sscal.c Normal file
View File

@ -0,0 +1,179 @@
/***************************************************************************
Copyright (c) 2013-2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2016/03/27 Werner Saar (wernsaar@googlemail.com)
* BLASTEST : OK
* CTEST : OK
* TEST : OK
* LAPACK-TEST : OK
**************************************************************************************/
#include "common.h"
#if defined(POWER8)
#include "sscal_microk_power8.c"
#endif
#if !defined(HAVE_KERNEL_16)
static void sscal_kernel_16( BLASLONG n, FLOAT *da , FLOAT *x )
{
BLASLONG i;
FLOAT alpha = *da;
for( i=0; i<n; i+=8 )
{
x[0] *= alpha;
x[1] *= alpha;
x[2] *= alpha;
x[3] *= alpha;
x[4] *= alpha;
x[5] *= alpha;
x[6] *= alpha;
x[7] *= alpha;
x+=8;
}
}
static void sscal_kernel_16_zero( BLASLONG n, FLOAT *da , FLOAT *x )
{
BLASLONG i;
FLOAT alpha=0.0;
for( i=0; i<n; i+=8 )
{
x[0] = alpha;
x[1] = alpha;
x[2] = alpha;
x[3] = alpha;
x[4] = alpha;
x[5] = alpha;
x[6] = alpha;
x[7] = alpha;
x+=8;
}
}
#endif
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
{
BLASLONG i=0,j=0;
FLOAT alpha[4] __attribute__ ((aligned (16)));;
if ( n <= 0 || inc_x <=0 )
return(0);
if ( inc_x == 1 )
{
if ( da == 0.0 )
{
BLASLONG n1 = n & -32;
if ( n1 > 0 )
{
alpha[0]=da;
alpha[1]=da;
alpha[2]=da;
alpha[3]=da;
sscal_kernel_16_zero(n1 , alpha , x);
j=n1;
}
while(j < n)
{
x[j]=0.0;
j++;
}
}
else
{
BLASLONG n1 = n & -32;
if ( n1 > 0 )
{
alpha[0]=da;
alpha[1]=da;
alpha[2]=da;
alpha[3]=da;
sscal_kernel_16(n1 , alpha , x);
j=n1;
}
while(j < n)
{
x[j] = da * x[j] ;
j++;
}
}
}
else
{
if ( da == 0.0 )
{
while(j < n)
{
x[i]=0.0;
i += inc_x ;
j++;
}
}
else
{
while(j < n)
{
x[i] = da * x[i] ;
i += inc_x ;
j++;
}
}
}
return 0;
}

View File

@ -0,0 +1,218 @@
/***************************************************************************
Copyright (c) 2013-2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2016/03/27 Werner Saar (wernsaar@googlemail.com)
* BLASTEST : OK
* CTEST : OK
* TEST : OK
* LAPACK-TEST : OK
**************************************************************************************/
#define HAVE_KERNEL_16 1
static void sscal_kernel_16( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline));
static void sscal_kernel_16( BLASLONG n, FLOAT *alpha, FLOAT *x)
{
BLASLONG i = n;
BLASLONG o16 = 16;
BLASLONG o32 = 32;
BLASLONG o48 = 48;
BLASLONG o64 = 64;
BLASLONG o80 = 80;
BLASLONG o96 = 96;
BLASLONG o112 = 112;
FLOAT *x1=x;
FLOAT *x2=x+1;
BLASLONG pre = 384;
__asm__ __volatile__
(
"lxvw4x 32, 0, %3 \n\t"
"addi %1, %1, -4 \n\t"
"dcbt %2, %4 \n\t"
"lxvw4x 40, 0, %2 \n\t"
"lxvw4x 41, %5, %2 \n\t"
"lxvw4x 42, %6, %2 \n\t"
"lxvw4x 43, %7, %2 \n\t"
"lxvw4x 44, %8, %2 \n\t"
"lxvw4x 45, %9, %2 \n\t"
"lxvw4x 46, %10, %2 \n\t"
"lxvw4x 47, %11, %2 \n\t"
"addi %2, %2, 128 \n\t"
"addic. %0 , %0 , -32 \n\t"
"ble 2f \n\t"
".align 5 \n\t"
"1: \n\t"
"dcbt %2, %4 \n\t"
"xvmulsp 48, 40, 32 \n\t"
"xvmulsp 49, 41, 32 \n\t"
"lxvw4x 40, 0, %2 \n\t"
"lxvw4x 41, %5, %2 \n\t"
"xvmulsp 50, 42, 32 \n\t"
"xvmulsp 51, 43, 32 \n\t"
"lxvw4x 42, %6, %2 \n\t"
"lxvw4x 43, %7, %2 \n\t"
"xvmulsp 52, 44, 32 \n\t"
"xvmulsp 53, 45, 32 \n\t"
"lxvw4x 44, %8, %2 \n\t"
"lxvw4x 45, %9, %2 \n\t"
"xvmulsp 54, 46, 32 \n\t"
"xvmulsp 55, 47, 32 \n\t"
"lxvw4x 46, %10, %2 \n\t"
"lxvw4x 47, %11, %2 \n\t"
"stxvw4x 48, 0, %1 \n\t"
"stxvw4x 49, %5, %1 \n\t"
"stxvw4x 50, %6, %1 \n\t"
"stxvw4x 51, %7, %1 \n\t"
"stxvw4x 52, %8, %1 \n\t"
"stxvw4x 53, %9, %1 \n\t"
"stxvw4x 54, %10, %1 \n\t"
"stxvw4x 55, %11, %1 \n\t"
"addi %1, %1, 128 \n\t"
"addi %2, %2, 128 \n\t"
"addic. %0 , %0 , -32 \n\t"
"bgt 1b \n\t"
"2: \n\t"
"xvmulsp 48, 40, 32 \n\t"
"xvmulsp 49, 41, 32 \n\t"
"xvmulsp 50, 42, 32 \n\t"
"xvmulsp 51, 43, 32 \n\t"
"xvmulsp 52, 44, 32 \n\t"
"xvmulsp 53, 45, 32 \n\t"
"xvmulsp 54, 46, 32 \n\t"
"xvmulsp 55, 47, 32 \n\t"
"stxvw4x 48, 0, %1 \n\t"
"stxvw4x 49, %5, %1 \n\t"
"stxvw4x 50, %6, %1 \n\t"
"stxvw4x 51, %7, %1 \n\t"
"stxvw4x 52, %8, %1 \n\t"
"stxvw4x 53, %9, %1 \n\t"
"stxvw4x 54, %10, %1 \n\t"
"stxvw4x 55, %11, %1 \n\t"
:
:
"r" (i), // 0
"r" (x2), // 1
"r" (x1), // 2
"r" (alpha), // 3
"r" (pre), // 4
"r" (o16), // 5
"r" (o32), // 6
"r" (o48), // 7
"r" (o64), // 8
"r" (o80), // 9
"r" (o96), // 10
"r" (o112) // 11
: "cr0", "%0", "%2" , "%1", "memory"
);
}
static void sscal_kernel_16_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline));
static void sscal_kernel_16_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
{
BLASLONG i = n;
BLASLONG o16 = 16;
BLASLONG o32 = 32;
BLASLONG o48 = 48;
BLASLONG o64 = 64;
BLASLONG o80 = 80;
BLASLONG o96 = 96;
BLASLONG o112 = 112;
FLOAT *x1=x;
FLOAT *x2=x+1;
BLASLONG pre = 384;
__asm__ __volatile__
(
"xxlxor 32 , 32 , 32 \n\t"
"addi %1, %1, -4 \n\t"
".align 5 \n\t"
"1: \n\t"
"stxvw4x 32, 0, %1 \n\t"
"stxvw4x 32, %5, %1 \n\t"
"stxvw4x 32, %6, %1 \n\t"
"stxvw4x 32, %7, %1 \n\t"
"stxvw4x 32, %8, %1 \n\t"
"stxvw4x 32, %9, %1 \n\t"
"stxvw4x 32, %10, %1 \n\t"
"stxvw4x 32, %11, %1 \n\t"
"addi %1, %1, 128 \n\t"
"addic. %0 , %0 , -32 \n\t"
"bgt 1b \n\t"
"2: \n\t"
:
:
"r" (i), // 0
"r" (x2), // 1
"r" (x1), // 2
"r" (alpha), // 3
"r" (pre), // 4
"r" (o16), // 5
"r" (o32), // 6
"r" (o48), // 7
"r" (o64), // 8
"r" (o80), // 9
"r" (o96), // 10
"r" (o112) // 11
: "cr0", "%0", "%2" , "%1", "memory"
);
}

154
kernel/power/sswap.c Normal file
View File

@ -0,0 +1,154 @@
/***************************************************************************
Copyright (c) 2013-2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2016/03/25 Werner Saar (wernsaar@googlemail.com)
* BLASTEST : OK
* CTEST : OK
* TEST : OK
* LAPACK-TEST : OK
**************************************************************************************/
#include "common.h"
#if defined(POWER8)
#include "sswap_microk_power8.c"
#endif
#ifndef HAVE_KERNEL_32
static void sswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y)
{
BLASLONG i=0;
FLOAT f0, f1, f2, f3, f4, f5, f6, f7;
FLOAT g0, g1, g2, g3, g4, g5, g6, g7;
FLOAT *x1=x;
FLOAT *y1=y;
while ( i<n )
{
f0 = x1[0];
f1 = x1[1];
f2 = x1[2];
f3 = x1[3];
f4 = x1[4];
f5 = x1[5];
f6 = x1[6];
f7 = x1[7];
g0 = y1[0];
g1 = y1[1];
g2 = y1[2];
g3 = y1[3];
g4 = y1[4];
g5 = y1[5];
g6 = y1[6];
g7 = y1[7];
y1[0] = f0;
y1[1] = f1;
y1[2] = f2;
y1[3] = f3;
y1[4] = f4;
y1[5] = f5;
y1[6] = f6;
y1[7] = f7;
x1[0] = g0;
x1[1] = g1;
x1[2] = g2;
x1[3] = g3;
x1[4] = g4;
x1[5] = g5;
x1[6] = g6;
x1[7] = g7;
x1 += 8;
y1 += 8;
i+=8;
}
return;
}
#endif
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;
FLOAT temp;
if ( n <= 0 ) return(0);
if ( (inc_x == 1) && (inc_y == 1 ))
{
BLASLONG n1 = n & -32;
if ( n1 > 0 )
{
sswap_kernel_32(n1, x, y);
i=n1;
}
while(i < n)
{
temp = y[i];
y[i] = x[i] ;
x[i] = temp;
i++ ;
}
}
else
{
while(i < n)
{
temp = y[iy];
y[iy] = x[ix] ;
x[ix] = temp;
ix += inc_x ;
iy += inc_y ;
i++ ;
}
}
return(0);
}

View File

@ -0,0 +1,136 @@
/***************************************************************************
Copyright (c) 2013-2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2016/03/25 Werner Saar (wernsaar@googlemail.com)
* BLASTEST : OK
* CTEST : OK
* TEST : OK
* LAPACK-TEST : OK
**************************************************************************************/
#define HAVE_KERNEL_32 1
static void sswap_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
static void sswap_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y)
{
BLASLONG i = n;
BLASLONG o16 = 16;
BLASLONG o32 = 32;
BLASLONG o48 = 48;
BLASLONG o64 = 64;
BLASLONG o80 = 80;
BLASLONG o96 = 96;
BLASLONG o112 = 112;
FLOAT *x1=x;
FLOAT *y1=y;
FLOAT *x2=x+1;
FLOAT *y2=y+1;
BLASLONG pre = 384;
BLASLONG alpha=0;
__asm__ __volatile__
(
"addi %3, %3, -4 \n\t"
"addi %4, %4, -4 \n\t"
".align 5 \n\t"
"1: \n\t"
"lxvw4x 32, 0, %2 \n\t"
"lxvw4x 33, %5, %2 \n\t"
"lxvw4x 34, %6, %2 \n\t"
"lxvw4x 35, %7, %2 \n\t"
"lxvw4x 36, %8, %2 \n\t"
"lxvw4x 37, %9, %2 \n\t"
"lxvw4x 38, %10, %2 \n\t"
"lxvw4x 39, %11, %2 \n\t"
"addi %2, %2, 128 \n\t"
"lxvw4x 48, 0, %1 \n\t"
"lxvw4x 49, %5, %1 \n\t"
"lxvw4x 50, %6, %1 \n\t"
"lxvw4x 51, %7, %1 \n\t"
"lxvw4x 52, %8, %1 \n\t"
"lxvw4x 53, %9, %1 \n\t"
"lxvw4x 54, %10, %1 \n\t"
"lxvw4x 55, %11, %1 \n\t"
"addi %1, %1, 128 \n\t"
"stxvw4x 32, 0, %3 \n\t"
"stxvw4x 33, %5, %3 \n\t"
"stxvw4x 34, %6, %3 \n\t"
"stxvw4x 35, %7, %3 \n\t"
"stxvw4x 36, %8, %3 \n\t"
"stxvw4x 37, %9, %3 \n\t"
"stxvw4x 38, %10, %3 \n\t"
"stxvw4x 39, %11, %3 \n\t"
"addi %3, %3, 128 \n\t"
"stxvw4x 48, 0, %4 \n\t"
"stxvw4x 49, %5, %4 \n\t"
"stxvw4x 50, %6, %4 \n\t"
"stxvw4x 51, %7, %4 \n\t"
"stxvw4x 52, %8, %4 \n\t"
"stxvw4x 53, %9, %4 \n\t"
"stxvw4x 54, %10, %4 \n\t"
"stxvw4x 55, %11, %4 \n\t"
"addi %4, %4, 128 \n\t"
"addic. %0 , %0 , -32 \n\t"
"bgt 1b \n\t"
"2: \n\t"
:
:
"r" (i), // 0
"r" (y1), // 1
"r" (x1), // 2
"r" (y2), // 3
"r" (x2), // 4
"r" (o16), // 5
"r" (o32), // 6
"r" (o48), // 7
"r" (o64), // 8
"r" (o80), // 9
"r" (o96), // 10
"r" (o112) // 11
: "cr0", "%0", "%2" , "%1", "%3", "%4", "memory"
);
}

View File

@ -0,0 +1,369 @@
/***************************************************************************
Copyright (c) 2013-2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2016/04/02 Werner Saar (wernsaar@googlemail.com)
* BLASTEST : OK
* CTEST : OK
* TEST : OK
* LAPACK-TEST : OK
**************************************************************************************/
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#define ASSEMBLER
#include "common.h"
#include "def_vsx.h"
#ifndef __64BIT__
#define LOAD lwz
#else
#define LOAD ld
#endif
#ifdef __64BIT__
#define STACKSIZE 340
#define ALPHA_SP 296(SP)
#define FZERO 304(SP)
#else
#define STACKSIZE 240
#define ALPHA_SP 224(SP)
#define FZERO 232(SP)
#endif
#define M r3
#define N r4
#define K r5
#ifdef linux
#ifndef __64BIT__
#define A r6
#define B r7
#define C r8
#define LDC r9
#define OFFSET r10
#else
#define A r7
#define B r8
#define C r9
#define LDC r10
#define OFFSET r6
#endif
#endif
#if defined(_AIX) || defined(__APPLE__)
#if !defined(__64BIT__) && defined(DOUBLE)
#define A r8
#define B r9
#define C r10
#define LDC r7
#define OFFSET r6
#else
#define A r7
#define B r8
#define C r9
#define LDC r10
#define OFFSET r6
#endif
#endif
#define alpha_r vs30
#define alpha_vr vs31
#define o0 0
#define TBUFFER r13
#define o12 r14
#define o4 r15
#define K1 r16
#define o8 r17
#define L r18
#define T1 r19
#define KK r20
#define KKK r21
#define I r22
#define J r23
#define AO r24
#define BO r25
#define CO r26
#define o16 r27
#define o32 r28
#define o48 r29
#define PRE r30
#define T2 r31
#include "strmm_macros_16x8_power8.S"
#ifndef NEEDPARAM
PROLOGUE
PROFCODE
addi SP, SP, -STACKSIZE
li r0, 0
stfd f14, 0(SP)
stfd f15, 8(SP)
stfd f16, 16(SP)
stfd f17, 24(SP)
stfd f18, 32(SP)
stfd f19, 40(SP)
stfd f20, 48(SP)
stfd f21, 56(SP)
stfd f22, 64(SP)
stfd f23, 72(SP)
stfd f24, 80(SP)
stfd f25, 88(SP)
stfd f26, 96(SP)
stfd f27, 104(SP)
stfd f28, 112(SP)
stfd f29, 120(SP)
stfd f30, 128(SP)
stfd f31, 136(SP)
#ifdef __64BIT__
std r31, 144(SP)
std r30, 152(SP)
std r29, 160(SP)
std r28, 168(SP)
std r27, 176(SP)
std r26, 184(SP)
std r25, 192(SP)
std r24, 200(SP)
std r23, 208(SP)
std r22, 216(SP)
std r21, 224(SP)
std r20, 232(SP)
std r19, 240(SP)
std r18, 248(SP)
std r17, 256(SP)
std r16, 264(SP)
std r15, 272(SP)
std r14, 280(SP)
std r13, 288(SP)
#else
stw r31, 144(SP)
stw r30, 148(SP)
stw r29, 152(SP)
stw r28, 156(SP)
stw r27, 160(SP)
stw r26, 164(SP)
stw r25, 168(SP)
stw r24, 172(SP)
stw r23, 176(SP)
stw r22, 180(SP)
stw r21, 184(SP)
stw r20, 188(SP)
stw r19, 192(SP)
stw r18, 196(SP)
stw r17, 200(SP)
stw r16, 204(SP)
stw r15, 208(SP)
stw r14, 212(SP)
stw r13, 216(SP)
#endif
// stfd f1, ALPHA_SP
// stw r0, FZERO
#if defined(_AIX) || defined(__APPLE__)
#if !defined(__64BIT__) && defined(DOUBLE)
lwz LDC, FRAMESLOT(0) + STACKSIZE(SP)
#endif
#endif
slwi LDC, LDC, BASE_SHIFT
#if defined(TRMMKERNEL)
#if defined(linux) && defined(__64BIT__)
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
#endif
#if defined(_AIX) || defined(__APPLE__)
#ifdef __64BIT__
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
#else
#ifdef DOUBLE
lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
#else
lwz OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
#endif
#endif
#endif
#endif
mr KK, OFFSET
#if defined(TRMMKERNEL) && !defined(LEFT)
neg KK, KK
#endif
cmpwi cr0, M, 0
ble L999_H1
cmpwi cr0, N, 0
ble L999_H1
cmpwi cr0, K, 0
ble L999_H1
li PRE, 256
li o4 , 4
li o8 , 8
li o12, 12
li o16, 16
li o32, 32
li o48, 48
addi TBUFFER, SP, 320
addi T1, SP, 300
stxsspx f1, o0 , T1
stxsspx f1, o4 , T1
stxsspx f1, o8 , T1
stxsspx f1, o12 , T1
lxsspx alpha_r, o0, T1
lxvw4x alpha_vr, o0, T1
#include "strmm_logic_16x8_power8.S"
L999:
addi r3, 0, 0
lfd f14, 0(SP)
lfd f15, 8(SP)
lfd f16, 16(SP)
lfd f17, 24(SP)
lfd f18, 32(SP)
lfd f19, 40(SP)
lfd f20, 48(SP)
lfd f21, 56(SP)
lfd f22, 64(SP)
lfd f23, 72(SP)
lfd f24, 80(SP)
lfd f25, 88(SP)
lfd f26, 96(SP)
lfd f27, 104(SP)
lfd f28, 112(SP)
lfd f29, 120(SP)
lfd f30, 128(SP)
lfd f31, 136(SP)
#ifdef __64BIT__
ld r31, 144(SP)
ld r30, 152(SP)
ld r29, 160(SP)
ld r28, 168(SP)
ld r27, 176(SP)
ld r26, 184(SP)
ld r25, 192(SP)
ld r24, 200(SP)
ld r23, 208(SP)
ld r22, 216(SP)
ld r21, 224(SP)
ld r20, 232(SP)
ld r19, 240(SP)
ld r18, 248(SP)
ld r17, 256(SP)
ld r16, 264(SP)
ld r15, 272(SP)
ld r14, 280(SP)
ld r13, 288(SP)
#else
lwz r31, 144(SP)
lwz r30, 148(SP)
lwz r29, 152(SP)
lwz r28, 156(SP)
lwz r27, 160(SP)
lwz r26, 164(SP)
lwz r25, 168(SP)
lwz r24, 172(SP)
lwz r23, 176(SP)
lwz r22, 180(SP)
lwz r21, 184(SP)
lwz r20, 188(SP)
lwz r19, 192(SP)
lwz r18, 196(SP)
lwz r17, 200(SP)
lwz r16, 204(SP)
lwz r15, 208(SP)
lwz r14, 212(SP)
lwz r13, 216(SP)
#endif
addi SP, SP, STACKSIZE
blr
EPILOGUE
#endif

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

149
kernel/power/zasum.c Normal file
View File

@ -0,0 +1,149 @@
/***************************************************************************
Copyright (c) 2013-2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2016/03/28 Werner Saar (wernsaar@googlemail.com)
* BLASTEST : OK
* CTEST : OK
* TEST : OK
* LAPACK-TEST : OK
**************************************************************************************/
#include "common.h"
#include <math.h>
#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif
#if defined(POWER8)
#include "zasum_microk_power8.c"
#endif
#ifndef HAVE_KERNEL_8
static void zasum_kernel_8(BLASLONG n, FLOAT *x1, FLOAT *svec)
{
BLASLONG i=0;
FLOAT *x = x1;
FLOAT temp0, temp1, temp2, temp3;
FLOAT temp4, temp5, temp6, temp7;
FLOAT sum0 = 0.0;
FLOAT sum1 = 0.0;
FLOAT sum2 = 0.0;
FLOAT sum3 = 0.0;
while ( i< n )
{
temp0 = ABS(x[0]);
temp1 = ABS(x[1]);
temp2 = ABS(x[2]);
temp3 = ABS(x[3]);
temp4 = ABS(x[4]);
temp5 = ABS(x[5]);
temp6 = ABS(x[6]);
temp7 = ABS(x[7]);
sum0 += temp0;
sum1 += temp1;
sum2 += temp2;
sum3 += temp3;
sum0 += temp4;
sum1 += temp5;
sum2 += temp6;
sum3 += temp7;
x+=8;
i+=4;
}
svec[0] = sum0+sum1+sum2+sum3;
svec[1] = 0.0;
}
#endif
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i=0;
BLASLONG ip=0;
FLOAT sumf = 0.0;
FLOAT svec[2] __attribute__ ((aligned (16)));;
BLASLONG n1;
BLASLONG inc_x2;
if (n <= 0 || inc_x <= 0) return(sumf);
if ( inc_x == 1 )
{
n1 = n & -8;
if ( n1 > 0 )
{
zasum_kernel_8(n1, x, svec);
sumf = svec[0] + svec[1];
i=n1;
ip=2*n1;
}
while(i < n)
{
sumf += ABS(x[ip]) + ABS(x[ip+1]);
i++;
ip+=2;
}
}
else
{
inc_x2 = 2* inc_x;
while(i < n)
{
sumf += ABS(x[ip]) + ABS(x[ip+1]);
ip+=inc_x2;
i++;
}
}
return(sumf);
}

View File

@ -0,0 +1,177 @@
/***************************************************************************
Copyright (c) 2013-2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2016/03/28 Werner Saar (wernsaar@googlemail.com)
* BLASTEST : OK
* CTEST : OK
* TEST : OK
* LAPACK-TEST : OK
**************************************************************************************/
#define HAVE_KERNEL_8 1
static void zasum_kernel_8( BLASLONG n, FLOAT *x, FLOAT *svec) __attribute__ ((noinline));
static void zasum_kernel_8( BLASLONG n, FLOAT *x, FLOAT *svec)
{
BLASLONG i = n;
BLASLONG o16 = 16;
BLASLONG o32 = 32;
BLASLONG o48 = 48;
BLASLONG o64 = 64;
BLASLONG o80 = 80;
BLASLONG o96 = 96;
BLASLONG o112 = 112;
FLOAT *x1=x;
BLASLONG pre = 384;
__asm__ __volatile__
(
"dcbt %2 , %4 \n\t"
"xxlxor 32,32,32 \n\t"
"xxlxor 33,33,33 \n\t"
"xxlxor 34,34,34 \n\t"
"xxlxor 35,35,35 \n\t"
"xxlxor 36,36,36 \n\t"
"xxlxor 37,37,37 \n\t"
"xxlxor 38,38,38 \n\t"
"xxlxor 39,39,39 \n\t"
"lxvd2x 40, 0, %2 \n\t"
"lxvd2x 41, %5, %2 \n\t"
"lxvd2x 42, %6, %2 \n\t"
"lxvd2x 43, %7, %2 \n\t"
"lxvd2x 44, %8, %2 \n\t"
"lxvd2x 45, %9, %2 \n\t"
"lxvd2x 46, %10, %2 \n\t"
"lxvd2x 47, %11, %2 \n\t"
"addi %2, %2, 128 \n\t"
"addic. %0 , %0 , -8 \n\t"
"ble 2f \n\t"
".align 5 \n\t"
"1: \n\t"
"dcbt %2 , %4 \n\t"
"xvabsdp 48, 40 \n\t"
"xvabsdp 49, 41 \n\t"
"xvabsdp 50, 42 \n\t"
"xvabsdp 51, 43 \n\t"
"lxvd2x 40, 0, %2 \n\t"
"lxvd2x 41, %5, %2 \n\t"
"xvabsdp 52, 44 \n\t"
"xvabsdp 53, 45 \n\t"
"lxvd2x 42, %6, %2 \n\t"
"lxvd2x 43, %7, %2 \n\t"
"xvabsdp 54, 46 \n\t"
"xvabsdp 55, 47 \n\t"
"lxvd2x 44, %8, %2 \n\t"
"lxvd2x 45, %9, %2 \n\t"
"xvadddp 32, 32, 48 \n\t"
"xvadddp 33, 33, 49 \n\t"
"lxvd2x 46, %10, %2 \n\t"
"lxvd2x 47, %11, %2 \n\t"
"xvadddp 34, 34, 50 \n\t"
"xvadddp 35, 35, 51 \n\t"
"addi %2, %2, 128 \n\t"
"xvadddp 36, 36, 52 \n\t"
"xvadddp 37, 37, 53 \n\t"
"addic. %0 , %0 , -8 \n\t"
"xvadddp 38, 38, 54 \n\t"
"xvadddp 39, 39, 55 \n\t"
"bgt 1b \n\t"
"2: \n\t"
"xvabsdp 48, 40 \n\t"
"xvabsdp 49, 41 \n\t"
"xvabsdp 50, 42 \n\t"
"xvabsdp 51, 43 \n\t"
"xvabsdp 52, 44 \n\t"
"xvabsdp 53, 45 \n\t"
"xvabsdp 54, 46 \n\t"
"xvabsdp 55, 47 \n\t"
"xvadddp 32, 32, 48 \n\t"
"xvadddp 33, 33, 49 \n\t"
"xvadddp 34, 34, 50 \n\t"
"xvadddp 35, 35, 51 \n\t"
"xvadddp 36, 36, 52 \n\t"
"xvadddp 37, 37, 53 \n\t"
"xvadddp 38, 38, 54 \n\t"
"xvadddp 39, 39, 55 \n\t"
"xvadddp 32, 32, 33 \n\t"
"xvadddp 34, 34, 35 \n\t"
"xvadddp 36, 36, 37 \n\t"
"xvadddp 38, 38, 39 \n\t"
"xvadddp 32, 32, 34 \n\t"
"xvadddp 36, 36, 38 \n\t"
"xvadddp 32, 32, 36 \n\t"
"stxvd2x 32, 0, %3 \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"r" (x1), // 2
"r" (svec), // 3
"r" (pre), // 4
"r" (o16), // 5
"r" (o32), // 6
"r" (o48), // 7
"r" (o64), // 8
"r" (o80), // 9
"r" (o96), // 10
"r" (o112) // 11
: "cr0", "%0", "%2", "memory"
);
}

140
kernel/power/zaxpy.c Normal file
View File

@ -0,0 +1,140 @@
/***************************************************************************
Copyright (c) 2013-2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2016/03/23 Werner Saar (wernsaar@googlemail.com)
* BLASTEST : OK
* CTEST : OK
* TEST : OK
* LAPACK-TEST : OK
**************************************************************************************/
#include "common.h"
#if defined(POWER8)
#include "zaxpy_microk_power8.c"
#endif
#ifndef HAVE_KERNEL_4
static void zaxpy_kernel_4(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
{
BLASLONG register i = 0;
BLASLONG register ix = 0;
FLOAT da_r = alpha[0];
FLOAT da_i = alpha[1];
while(i < n)
{
#if !defined(CONJ)
y[ix] += ( da_r * x[ix] - da_i * x[ix+1] ) ;
y[ix+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ;
y[ix+2] += ( da_r * x[ix+2] - da_i * x[ix+3] ) ;
y[ix+3] += ( da_r * x[ix+3] + da_i * x[ix+2] ) ;
#else
y[ix] += ( da_r * x[ix] + da_i * x[ix+1] ) ;
y[ix+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ;
y[ix+2] += ( da_r * x[ix+2] + da_i * x[ix+3] ) ;
y[ix+3] -= ( da_r * x[ix+3] - da_i * x[ix+2] ) ;
#endif
ix+=4 ;
i+=2 ;
}
}
#endif
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;
FLOAT da[4];
if ( n <= 0 ) return(0);
if ( (inc_x == 1) && (inc_y == 1) )
{
BLASLONG n1 = n & -16;
if ( n1 )
{
da[0] = da_r;
da[1] = da_r;
da[2] = da_i;
da[3] = da_i;
zaxpy_kernel_4(n1, x, y , da );
ix = 2 * n1;
}
i = n1;
while(i < n)
{
#if !defined(CONJ)
y[ix] += ( da_r * x[ix] - da_i * x[ix+1] ) ;
y[ix+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ;
#else
y[ix] += ( da_r * x[ix] + da_i * x[ix+1] ) ;
y[ix+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ;
#endif
i++ ;
ix += 2;
}
return(0);
}
inc_x *=2;
inc_y *=2;
while(i < n)
{
#if !defined(CONJ)
y[iy] += ( da_r * x[ix] - da_i * x[ix+1] ) ;
y[iy+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ;
#else
y[iy] += ( da_r * x[ix] + da_i * x[ix+1] ) ;
y[iy+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ;
#endif
ix += inc_x ;
iy += inc_y ;
i++ ;
}
return(0);
}

View File

@ -0,0 +1,250 @@
/***************************************************************************
Copyright (c) 2013-2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2016/03/23 Werner Saar (wernsaar@googlemail.com)
* BLASTEST : OK
* CTEST : OK
* TEST : OK
* LAPACK-TEST : OK
**************************************************************************************/
#define HAVE_KERNEL_4 1
static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __attribute__ ((noinline));
static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
{
BLASLONG i = n;
BLASLONG o16 = 16;
BLASLONG o32 = 32;
BLASLONG o48 = 48;
FLOAT *x1=x;
FLOAT *y1=y;
FLOAT *y2=y+1;
BLASLONG pre = 384;
#if !defined(CONJ)
FLOAT mvec[2] = { -1.0, 1.0 };
#else
FLOAT mvec[2] = { 1.0, -1.0 };
#endif
__asm__ __volatile__
(
"lxsdx 34, 0 , %4 \n\t" // alpha_r
"lxsdx 35, %5, %4 \n\t" // alpha_i
"xxspltd 32, 34, 0 \n\t"
"xxspltd 33, 35, 0 \n\t"
"lxvd2x 36, 0, %9 \n\t" // mvec
#if !defined(CONJ)
"xvmuldp 33, 33 , 36 \n\t" // alpha_i * mvec
#else
"xvmuldp 32, 32 , 36 \n\t" // alpha_r * mvec
#endif
"addi %8, %8, -8 \n\t"
"dcbt %2, %10 \n\t"
"dcbt %3, %10 \n\t"
"lxvd2x 40, 0, %2 \n\t" // x0
"lxvd2x 41, %5, %2 \n\t" // x1
"lxvd2x 42, %6, %2 \n\t" // x2
"lxvd2x 43, %7, %2 \n\t" // x3
"lxvd2x 48, 0, %3 \n\t" // y0
"lxvd2x 49, %5, %3 \n\t" // y1
"lxvd2x 50, %6, %3 \n\t" // y2
"lxvd2x 51, %7, %3 \n\t" // y3
"xxswapd 56, 40 \n\t" // exchange real and imag part
"xxswapd 57, 41 \n\t" // exchange real and imag part
"xxswapd 58, 42 \n\t" // exchange real and imag part
"xxswapd 59, 43 \n\t" // exchange real and imag part
"addi %2, %2, 64 \n\t"
"addi %3, %3, 64 \n\t"
"lxvd2x 44, 0, %2 \n\t" // x4
"lxvd2x 45, %5, %2 \n\t" // x5
"lxvd2x 46, %6, %2 \n\t" // x6
"lxvd2x 47, %7, %2 \n\t" // x7
"lxvd2x 52, 0, %3 \n\t" // y4
"lxvd2x 53, %5, %3 \n\t" // y5
"lxvd2x 54, %6, %3 \n\t" // y6
"lxvd2x 55, %7, %3 \n\t" // y7
"xxswapd 60, 44 \n\t" // exchange real and imag part
"xxswapd 61, 45 \n\t" // exchange real and imag part
"xxswapd 62, 46 \n\t" // exchange real and imag part
"xxswapd 63, 47 \n\t" // exchange real and imag part
"addi %2, %2, 64 \n\t"
"addi %3, %3, 64 \n\t"
"addic. %0 , %0 , -8 \n\t"
"ble 2f \n\t"
".align 5 \n\t"
"1: \n\t"
"dcbt %2, %10 \n\t"
"dcbt %3, %10 \n\t"
"xvmaddadp 48, 40, 32 \n\t" // alpha_r * x0_r , alpha_r * x0_i
"xvmaddadp 49, 41, 32 \n\t"
"lxvd2x 40, 0, %2 \n\t" // x0
"lxvd2x 41, %5, %2 \n\t" // x1
"xvmaddadp 50, 42, 32 \n\t"
"xvmaddadp 51, 43, 32 \n\t"
"lxvd2x 42, %6, %2 \n\t" // x2
"lxvd2x 43, %7, %2 \n\t" // x3
"xvmaddadp 52, 44, 32 \n\t"
"addi %2, %2, 64 \n\t"
"xvmaddadp 53, 45, 32 \n\t"
"lxvd2x 44, 0, %2 \n\t" // x4
"lxvd2x 45, %5, %2 \n\t" // x5
"xvmaddadp 54, 46, 32 \n\t"
"xvmaddadp 55, 47, 32 \n\t"
"lxvd2x 46, %6, %2 \n\t" // x6
"lxvd2x 47, %7, %2 \n\t" // x7
"xvmaddadp 48, 56, 33 \n\t" // alpha_i * x0_i , alpha_i * x0_r
"addi %2, %2, 64 \n\t"
"xvmaddadp 49, 57, 33 \n\t"
"xvmaddadp 50, 58, 33 \n\t"
"xvmaddadp 51, 59, 33 \n\t"
"xvmaddadp 52, 60, 33 \n\t"
"xvmaddadp 53, 61, 33 \n\t"
"xvmaddadp 54, 62, 33 \n\t"
"xvmaddadp 55, 63, 33 \n\t"
"stxvd2x 48, 0, %8 \n\t"
"stxvd2x 49, %5, %8 \n\t"
"stxvd2x 50, %6, %8 \n\t"
"stxvd2x 51, %7, %8 \n\t"
"addi %8, %8, 64 \n\t"
"stxvd2x 52, 0, %8 \n\t"
"stxvd2x 53, %5, %8 \n\t"
"stxvd2x 54, %6, %8 \n\t"
"stxvd2x 55, %7, %8 \n\t"
"addi %8, %8, 64 \n\t"
"xxswapd 56, 40 \n\t" // exchange real and imag part
"xxswapd 57, 41 \n\t" // exchange real and imag part
"lxvd2x 48, 0, %3 \n\t" // y0
"lxvd2x 49, %5, %3 \n\t" // y1
"xxswapd 58, 42 \n\t" // exchange real and imag part
"xxswapd 59, 43 \n\t" // exchange real and imag part
"lxvd2x 50, %6, %3 \n\t" // y2
"lxvd2x 51, %7, %3 \n\t" // y3
"xxswapd 60, 44 \n\t" // exchange real and imag part
"addi %3, %3, 64 \n\t"
"xxswapd 61, 45 \n\t" // exchange real and imag part
"lxvd2x 52, 0, %3 \n\t" // y4
"lxvd2x 53, %5, %3 \n\t" // y5
"xxswapd 62, 46 \n\t" // exchange real and imag part
"xxswapd 63, 47 \n\t" // exchange real and imag part
"lxvd2x 54, %6, %3 \n\t" // y6
"lxvd2x 55, %7, %3 \n\t" // y7
"addi %3, %3, 64 \n\t"
"addic. %0 , %0 , -8 \n\t"
"bgt 1b \n\t"
"2: \n\t"
"xvmaddadp 48, 40, 32 \n\t" // alpha_r * x0_r , alpha_r * x0_i
"xvmaddadp 49, 41, 32 \n\t"
"xvmaddadp 50, 42, 32 \n\t"
"xvmaddadp 51, 43, 32 \n\t"
"xvmaddadp 52, 44, 32 \n\t"
"xvmaddadp 53, 45, 32 \n\t"
"xvmaddadp 54, 46, 32 \n\t"
"xvmaddadp 55, 47, 32 \n\t"
"xvmaddadp 48, 56, 33 \n\t" // alpha_i * x0_i , alpha_i * x0_r
"xvmaddadp 49, 57, 33 \n\t"
"xvmaddadp 50, 58, 33 \n\t"
"xvmaddadp 51, 59, 33 \n\t"
"xvmaddadp 52, 60, 33 \n\t"
"xvmaddadp 53, 61, 33 \n\t"
"xvmaddadp 54, 62, 33 \n\t"
"xvmaddadp 55, 63, 33 \n\t"
"stxvd2x 48, 0, %8 \n\t"
"stxvd2x 49, %5, %8 \n\t"
"stxvd2x 50, %6, %8 \n\t"
"stxvd2x 51, %7, %8 \n\t"
"addi %8, %8, 64 \n\t"
"stxvd2x 52, 0, %8 \n\t"
"stxvd2x 53, %5, %8 \n\t"
"stxvd2x 54, %6, %8 \n\t"
"stxvd2x 55, %7, %8 \n\t"
"addi %8, %8, 64 \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"r" (x1), // 2
"r" (y1), // 3
"r" (alpha), // 4
"r" (o16), // 5
"r" (o32), // 6
"r" (o48), // 7
"r" (y2), // 8
"r" (mvec), // 9
"r" (pre) // 10
: "cr0", "%0", "%2" , "%3", "%8", "memory"
);
}

140
kernel/power/zcopy.c Normal file
View File

@ -0,0 +1,140 @@
/***************************************************************************
Copyright (c) 2013-2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2016/03/25 Werner Saar (wernsaar@googlemail.com)
* BLASTEST : OK
* CTEST : OK
* TEST : OK
* LAPACK-TEST : OK
**************************************************************************************/
#include "common.h"
#if defined(POWER8)
#include "zcopy_microk_power8.c"
#endif
#ifndef HAVE_KERNEL_16
static void zcopy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y)
{
BLASLONG i=0;
FLOAT f0, f1, f2, f3, f4, f5, f6, f7;
FLOAT *x1=x;
FLOAT *y1=y;
while ( i<n )
{
f0 = x1[0];
f1 = x1[1];
f2 = x1[2];
f3 = x1[3];
f4 = x1[4];
f5 = x1[5];
f6 = x1[6];
f7 = x1[7];
y1[0] = f0;
y1[1] = f1;
y1[2] = f2;
y1[3] = f3;
y1[4] = f4;
y1[5] = f5;
y1[6] = f6;
y1[7] = f7;
x1 += 8;
y1 += 8;
i+=4;
}
return;
}
#endif
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;
if ( n <= 0 ) return(0);
if ( (inc_x == 1) && (inc_y == 1 ))
{
BLASLONG n1 = n & -16;
if ( n1 > 0 )
{
zcopy_kernel_16(n1, x, y);
i=n1;
ix=n1*2;
iy=n1*2;
}
while(i < n)
{
y[iy] = x[iy] ;
y[iy+1] = x[ix+1] ;
ix+=2;
iy+=2;
i++ ;
}
}
else
{
BLASLONG inc_x2 = 2 * inc_x;
BLASLONG inc_y2 = 2 * inc_y;
while(i < n)
{
y[iy] = x[ix] ;
y[iy+1] = x[ix+1] ;
ix += inc_x2 ;
iy += inc_y2 ;
i++ ;
}
}
return(0);
}

View File

@ -0,0 +1,174 @@
/***************************************************************************
Copyright (c) 2013-2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2016/03/25 Werner Saar (wernsaar@googlemail.com)
* BLASTEST : OK
* CTEST : OK
* TEST : OK
* LAPACK-TEST : OK
**************************************************************************************/
#define HAVE_KERNEL_16 1
static void zcopy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
static void zcopy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y)
{
BLASLONG i = n;
BLASLONG o16 = 16;
BLASLONG o32 = 32;
BLASLONG o48 = 48;
BLASLONG o64 = 64;
BLASLONG o80 = 80;
BLASLONG o96 = 96;
BLASLONG o112 = 112;
FLOAT *x1=x;
FLOAT *y1=y;
BLASLONG pre = 384;
BLASLONG alpha=0;
__asm__ __volatile__
(
"lxvd2x 40, 0, %2 \n\t"
"lxvd2x 41, %5, %2 \n\t"
"lxvd2x 42, %6, %2 \n\t"
"lxvd2x 43, %7, %2 \n\t"
"lxvd2x 44, %8, %2 \n\t"
"lxvd2x 45, %9, %2 \n\t"
"lxvd2x 46, %10, %2 \n\t"
"lxvd2x 47, %11, %2 \n\t"
"addi %2, %2, 128 \n\t"
"lxvd2x 50, 0, %2 \n\t"
"lxvd2x 51, %5, %2 \n\t"
"lxvd2x 52, %6, %2 \n\t"
"lxvd2x 53, %7, %2 \n\t"
"lxvd2x 54, %8, %2 \n\t"
"lxvd2x 55, %9, %2 \n\t"
"lxvd2x 56, %10, %2 \n\t"
"lxvd2x 57, %11, %2 \n\t"
"addi %2, %2, 128 \n\t"
"addic. %0 , %0 , -16 \n\t"
"ble 2f \n\t"
".align 5 \n\t"
"1: \n\t"
"stxvd2x 40, 0, %1 \n\t"
"stxvd2x 41, %5, %1 \n\t"
"lxvd2x 40, 0, %2 \n\t"
"lxvd2x 41, %5, %2 \n\t"
"stxvd2x 42, %6, %1 \n\t"
"stxvd2x 43, %7, %1 \n\t"
"lxvd2x 42, %6, %2 \n\t"
"lxvd2x 43, %7, %2 \n\t"
"stxvd2x 44, %8, %1 \n\t"
"stxvd2x 45, %9, %1 \n\t"
"lxvd2x 44, %8, %2 \n\t"
"lxvd2x 45, %9, %2 \n\t"
"stxvd2x 46, %10, %1 \n\t"
"stxvd2x 47, %11, %1 \n\t"
"lxvd2x 46, %10, %2 \n\t"
"lxvd2x 47, %11, %2 \n\t"
"addi %1, %1, 128 \n\t"
"addi %2, %2, 128 \n\t"
"stxvd2x 50, 0, %1 \n\t"
"stxvd2x 51, %5, %1 \n\t"
"lxvd2x 50, 0, %2 \n\t"
"lxvd2x 51, %5, %2 \n\t"
"stxvd2x 52, %6, %1 \n\t"
"stxvd2x 53, %7, %1 \n\t"
"lxvd2x 52, %6, %2 \n\t"
"lxvd2x 53, %7, %2 \n\t"
"stxvd2x 54, %8, %1 \n\t"
"stxvd2x 55, %9, %1 \n\t"
"lxvd2x 54, %8, %2 \n\t"
"lxvd2x 55, %9, %2 \n\t"
"stxvd2x 56, %10, %1 \n\t"
"stxvd2x 57, %11, %1 \n\t"
"lxvd2x 56, %10, %2 \n\t"
"lxvd2x 57, %11, %2 \n\t"
"addi %1, %1, 128 \n\t"
"addi %2, %2, 128 \n\t"
"addic. %0 , %0 , -16 \n\t"
"bgt 1b \n\t"
"2: \n\t"
"stxvd2x 40, 0, %1 \n\t"
"stxvd2x 41, %5, %1 \n\t"
"stxvd2x 42, %6, %1 \n\t"
"stxvd2x 43, %7, %1 \n\t"
"stxvd2x 44, %8, %1 \n\t"
"stxvd2x 45, %9, %1 \n\t"
"stxvd2x 46, %10, %1 \n\t"
"stxvd2x 47, %11, %1 \n\t"
"addi %1, %1, 128 \n\t"
"stxvd2x 50, 0, %1 \n\t"
"stxvd2x 51, %5, %1 \n\t"
"stxvd2x 52, %6, %1 \n\t"
"stxvd2x 53, %7, %1 \n\t"
"stxvd2x 54, %8, %1 \n\t"
"stxvd2x 55, %9, %1 \n\t"
"stxvd2x 56, %10, %1 \n\t"
"stxvd2x 57, %11, %1 \n\t"
:
:
"r" (i), // 0
"r" (y1), // 1
"r" (x1), // 2
"r" (alpha), // 3
"r" (pre), // 4
"r" (o16), // 5
"r" (o32), // 6
"r" (o48), // 7
"r" (o64), // 8
"r" (o80), // 9
"r" (o96), // 10
"r" (o112) // 11
: "cr0", "%0", "%2" , "%1", "memory"
);
}

167
kernel/power/zdot.c Normal file
View File

@ -0,0 +1,167 @@
/***************************************************************************
Copyright (c) 2013-2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2016/03/21 Werner Saar (wernsaar@googlemail.com)
* BLASTEST : OK
* CTEST : OK
* TEST : OK
* LAPACK-TEST : OK
**************************************************************************************/
#include "common.h"
#include <complex.h>
#if defined(POWER8)
#include "zdot_microk_power8.c"
#endif
#ifndef HAVE_KERNEL_8
static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) __attribute__ ((noinline));
static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d)
{
BLASLONG register i = 0;
FLOAT dot[4] = { 0.0, 0.0, 0.0, 0.0 };
BLASLONG j=0;
while( i < n )
{
dot[0] += x[j] * y[j] ;
dot[1] += x[j+1] * y[j+1] ;
dot[2] += x[j] * y[j+1] ;
dot[3] += x[j+1] * y[j] ;
dot[0] += x[j+2] * y[j+2] ;
dot[1] += x[j+3] * y[j+3] ;
dot[2] += x[j+2] * y[j+3] ;
dot[3] += x[j+3] * y[j+2] ;
dot[0] += x[j+4] * y[j+4] ;
dot[1] += x[j+5] * y[j+5] ;
dot[2] += x[j+4] * y[j+5] ;
dot[3] += x[j+5] * y[j+4] ;
dot[0] += x[j+6] * y[j+6] ;
dot[1] += x[j+7] * y[j+7] ;
dot[2] += x[j+6] * y[j+7] ;
dot[3] += x[j+7] * y[j+6] ;
j+=8;
i+=4;
}
d[0] = dot[0];
d[1] = dot[1];
d[2] = dot[2];
d[3] = dot[3];
}
#endif
FLOAT _Complex CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
{
BLASLONG i;
BLASLONG ix,iy;
FLOAT _Complex result;
FLOAT dot[4] = { 0.0, 0.0, 0.0 , 0.0 } ;
if ( n <= 0 )
{
__real__ result = 0.0 ;
__imag__ result = 0.0 ;
return(result);
}
if ( (inc_x == 1) && (inc_y == 1) )
{
BLASLONG n1 = n & -8;
if ( n1 )
zdot_kernel_8(n1, x, y , dot );
i = n1;
BLASLONG j = i * 2;
while( i < n )
{
dot[0] += x[j] * y[j] ;
dot[1] += x[j+1] * y[j+1] ;
dot[2] += x[j] * y[j+1] ;
dot[3] += x[j+1] * y[j] ;
j+=2;
i++ ;
}
}
else
{
i=0;
ix=0;
iy=0;
inc_x <<= 1;
inc_y <<= 1;
while(i < n)
{
dot[0] += x[ix] * y[iy] ;
dot[1] += x[ix+1] * y[iy+1] ;
dot[2] += x[ix] * y[iy+1] ;
dot[3] += x[ix+1] * y[iy] ;
ix += inc_x ;
iy += inc_y ;
i++ ;
}
}
#if !defined(CONJ)
__real__ result = dot[0] - dot[1];
__imag__ result = dot[2] + dot[3];
#else
__real__ result = dot[0] + dot[1];
__imag__ result = dot[2] - dot[3];
#endif
return(result);
}

View File

@ -0,0 +1,219 @@
/***************************************************************************
Copyright (c) 2013-2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2016/03/21 Werner Saar (wernsaar@googlemail.com)
* BLASTEST : OK
* CTEST : OK
* TEST : OK
* LAPACK-TEST : OK
**************************************************************************************/
#define HAVE_KERNEL_8 1
static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline));
static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
{
BLASLONG i = n;
BLASLONG o16 = 16;
BLASLONG o32 = 32;
BLASLONG o48 = 48;
FLOAT *x1=x;
FLOAT *y1=y;
BLASLONG pre = 384;
__asm__ __volatile__
(
"xxlxor 32,32,32 \n\t"
"xxlxor 33,33,33 \n\t"
"xxlxor 34,34,34 \n\t"
"xxlxor 35,35,35 \n\t"
"xxlxor 36,36,36 \n\t"
"xxlxor 37,37,37 \n\t"
"xxlxor 38,38,38 \n\t"
"xxlxor 39,39,39 \n\t"
"dcbt %2, %8 \n\t"
"dcbt %3, %8 \n\t"
"lxvd2x 40, 0, %2 \n\t" // x0_r, x0_i
"lxvd2x 48, 0, %3 \n\t" // y0_r, y0_i
"lxvd2x 41, %5, %2 \n\t" // x1_r, x1_i
"lxvd2x 49, %5, %3 \n\t" // y1_r, y1_i
"lxvd2x 42, %6, %2 \n\t" // x2_r, x2_i
"lxvd2x 50, %6, %3 \n\t" // y2_r, y2_i
"lxvd2x 43, %7, %2 \n\t" // x3_r, x3_i
"lxvd2x 51, %7, %3 \n\t" // y3_r, y3_i
"xxswapd 52,48 \n\t" // y0_i, y0_r
"xxswapd 53,49 \n\t" // y1_i, y1_r
"xxswapd 54,50 \n\t" // y2_i, y2_r
"xxswapd 55,51 \n\t" // y3_i, y3_r
"addi %2, %2, 64 \n\t"
"addi %3, %3, 64 \n\t"
"lxvd2x 44, 0, %2 \n\t" // x0_r, x0_i
"lxvd2x 56, 0, %3 \n\t" // y0_r, y0_i
"lxvd2x 45, %5, %2 \n\t" // x1_r, x1_i
"lxvd2x 57, %5, %3 \n\t" // y1_r, y1_i
"lxvd2x 46, %6, %2 \n\t" // x2_r, x2_i
"lxvd2x 58, %6, %3 \n\t" // y2_r, y2_i
"lxvd2x 47, %7, %2 \n\t" // x3_r, x3_i
"lxvd2x 59, %7, %3 \n\t" // y3_r, y3_i
"xxswapd 60,56 \n\t" // y0_i, y0_r
"xxswapd 61,57 \n\t" // y1_i, y1_r
"xxswapd 62,58 \n\t" // y2_i, y2_r
"xxswapd 63,59 \n\t" // y3_i, y3_r
"addi %2, %2, 64 \n\t"
"addi %3, %3, 64 \n\t"
"addic. %0 , %0 , -8 \n\t"
"ble 2f \n\t"
".align 5 \n\t"
"1: \n\t"
"dcbt %2, %8 \n\t"
"dcbt %3, %8 \n\t"
"xvmaddadp 32, 40, 48 \n\t" // x0_r * y0_r , x0_i * y0_i
"lxvd2x 48, 0, %3 \n\t" // y0_r, y0_i
"xvmaddadp 34, 41, 49 \n\t" // x1_r * y1_r , x1_i * y1_i
"lxvd2x 49, %5, %3 \n\t" // y1_r, y1_i
"xvmaddadp 36, 42, 50 \n\t" // x2_r * y2_r , x2_i * y2_i
"lxvd2x 50, %6, %3 \n\t" // y2_r, y2_i
"xvmaddadp 38, 43, 51 \n\t" // x3_r * y3_r , x3_i * y3_i
"lxvd2x 51, %7, %3 \n\t" // y3_r, y3_i
"xvmaddadp 33, 40, 52 \n\t" // x0_r * y0_i , x0_i * y0_r
"lxvd2x 40, 0, %2 \n\t" // x0_r, x0_i
"xvmaddadp 35, 41, 53 \n\t" // x1_r * y1_i , x1_i * y1_r
"lxvd2x 41, %5, %2 \n\t" // x1_r, x1_i
"xvmaddadp 37, 42, 54 \n\t" // x2_r * y2_i , x2_i * y2_r
"lxvd2x 42, %6, %2 \n\t" // x2_r, x2_i
"xvmaddadp 39, 43, 55 \n\t" // x3_r * y3_i , x3_i * y3_r
"lxvd2x 43, %7, %2 \n\t" // x3_r, x3_i
"xxswapd 52,48 \n\t" // y0_i, y0_r
"xxswapd 53,49 \n\t" // y1_i, y1_r
"addi %2, %2, 64 \n\t"
"addi %3, %3, 64 \n\t"
"xxswapd 54,50 \n\t" // y2_i, y2_r
"xxswapd 55,51 \n\t" // y3_i, y3_r
"xvmaddadp 32, 44, 56 \n\t" // x0_r * y0_r , x0_i * y0_i
"lxvd2x 56, 0, %3 \n\t" // y0_r, y0_i
"xvmaddadp 34, 45, 57 \n\t" // x1_r * y1_r , x1_i * y1_i
"lxvd2x 57, %5, %3 \n\t" // y1_r, y1_i
"xvmaddadp 36, 46, 58 \n\t" // x2_r * y2_r , x2_i * y2_i
"lxvd2x 58, %6, %3 \n\t" // y2_r, y2_i
"xvmaddadp 38, 47, 59 \n\t" // x3_r * y3_r , x3_i * y3_i
"lxvd2x 59, %7, %3 \n\t" // y3_r, y3_i
"xvmaddadp 33, 44, 60 \n\t" // x0_r * y0_i , x0_i * y0_r
"lxvd2x 44, 0, %2 \n\t" // x0_r, x0_i
"xvmaddadp 35, 45, 61 \n\t" // x1_r * y1_i , x1_i * y1_r
"lxvd2x 45, %5, %2 \n\t" // x1_r, x1_i
"xvmaddadp 37, 46, 62 \n\t" // x2_r * y2_i , x2_i * y2_r
"lxvd2x 46, %6, %2 \n\t" // x2_r, x2_i
"xvmaddadp 39, 47, 63 \n\t" // x3_r * y3_i , x3_i * y3_r
"lxvd2x 47, %7, %2 \n\t" // x3_r, x3_i
"xxswapd 60,56 \n\t" // y0_i, y0_r
"xxswapd 61,57 \n\t" // y1_i, y1_r
"addi %2, %2, 64 \n\t"
"addi %3, %3, 64 \n\t"
"xxswapd 62,58 \n\t" // y2_i, y2_r
"xxswapd 63,59 \n\t" // y3_i, y3_r
"addic. %0 , %0 , -8 \n\t"
"bgt 1b \n\t"
"2: \n\t"
"xvmaddadp 32, 40, 48 \n\t" // x0_r * y0_r , x0_i * y0_i
"xvmaddadp 34, 41, 49 \n\t" // x1_r * y1_r , x1_i * y1_i
"xvmaddadp 36, 42, 50 \n\t" // x2_r * y2_r , x2_i * y2_i
"xvmaddadp 38, 43, 51 \n\t" // x3_r * y3_r , x3_i * y3_i
"xvmaddadp 33, 40, 52 \n\t" // x0_r * y0_i , x0_i * y0_r
"xvmaddadp 35, 41, 53 \n\t" // x1_r * y1_i , x1_i * y1_r
"xvmaddadp 37, 42, 54 \n\t" // x2_r * y2_i , x2_i * y2_r
"xvmaddadp 39, 43, 55 \n\t" // x3_r * y3_i , x3_i * y3_r
"xvmaddadp 32, 44, 56 \n\t" // x0_r * y0_r , x0_i * y0_i
"xvmaddadp 34, 45, 57 \n\t" // x1_r * y1_r , x1_i * y1_i
"xvmaddadp 36, 46, 58 \n\t" // x2_r * y2_r , x2_i * y2_i
"xvmaddadp 38, 47, 59 \n\t" // x3_r * y3_r , x3_i * y3_i
"xvmaddadp 33, 44, 60 \n\t" // x0_r * y0_i , x0_i * y0_r
"xvmaddadp 35, 45, 61 \n\t" // x1_r * y1_i , x1_i * y1_r
"xvmaddadp 37, 46, 62 \n\t" // x2_r * y2_i , x2_i * y2_r
"xvmaddadp 39, 47, 63 \n\t" // x3_r * y3_i , x3_i * y3_r
"xvadddp 32, 32, 34 \n\t"
"xvadddp 36, 36, 38 \n\t"
"xvadddp 33, 33, 35 \n\t"
"xvadddp 37, 37, 39 \n\t"
"xvadddp 32, 32, 36 \n\t"
"xvadddp 33, 33, 37 \n\t"
"stxvd2x 32, 0, %4 \n\t"
"stxvd2x 33, %5, %4 \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"r" (x1), // 2
"r" (y1), // 3
"r" (dot), // 4
"r" (o16), // 5
"r" (o32), // 6
"r" (o48), // 7
"r" (pre) // 8
: "cr0", "%0", "%2" , "%3", "memory"
);
}

View File

@ -1,38 +1,3 @@
/***************************************************************************
Copyright (c) 2013-2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2016/03/05 Werner Saar (wernsaar@googlemail.com)
* BLASTEST : OK
* CTEST : OK
* TEST : OK
* LAPACK-TEST : OK
**************************************************************************************/
/*********************************************************************/ /*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */ /* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */ /* All rights reserved. */
@ -82,7 +47,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif #endif
#ifdef __64BIT__ #ifdef __64BIT__
#define STACKSIZE 320 #define STACKSIZE 32000
#define ALPHA_R_SP 296(SP) #define ALPHA_R_SP 296(SP)
#define ALPHA_I_SP 304(SP) #define ALPHA_I_SP 304(SP)
#define FZERO 312(SP) #define FZERO 312(SP)
@ -133,11 +98,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define alpha_r vs30 #define alpha_r vs30
#define alpha_i vs31 #define alpha_i vs31
#define FRAMEPOINTER r12
#define BBUFFER r14
#define L r15 #define L r15
#define ALPHA r16 #define ALPHA r16
#define o24 r17 #define o24 r17
#define T2 r19 #define T2 r19
#define KK r20 #define BBO r20
#define o8 r21 #define o8 r21
#define I r22 #define I r22
#define J r23 #define J r23
@ -156,6 +126,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PROLOGUE PROLOGUE
PROFCODE PROFCODE
mr FRAMEPOINTER, SP
addi SP, SP, -STACKSIZE
addi SP, SP, -STACKSIZE
addi SP, SP, -STACKSIZE
addi SP, SP, -STACKSIZE addi SP, SP, -STACKSIZE
li r0, 0 li r0, 0
@ -200,6 +174,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
std r17, 256(SP) std r17, 256(SP)
std r16, 264(SP) std r16, 264(SP)
std r15, 272(SP) std r15, 272(SP)
std r14, 280(SP)
#else #else
stw r31, 144(SP) stw r31, 144(SP)
stw r30, 148(SP) stw r30, 148(SP)
@ -226,37 +201,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#ifdef linux #ifdef linux
#ifdef __64BIT__ #ifdef __64BIT__
ld LDC, FRAMESLOT(0) + STACKSIZE(SP) ld LDC, FRAMESLOT(0) + 0(FRAMEPOINTER)
#endif #endif
#endif #endif
#if defined(_AIX) || defined(__APPLE__) #if defined(_AIX) || defined(__APPLE__)
#ifdef __64BIT__ #ifdef __64BIT__
ld LDC, FRAMESLOT(0) + STACKSIZE(SP) ld LDC, FRAMESLOT(0) + 0(FRAMEPOINTER)
#else #else
#ifdef DOUBLE #ifdef DOUBLE
lwz B, FRAMESLOT(0) + STACKSIZE(SP) lwz B, FRAMESLOT(0) + 0(FRAMEPOINTER)
lwz C, FRAMESLOT(1) + STACKSIZE(SP) lwz C, FRAMESLOT(1) + 0(FRAMEPOINTER)
lwz LDC, FRAMESLOT(2) + STACKSIZE(SP) lwz LDC, FRAMESLOT(2) + 0(FRAMEPOINTER)
#else #else
lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) lwz LDC, FRAMESLOT(0) + 0(FRAMEPOINTER)
#endif #endif
#endif #endif
#endif #endif
#ifdef TRMMKERNEL #ifdef TRMMKERNEL
#if defined(linux) && defined(__64BIT__) #if defined(linux) && defined(__64BIT__)
ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) ld OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER)
#endif #endif
#if defined(_AIX) || defined(__APPLE__) #if defined(_AIX) || defined(__APPLE__)
#ifdef __64BIT__ #ifdef __64BIT__
ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) ld OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER)
#else #else
#ifdef DOUBLE #ifdef DOUBLE
lwz OFFSET, FRAMESLOT(3) + STACKSIZE(SP) lwz OFFSET, FRAMESLOT(3) + 0(FRAMEPOINTER)
#else #else
lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP) lwz OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER)
#endif #endif
#endif #endif
#endif #endif
@ -268,34 +243,38 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "zgemm_macros_8x2_power8.S" #include "zgemm_macros_8x2_power8.S"
cmpwi cr0, M, 0 cmpwi cr0, M, 0
ble .L999 ble L999
cmpwi cr0, N, 0 cmpwi cr0, N, 0
ble .L999 ble L999
cmpwi cr0, K, 0 cmpwi cr0, K, 0
ble .L999 ble L999
slwi LDC, LDC, ZBASE_SHIFT slwi LDC, LDC, ZBASE_SHIFT
li PRE, 256 li PRE, 384
li o8 , 8 li o8 , 8
li o16 , 16 li o16 , 16
li o24 , 24 li o24 , 24
li o32 , 32 li o32 , 32
li o48 , 48 li o48 , 48
addi BBUFFER, SP, 512+4096
li T1, -4096
and BBUFFER, BBUFFER, T1
#ifdef __64BIT__ #ifdef __64BIT__
addi ALPHA, SP, 296 addi ALPHA, SP, 296
#else #else
addi ALPHA, SP, 224 addi ALPHA, SP, 224
#endif #endif
lxvdsx alpha_r, 0, ALPHA lxsdx alpha_r, 0, ALPHA
lxvdsx alpha_i, o8, ALPHA lxsdx alpha_i, o8, ALPHA
.align 5 .align 4
#include "zgemm_logic_8x2_power8.S" #include "zgemm_logic_8x2_power8.S"
.L999: L999:
addi r3, 0, 0 addi r3, 0, 0
lfd f14, 0(SP) lfd f14, 0(SP)
@ -339,6 +318,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld r17, 256(SP) ld r17, 256(SP)
ld r16, 264(SP) ld r16, 264(SP)
ld r15, 272(SP) ld r15, 272(SP)
ld r14, 280(SP)
#else #else
lwz r31, 144(SP) lwz r31, 144(SP)
lwz r30, 148(SP) lwz r30, 148(SP)
@ -360,6 +340,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif #endif
addi SP, SP, STACKSIZE addi SP, SP, STACKSIZE
addi SP, SP, STACKSIZE
addi SP, SP, STACKSIZE
addi SP, SP, STACKSIZE
blr blr

View File

@ -1,83 +1,111 @@
srawi. J, N, 1 srawi. J, N, 1
ble .LZGEMM_L2_END ble ZGEMM_L2_END
ZGEMM_L2_BEGIN:
mr BO, B
mr BBO, BBUFFER
slwi T1, K, 1
ZGEMM_L2_COPYB:
lxvdsx vs4, o0, BO // b0_r
lxvdsx vs5, o8, BO // b0_i
addi BO, BO, 16
stxvd2x vs4, o0, BBO
stxvd2x vs5, o16, BBO
addic. T1, T1, -1
addi BBO, BBO, 32
bge ZGEMM_L2_COPYB
.LZGEMM_L2_BEGIN:
mr CO, C mr CO, C
mr AO, A mr AO, A
slwi T1, LDC , 1 slwi T1, LDC , 1
add C, C, T1 add C, C, T1
srawi. I, M, 3 srawi. I, M, 3
ble .LZGEMM_L2x8_END ble ZGEMM_L2x8_END
.LZGEMM_L2x8_BEGIN: ZGEMM_L2x8_BEGIN:
mr BO, B mr BO, BBUFFER
srawi. L, K, 3 srawi. L, K, 3
ble .LZGEMM_L2x8_SUB0 ble ZGEMM_L2x8_SUB0
cmpwi cr0, L, 1 cmpwi cr0, L, 1
ble .LZGEMM_L2x8_SUB4 ble ZGEMM_L2x8_SUB4
.LZGEMM_L2x8_LOOP_START: ZGEMM_L2x8_LOOP_START:
dcbt AO, PRE dcbt AO, PRE
dcbt BO, PRE
LOAD2x8_1 LOAD2x8_1
dcbt AO, PRE dcbt AO, PRE
KERNEL2x8_I1 KERNEL2x8_I1
dcbt AO, PRE dcbt AO, PRE
dcbt BO, PRE
KERNEL2x8_2 KERNEL2x8_2
dcbt AO, PRE dcbt AO, PRE
KERNEL2x8_1 KERNEL2x8_1
dcbt AO, PRE dcbt AO, PRE
dcbt BO, PRE
KERNEL2x8_2 KERNEL2x8_2
dcbt AO, PRE dcbt AO, PRE
KERNEL2x8_1 KERNEL2x8_1
dcbt AO, PRE dcbt AO, PRE
dcbt BO, PRE
KERNEL2x8_2 KERNEL2x8_2
dcbt AO, PRE dcbt AO, PRE
KERNEL2x8_1 KERNEL2x8_1
dcbt AO, PRE dcbt AO, PRE
dcbt BO, PRE
KERNEL2x8_2 KERNEL2x8_2
addic. L, L, -2 addic. L, L, -2
ble .LZGEMM_L2x8_LOOP_END ble ZGEMM_L2x8_LOOP_END
.align 5 .align 5
.LZGEMM_L2x8_LOOP: ZGEMM_L2x8_LOOP:
dcbt AO, PRE dcbt AO, PRE
KERNEL2x8_1 KERNEL2x8_1
dcbt AO, PRE dcbt AO, PRE
dcbt BO, PRE
KERNEL2x8_2 KERNEL2x8_2
dcbt AO, PRE dcbt AO, PRE
KERNEL2x8_1 KERNEL2x8_1
dcbt AO, PRE dcbt AO, PRE
dcbt BO, PRE
KERNEL2x8_2 KERNEL2x8_2
dcbt AO, PRE dcbt AO, PRE
KERNEL2x8_1 KERNEL2x8_1
dcbt AO, PRE dcbt AO, PRE
dcbt BO, PRE
KERNEL2x8_2 KERNEL2x8_2
dcbt AO, PRE dcbt AO, PRE
KERNEL2x8_1 KERNEL2x8_1
dcbt AO, PRE dcbt AO, PRE
dcbt BO, PRE
KERNEL2x8_2 KERNEL2x8_2
addic. L, L, -1 addic. L, L, -1
bgt .LZGEMM_L2x8_LOOP bgt ZGEMM_L2x8_LOOP
.LZGEMM_L2x8_LOOP_END: ZGEMM_L2x8_LOOP_END:
dcbt AO, PRE dcbt AO, PRE
KERNEL2x8_1 KERNEL2x8_1
dcbt AO, PRE dcbt AO, PRE
dcbt BO, PRE
KERNEL2x8_2 KERNEL2x8_2
dcbt AO, PRE dcbt AO, PRE
KERNEL2x8_1 KERNEL2x8_1
dcbt AO, PRE dcbt AO, PRE
dcbt BO, PRE
KERNEL2x8_2 KERNEL2x8_2
dcbt AO, PRE dcbt AO, PRE
@ -88,9 +116,9 @@
KERNEL2x8_1 KERNEL2x8_1
KERNEL2x8_E2 KERNEL2x8_E2
b .LZGEMM_L2x8_SUB1 b ZGEMM_L2x8_SUB1
.LZGEMM_L2x8_SUB4: ZGEMM_L2x8_SUB4:
dcbt AO, PRE dcbt AO, PRE
KERNEL2x8_SUBI1 KERNEL2x8_SUBI1
@ -106,53 +134,53 @@
KERNEL2x8_SUB1 KERNEL2x8_SUB1
KERNEL2x8_SUB1 KERNEL2x8_SUB1
b .LZGEMM_L2x8_SUB1 b ZGEMM_L2x8_SUB1
.LZGEMM_L2x8_SUB0: ZGEMM_L2x8_SUB0:
andi. L, K, 7 andi. L, K, 7
KERNEL2x8_SUBI1 KERNEL2x8_SUBI1
addic. L, L, -1 addic. L, L, -1
ble .LZGEMM_L2x8_SAVE ble ZGEMM_L2x8_SAVE
b .LZGEMM_L2x8_SUB2 b ZGEMM_L2x8_SUB2
.LZGEMM_L2x8_SUB1: ZGEMM_L2x8_SUB1:
andi. L, K, 7 andi. L, K, 7
ble .LZGEMM_L2x8_SAVE ble ZGEMM_L2x8_SAVE
.LZGEMM_L2x8_SUB2: ZGEMM_L2x8_SUB2:
KERNEL2x8_SUB1 KERNEL2x8_SUB1
addic. L, L, -1 addic. L, L, -1
bgt .LZGEMM_L2x8_SUB2 bgt ZGEMM_L2x8_SUB2
.LZGEMM_L2x8_SAVE: ZGEMM_L2x8_SAVE:
SAVE2x8 SAVE2x8
addic. I, I, -1 addic. I, I, -1
bgt .LZGEMM_L2x8_BEGIN bgt ZGEMM_L2x8_BEGIN
.LZGEMM_L2x8_END: ZGEMM_L2x8_END:
.LZGEMM_L2x4_BEGIN: ZGEMM_L2x4_BEGIN:
andi. T2, M, 7 andi. T2, M, 7
ble .LZGEMM_L2x1_END ble ZGEMM_L2x1_END
andi. T1, M, 4 andi. T1, M, 4
ble .LZGEMM_L2x4_END ble ZGEMM_L2x4_END
mr BO, B mr BO, BBUFFER
srawi. L, K, 3 srawi. L, K, 3
ble .LZGEMM_L2x4_SUB0 ble ZGEMM_L2x4_SUB0
cmpwi cr0, L, 1 cmpwi cr0, L, 1
ble .LZGEMM_L2x4_SUB4 ble ZGEMM_L2x4_SUB4
.LZGEMM_L2x4_LOOP_START: ZGEMM_L2x4_LOOP_START:
LOAD2x4_1 LOAD2x4_1
KERNEL2x4_I1 KERNEL2x4_I1
@ -166,11 +194,11 @@
KERNEL2x4_2 KERNEL2x4_2
addic. L, L, -2 addic. L, L, -2
ble .LZGEMM_L2x4_LOOP_END ble ZGEMM_L2x4_LOOP_END
.align 5 .align 5
.LZGEMM_L2x4_LOOP: ZGEMM_L2x4_LOOP:
KERNEL2x4_1 KERNEL2x4_1
KERNEL2x4_2 KERNEL2x4_2
@ -183,9 +211,9 @@
KERNEL2x4_2 KERNEL2x4_2
addic. L, L, -1 addic. L, L, -1
bgt .LZGEMM_L2x4_LOOP bgt ZGEMM_L2x4_LOOP
.LZGEMM_L2x4_LOOP_END: ZGEMM_L2x4_LOOP_END:
KERNEL2x4_1 KERNEL2x4_1
KERNEL2x4_2 KERNEL2x4_2
@ -197,9 +225,9 @@
KERNEL2x4_1 KERNEL2x4_1
KERNEL2x4_E2 KERNEL2x4_E2
b .LZGEMM_L2x4_SUB1 b ZGEMM_L2x4_SUB1
.LZGEMM_L2x4_SUB4: ZGEMM_L2x4_SUB4:
KERNEL2x4_SUBI1 KERNEL2x4_SUBI1
KERNEL2x4_SUB1 KERNEL2x4_SUB1
@ -211,48 +239,48 @@
KERNEL2x4_SUB1 KERNEL2x4_SUB1
KERNEL2x4_SUB1 KERNEL2x4_SUB1
b .LZGEMM_L2x4_SUB1 b ZGEMM_L2x4_SUB1
.LZGEMM_L2x4_SUB0: ZGEMM_L2x4_SUB0:
andi. L, K, 7 andi. L, K, 7
KERNEL2x4_SUBI1 KERNEL2x4_SUBI1
addic. L, L, -1 addic. L, L, -1
ble .LZGEMM_L2x4_SAVE ble ZGEMM_L2x4_SAVE
b .LZGEMM_L2x4_SUB2 b ZGEMM_L2x4_SUB2
.LZGEMM_L2x4_SUB1: ZGEMM_L2x4_SUB1:
andi. L, K, 7 andi. L, K, 7
ble .LZGEMM_L2x4_SAVE ble ZGEMM_L2x4_SAVE
.LZGEMM_L2x4_SUB2: ZGEMM_L2x4_SUB2:
KERNEL2x4_SUB1 KERNEL2x4_SUB1
addic. L, L, -1 addic. L, L, -1
bgt .LZGEMM_L2x4_SUB2 bgt ZGEMM_L2x4_SUB2
.LZGEMM_L2x4_SAVE: ZGEMM_L2x4_SAVE:
SAVE2x4 SAVE2x4
.LZGEMM_L2x4_END: ZGEMM_L2x4_END:
.LZGEMM_L2x2_BEGIN: ZGEMM_L2x2_BEGIN:
andi. T1, M, 2 andi. T1, M, 2
ble .LZGEMM_L2x2_END ble ZGEMM_L2x2_END
mr BO, B mr BO, BBUFFER
srawi. L, K, 3 srawi. L, K, 3
ble .LZGEMM_L2x2_SUB0 ble ZGEMM_L2x2_SUB0
cmpwi cr0, L, 1 cmpwi cr0, L, 1
ble .LZGEMM_L2x2_SUB4 ble ZGEMM_L2x2_SUB4
.LZGEMM_L2x2_LOOP_START: ZGEMM_L2x2_LOOP_START:
LOAD2x2_1 LOAD2x2_1
KERNEL2x2_I1 KERNEL2x2_I1
@ -266,11 +294,11 @@
KERNEL2x2_2 KERNEL2x2_2
addic. L, L, -2 addic. L, L, -2
ble .LZGEMM_L2x2_LOOP_END ble ZGEMM_L2x2_LOOP_END
.align 5 .align 5
.LZGEMM_L2x2_LOOP: ZGEMM_L2x2_LOOP:
KERNEL2x2_1 KERNEL2x2_1
KERNEL2x2_2 KERNEL2x2_2
@ -283,9 +311,9 @@
KERNEL2x2_2 KERNEL2x2_2
addic. L, L, -1 addic. L, L, -1
bgt .LZGEMM_L2x2_LOOP bgt ZGEMM_L2x2_LOOP
.LZGEMM_L2x2_LOOP_END: ZGEMM_L2x2_LOOP_END:
KERNEL2x2_1 KERNEL2x2_1
KERNEL2x2_2 KERNEL2x2_2
@ -297,9 +325,9 @@
KERNEL2x2_1 KERNEL2x2_1
KERNEL2x2_E2 KERNEL2x2_E2
b .LZGEMM_L2x2_SUB1 b ZGEMM_L2x2_SUB1
.LZGEMM_L2x2_SUB4: ZGEMM_L2x2_SUB4:
KERNEL2x2_SUBI1 KERNEL2x2_SUBI1
KERNEL2x2_SUB1 KERNEL2x2_SUB1
@ -311,48 +339,48 @@
KERNEL2x2_SUB1 KERNEL2x2_SUB1
KERNEL2x2_SUB1 KERNEL2x2_SUB1
b .LZGEMM_L2x2_SUB1 b ZGEMM_L2x2_SUB1
.LZGEMM_L2x2_SUB0: ZGEMM_L2x2_SUB0:
andi. L, K, 7 andi. L, K, 7
KERNEL2x2_SUBI1 KERNEL2x2_SUBI1
addic. L, L, -1 addic. L, L, -1
ble .LZGEMM_L2x2_SAVE ble ZGEMM_L2x2_SAVE
b .LZGEMM_L2x2_SUB2 b ZGEMM_L2x2_SUB2
.LZGEMM_L2x2_SUB1: ZGEMM_L2x2_SUB1:
andi. L, K, 7 andi. L, K, 7
ble .LZGEMM_L2x2_SAVE ble ZGEMM_L2x2_SAVE
.LZGEMM_L2x2_SUB2: ZGEMM_L2x2_SUB2:
KERNEL2x2_SUB1 KERNEL2x2_SUB1
addic. L, L, -1 addic. L, L, -1
bgt .LZGEMM_L2x2_SUB2 bgt ZGEMM_L2x2_SUB2
.LZGEMM_L2x2_SAVE: ZGEMM_L2x2_SAVE:
SAVE2x2 SAVE2x2
.LZGEMM_L2x2_END: ZGEMM_L2x2_END:
.LZGEMM_L2x1_BEGIN: ZGEMM_L2x1_BEGIN:
andi. T1, M, 1 andi. T1, M, 1
ble .LZGEMM_L2x1_END ble ZGEMM_L2x1_END
mr BO, B mr BO, BBUFFER
srawi. L, K, 3 srawi. L, K, 3
ble .LZGEMM_L2x1_SUB0 ble ZGEMM_L2x1_SUB0
cmpwi cr0, L, 1 cmpwi cr0, L, 1
ble .LZGEMM_L2x1_SUB4 ble ZGEMM_L2x1_SUB4
.LZGEMM_L2x1_LOOP_START: ZGEMM_L2x1_LOOP_START:
LOAD2x1_1 LOAD2x1_1
KERNEL2x1_I1 KERNEL2x1_I1
@ -366,11 +394,11 @@
KERNEL2x1_2 KERNEL2x1_2
addic. L, L, -2 addic. L, L, -2
ble .LZGEMM_L2x1_LOOP_END ble ZGEMM_L2x1_LOOP_END
.align 5 .align 5
.LZGEMM_L2x1_LOOP: ZGEMM_L2x1_LOOP:
KERNEL2x1_1 KERNEL2x1_1
KERNEL2x1_2 KERNEL2x1_2
@ -383,9 +411,9 @@
KERNEL2x1_2 KERNEL2x1_2
addic. L, L, -1 addic. L, L, -1
bgt .LZGEMM_L2x1_LOOP bgt ZGEMM_L2x1_LOOP
.LZGEMM_L2x1_LOOP_END: ZGEMM_L2x1_LOOP_END:
KERNEL2x1_1 KERNEL2x1_1
KERNEL2x1_2 KERNEL2x1_2
@ -397,9 +425,9 @@
KERNEL2x1_1 KERNEL2x1_1
KERNEL2x1_E2 KERNEL2x1_E2
b .LZGEMM_L2x1_SUB1 b ZGEMM_L2x1_SUB1
.LZGEMM_L2x1_SUB4: ZGEMM_L2x1_SUB4:
KERNEL2x1_SUBI1 KERNEL2x1_SUBI1
KERNEL2x1_SUB1 KERNEL2x1_SUB1
@ -411,72 +439,89 @@
KERNEL2x1_SUB1 KERNEL2x1_SUB1
KERNEL2x1_SUB1 KERNEL2x1_SUB1
b .LZGEMM_L2x1_SUB1 b ZGEMM_L2x1_SUB1
.LZGEMM_L2x1_SUB0: ZGEMM_L2x1_SUB0:
andi. L, K, 7 andi. L, K, 7
KERNEL2x1_SUBI1 KERNEL2x1_SUBI1
addic. L, L, -1 addic. L, L, -1
ble .LZGEMM_L2x1_SAVE ble ZGEMM_L2x1_SAVE
b .LZGEMM_L2x1_SUB2 b ZGEMM_L2x1_SUB2
.LZGEMM_L2x1_SUB1: ZGEMM_L2x1_SUB1:
andi. L, K, 7 andi. L, K, 7
ble .LZGEMM_L2x1_SAVE ble ZGEMM_L2x1_SAVE
.LZGEMM_L2x1_SUB2: ZGEMM_L2x1_SUB2:
KERNEL2x1_SUB1 KERNEL2x1_SUB1
addic. L, L, -1 addic. L, L, -1
bgt .LZGEMM_L2x1_SUB2 bgt ZGEMM_L2x1_SUB2
.LZGEMM_L2x1_SAVE: ZGEMM_L2x1_SAVE:
SAVE2x1 SAVE2x1
.LZGEMM_L2x1_END: ZGEMM_L2x1_END:
slwi T1, K, 5 slwi T1, K, 5
add B, B, T1 add B, B, T1
addic. J, J, -1 addic. J, J, -1
bgt .LZGEMM_L2_BEGIN bgt ZGEMM_L2_BEGIN
andi. T2, N, 1 andi. T2, N, 1
ble .L999 ble L999
.LZGEMM_L2_END: ZGEMM_L2_END:
b .LZGEMM_L1_BEGIN b ZGEMM_L1_BEGIN
.L999_H1: L999_H1:
b .L999 b L999
ZGEMM_L1_BEGIN:
mr BO, B
mr BBO, BBUFFER
slwi T1, K, 0
ZGEMM_L1_COPYB:
lxvdsx vs4, o0, BO // b0_r
lxvdsx vs5, o8, BO // b0_i
addi BO, BO, 16
stxvd2x vs4, o0, BBO
stxvd2x vs5, o16, BBO
addic. T1, T1, -1
addi BBO, BBO, 32
bge ZGEMM_L1_COPYB
.LZGEMM_L1_BEGIN:
andi. T1, N, 1 andi. T1, N, 1
ble .LZGEMM_L1_END ble ZGEMM_L1_END
mr CO, C mr CO, C
mr AO, A mr AO, A
srawi. I, M, 3 srawi. I, M, 3
ble .LZGEMM_L1x8_END ble ZGEMM_L1x8_END
.LZGEMM_L1x8_BEGIN: ZGEMM_L1x8_BEGIN:
mr BO, B mr BO, BBUFFER
srawi. L, K, 3 srawi. L, K, 3
ble .LZGEMM_L1x8_SUB0 ble ZGEMM_L1x8_SUB0
cmpwi cr0, L, 1 cmpwi cr0, L, 1
ble .LZGEMM_L1x8_SUB4 ble ZGEMM_L1x8_SUB4
.LZGEMM_L1x8_LOOP_START: ZGEMM_L1x8_LOOP_START:
dcbt AO, PRE dcbt AO, PRE
LOAD1x8_1 LOAD1x8_1
@ -499,11 +544,11 @@
KERNEL1x8_2 KERNEL1x8_2
addic. L, L, -2 addic. L, L, -2
ble .LZGEMM_L1x8_LOOP_END ble ZGEMM_L1x8_LOOP_END
.align 5 .align 5
.LZGEMM_L1x8_LOOP: ZGEMM_L1x8_LOOP:
dcbt AO, PRE dcbt AO, PRE
KERNEL1x8_1 KERNEL1x8_1
@ -524,9 +569,9 @@
KERNEL1x8_2 KERNEL1x8_2
addic. L, L, -1 addic. L, L, -1
bgt .LZGEMM_L1x8_LOOP bgt ZGEMM_L1x8_LOOP
.LZGEMM_L1x8_LOOP_END: ZGEMM_L1x8_LOOP_END:
dcbt AO, PRE dcbt AO, PRE
KERNEL1x8_1 KERNEL1x8_1
@ -545,9 +590,9 @@
KERNEL1x8_1 KERNEL1x8_1
KERNEL1x8_E2 KERNEL1x8_E2
b .LZGEMM_L1x8_SUB1 b ZGEMM_L1x8_SUB1
.LZGEMM_L1x8_SUB4: ZGEMM_L1x8_SUB4:
dcbt AO, PRE dcbt AO, PRE
KERNEL1x8_SUBI1 KERNEL1x8_SUBI1
@ -563,53 +608,53 @@
KERNEL1x8_SUB1 KERNEL1x8_SUB1
KERNEL1x8_SUB1 KERNEL1x8_SUB1
b .LZGEMM_L1x8_SUB1 b ZGEMM_L1x8_SUB1
.LZGEMM_L1x8_SUB0: ZGEMM_L1x8_SUB0:
andi. L, K, 7 andi. L, K, 7
KERNEL1x8_SUBI1 KERNEL1x8_SUBI1
addic. L, L, -1 addic. L, L, -1
ble .LZGEMM_L1x8_SAVE ble ZGEMM_L1x8_SAVE
b .LZGEMM_L1x8_SUB2 b ZGEMM_L1x8_SUB2
.LZGEMM_L1x8_SUB1: ZGEMM_L1x8_SUB1:
andi. L, K, 7 andi. L, K, 7
ble .LZGEMM_L1x8_SAVE ble ZGEMM_L1x8_SAVE
.LZGEMM_L1x8_SUB2: ZGEMM_L1x8_SUB2:
KERNEL1x8_SUB1 KERNEL1x8_SUB1
addic. L, L, -1 addic. L, L, -1
bgt .LZGEMM_L1x8_SUB2 bgt ZGEMM_L1x8_SUB2
.LZGEMM_L1x8_SAVE: ZGEMM_L1x8_SAVE:
SAVE1x8 SAVE1x8
addic. I, I, -1 addic. I, I, -1
bgt .LZGEMM_L1x8_BEGIN bgt ZGEMM_L1x8_BEGIN
.LZGEMM_L1x8_END: ZGEMM_L1x8_END:
.LZGEMM_L1x4_BEGIN: ZGEMM_L1x4_BEGIN:
andi. T2, M, 7 andi. T2, M, 7
ble .LZGEMM_L1x1_END ble ZGEMM_L1x1_END
andi. T1, M, 4 andi. T1, M, 4
ble .LZGEMM_L1x4_END ble ZGEMM_L1x4_END
mr BO, B mr BO, BBUFFER
srawi. L, K, 3 srawi. L, K, 3
ble .LZGEMM_L1x4_SUB0 ble ZGEMM_L1x4_SUB0
cmpwi cr0, L, 1 cmpwi cr0, L, 1
ble .LZGEMM_L1x4_SUB4 ble ZGEMM_L1x4_SUB4
.LZGEMM_L1x4_LOOP_START: ZGEMM_L1x4_LOOP_START:
LOAD1x4_1 LOAD1x4_1
KERNEL1x4_I1 KERNEL1x4_I1
@ -623,11 +668,11 @@
KERNEL1x4_2 KERNEL1x4_2
addic. L, L, -2 addic. L, L, -2
ble .LZGEMM_L1x4_LOOP_END ble ZGEMM_L1x4_LOOP_END
.align 5 .align 5
.LZGEMM_L1x4_LOOP: ZGEMM_L1x4_LOOP:
KERNEL1x4_1 KERNEL1x4_1
KERNEL1x4_2 KERNEL1x4_2
@ -640,9 +685,9 @@
KERNEL1x4_2 KERNEL1x4_2
addic. L, L, -1 addic. L, L, -1
bgt .LZGEMM_L1x4_LOOP bgt ZGEMM_L1x4_LOOP
.LZGEMM_L1x4_LOOP_END: ZGEMM_L1x4_LOOP_END:
KERNEL1x4_1 KERNEL1x4_1
KERNEL1x4_2 KERNEL1x4_2
@ -654,9 +699,9 @@
KERNEL1x4_1 KERNEL1x4_1
KERNEL1x4_E2 KERNEL1x4_E2
b .LZGEMM_L1x4_SUB1 b ZGEMM_L1x4_SUB1
.LZGEMM_L1x4_SUB4: ZGEMM_L1x4_SUB4:
KERNEL1x4_SUBI1 KERNEL1x4_SUBI1
KERNEL1x4_SUB1 KERNEL1x4_SUB1
@ -668,48 +713,48 @@
KERNEL1x4_SUB1 KERNEL1x4_SUB1
KERNEL1x4_SUB1 KERNEL1x4_SUB1
b .LZGEMM_L1x4_SUB1 b ZGEMM_L1x4_SUB1
.LZGEMM_L1x4_SUB0: ZGEMM_L1x4_SUB0:
andi. L, K, 7 andi. L, K, 7
KERNEL1x4_SUBI1 KERNEL1x4_SUBI1
addic. L, L, -1 addic. L, L, -1
ble .LZGEMM_L1x4_SAVE ble ZGEMM_L1x4_SAVE
b .LZGEMM_L1x4_SUB2 b ZGEMM_L1x4_SUB2
.LZGEMM_L1x4_SUB1: ZGEMM_L1x4_SUB1:
andi. L, K, 7 andi. L, K, 7
ble .LZGEMM_L1x4_SAVE ble ZGEMM_L1x4_SAVE
.LZGEMM_L1x4_SUB2: ZGEMM_L1x4_SUB2:
KERNEL1x4_SUB1 KERNEL1x4_SUB1
addic. L, L, -1 addic. L, L, -1
bgt .LZGEMM_L1x4_SUB2 bgt ZGEMM_L1x4_SUB2
.LZGEMM_L1x4_SAVE: ZGEMM_L1x4_SAVE:
SAVE1x4 SAVE1x4
.LZGEMM_L1x4_END: ZGEMM_L1x4_END:
.LZGEMM_L1x2_BEGIN: ZGEMM_L1x2_BEGIN:
andi. T1, M, 2 andi. T1, M, 2
ble .LZGEMM_L1x2_END ble ZGEMM_L1x2_END
mr BO, B mr BO, BBUFFER
srawi. L, K, 3 srawi. L, K, 3
ble .LZGEMM_L1x2_SUB0 ble ZGEMM_L1x2_SUB0
cmpwi cr0, L, 1 cmpwi cr0, L, 1
ble .LZGEMM_L1x2_SUB4 ble ZGEMM_L1x2_SUB4
.LZGEMM_L1x2_LOOP_START: ZGEMM_L1x2_LOOP_START:
LOAD1x2_1 LOAD1x2_1
KERNEL1x2_I1 KERNEL1x2_I1
@ -723,11 +768,11 @@
KERNEL1x2_2 KERNEL1x2_2
addic. L, L, -2 addic. L, L, -2
ble .LZGEMM_L1x2_LOOP_END ble ZGEMM_L1x2_LOOP_END
.align 5 .align 5
.LZGEMM_L1x2_LOOP: ZGEMM_L1x2_LOOP:
KERNEL1x2_1 KERNEL1x2_1
KERNEL1x2_2 KERNEL1x2_2
@ -740,9 +785,9 @@
KERNEL1x2_2 KERNEL1x2_2
addic. L, L, -1 addic. L, L, -1
bgt .LZGEMM_L1x2_LOOP bgt ZGEMM_L1x2_LOOP
.LZGEMM_L1x2_LOOP_END: ZGEMM_L1x2_LOOP_END:
KERNEL1x2_1 KERNEL1x2_1
KERNEL1x2_2 KERNEL1x2_2
@ -754,9 +799,9 @@
KERNEL1x2_1 KERNEL1x2_1
KERNEL1x2_E2 KERNEL1x2_E2
b .LZGEMM_L1x2_SUB1 b ZGEMM_L1x2_SUB1
.LZGEMM_L1x2_SUB4: ZGEMM_L1x2_SUB4:
KERNEL1x2_SUBI1 KERNEL1x2_SUBI1
KERNEL1x2_SUB1 KERNEL1x2_SUB1
@ -768,48 +813,48 @@
KERNEL1x2_SUB1 KERNEL1x2_SUB1
KERNEL1x2_SUB1 KERNEL1x2_SUB1
b .LZGEMM_L1x2_SUB1 b ZGEMM_L1x2_SUB1
.LZGEMM_L1x2_SUB0: ZGEMM_L1x2_SUB0:
andi. L, K, 7 andi. L, K, 7
KERNEL1x2_SUBI1 KERNEL1x2_SUBI1
addic. L, L, -1 addic. L, L, -1
ble .LZGEMM_L1x2_SAVE ble ZGEMM_L1x2_SAVE
b .LZGEMM_L1x2_SUB2 b ZGEMM_L1x2_SUB2
.LZGEMM_L1x2_SUB1: ZGEMM_L1x2_SUB1:
andi. L, K, 7 andi. L, K, 7
ble .LZGEMM_L1x2_SAVE ble ZGEMM_L1x2_SAVE
.LZGEMM_L1x2_SUB2: ZGEMM_L1x2_SUB2:
KERNEL1x2_SUB1 KERNEL1x2_SUB1
addic. L, L, -1 addic. L, L, -1
bgt .LZGEMM_L1x2_SUB2 bgt ZGEMM_L1x2_SUB2
.LZGEMM_L1x2_SAVE: ZGEMM_L1x2_SAVE:
SAVE1x2 SAVE1x2
.LZGEMM_L1x2_END: ZGEMM_L1x2_END:
.LZGEMM_L1x1_BEGIN: ZGEMM_L1x1_BEGIN:
andi. T1, M, 1 andi. T1, M, 1
ble .LZGEMM_L1x1_END ble ZGEMM_L1x1_END
mr BO, B mr BO, BBUFFER
srawi. L, K, 3 srawi. L, K, 3
ble .LZGEMM_L1x1_SUB0 ble ZGEMM_L1x1_SUB0
cmpwi cr0, L, 1 cmpwi cr0, L, 1
ble .LZGEMM_L1x1_SUB4 ble ZGEMM_L1x1_SUB4
.LZGEMM_L1x1_LOOP_START: ZGEMM_L1x1_LOOP_START:
LOAD1x1_1 LOAD1x1_1
KERNEL1x1_I1 KERNEL1x1_I1
@ -823,11 +868,11 @@
KERNEL1x1_2 KERNEL1x1_2
addic. L, L, -2 addic. L, L, -2
ble .LZGEMM_L1x1_LOOP_END ble ZGEMM_L1x1_LOOP_END
.align 5 .align 5
.LZGEMM_L1x1_LOOP: ZGEMM_L1x1_LOOP:
KERNEL1x1_1 KERNEL1x1_1
KERNEL1x1_2 KERNEL1x1_2
@ -840,9 +885,9 @@
KERNEL1x1_2 KERNEL1x1_2
addic. L, L, -1 addic. L, L, -1
bgt .LZGEMM_L1x1_LOOP bgt ZGEMM_L1x1_LOOP
.LZGEMM_L1x1_LOOP_END: ZGEMM_L1x1_LOOP_END:
KERNEL1x1_1 KERNEL1x1_1
KERNEL1x1_2 KERNEL1x1_2
@ -854,9 +899,9 @@
KERNEL1x1_1 KERNEL1x1_1
KERNEL1x1_E2 KERNEL1x1_E2
b .LZGEMM_L1x1_SUB1 b ZGEMM_L1x1_SUB1
.LZGEMM_L1x1_SUB4: ZGEMM_L1x1_SUB4:
KERNEL1x1_SUBI1 KERNEL1x1_SUBI1
KERNEL1x1_SUB1 KERNEL1x1_SUB1
@ -868,34 +913,34 @@
KERNEL1x1_SUB1 KERNEL1x1_SUB1
KERNEL1x1_SUB1 KERNEL1x1_SUB1
b .LZGEMM_L1x1_SUB1 b ZGEMM_L1x1_SUB1
.LZGEMM_L1x1_SUB0: ZGEMM_L1x1_SUB0:
andi. L, K, 7 andi. L, K, 7
KERNEL1x1_SUBI1 KERNEL1x1_SUBI1
addic. L, L, -1 addic. L, L, -1
ble .LZGEMM_L1x1_SAVE ble ZGEMM_L1x1_SAVE
b .LZGEMM_L1x1_SUB2 b ZGEMM_L1x1_SUB2
.LZGEMM_L1x1_SUB1: ZGEMM_L1x1_SUB1:
andi. L, K, 7 andi. L, K, 7
ble .LZGEMM_L1x1_SAVE ble ZGEMM_L1x1_SAVE
.LZGEMM_L1x1_SUB2: ZGEMM_L1x1_SUB2:
KERNEL1x1_SUB1 KERNEL1x1_SUB1
addic. L, L, -1 addic. L, L, -1
bgt .LZGEMM_L1x1_SUB2 bgt ZGEMM_L1x1_SUB2
.LZGEMM_L1x1_SAVE: ZGEMM_L1x1_SAVE:
SAVE1x1 SAVE1x1
.LZGEMM_L1x1_END: ZGEMM_L1x1_END:
.LZGEMM_L1_END: ZGEMM_L1_END:

View File

@ -1,39 +1,3 @@
/***************************************************************************
Copyright (c) 2013-2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2016/03/05 Werner Saar (wernsaar@googlemail.com)
* BLASTEST : OK
* CTEST : OK
* TEST : OK
* LAPACK-TEST : OK
**************************************************************************************/
#if defined(NN) || defined(NT) || defined(TN) || defined(TT) #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
#define XSFADD_R1 xsadddp #define XSFADD_R1 xsadddp
@ -70,12 +34,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro LOAD2x8_1 .macro LOAD2x8_1
lxvdsx vs16, o0, BO // load real part from B lxvd2x vs16, o0, BO // load real part from B
lxvdsx vs17, o8, BO // load imag part from B lxvd2x vs17, o16, BO // load imag part from B
lxvdsx vs18, o16, BO // load real part from B lxvd2x vs18, o32, BO // load real part from B
lxvdsx vs19, o24, BO // load imag part from B lxvd2x vs19, o48, BO // load imag part from B
addi BO, BO, 32 addi BO, BO, 64
lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs0, o0, AO // load real,imag from A
lxvd2x vs1, o16, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A
@ -110,12 +74,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 64 addi AO, AO, 64
lxvdsx vs20, o0, BO // load real part from B lxvd2x vs20, o0, BO // load real part from B
lxvdsx vs21, o8, BO // load imag part from B lxvd2x vs21, o16, BO // load imag part from B
lxvdsx vs22, o16, BO // load real part from B lxvd2x vs22, o32, BO // load real part from B
lxvdsx vs23, o24, BO // load imag part from B lxvd2x vs23, o48, BO // load imag part from B
addi BO, BO, 32 addi BO, BO, 64
xvmuldp vs32, vs0, vs16 // real*real, imag*real xvmuldp vs32, vs0, vs16 // real*real, imag*real
xvmuldp vs33, vs0, vs17 // real*imag, imag*imag xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
@ -156,36 +120,41 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro KERNEL2x8_1 .macro KERNEL2x8_1
lxvd2x vs8, o0, AO // load real,imag from A
lxvd2x vs9, o16, AO // load real,imag from A
lxvd2x vs10, o32, AO // load real,imag from A
lxvd2x vs11, o48, AO // load real,imag from A
addi AO, AO, 64
lxvd2x vs12, o0, AO // load real,imag from A
lxvd2x vs13, o16, AO // load real,imag from A
lxvd2x vs14, o32, AO // load real,imag from A
lxvd2x vs15, o48, AO // load real,imag from A
addi AO, AO, 64
lxvd2x vs20, o0, BO // load real part from B
lxvd2x vs21, o16, BO // load imag part from B
lxvd2x vs22, o32, BO // load real part from B
lxvd2x vs23, o48, BO // load imag part from B
addi BO, BO, 64
xvmaddadp vs32, vs0, vs16 // real*real, imag*real xvmaddadp vs32, vs0, vs16 // real*real, imag*real
xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
xvmaddadp vs34, vs1, vs16 // real*real, imag*real xvmaddadp vs34, vs1, vs16 // real*real, imag*real
xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag
lxvdsx vs22, o16, BO // load real part from B
lxvdsx vs23, o24, BO // load imag part from B
xvmaddadp vs36, vs2, vs16 // real*real, imag*real xvmaddadp vs36, vs2, vs16 // real*real, imag*real
xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag
xvmaddadp vs38, vs3, vs16 // real*real, imag*real xvmaddadp vs38, vs3, vs16 // real*real, imag*real
xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag
lxvd2x vs8, o0, AO // load real,imag from A
lxvd2x vs9, o16, AO // load real,imag from A
xvmaddadp vs40, vs4, vs16 // real*real, imag*real xvmaddadp vs40, vs4, vs16 // real*real, imag*real
xvmaddadp vs41, vs4, vs17 // real*imag, imag*imag xvmaddadp vs41, vs4, vs17 // real*imag, imag*imag
xvmaddadp vs42, vs5, vs16 // real*real, imag*real xvmaddadp vs42, vs5, vs16 // real*real, imag*real
xvmaddadp vs43, vs5, vs17 // real*imag, imag*imag xvmaddadp vs43, vs5, vs17 // real*imag, imag*imag
lxvd2x vs10, o32, AO // load real,imag from A
lxvd2x vs11, o48, AO // load real,imag from A
xvmaddadp vs44, vs6, vs16 // real*real, imag*real xvmaddadp vs44, vs6, vs16 // real*real, imag*real
xvmaddadp vs45, vs6, vs17 // real*imag, imag*imag xvmaddadp vs45, vs6, vs17 // real*imag, imag*imag
addi AO, AO, 64
xvmaddadp vs46, vs7, vs16 // real*real, imag*real xvmaddadp vs46, vs7, vs16 // real*real, imag*real
xvmaddadp vs47, vs7, vs17 // real*imag, imag*imag xvmaddadp vs47, vs7, vs17 // real*imag, imag*imag
@ -193,101 +162,79 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvmaddadp vs49, vs0, vs19 // real*imag, imag*imag xvmaddadp vs49, vs0, vs19 // real*imag, imag*imag
xvmaddadp vs50, vs1, vs18 // real*real, imag*real xvmaddadp vs50, vs1, vs18 // real*real, imag*real
xvmaddadp vs51, vs1, vs19 // real*imag, imag*imag xvmaddadp vs51, vs1, vs19 // real*imag, imag*imag
lxvd2x vs12, o0, AO // load real,imag from A
lxvd2x vs13, o16, AO // load real,imag from A
xvmaddadp vs52, vs2, vs18 // real*real, imag*real xvmaddadp vs52, vs2, vs18 // real*real, imag*real
xvmaddadp vs53, vs2, vs19 // real*imag, imag*imag xvmaddadp vs53, vs2, vs19 // real*imag, imag*imag
xvmaddadp vs54, vs3, vs18 // real*real, imag*real xvmaddadp vs54, vs3, vs18 // real*real, imag*real
xvmaddadp vs55, vs3, vs19 // real*imag, imag*imag xvmaddadp vs55, vs3, vs19 // real*imag, imag*imag
lxvd2x vs14, o32, AO // load real,imag from A
lxvd2x vs15, o48, AO // load real,imag from A
xvmaddadp vs56, vs4, vs18 // real*real, imag*real xvmaddadp vs56, vs4, vs18 // real*real, imag*real
xvmaddadp vs57, vs4, vs19 // real*imag, imag*imag xvmaddadp vs57, vs4, vs19 // real*imag, imag*imag
xvmaddadp vs58, vs5, vs18 // real*real, imag*real xvmaddadp vs58, vs5, vs18 // real*real, imag*real
xvmaddadp vs59, vs5, vs19 // real*imag, imag*imag xvmaddadp vs59, vs5, vs19 // real*imag, imag*imag
lxvdsx vs20, o0, BO // load real part from B
lxvdsx vs21, o8, BO // load imag part from B
xvmaddadp vs60, vs6, vs18 // real*real, imag*real xvmaddadp vs60, vs6, vs18 // real*real, imag*real
xvmaddadp vs61, vs6, vs19 // real*imag, imag*imag xvmaddadp vs61, vs6, vs19 // real*imag, imag*imag
xvmaddadp vs62, vs7, vs18 // real*real, imag*real xvmaddadp vs62, vs7, vs18 // real*real, imag*real
xvmaddadp vs63, vs7, vs19 // real*imag, imag*imag xvmaddadp vs63, vs7, vs19 // real*imag, imag*imag
addi AO, AO, 64
addi BO, BO, 32
.endm .endm
.macro KERNEL2x8_2 .macro KERNEL2x8_2
lxvd2x vs0, o0, AO // load real,imag from A
lxvd2x vs1, o16, AO // load real,imag from A
lxvd2x vs2, o32, AO // load real,imag from A
lxvd2x vs3, o48, AO // load real,imag from A
addi AO, AO, 64
lxvd2x vs4, o0, AO // load real,imag from A
lxvd2x vs5, o16, AO // load real,imag from A
lxvd2x vs6, o32, AO // load real,imag from A
lxvd2x vs7, o48, AO // load real,imag from A
addi AO, AO, 64
lxvd2x vs16, o0, BO // load real part from B
lxvd2x vs17, o16, BO // load imag part from B
lxvd2x vs18, o32, BO // load real part from B
lxvd2x vs19, o48, BO // load imag part from B
addi BO, BO, 64
xvmaddadp vs32, vs8, vs20 // real*real, imag*real xvmaddadp vs32, vs8, vs20 // real*real, imag*real
xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
xvmaddadp vs34, vs9, vs20 // real*real, imag*real xvmaddadp vs34, vs9, vs20 // real*real, imag*real
xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag
lxvdsx vs16, o0, BO // load real part from B
lxvdsx vs17, o8, BO // load imag part from B
xvmaddadp vs36, vs10, vs20 // real*real, imag*real xvmaddadp vs36, vs10, vs20 // real*real, imag*real
xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag
xvmaddadp vs38, vs11, vs20 // real*real, imag*real xvmaddadp vs38, vs11, vs20 // real*real, imag*real
xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag
lxvd2x vs0, o0, AO // load real,imag from A
lxvd2x vs1, o16, AO // load real,imag from A
xvmaddadp vs40, vs12, vs20 // real*real, imag*real xvmaddadp vs40, vs12, vs20 // real*real, imag*real
xvmaddadp vs41, vs12, vs21 // real*imag, imag*imag xvmaddadp vs41, vs12, vs21 // real*imag, imag*imag
xvmaddadp vs42, vs13, vs20 // real*real, imag*real xvmaddadp vs42, vs13, vs20 // real*real, imag*real
xvmaddadp vs43, vs13, vs21 // real*imag, imag*imag xvmaddadp vs43, vs13, vs21 // real*imag, imag*imag
lxvd2x vs2, o32, AO // load real,imag from A
lxvd2x vs3, o48, AO // load real,imag from A
xvmaddadp vs44, vs14, vs20 // real*real, imag*real xvmaddadp vs44, vs14, vs20 // real*real, imag*real
xvmaddadp vs45, vs14, vs21 // real*imag, imag*imag xvmaddadp vs45, vs14, vs21 // real*imag, imag*imag
xvmaddadp vs46, vs15, vs20 // real*real, imag*real xvmaddadp vs46, vs15, vs20 // real*real, imag*real
xvmaddadp vs47, vs15, vs21 // real*imag, imag*imag xvmaddadp vs47, vs15, vs21 // real*imag, imag*imag
addi AO, AO, 64
xvmaddadp vs48, vs8, vs22 // real*real, imag*real xvmaddadp vs48, vs8, vs22 // real*real, imag*real
xvmaddadp vs49, vs8, vs23 // real*imag, imag*imag xvmaddadp vs49, vs8, vs23 // real*imag, imag*imag
xvmaddadp vs50, vs9, vs22 // real*real, imag*real xvmaddadp vs50, vs9, vs22 // real*real, imag*real
xvmaddadp vs51, vs9, vs23 // real*imag, imag*imag xvmaddadp vs51, vs9, vs23 // real*imag, imag*imag
lxvd2x vs4, o0, AO // load real,imag from A
lxvd2x vs5, o16, AO // load real,imag from A
xvmaddadp vs52, vs10, vs22 // real*real, imag*real xvmaddadp vs52, vs10, vs22 // real*real, imag*real
xvmaddadp vs53, vs10, vs23 // real*imag, imag*imag xvmaddadp vs53, vs10, vs23 // real*imag, imag*imag
xvmaddadp vs54, vs11, vs22 // real*real, imag*real xvmaddadp vs54, vs11, vs22 // real*real, imag*real
xvmaddadp vs55, vs11, vs23 // real*imag, imag*imag xvmaddadp vs55, vs11, vs23 // real*imag, imag*imag
lxvd2x vs6, o32, AO // load real,imag from A
lxvd2x vs7, o48, AO // load real,imag from A
xvmaddadp vs56, vs12, vs22 // real*real, imag*real xvmaddadp vs56, vs12, vs22 // real*real, imag*real
xvmaddadp vs57, vs12, vs23 // real*imag, imag*imag xvmaddadp vs57, vs12, vs23 // real*imag, imag*imag
xvmaddadp vs58, vs13, vs22 // real*real, imag*real xvmaddadp vs58, vs13, vs22 // real*real, imag*real
xvmaddadp vs59, vs13, vs23 // real*imag, imag*imag xvmaddadp vs59, vs13, vs23 // real*imag, imag*imag
lxvdsx vs18, o16, BO // load real part from B
lxvdsx vs19, o24, BO // load imag part from B
xvmaddadp vs60, vs14, vs22 // real*real, imag*real xvmaddadp vs60, vs14, vs22 // real*real, imag*real
xvmaddadp vs61, vs14, vs23 // real*imag, imag*imag xvmaddadp vs61, vs14, vs23 // real*imag, imag*imag
xvmaddadp vs62, vs15, vs22 // real*real, imag*real xvmaddadp vs62, vs15, vs22 // real*real, imag*real
xvmaddadp vs63, vs15, vs23 // real*imag, imag*imag xvmaddadp vs63, vs15, vs23 // real*imag, imag*imag
addi AO, AO, 64
addi BO, BO, 32
.endm .endm
@ -347,12 +294,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 64 addi AO, AO, 64
lxvdsx vs16, o0, BO // load real part from B lxvd2x vs16, o0, BO // load real part from B
lxvdsx vs17, o8, BO // load imag part from B lxvd2x vs17, o16, BO // load imag part from B
lxvdsx vs18, o16, BO // load real part from B lxvd2x vs18, o32, BO // load real part from B
lxvdsx vs19, o24, BO // load imag part from B lxvd2x vs19, o48, BO // load imag part from B
addi BO, BO, 32 addi BO, BO, 64
xvmuldp vs32, vs0, vs16 // real*real, imag*real xvmuldp vs32, vs0, vs16 // real*real, imag*real
xvmuldp vs33, vs0, vs17 // real*imag, imag*imag xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
@ -407,12 +354,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 64 addi AO, AO, 64
lxvdsx vs16, o0, BO // load real part from B lxvd2x vs16, o0, BO // load real part from B
lxvdsx vs17, o8, BO // load imag part from B lxvd2x vs17, o16, BO // load imag part from B
lxvdsx vs18, o16, BO // load real part from B lxvd2x vs18, o32, BO // load real part from B
lxvdsx vs19, o24, BO // load imag part from B lxvd2x vs19, o48, BO // load imag part from B
addi BO, BO, 32 addi BO, BO, 64
xvmaddadp vs32, vs0, vs16 // real*real, imag*real xvmaddadp vs32, vs0, vs16 // real*real, imag*real
xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
@ -927,12 +874,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro LOAD2x4_1 .macro LOAD2x4_1
lxvdsx vs16, o0, BO // load real part from B lxvd2x vs16, o0, BO // load real part from B
lxvdsx vs17, o8, BO // load imag part from B lxvd2x vs17, o16, BO // load imag part from B
lxvdsx vs18, o16, BO // load real part from B lxvd2x vs18, o32, BO // load real part from B
lxvdsx vs19, o24, BO // load imag part from B lxvd2x vs19, o48, BO // load imag part from B
addi BO, BO, 32 addi BO, BO, 64
lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs0, o0, AO // load real,imag from A
lxvd2x vs1, o16, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A
@ -953,12 +900,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 64 addi AO, AO, 64
lxvdsx vs20, o0, BO // load real part from B lxvd2x vs20, o0, BO // load real part from B
lxvdsx vs21, o8, BO // load imag part from B lxvd2x vs21, o16, BO // load imag part from B
lxvdsx vs22, o16, BO // load real part from B lxvd2x vs22, o32, BO // load real part from B
lxvdsx vs23, o24, BO // load imag part from B lxvd2x vs23, o48, BO // load imag part from B
addi BO, BO, 32 addi BO, BO, 64
xvmuldp vs32, vs0, vs16 // real*real, imag*real xvmuldp vs32, vs0, vs16 // real*real, imag*real
xvmuldp vs33, vs0, vs17 // real*imag, imag*imag xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
@ -990,12 +937,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 64 addi AO, AO, 64
lxvdsx vs20, o0, BO // load real part from B lxvd2x vs20, o0, BO // load real part from B
lxvdsx vs21, o8, BO // load imag part from B lxvd2x vs21, o16, BO // load imag part from B
lxvdsx vs22, o16, BO // load real part from B lxvd2x vs22, o32, BO // load real part from B
lxvdsx vs23, o24, BO // load imag part from B lxvd2x vs23, o48, BO // load imag part from B
addi BO, BO, 32 addi BO, BO, 64
xvmaddadp vs32, vs0, vs16 // real*real, imag*real xvmaddadp vs32, vs0, vs16 // real*real, imag*real
xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
@ -1027,12 +974,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 64 addi AO, AO, 64
lxvdsx vs16, o0, BO // load real part from B lxvd2x vs16, o0, BO // load real part from B
lxvdsx vs17, o8, BO // load imag part from B lxvd2x vs17, o16, BO // load imag part from B
lxvdsx vs18, o16, BO // load real part from B lxvd2x vs18, o32, BO // load real part from B
lxvdsx vs19, o24, BO // load imag part from B lxvd2x vs19, o48, BO // load imag part from B
addi BO, BO, 32 addi BO, BO, 64
xvmaddadp vs32, vs8, vs20 // real*real, imag*real xvmaddadp vs32, vs8, vs20 // real*real, imag*real
xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
@ -1088,12 +1035,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 64 addi AO, AO, 64
lxvdsx vs16, o0, BO // load real part from B lxvd2x vs16, o0, BO // load real part from B
lxvdsx vs17, o8, BO // load imag part from B lxvd2x vs17, o16, BO // load imag part from B
lxvdsx vs18, o16, BO // load real part from B lxvd2x vs18, o32, BO // load real part from B
lxvdsx vs19, o24, BO // load imag part from B lxvd2x vs19, o48, BO // load imag part from B
addi BO, BO, 32 addi BO, BO, 64
xvmuldp vs32, vs0, vs16 // real*real, imag*real xvmuldp vs32, vs0, vs16 // real*real, imag*real
xvmuldp vs33, vs0, vs17 // real*imag, imag*imag xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
@ -1125,12 +1072,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 64 addi AO, AO, 64
lxvdsx vs16, o0, BO // load real part from B lxvd2x vs16, o0, BO // load real part from B
lxvdsx vs17, o8, BO // load imag part from B lxvd2x vs17, o16, BO // load imag part from B
lxvdsx vs18, o16, BO // load real part from B lxvd2x vs18, o32, BO // load real part from B
lxvdsx vs19, o24, BO // load imag part from B lxvd2x vs19, o48, BO // load imag part from B
addi BO, BO, 32 addi BO, BO, 64
xvmaddadp vs32, vs0, vs16 // real*real, imag*real xvmaddadp vs32, vs0, vs16 // real*real, imag*real
xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
@ -1410,12 +1357,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro LOAD2x2_1 .macro LOAD2x2_1
lxvdsx vs16, o0, BO // load real part from B lxvd2x vs16, o0, BO // load real part from B
lxvdsx vs17, o8, BO // load imag part from B lxvd2x vs17, o16, BO // load imag part from B
lxvdsx vs18, o16, BO // load real part from B lxvd2x vs18, o32, BO // load real part from B
lxvdsx vs19, o24, BO // load imag part from B lxvd2x vs19, o48, BO // load imag part from B
addi BO, BO, 32 addi BO, BO, 64
lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs0, o0, AO // load real,imag from A
lxvd2x vs1, o16, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A
@ -1432,12 +1379,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 32 addi AO, AO, 32
lxvdsx vs20, o0, BO // load real part from B lxvd2x vs20, o0, BO // load real part from B
lxvdsx vs21, o8, BO // load imag part from B lxvd2x vs21, o16, BO // load imag part from B
lxvdsx vs22, o16, BO // load real part from B lxvd2x vs22, o32, BO // load real part from B
lxvdsx vs23, o24, BO // load imag part from B lxvd2x vs23, o48, BO // load imag part from B
addi BO, BO, 32 addi BO, BO, 64
xvmuldp vs32, vs0, vs16 // real*real, imag*real xvmuldp vs32, vs0, vs16 // real*real, imag*real
xvmuldp vs33, vs0, vs17 // real*imag, imag*imag xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
@ -1459,12 +1406,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 32 addi AO, AO, 32
lxvdsx vs20, o0, BO // load real part from B lxvd2x vs20, o0, BO // load real part from B
lxvdsx vs21, o8, BO // load imag part from B lxvd2x vs21, o16, BO // load imag part from B
lxvdsx vs22, o16, BO // load real part from B lxvd2x vs22, o32, BO // load real part from B
lxvdsx vs23, o24, BO // load imag part from B lxvd2x vs23, o48, BO // load imag part from B
addi BO, BO, 32 addi BO, BO, 64
xvmaddadp vs32, vs0, vs16 // real*real, imag*real xvmaddadp vs32, vs0, vs16 // real*real, imag*real
xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
@ -1486,12 +1433,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 32 addi AO, AO, 32
lxvdsx vs16, o0, BO // load real part from B lxvd2x vs16, o0, BO // load real part from B
lxvdsx vs17, o8, BO // load imag part from B lxvd2x vs17, o16, BO // load imag part from B
lxvdsx vs18, o16, BO // load real part from B lxvd2x vs18, o32, BO // load real part from B
lxvdsx vs19, o24, BO // load imag part from B lxvd2x vs19, o48, BO // load imag part from B
addi BO, BO, 32 addi BO, BO, 64
xvmaddadp vs32, vs8, vs20 // real*real, imag*real xvmaddadp vs32, vs8, vs20 // real*real, imag*real
xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
@ -1529,12 +1476,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 32 addi AO, AO, 32
lxvdsx vs16, o0, BO // load real part from B lxvd2x vs16, o0, BO // load real part from B
lxvdsx vs17, o8, BO // load imag part from B lxvd2x vs17, o16, BO // load imag part from B
lxvdsx vs18, o16, BO // load real part from B lxvd2x vs18, o32, BO // load real part from B
lxvdsx vs19, o24, BO // load imag part from B lxvd2x vs19, o48, BO // load imag part from B
addi BO, BO, 32 addi BO, BO, 64
xvmuldp vs32, vs0, vs16 // real*real, imag*real xvmuldp vs32, vs0, vs16 // real*real, imag*real
xvmuldp vs33, vs0, vs17 // real*imag, imag*imag xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
@ -1556,12 +1503,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 32 addi AO, AO, 32
lxvdsx vs16, o0, BO // load real part from B lxvd2x vs16, o0, BO // load real part from B
lxvdsx vs17, o8, BO // load imag part from B lxvd2x vs17, o16, BO // load imag part from B
lxvdsx vs18, o16, BO // load real part from B lxvd2x vs18, o32, BO // load real part from B
lxvdsx vs19, o24, BO // load imag part from B lxvd2x vs19, o48, BO // load imag part from B
addi BO, BO, 32 addi BO, BO, 64
xvmaddadp vs32, vs0, vs16 // real*real, imag*real xvmaddadp vs32, vs0, vs16 // real*real, imag*real
xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
@ -1725,12 +1672,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro LOAD2x1_1 .macro LOAD2x1_1
lxvdsx vs16, o0, BO // load real part from B lxvd2x vs16, o0, BO // load real part from B
lxvdsx vs17, o8, BO // load imag part from B lxvd2x vs17, o16, BO // load imag part from B
lxvdsx vs18, o16, BO // load real part from B lxvd2x vs18, o32, BO // load real part from B
lxvdsx vs19, o24, BO // load imag part from B lxvd2x vs19, o48, BO // load imag part from B
addi BO, BO, 32 addi BO, BO, 64
lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs0, o0, AO // load real,imag from A
@ -1745,12 +1692,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 16 addi AO, AO, 16
lxvdsx vs20, o0, BO // load real part from B lxvd2x vs20, o0, BO // load real part from B
lxvdsx vs21, o8, BO // load imag part from B lxvd2x vs21, o16, BO // load imag part from B
lxvdsx vs22, o16, BO // load real part from B lxvd2x vs22, o32, BO // load real part from B
lxvdsx vs23, o24, BO // load imag part from B lxvd2x vs23, o48, BO // load imag part from B
addi BO, BO, 32 addi BO, BO, 64
xvmuldp vs32, vs0, vs16 // real*real, imag*real xvmuldp vs32, vs0, vs16 // real*real, imag*real
xvmuldp vs33, vs0, vs17 // real*imag, imag*imag xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
@ -1767,12 +1714,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 16 addi AO, AO, 16
lxvdsx vs20, o0, BO // load real part from B lxvd2x vs20, o0, BO // load real part from B
lxvdsx vs21, o8, BO // load imag part from B lxvd2x vs21, o16, BO // load imag part from B
lxvdsx vs22, o16, BO // load real part from B lxvd2x vs22, o32, BO // load real part from B
lxvdsx vs23, o24, BO // load imag part from B lxvd2x vs23, o48, BO // load imag part from B
addi BO, BO, 32 addi BO, BO, 64
xvmaddadp vs32, vs0, vs16 // real*real, imag*real xvmaddadp vs32, vs0, vs16 // real*real, imag*real
xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
@ -1789,12 +1736,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 16 addi AO, AO, 16
lxvdsx vs16, o0, BO // load real part from B lxvd2x vs16, o0, BO // load real part from B
lxvdsx vs17, o8, BO // load imag part from B lxvd2x vs17, o16, BO // load imag part from B
lxvdsx vs18, o16, BO // load real part from B lxvd2x vs18, o32, BO // load real part from B
lxvdsx vs19, o24, BO // load imag part from B lxvd2x vs19, o48, BO // load imag part from B
addi BO, BO, 32 addi BO, BO, 64
xvmaddadp vs32, vs8, vs20 // real*real, imag*real xvmaddadp vs32, vs8, vs20 // real*real, imag*real
xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
@ -1823,12 +1770,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 16 addi AO, AO, 16
lxvdsx vs16, o0, BO // load real part from B lxvd2x vs16, o0, BO // load real part from B
lxvdsx vs17, o8, BO // load imag part from B lxvd2x vs17, o16, BO // load imag part from B
lxvdsx vs18, o16, BO // load real part from B lxvd2x vs18, o32, BO // load real part from B
lxvdsx vs19, o24, BO // load imag part from B lxvd2x vs19, o48, BO // load imag part from B
addi BO, BO, 32 addi BO, BO, 64
xvmuldp vs32, vs0, vs16 // real*real, imag*real xvmuldp vs32, vs0, vs16 // real*real, imag*real
xvmuldp vs33, vs0, vs17 // real*imag, imag*imag xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
@ -1845,12 +1792,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 16 addi AO, AO, 16
lxvdsx vs16, o0, BO // load real part from B lxvd2x vs16, o0, BO // load real part from B
lxvdsx vs17, o8, BO // load imag part from B lxvd2x vs17, o16, BO // load imag part from B
lxvdsx vs18, o16, BO // load real part from B lxvd2x vs18, o32, BO // load real part from B
lxvdsx vs19, o24, BO // load imag part from B lxvd2x vs19, o48, BO // load imag part from B
addi BO, BO, 32 addi BO, BO, 64
xvmaddadp vs32, vs0, vs16 // real*real, imag*real xvmaddadp vs32, vs0, vs16 // real*real, imag*real
xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
@ -1956,10 +1903,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro LOAD1x8_1 .macro LOAD1x8_1
lxvdsx vs16, o0, BO // load real part from B lxvd2x vs16, o0, BO // load real part from B
lxvdsx vs17, o8, BO // load imag part from B lxvd2x vs17, o16, BO // load imag part from B
addi BO, BO, 16 addi BO, BO, 32
lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs0, o0, AO // load real,imag from A
lxvd2x vs1, o16, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A
@ -1994,10 +1941,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 64 addi AO, AO, 64
lxvdsx vs20, o0, BO // load real part from B lxvd2x vs20, o0, BO // load real part from B
lxvdsx vs21, o8, BO // load imag part from B lxvd2x vs21, o16, BO // load imag part from B
addi BO, BO, 16 addi BO, BO, 32
xvmuldp vs32, vs0, vs16 // real*real, imag*real xvmuldp vs32, vs0, vs16 // real*real, imag*real
xvmuldp vs33, vs0, vs17 // real*imag, imag*imag xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
@ -2035,10 +1982,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 64 addi AO, AO, 64
lxvdsx vs20, o0, BO // load real part from B lxvd2x vs20, o0, BO // load real part from B
lxvdsx vs21, o8, BO // load imag part from B lxvd2x vs21, o16, BO // load imag part from B
addi BO, BO, 16 addi BO, BO, 32
xvmaddadp vs32, vs0, vs16 // real*real, imag*real xvmaddadp vs32, vs0, vs16 // real*real, imag*real
xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
@ -2076,10 +2023,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 64 addi AO, AO, 64
lxvdsx vs16, o0, BO // load real part from B lxvd2x vs16, o0, BO // load real part from B
lxvdsx vs17, o8, BO // load imag part from B lxvd2x vs17, o16, BO // load imag part from B
addi BO, BO, 16 addi BO, BO, 32
xvmaddadp vs32, vs8, vs20 // real*real, imag*real xvmaddadp vs32, vs8, vs20 // real*real, imag*real
xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
@ -2140,10 +2087,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 64 addi AO, AO, 64
lxvdsx vs16, o0, BO // load real part from B lxvd2x vs16, o0, BO // load real part from B
lxvdsx vs17, o8, BO // load imag part from B lxvd2x vs17, o16, BO // load imag part from B
addi BO, BO, 16 addi BO, BO, 32
xvmuldp vs32, vs0, vs16 // real*real, imag*real xvmuldp vs32, vs0, vs16 // real*real, imag*real
xvmuldp vs33, vs0, vs17 // real*imag, imag*imag xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
@ -2181,10 +2128,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 64 addi AO, AO, 64
lxvdsx vs16, o0, BO // load real part from B lxvd2x vs16, o0, BO // load real part from B
lxvdsx vs17, o8, BO // load imag part from B lxvd2x vs17, o16, BO // load imag part from B
addi BO, BO, 16 addi BO, BO, 32
xvmaddadp vs32, vs0, vs16 // real*real, imag*real xvmaddadp vs32, vs0, vs16 // real*real, imag*real
xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
@ -2452,10 +2399,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro LOAD1x4_1 .macro LOAD1x4_1
lxvdsx vs16, o0, BO // load real part from B lxvd2x vs16, o0, BO // load real part from B
lxvdsx vs17, o8, BO // load imag part from B lxvd2x vs17, o16, BO // load imag part from B
addi BO, BO, 16 addi BO, BO, 32
lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs0, o0, AO // load real,imag from A
lxvd2x vs1, o16, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A
@ -2476,10 +2423,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 64 addi AO, AO, 64
lxvdsx vs20, o0, BO // load real part from B lxvd2x vs20, o0, BO // load real part from B
lxvdsx vs21, o8, BO // load imag part from B lxvd2x vs21, o16, BO // load imag part from B
addi BO, BO, 16 addi BO, BO, 32
xvmuldp vs32, vs0, vs16 // real*real, imag*real xvmuldp vs32, vs0, vs16 // real*real, imag*real
xvmuldp vs33, vs0, vs17 // real*imag, imag*imag xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
@ -2502,10 +2449,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 64 addi AO, AO, 64
lxvdsx vs20, o0, BO // load real part from B lxvd2x vs20, o0, BO // load real part from B
lxvdsx vs21, o8, BO // load imag part from B lxvd2x vs21, o16, BO // load imag part from B
addi BO, BO, 16 addi BO, BO, 32
xvmaddadp vs32, vs0, vs16 // real*real, imag*real xvmaddadp vs32, vs0, vs16 // real*real, imag*real
xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
@ -2528,10 +2475,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 64 addi AO, AO, 64
lxvdsx vs16, o0, BO // load real part from B lxvd2x vs16, o0, BO // load real part from B
lxvdsx vs17, o8, BO // load imag part from B lxvd2x vs17, o16, BO // load imag part from B
addi BO, BO, 16 addi BO, BO, 32
xvmaddadp vs32, vs8, vs20 // real*real, imag*real xvmaddadp vs32, vs8, vs20 // real*real, imag*real
xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
@ -2569,10 +2516,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 64 addi AO, AO, 64
lxvdsx vs16, o0, BO // load real part from B lxvd2x vs16, o0, BO // load real part from B
lxvdsx vs17, o8, BO // load imag part from B lxvd2x vs17, o16, BO // load imag part from B
addi BO, BO, 16 addi BO, BO, 32
xvmuldp vs32, vs0, vs16 // real*real, imag*real xvmuldp vs32, vs0, vs16 // real*real, imag*real
xvmuldp vs33, vs0, vs17 // real*imag, imag*imag xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
@ -2595,10 +2542,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 64 addi AO, AO, 64
lxvdsx vs16, o0, BO // load real part from B lxvd2x vs16, o0, BO // load real part from B
lxvdsx vs17, o8, BO // load imag part from B lxvd2x vs17, o16, BO // load imag part from B
addi BO, BO, 16 addi BO, BO, 32
xvmaddadp vs32, vs0, vs16 // real*real, imag*real xvmaddadp vs32, vs0, vs16 // real*real, imag*real
xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
@ -2748,10 +2695,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro LOAD1x2_1 .macro LOAD1x2_1
lxvdsx vs16, o0, BO // load real part from B lxvd2x vs16, o0, BO // load real part from B
lxvdsx vs17, o8, BO // load imag part from B lxvd2x vs17, o16, BO // load imag part from B
addi BO, BO, 16 addi BO, BO, 32
lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs0, o0, AO // load real,imag from A
lxvd2x vs1, o16, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A
@ -2768,10 +2715,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 32 addi AO, AO, 32
lxvdsx vs20, o0, BO // load real part from B lxvd2x vs20, o0, BO // load real part from B
lxvdsx vs21, o8, BO // load imag part from B lxvd2x vs21, o16, BO // load imag part from B
addi BO, BO, 16 addi BO, BO, 32
xvmuldp vs32, vs0, vs16 // real*real, imag*real xvmuldp vs32, vs0, vs16 // real*real, imag*real
xvmuldp vs33, vs0, vs17 // real*imag, imag*imag xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
@ -2788,10 +2735,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 32 addi AO, AO, 32
lxvdsx vs20, o0, BO // load real part from B lxvd2x vs20, o0, BO // load real part from B
lxvdsx vs21, o8, BO // load imag part from B lxvd2x vs21, o16, BO // load imag part from B
addi BO, BO, 16 addi BO, BO, 32
xvmaddadp vs32, vs0, vs16 // real*real, imag*real xvmaddadp vs32, vs0, vs16 // real*real, imag*real
xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
@ -2808,10 +2755,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 32 addi AO, AO, 32
lxvdsx vs16, o0, BO // load real part from B lxvd2x vs16, o0, BO // load real part from B
lxvdsx vs17, o8, BO // load imag part from B lxvd2x vs17, o16, BO // load imag part from B
addi BO, BO, 16 addi BO, BO, 32
xvmaddadp vs32, vs8, vs20 // real*real, imag*real xvmaddadp vs32, vs8, vs20 // real*real, imag*real
xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
@ -2839,10 +2786,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 32 addi AO, AO, 32
lxvdsx vs16, o0, BO // load real part from B lxvd2x vs16, o0, BO // load real part from B
lxvdsx vs17, o8, BO // load imag part from B lxvd2x vs17, o16, BO // load imag part from B
addi BO, BO, 16 addi BO, BO, 32
xvmuldp vs32, vs0, vs16 // real*real, imag*real xvmuldp vs32, vs0, vs16 // real*real, imag*real
xvmuldp vs33, vs0, vs17 // real*imag, imag*imag xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
@ -2859,10 +2806,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 32 addi AO, AO, 32
lxvdsx vs16, o0, BO // load real part from B lxvd2x vs16, o0, BO // load real part from B
lxvdsx vs17, o8, BO // load imag part from B lxvd2x vs17, o16, BO // load imag part from B
addi BO, BO, 16 addi BO, BO, 32
xvmaddadp vs32, vs0, vs16 // real*real, imag*real xvmaddadp vs32, vs0, vs16 // real*real, imag*real
xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
@ -2954,10 +2901,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro LOAD1x1_1 .macro LOAD1x1_1
lxvdsx vs16, o0, BO // load real part from B lxvd2x vs16, o0, BO // load real part from B
lxvdsx vs17, o8, BO // load imag part from B lxvd2x vs17, o16, BO // load imag part from B
addi BO, BO, 16 addi BO, BO, 32
lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs0, o0, AO // load real,imag from A
@ -2972,10 +2919,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 16 addi AO, AO, 16
lxvdsx vs20, o0, BO // load real part from B lxvd2x vs20, o0, BO // load real part from B
lxvdsx vs21, o8, BO // load imag part from B lxvd2x vs21, o16, BO // load imag part from B
addi BO, BO, 16 addi BO, BO, 32
xvmuldp vs32, vs0, vs16 // real*real, imag*real xvmuldp vs32, vs0, vs16 // real*real, imag*real
xvmuldp vs33, vs0, vs17 // real*imag, imag*imag xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
@ -2989,10 +2936,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 16 addi AO, AO, 16
lxvdsx vs20, o0, BO // load real part from B lxvd2x vs20, o0, BO // load real part from B
lxvdsx vs21, o8, BO // load imag part from B lxvd2x vs21, o16, BO // load imag part from B
addi BO, BO, 16 addi BO, BO, 32
xvmaddadp vs32, vs0, vs16 // real*real, imag*real xvmaddadp vs32, vs0, vs16 // real*real, imag*real
xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
@ -3006,10 +2953,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 16 addi AO, AO, 16
lxvdsx vs16, o0, BO // load real part from B lxvd2x vs16, o0, BO // load real part from B
lxvdsx vs17, o8, BO // load imag part from B lxvd2x vs17, o16, BO // load imag part from B
addi BO, BO, 16 addi BO, BO, 32
xvmaddadp vs32, vs8, vs20 // real*real, imag*real xvmaddadp vs32, vs8, vs20 // real*real, imag*real
xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
@ -3032,10 +2979,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 16 addi AO, AO, 16
lxvdsx vs16, o0, BO // load real part from B lxvd2x vs16, o0, BO // load real part from B
lxvdsx vs17, o8, BO // load imag part from B lxvd2x vs17, o16, BO // load imag part from B
addi BO, BO, 16 addi BO, BO, 32
xvmuldp vs32, vs0, vs16 // real*real, imag*real xvmuldp vs32, vs0, vs16 // real*real, imag*real
xvmuldp vs33, vs0, vs17 // real*imag, imag*imag xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
@ -3049,10 +2996,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 16 addi AO, AO, 16
lxvdsx vs16, o0, BO // load real part from B lxvd2x vs16, o0, BO // load real part from B
lxvdsx vs17, o8, BO // load imag part from B lxvd2x vs17, o16, BO // load imag part from B
addi BO, BO, 16 addi BO, BO, 32
xvmaddadp vs32, vs0, vs16 // real*real, imag*real xvmaddadp vs32, vs0, vs16 // real*real, imag*real
xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag

176
kernel/power/zscal.c Normal file
View File

@ -0,0 +1,176 @@
/***************************************************************************
Copyright (c) 2013-2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2016/03/27 Werner Saar (wernsaar@googlemail.com)
* BLASTEST : OK
* CTEST : OK
* TEST : OK
* LAPACK-TEST : OK
**************************************************************************************/
#include "common.h"
#pragma GCC optimize "O1"
#if defined(POWER8)
#include "zscal_microk_power8.c"
#endif
#ifndef HAVE_KERNEL_8
static void zscal_kernel_8(BLASLONG n, FLOAT *x, FLOAT *alpha)
{
BLASLONG i=0;
FLOAT *x1=x;
FLOAT alpha_r1=alpha[0];
FLOAT alpha_r2=alpha[1];
FLOAT alpha_i1=alpha[2];
FLOAT alpha_i2=alpha[3];
FLOAT temp00, temp01, temp10, temp11, temp20, temp21, temp30, temp31;
FLOAT x0_r, x0_i, x1_r, x1_i, x2_r, x2_i, x3_r, x3_i;
while ( i<n )
{
x0_r = x1[0];
x0_i = x1[1];
x1_r = x1[2];
x1_i = x1[3];
x2_r = x1[4];
x2_i = x1[5];
x3_r = x1[6];
x3_i = x1[7];
temp00 = x0_r * alpha_r1;
temp10 = x1_r * alpha_r1;
temp20 = x2_r * alpha_r1;
temp30 = x3_r * alpha_r1;
temp01 = x0_i * alpha_r2;
temp11 = x1_i * alpha_r2;
temp21 = x2_i * alpha_r2;
temp31 = x3_i * alpha_r2;
temp00 += x0_i * alpha_i1;
temp10 += x1_i * alpha_i1;
temp20 += x2_i * alpha_i1;
temp30 += x3_i * alpha_i1;
temp01 += x0_r * alpha_i2;
temp11 += x1_r * alpha_i2;
temp21 += x2_r * alpha_i2;
temp31 += x3_r * alpha_i2;
x1[0] = temp00;
x1[1] = temp01;
x1[2] = temp10;
x1[3] = temp11;
x1[4] = temp20;
x1[5] = temp21;
x1[6] = temp30;
x1[7] = temp31;
x1 += 8;
i+=4;
}
return;
}
#endif
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
{
BLASLONG i=0;
BLASLONG inc_x2;
BLASLONG ip = 0;
FLOAT temp;
FLOAT alpha[4] __attribute__ ((aligned (16)));;
BLASLONG n1;
if ( n <= 0 )
return(0);
if ( inc_x <= 0 )
return(0);
if ( inc_x == 1 )
{
n1 = n & -8;
if ( n1 > 0 )
{
alpha[0] = da_r;
alpha[1] = da_r;
alpha[2] = -da_i;
alpha[3] = da_i;
zscal_kernel_8(n1, x, alpha);
i=n1;
ip = n1 * 2;
}
while ( i < n )
{
temp = da_r * x[ip] - da_i * x[ip+1] ;
x[ip+1] = da_r * x[ip+1] + da_i * x[ip] ;
x[ip] = temp;
ip += 2;
i++;
}
}
else
{
inc_x2 = 2 * inc_x;
while ( i < n )
{
temp = da_r * x[ip] - da_i * x[ip+1] ;
x[ip+1] = da_r * x[ip+1] + da_i * x[ip] ;
x[ip] = temp;
ip += inc_x2;
i++;
}
}
return(0);
}

View File

@ -0,0 +1,224 @@
/***************************************************************************
Copyright (c) 2013-2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2016/03/25 Werner Saar (wernsaar@googlemail.com)
*
* I don't use fused multipy-add ( lapack precision problems )
*
* BLASTEST : OK
* CTEST : OK
* TEST : OK
* LAPACK-TEST : OK
**************************************************************************************/
#define HAVE_KERNEL_8 1
static void zscal_kernel_8( BLASLONG n, FLOAT *x, FLOAT *alpha) __attribute__ ((noinline));
static void zscal_kernel_8( BLASLONG n, FLOAT *x, FLOAT *alpha)
{
BLASLONG i = n;
BLASLONG o16 = 16;
BLASLONG o32 = 32;
BLASLONG o48 = 48;
BLASLONG o64 = 64;
BLASLONG o80 = 80;
BLASLONG o96 = 96;
BLASLONG o112 = 112;
FLOAT *x1=x;
FLOAT *x2=x+1;
BLASLONG pre = 384;
__asm__ __volatile__
(
"lxvd2x 32, 0, %3 \n\t" // alpha_r , alpha_r
"lxvd2x 33, %5, %3 \n\t" // -alpha_i , alpha_i
"addi %1, %1, -8 \n\t"
"dcbt %2, %4 \n\t"
"lxvd2x 40, 0, %2 \n\t" // x0_r, x0_i
"lxvd2x 41, %5, %2 \n\t"
"lxvd2x 42, %6, %2 \n\t"
"lxvd2x 43, %7, %2 \n\t"
"lxvd2x 44, %8, %2 \n\t"
"lxvd2x 45, %9, %2 \n\t"
"lxvd2x 46, %10, %2 \n\t"
"lxvd2x 47, %11, %2 \n\t"
"addi %2, %2, 128 \n\t"
"addic. %0 , %0 , -8 \n\t"
"ble 2f \n\t"
".align 5 \n\t"
"1: \n\t"
"dcbt %2, %4 \n\t"
"xvmuldp 48, 40, 32 \n\t" // x0_r * alpha_r, x0_i * alpha_r
"xvmuldp 49, 41, 32 \n\t"
"xvmuldp 50, 42, 32 \n\t"
"xvmuldp 51, 43, 32 \n\t"
"xvmuldp 52, 44, 32 \n\t"
"xvmuldp 53, 45, 32 \n\t"
"xvmuldp 54, 46, 32 \n\t"
"xvmuldp 55, 47, 32 \n\t"
"xxswapd 56, 40 \n\t"
"xxswapd 57, 41 \n\t"
"xxswapd 58, 42 \n\t"
"xxswapd 59, 43 \n\t"
"xxswapd 60, 44 \n\t"
"xxswapd 61, 45 \n\t"
"xxswapd 62, 46 \n\t"
"xxswapd 63, 47 \n\t"
"xvmuldp 56, 56, 33 \n\t" // x0_i * -alpha_i, x0_r * alpha_i
"xvmuldp 57, 57, 33 \n\t"
"lxvd2x 40, 0, %2 \n\t" // x0_r, x0_i
"lxvd2x 41, %5, %2 \n\t"
"xvmuldp 58, 58, 33 \n\t"
"xvmuldp 59, 59, 33 \n\t"
"lxvd2x 42, %6, %2 \n\t"
"lxvd2x 43, %7, %2 \n\t"
"xvmuldp 60, 60, 33 \n\t"
"xvmuldp 61, 61, 33 \n\t"
"lxvd2x 44, %8, %2 \n\t"
"lxvd2x 45, %9, %2 \n\t"
"xvmuldp 62, 62, 33 \n\t"
"xvmuldp 63, 63, 33 \n\t"
"lxvd2x 46, %10, %2 \n\t"
"lxvd2x 47, %11, %2 \n\t"
"xvadddp 48, 48 , 56 \n\t"
"xvadddp 49, 49 , 57 \n\t"
"xvadddp 50, 50 , 58 \n\t"
"xvadddp 51, 51 , 59 \n\t"
"stxvd2x 48, 0, %1 \n\t"
"stxvd2x 49, %5, %1 \n\t"
"xvadddp 52, 52 , 60 \n\t"
"xvadddp 53, 53 , 61 \n\t"
"stxvd2x 50, %6, %1 \n\t"
"stxvd2x 51, %7, %1 \n\t"
"xvadddp 54, 54 , 62 \n\t"
"xvadddp 55, 55 , 63 \n\t"
"stxvd2x 52, %8, %1 \n\t"
"stxvd2x 53, %9, %1 \n\t"
"stxvd2x 54, %10, %1 \n\t"
"stxvd2x 55, %11, %1 \n\t"
"addi %1, %1, 128 \n\t"
"addi %2, %2, 128 \n\t"
"addic. %0 , %0 , -8 \n\t"
"bgt 1b \n\t"
"2: \n\t"
"xvmuldp 48, 40, 32 \n\t" // x0_r * alpha_r, x0_i * alpha_r
"xvmuldp 49, 41, 32 \n\t"
"xvmuldp 50, 42, 32 \n\t"
"xvmuldp 51, 43, 32 \n\t"
"xvmuldp 52, 44, 32 \n\t"
"xvmuldp 53, 45, 32 \n\t"
"xvmuldp 54, 46, 32 \n\t"
"xvmuldp 55, 47, 32 \n\t"
"xxswapd 56, 40 \n\t"
"xxswapd 57, 41 \n\t"
"xxswapd 58, 42 \n\t"
"xxswapd 59, 43 \n\t"
"xxswapd 60, 44 \n\t"
"xxswapd 61, 45 \n\t"
"xxswapd 62, 46 \n\t"
"xxswapd 63, 47 \n\t"
"xvmuldp 56, 56, 33 \n\t" // x0_i * -alpha_i, x0_r * alpha_i
"xvmuldp 57, 57, 33 \n\t"
"xvmuldp 58, 58, 33 \n\t"
"xvmuldp 59, 59, 33 \n\t"
"xvmuldp 60, 60, 33 \n\t"
"xvmuldp 61, 61, 33 \n\t"
"xvmuldp 62, 62, 33 \n\t"
"xvmuldp 63, 63, 33 \n\t"
"xvadddp 48, 48 , 56 \n\t"
"xvadddp 49, 49 , 57 \n\t"
"xvadddp 50, 50 , 58 \n\t"
"xvadddp 51, 51 , 59 \n\t"
"xvadddp 52, 52 , 60 \n\t"
"xvadddp 53, 53 , 61 \n\t"
"xvadddp 54, 54 , 62 \n\t"
"xvadddp 55, 55 , 63 \n\t"
"stxvd2x 48, 0, %1 \n\t"
"stxvd2x 49, %5, %1 \n\t"
"stxvd2x 50, %6, %1 \n\t"
"stxvd2x 51, %7, %1 \n\t"
"stxvd2x 52, %8, %1 \n\t"
"stxvd2x 53, %9, %1 \n\t"
"stxvd2x 54, %10, %1 \n\t"
"stxvd2x 55, %11, %1 \n\t"
:
:
"r" (i), // 0
"r" (x2), // 1
"r" (x1), // 2
"r" (alpha), // 3
"r" (pre), // 4
"r" (o16), // 5
"r" (o32), // 6
"r" (o48), // 7
"r" (o64), // 8
"r" (o80), // 9
"r" (o96), // 10
"r" (o112) // 11
: "cr0", "%0", "%2" , "%1", "memory"
);
}

175
kernel/power/zswap.c Normal file
View File

@ -0,0 +1,175 @@
/***************************************************************************
Copyright (c) 2013-2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2016/03/27 Werner Saar (wernsaar@googlemail.com)
* BLASTEST : OK
* CTEST : OK
* TEST : OK
* LAPACK-TEST : OK
**************************************************************************************/
#include "common.h"
#if defined(POWER8)
#include "zswap_microk_power8.c"
#endif
#ifndef HAVE_KERNEL_16
static void zswap_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y)
{
BLASLONG i=0;
FLOAT f0, f1, f2, f3, f4, f5, f6, f7;
FLOAT g0, g1, g2, g3, g4, g5, g6, g7;
FLOAT *x1=x;
FLOAT *y1=y;
while ( i<n )
{
f0 = x1[0];
f1 = x1[1];
f2 = x1[2];
f3 = x1[3];
f4 = x1[4];
f5 = x1[5];
f6 = x1[6];
f7 = x1[7];
g0 = y1[0];
g1 = y1[1];
g2 = y1[2];
g3 = y1[3];
g4 = y1[4];
g5 = y1[5];
g6 = y1[6];
g7 = y1[7];
y1[0] = f0;
y1[1] = f1;
y1[2] = f2;
y1[3] = f3;
y1[4] = f4;
y1[5] = f5;
y1[6] = f6;
y1[7] = f7;
x1[0] = g0;
x1[1] = g1;
x1[2] = g2;
x1[3] = g3;
x1[4] = g4;
x1[5] = g5;
x1[6] = g6;
x1[7] = g7;
x1 += 8;
y1 += 8;
i+=4;
}
return;
}
#endif
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;
FLOAT temp[2];
BLASLONG inc_x2, inc_y2;
if ( n <= 0 ) return(0);
if ( (inc_x == 1) && (inc_y == 1 ))
{
BLASLONG n1 = n & -16;
if ( n1 > 0 )
{
zswap_kernel_16(n1, x, y);
i=n1;
ix = 2* n1;
iy = 2* n1;
}
while(i < n)
{
temp[0] = x[ix] ;
temp[1] = x[ix+1] ;
x[ix] = y[iy] ;
x[ix+1] = y[iy+1] ;
y[iy] = temp[0] ;
y[iy+1] = temp[1] ;
ix += 2 ;
iy += 2 ;
i++ ;
}
}
else
{
inc_x2 = 2 * inc_x;
inc_y2 = 2 * inc_y;
while(i < n)
{
temp[0] = x[ix] ;
temp[1] = x[ix+1] ;
x[ix] = y[iy] ;
x[ix+1] = y[iy+1] ;
y[iy] = temp[0] ;
y[iy+1] = temp[1] ;
ix += inc_x2 ;
iy += inc_y2 ;
i++ ;
}
}
return(0);
}

View File

@ -0,0 +1,180 @@
/***************************************************************************
Copyright (c) 2013-2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2016/03/27 Werner Saar (wernsaar@googlemail.com)
* BLASTEST : OK
* CTEST : OK
* TEST : OK
* LAPACK-TEST : OK
**************************************************************************************/
#define HAVE_KERNEL_16 1
static void zswap_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
static void zswap_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y)
{
BLASLONG i = n;
BLASLONG o16 = 16;
BLASLONG o32 = 32;
BLASLONG o48 = 48;
BLASLONG o64 = 64;
BLASLONG o80 = 80;
BLASLONG o96 = 96;
BLASLONG o112 = 112;
FLOAT *x1=x;
FLOAT *y1=y;
FLOAT *x2=x+1;
FLOAT *y2=y+1;
BLASLONG pre = 384;
BLASLONG alpha=0;
__asm__ __volatile__
(
"addi %3, %3, -8 \n\t"
"addi %4, %4, -8 \n\t"
".align 5 \n\t"
"1: \n\t"
"lxvd2x 32, 0, %2 \n\t"
"lxvd2x 33, %5, %2 \n\t"
"lxvd2x 34, %6, %2 \n\t"
"lxvd2x 35, %7, %2 \n\t"
"lxvd2x 36, %8, %2 \n\t"
"lxvd2x 37, %9, %2 \n\t"
"lxvd2x 38, %10, %2 \n\t"
"lxvd2x 39, %11, %2 \n\t"
"addi %2, %2, 128 \n\t"
"lxvd2x 40, 0, %2 \n\t"
"lxvd2x 41, %5, %2 \n\t"
"lxvd2x 42, %6, %2 \n\t"
"lxvd2x 43, %7, %2 \n\t"
"lxvd2x 44, %8, %2 \n\t"
"lxvd2x 45, %9, %2 \n\t"
"lxvd2x 46, %10, %2 \n\t"
"lxvd2x 47, %11, %2 \n\t"
"addi %2, %2, 128 \n\t"
"lxvd2x 48, 0, %1 \n\t"
"lxvd2x 49, %5, %1 \n\t"
"lxvd2x 50, %6, %1 \n\t"
"lxvd2x 51, %7, %1 \n\t"
"lxvd2x 52, %8, %1 \n\t"
"lxvd2x 53, %9, %1 \n\t"
"lxvd2x 54, %10, %1 \n\t"
"lxvd2x 55, %11, %1 \n\t"
"addi %1, %1, 128 \n\t"
"lxvd2x 56, 0, %1 \n\t"
"lxvd2x 57, %5, %1 \n\t"
"lxvd2x 58, %6, %1 \n\t"
"lxvd2x 59, %7, %1 \n\t"
"lxvd2x 60, %8, %1 \n\t"
"lxvd2x 61, %9, %1 \n\t"
"lxvd2x 62, %10, %1 \n\t"
"lxvd2x 63, %11, %1 \n\t"
"addi %1, %1, 128 \n\t"
"stxvd2x 32, 0, %3 \n\t"
"stxvd2x 33, %5, %3 \n\t"
"stxvd2x 34, %6, %3 \n\t"
"stxvd2x 35, %7, %3 \n\t"
"stxvd2x 36, %8, %3 \n\t"
"stxvd2x 37, %9, %3 \n\t"
"stxvd2x 38, %10, %3 \n\t"
"stxvd2x 39, %11, %3 \n\t"
"addi %3, %3, 128 \n\t"
"stxvd2x 40, 0, %3 \n\t"
"stxvd2x 41, %5, %3 \n\t"
"stxvd2x 42, %6, %3 \n\t"
"stxvd2x 43, %7, %3 \n\t"
"stxvd2x 44, %8, %3 \n\t"
"stxvd2x 45, %9, %3 \n\t"
"stxvd2x 46, %10, %3 \n\t"
"stxvd2x 47, %11, %3 \n\t"
"addi %3, %3, 128 \n\t"
"stxvd2x 48, 0, %4 \n\t"
"stxvd2x 49, %5, %4 \n\t"
"stxvd2x 50, %6, %4 \n\t"
"stxvd2x 51, %7, %4 \n\t"
"stxvd2x 52, %8, %4 \n\t"
"stxvd2x 53, %9, %4 \n\t"
"stxvd2x 54, %10, %4 \n\t"
"stxvd2x 55, %11, %4 \n\t"
"addi %4, %4, 128 \n\t"
"stxvd2x 56, 0, %4 \n\t"
"stxvd2x 57, %5, %4 \n\t"
"stxvd2x 58, %6, %4 \n\t"
"stxvd2x 59, %7, %4 \n\t"
"stxvd2x 60, %8, %4 \n\t"
"stxvd2x 61, %9, %4 \n\t"
"stxvd2x 62, %10, %4 \n\t"
"stxvd2x 63, %11, %4 \n\t"
"addi %4, %4, 128 \n\t"
"addic. %0 , %0 , -16 \n\t"
"bgt 1b \n\t"
"2: \n\t"
:
:
"r" (i), // 0
"r" (y1), // 1
"r" (x1), // 2
"r" (y2), // 3
"r" (x2), // 4
"r" (o16), // 5
"r" (o32), // 6
"r" (o48), // 7
"r" (o64), // 8
"r" (o80), // 9
"r" (o96), // 10
"r" (o112) // 11
: "cr0", "%0", "%2" , "%1", "%3", "%4", "memory"
);
}

View File

@ -271,7 +271,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif #endif
#endif #endif
#include "zgemm_macros_8x2_power8.S" #include "ztrmm_macros_8x2_power8.S"
cmpwi cr0, M, 0 cmpwi cr0, M, 0
ble .L999 ble .L999

File diff suppressed because it is too large Load Diff

View File

@ -24,7 +24,7 @@ SGEMVTKERNEL = sgemv_t_4.c
DGEMVNKERNEL = dgemv_n_4.c DGEMVNKERNEL = dgemv_n_4.c
DGEMVTKERNEL = dgemv_t_4.c DGEMVTKERNEL = dgemv_t_4.c
ZGEMVNKERNEL = zgemv_t_4.c ZGEMVNKERNEL = zgemv_n_4.c
ZGEMVTKERNEL = zgemv_t_4.c ZGEMVTKERNEL = zgemv_t_4.c
DCOPYKERNEL = dcopy_bulldozer.S DCOPYKERNEL = dcopy_bulldozer.S

View File

@ -72,18 +72,20 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
{ {
BLASLONG i=0; BLASLONG i=0;
BLASLONG ix=0,iy=0; BLASLONG ix=0,iy=0;
double dot = 0.0 ;
FLOAT dot = 0.0 ; FLOAT mydot=0.0;
BLASLONG n1;
if ( n <= 0 ) return(dot); if ( n <= 0 ) return(dot);
if ( (inc_x == 1) && (inc_y == 1) ) if ( (inc_x == 1) && (inc_y == 1) )
{ {
BLASLONG n1 = n & -32; n1 = n & (BLASLONG)(-32);
if ( n1 ) if ( n1 )
sdot_kernel_16(n1, x, y , &dot ); sdot_kernel_16(n1, x, y , &mydot );
i = n1; i = n1;
@ -94,12 +96,13 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
i++ ; i++ ;
} }
dot+=mydot;
return(dot); return(dot);
} }
BLASLONG n1 = n & -2; n1 = n & (BLASLONG)(-2);
while(i < n1) while(i < n1)
{ {
@ -124,4 +127,3 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
} }

31
param.h
View File

@ -1961,35 +1961,36 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if defined(POWER8) #if defined(POWER8)
#define SNUMOPT 4 #define SNUMOPT 16
#define DNUMOPT 8 #define DNUMOPT 8
#define GEMM_DEFAULT_OFFSET_A 384 #define GEMM_DEFAULT_OFFSET_A 4096
#define GEMM_DEFAULT_OFFSET_B 1024 #define GEMM_DEFAULT_OFFSET_B 4096
#define GEMM_DEFAULT_ALIGN 0x03fffUL #define GEMM_DEFAULT_ALIGN 0x03fffUL
#define SGEMM_DEFAULT_UNROLL_M 4 #define SGEMM_DEFAULT_UNROLL_M 16
#define SGEMM_DEFAULT_UNROLL_N 4 #define SGEMM_DEFAULT_UNROLL_N 8
#define DGEMM_DEFAULT_UNROLL_M 16 #define DGEMM_DEFAULT_UNROLL_M 16
#define DGEMM_DEFAULT_UNROLL_N 4 #define DGEMM_DEFAULT_UNROLL_N 4
#define CGEMM_DEFAULT_UNROLL_M 2 #define CGEMM_DEFAULT_UNROLL_M 8
#define CGEMM_DEFAULT_UNROLL_N 2 #define CGEMM_DEFAULT_UNROLL_N 4
#define ZGEMM_DEFAULT_UNROLL_M 8 #define ZGEMM_DEFAULT_UNROLL_M 8
#define ZGEMM_DEFAULT_UNROLL_N 2 #define ZGEMM_DEFAULT_UNROLL_N 2
#define SGEMM_DEFAULT_P 992 #define SGEMM_DEFAULT_P 960
#define DGEMM_DEFAULT_P 480 #define DGEMM_DEFAULT_P 480
#define CGEMM_DEFAULT_P 488 #define CGEMM_DEFAULT_P 720
#define ZGEMM_DEFAULT_P 240 #define ZGEMM_DEFAULT_P 480
#define SGEMM_DEFAULT_Q 504 #define SGEMM_DEFAULT_Q 720
#define DGEMM_DEFAULT_Q 720 #define DGEMM_DEFAULT_Q 720
#define CGEMM_DEFAULT_Q 400 #define CGEMM_DEFAULT_Q 720
#define ZGEMM_DEFAULT_Q 360 #define ZGEMM_DEFAULT_Q 720
#define SGEMM_DEFAULT_R 28800 #define SGEMM_DEFAULT_R 21600
#define DGEMM_DEFAULT_R 14400 #define DGEMM_DEFAULT_R 14400
#define ZGEMM_DEFAULT_R 7200 #define CGEMM_DEFAULT_R 16200
#define ZGEMM_DEFAULT_R 21600
#define SYMV_P 8 #define SYMV_P 8