Merge pull request #837 from wernsaar/develop

updated zgemm- and ztrmm-kernel for POWER8
This commit is contained in:
wernsaar 2016-04-08 11:13:27 +02:00
commit 0a4276bc2f
10 changed files with 3673 additions and 557 deletions

View File

@ -34,7 +34,8 @@ LIBMKL = -L$(MKL) -lmkl_intel_lp64 -lmkl_gnu_thread -lmkl_core -lgomp -lpthread
LIBVECLIB = -framework Accelerate
ESSL=/opt/ibm/lib
LIBESSL = -lessl $(ESSL)/libxlomp_ser.so.1 $(ESSL)/libxlf90_r.so.1 $(ESSL)/libxlfmath.so.1 $(ESSL)/libxlsmp.so.1 /opt/ibm/xlC/13.1.2/lib/libxl.a
#LIBESSL = -lesslsmp $(ESSL)/libxlomp_ser.so.1 $(ESSL)/libxlf90_r.so.1 $(ESSL)/libxlfmath.so.1 $(ESSL)/libxlsmp.so.1 /opt/ibm/xlC/13.1.3/lib/libxl.a
LIBESSL = -lesslsmp $(ESSL)/libxlf90_r.so.1 $(ESSL)/libxlfmath.so.1 $(ESSL)/libxlsmp.so.1 /opt/ibm/xlC/13.1.3/lib/libxl.a
ifeq ($(OSNAME), WINNT)
@ -259,7 +260,8 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \
endif
essl :: sgemm.essl strmm.essl dgemm.essl dtrmm.essl \
cgemm.essl ctrmm.essl zgemm.essl ztrmm.essl
cgemm.essl ctrmm.essl zgemm.essl ztrmm.essl \
slinpack.essl clinpack.essl dlinpack.essl zlinpack.essl
veclib :: slinpack.veclib dlinpack.veclib clinpack.veclib zlinpack.veclib \
scholesky.veclib dcholesky.veclib ccholesky.veclib zcholesky.veclib \
@ -312,6 +314,9 @@ slinpack.mkl : slinpack.$(SUFFIX)
slinpack.veclib : slinpack.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
slinpack.essl : slinpack.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
##################################### Dlinpack ####################################################
dlinpack.goto : dlinpack.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
@ -328,6 +333,9 @@ dlinpack.mkl : dlinpack.$(SUFFIX)
dlinpack.veclib : dlinpack.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
dlinpack.essl : dlinpack.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
##################################### Clinpack ####################################################
clinpack.goto : clinpack.$(SUFFIX) ../$(LIBNAME)
@ -345,6 +353,9 @@ clinpack.mkl : clinpack.$(SUFFIX)
clinpack.veclib : clinpack.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
clinpack.essl : clinpack.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
##################################### Zlinpack ####################################################
zlinpack.goto : zlinpack.$(SUFFIX) ../$(LIBNAME)
@ -362,6 +373,9 @@ zlinpack.mkl : zlinpack.$(SUFFIX)
zlinpack.veclib : zlinpack.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
zlinpack.essl : zlinpack.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
##################################### Scholesky ###################################################
scholesky.goto : scholesky.$(SUFFIX) ../$(LIBNAME)

View File

@ -798,7 +798,7 @@ Lmcount$lazy_ptr:
#elif defined(PPC440FP2)
#define BUFFER_SIZE ( 16 << 20)
#elif defined(POWER8)
#define BUFFER_SIZE ( 64 << 20)
#define BUFFER_SIZE ( 32 << 20)
#else
#define BUFFER_SIZE ( 16 << 20)
#endif

View File

@ -82,7 +82,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
#ifdef __64BIT__
#define STACKSIZE 400
#define STACKSIZE 32000
#define ALPHA_R_SP 296(SP)
#define ALPHA_I_SP 304(SP)
#define FZERO 312(SP)
@ -136,6 +136,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define alpha_sr vs30
#define alpha_si vs31
#define FRAMEPOINTER r12
#define BBUFFER r14
#define L r15
@ -161,6 +162,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PROLOGUE
PROFCODE
mr FRAMEPOINTER, SP
addi SP, SP, -STACKSIZE
addi SP, SP, -STACKSIZE
addi SP, SP, -STACKSIZE
addi SP, SP, -STACKSIZE
li r0, 0
@ -233,37 +238,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#ifdef linux
#ifdef __64BIT__
ld LDC, FRAMESLOT(0) + STACKSIZE(SP)
ld LDC, FRAMESLOT(0) + 0(FRAMEPOINTER)
#endif
#endif
#if defined(_AIX) || defined(__APPLE__)
#ifdef __64BIT__
ld LDC, FRAMESLOT(0) + STACKSIZE(SP)
ld LDC, FRAMESLOT(0) + 0(FRAMEPOINTER)
#else
#ifdef DOUBLE
lwz B, FRAMESLOT(0) + STACKSIZE(SP)
lwz C, FRAMESLOT(1) + STACKSIZE(SP)
lwz LDC, FRAMESLOT(2) + STACKSIZE(SP)
lwz B, FRAMESLOT(0) + 0(FRAMEPOINTER)
lwz C, FRAMESLOT(1) + 0(FRAMEPOINTER)
lwz LDC, FRAMESLOT(2) + 0(FRAMEPOINTER)
#else
lwz LDC, FRAMESLOT(0) + STACKSIZE(SP)
lwz LDC, FRAMESLOT(0) + 0(FRAMEPOINTER)
#endif
#endif
#endif
#ifdef TRMMKERNEL
#if defined(linux) && defined(__64BIT__)
ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
ld OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER)
#endif
#if defined(_AIX) || defined(__APPLE__)
#ifdef __64BIT__
ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
ld OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER)
#else
#ifdef DOUBLE
lwz OFFSET, FRAMESLOT(3) + STACKSIZE(SP)
lwz OFFSET, FRAMESLOT(3) + 0(FRAMEPOINTER)
#else
lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
lwz OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER)
#endif
#endif
#endif
@ -290,9 +295,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
li o32 , 32
li o48 , 48
li T1, 256
slwi T1, T1, 9 // 131072
sub BBUFFER, A, T1 // temp buffer for B unrolled
addi BBUFFER, SP, 512+4096
li T1, -4096
and BBUFFER, BBUFFER, T1
#ifdef __64BIT__
@ -392,6 +397,9 @@ L999:
#endif
addi SP, SP, STACKSIZE
addi SP, SP, STACKSIZE
addi SP, SP, STACKSIZE
addi SP, SP, STACKSIZE
blr

View File

@ -82,7 +82,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
#ifdef __64BIT__
#define STACKSIZE 340
#define STACKSIZE 32752
#define ALPHA_SP 296(SP)
#define FZERO 304(SP)
#else
@ -132,6 +132,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define o0 0
#define FRAMEPOINTER r12
#define BBUFFER r14
#define o4 r15
#define o12 r16
@ -160,6 +162,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PROLOGUE
PROFCODE
mr FRAMEPOINTER, SP
addi SP, SP, -STACKSIZE
addi SP, SP, -STACKSIZE
addi SP, SP, -STACKSIZE
addi SP, SP, -STACKSIZE
li r0, 0
@ -231,7 +237,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if defined(_AIX) || defined(__APPLE__)
#if !defined(__64BIT__) && defined(DOUBLE)
lwz LDC, FRAMESLOT(0) + STACKSIZE(SP)
lwz LDC, FRAMESLOT(0) + 0(FRAMEPOINTER)
#endif
#endif
@ -239,17 +245,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if defined(TRMMKERNEL)
#if defined(linux) && defined(__64BIT__)
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
ld OFFSET, FRAMESLOT(0) + 0(FRAMEPOINTER)
#endif
#if defined(_AIX) || defined(__APPLE__)
#ifdef __64BIT__
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
ld OFFSET, FRAMESLOT(0) + 0(FRAMEPOINTER)
#else
#ifdef DOUBLE
lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
lwz OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER)
#else
lwz OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
lwz OFFSET, FRAMESLOT(0) + 0(FRAMEPOINTER)
#endif
#endif
#endif
@ -271,9 +277,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
li o32, 32
li o48, 48
li T1, 256
slwi T1, T1, 9 // 131072
sub BBUFFER, A, T1 // temp buffer for B unrolled
addi BBUFFER, SP, 512+4096
li T1, -4096
and BBUFFER, BBUFFER, T1
addi T1, SP, 300
stxsspx f1, o0 , T1
@ -355,6 +361,9 @@ L999:
#endif
addi SP, SP, STACKSIZE
addi SP, SP, STACKSIZE
addi SP, SP, STACKSIZE
addi SP, SP, STACKSIZE
blr

View File

@ -1,38 +1,3 @@
/***************************************************************************
Copyright (c) 2013-2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2016/03/05 Werner Saar (wernsaar@googlemail.com)
* BLASTEST : OK
* CTEST : OK
* TEST : OK
* LAPACK-TEST : OK
**************************************************************************************/
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
@ -82,7 +47,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
#ifdef __64BIT__
#define STACKSIZE 320
#define STACKSIZE 32000
#define ALPHA_R_SP 296(SP)
#define ALPHA_I_SP 304(SP)
#define FZERO 312(SP)
@ -133,11 +98,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define alpha_r vs30
#define alpha_i vs31
#define FRAMEPOINTER r12
#define BBUFFER r14
#define L r15
#define ALPHA r16
#define o24 r17
#define T2 r19
#define KK r20
#define BBO r20
#define o8 r21
#define I r22
#define J r23
@ -156,8 +126,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PROLOGUE
PROFCODE
addi SP, SP, -STACKSIZE
li r0, 0
mr FRAMEPOINTER, SP
addi SP, SP, -STACKSIZE
addi SP, SP, -STACKSIZE
addi SP, SP, -STACKSIZE
addi SP, SP, -STACKSIZE
li r0, 0
stfd f14, 0(SP)
stfd f15, 8(SP)
@ -200,6 +174,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
std r17, 256(SP)
std r16, 264(SP)
std r15, 272(SP)
std r14, 280(SP)
#else
stw r31, 144(SP)
stw r30, 148(SP)
@ -226,37 +201,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#ifdef linux
#ifdef __64BIT__
ld LDC, FRAMESLOT(0) + STACKSIZE(SP)
ld LDC, FRAMESLOT(0) + 0(FRAMEPOINTER)
#endif
#endif
#if defined(_AIX) || defined(__APPLE__)
#ifdef __64BIT__
ld LDC, FRAMESLOT(0) + STACKSIZE(SP)
ld LDC, FRAMESLOT(0) + 0(FRAMEPOINTER)
#else
#ifdef DOUBLE
lwz B, FRAMESLOT(0) + STACKSIZE(SP)
lwz C, FRAMESLOT(1) + STACKSIZE(SP)
lwz LDC, FRAMESLOT(2) + STACKSIZE(SP)
lwz B, FRAMESLOT(0) + 0(FRAMEPOINTER)
lwz C, FRAMESLOT(1) + 0(FRAMEPOINTER)
lwz LDC, FRAMESLOT(2) + 0(FRAMEPOINTER)
#else
lwz LDC, FRAMESLOT(0) + STACKSIZE(SP)
lwz LDC, FRAMESLOT(0) + 0(FRAMEPOINTER)
#endif
#endif
#endif
#ifdef TRMMKERNEL
#if defined(linux) && defined(__64BIT__)
ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
ld OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER)
#endif
#if defined(_AIX) || defined(__APPLE__)
#ifdef __64BIT__
ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
ld OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER)
#else
#ifdef DOUBLE
lwz OFFSET, FRAMESLOT(3) + STACKSIZE(SP)
lwz OFFSET, FRAMESLOT(3) + 0(FRAMEPOINTER)
#else
lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
lwz OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER)
#endif
#endif
#endif
@ -268,34 +243,38 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "zgemm_macros_8x2_power8.S"
cmpwi cr0, M, 0
ble .L999
ble L999
cmpwi cr0, N, 0
ble .L999
ble L999
cmpwi cr0, K, 0
ble .L999
ble L999
slwi LDC, LDC, ZBASE_SHIFT
li PRE, 256
li PRE, 384
li o8 , 8
li o16 , 16
li o24 , 24
li o32 , 32
li o48 , 48
addi BBUFFER, SP, 512+4096
li T1, -4096
and BBUFFER, BBUFFER, T1
#ifdef __64BIT__
addi ALPHA, SP, 296
#else
addi ALPHA, SP, 224
#endif
lxvdsx alpha_r, 0, ALPHA
lxvdsx alpha_i, o8, ALPHA
lxsdx alpha_r, 0, ALPHA
lxsdx alpha_i, o8, ALPHA
.align 5
.align 4
#include "zgemm_logic_8x2_power8.S"
.L999:
L999:
addi r3, 0, 0
lfd f14, 0(SP)
@ -339,6 +318,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld r17, 256(SP)
ld r16, 264(SP)
ld r15, 272(SP)
ld r14, 280(SP)
#else
lwz r31, 144(SP)
lwz r30, 148(SP)
@ -360,6 +340,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
addi SP, SP, STACKSIZE
addi SP, SP, STACKSIZE
addi SP, SP, STACKSIZE
addi SP, SP, STACKSIZE
blr

View File

@ -1,83 +1,111 @@
srawi. J, N, 1
ble .LZGEMM_L2_END
ble ZGEMM_L2_END
ZGEMM_L2_BEGIN:
mr BO, B
mr BBO, BBUFFER
slwi T1, K, 1
ZGEMM_L2_COPYB:
lxvdsx vs4, o0, BO // b0_r
lxvdsx vs5, o8, BO // b0_i
addi BO, BO, 16
stxvd2x vs4, o0, BBO
stxvd2x vs5, o16, BBO
addic. T1, T1, -1
addi BBO, BBO, 32
bge ZGEMM_L2_COPYB
.LZGEMM_L2_BEGIN:
mr CO, C
mr AO, A
slwi T1, LDC , 1
add C, C, T1
srawi. I, M, 3
ble .LZGEMM_L2x8_END
ble ZGEMM_L2x8_END
.LZGEMM_L2x8_BEGIN:
ZGEMM_L2x8_BEGIN:
mr BO, B
mr BO, BBUFFER
srawi. L, K, 3
ble .LZGEMM_L2x8_SUB0
ble ZGEMM_L2x8_SUB0
cmpwi cr0, L, 1
ble .LZGEMM_L2x8_SUB4
ble ZGEMM_L2x8_SUB4
.LZGEMM_L2x8_LOOP_START:
ZGEMM_L2x8_LOOP_START:
dcbt AO, PRE
dcbt BO, PRE
LOAD2x8_1
dcbt AO, PRE
KERNEL2x8_I1
dcbt AO, PRE
dcbt BO, PRE
KERNEL2x8_2
dcbt AO, PRE
KERNEL2x8_1
dcbt AO, PRE
dcbt BO, PRE
KERNEL2x8_2
dcbt AO, PRE
KERNEL2x8_1
dcbt AO, PRE
dcbt BO, PRE
KERNEL2x8_2
dcbt AO, PRE
KERNEL2x8_1
dcbt AO, PRE
dcbt BO, PRE
KERNEL2x8_2
addic. L, L, -2
ble .LZGEMM_L2x8_LOOP_END
ble ZGEMM_L2x8_LOOP_END
.align 5
.LZGEMM_L2x8_LOOP:
ZGEMM_L2x8_LOOP:
dcbt AO, PRE
KERNEL2x8_1
dcbt AO, PRE
dcbt BO, PRE
KERNEL2x8_2
dcbt AO, PRE
KERNEL2x8_1
dcbt AO, PRE
dcbt BO, PRE
KERNEL2x8_2
dcbt AO, PRE
KERNEL2x8_1
dcbt AO, PRE
dcbt BO, PRE
KERNEL2x8_2
dcbt AO, PRE
KERNEL2x8_1
dcbt AO, PRE
dcbt BO, PRE
KERNEL2x8_2
addic. L, L, -1
bgt .LZGEMM_L2x8_LOOP
bgt ZGEMM_L2x8_LOOP
.LZGEMM_L2x8_LOOP_END:
ZGEMM_L2x8_LOOP_END:
dcbt AO, PRE
KERNEL2x8_1
dcbt AO, PRE
dcbt BO, PRE
KERNEL2x8_2
dcbt AO, PRE
KERNEL2x8_1
dcbt AO, PRE
dcbt BO, PRE
KERNEL2x8_2
dcbt AO, PRE
@ -88,9 +116,9 @@
KERNEL2x8_1
KERNEL2x8_E2
b .LZGEMM_L2x8_SUB1
b ZGEMM_L2x8_SUB1
.LZGEMM_L2x8_SUB4:
ZGEMM_L2x8_SUB4:
dcbt AO, PRE
KERNEL2x8_SUBI1
@ -106,53 +134,53 @@
KERNEL2x8_SUB1
KERNEL2x8_SUB1
b .LZGEMM_L2x8_SUB1
b ZGEMM_L2x8_SUB1
.LZGEMM_L2x8_SUB0:
ZGEMM_L2x8_SUB0:
andi. L, K, 7
KERNEL2x8_SUBI1
addic. L, L, -1
ble .LZGEMM_L2x8_SAVE
b .LZGEMM_L2x8_SUB2
ble ZGEMM_L2x8_SAVE
b ZGEMM_L2x8_SUB2
.LZGEMM_L2x8_SUB1:
ZGEMM_L2x8_SUB1:
andi. L, K, 7
ble .LZGEMM_L2x8_SAVE
ble ZGEMM_L2x8_SAVE
.LZGEMM_L2x8_SUB2:
ZGEMM_L2x8_SUB2:
KERNEL2x8_SUB1
addic. L, L, -1
bgt .LZGEMM_L2x8_SUB2
bgt ZGEMM_L2x8_SUB2
.LZGEMM_L2x8_SAVE:
ZGEMM_L2x8_SAVE:
SAVE2x8
addic. I, I, -1
bgt .LZGEMM_L2x8_BEGIN
bgt ZGEMM_L2x8_BEGIN
.LZGEMM_L2x8_END:
ZGEMM_L2x8_END:
.LZGEMM_L2x4_BEGIN:
ZGEMM_L2x4_BEGIN:
andi. T2, M, 7
ble .LZGEMM_L2x1_END
ble ZGEMM_L2x1_END
andi. T1, M, 4
ble .LZGEMM_L2x4_END
mr BO, B
ble ZGEMM_L2x4_END
mr BO, BBUFFER
srawi. L, K, 3
ble .LZGEMM_L2x4_SUB0
ble ZGEMM_L2x4_SUB0
cmpwi cr0, L, 1
ble .LZGEMM_L2x4_SUB4
ble ZGEMM_L2x4_SUB4
.LZGEMM_L2x4_LOOP_START:
ZGEMM_L2x4_LOOP_START:
LOAD2x4_1
KERNEL2x4_I1
@ -166,11 +194,11 @@
KERNEL2x4_2
addic. L, L, -2
ble .LZGEMM_L2x4_LOOP_END
ble ZGEMM_L2x4_LOOP_END
.align 5
.LZGEMM_L2x4_LOOP:
ZGEMM_L2x4_LOOP:
KERNEL2x4_1
KERNEL2x4_2
@ -183,9 +211,9 @@
KERNEL2x4_2
addic. L, L, -1
bgt .LZGEMM_L2x4_LOOP
bgt ZGEMM_L2x4_LOOP
.LZGEMM_L2x4_LOOP_END:
ZGEMM_L2x4_LOOP_END:
KERNEL2x4_1
KERNEL2x4_2
@ -197,9 +225,9 @@
KERNEL2x4_1
KERNEL2x4_E2
b .LZGEMM_L2x4_SUB1
b ZGEMM_L2x4_SUB1
.LZGEMM_L2x4_SUB4:
ZGEMM_L2x4_SUB4:
KERNEL2x4_SUBI1
KERNEL2x4_SUB1
@ -211,48 +239,48 @@
KERNEL2x4_SUB1
KERNEL2x4_SUB1
b .LZGEMM_L2x4_SUB1
b ZGEMM_L2x4_SUB1
.LZGEMM_L2x4_SUB0:
ZGEMM_L2x4_SUB0:
andi. L, K, 7
KERNEL2x4_SUBI1
addic. L, L, -1
ble .LZGEMM_L2x4_SAVE
b .LZGEMM_L2x4_SUB2
ble ZGEMM_L2x4_SAVE
b ZGEMM_L2x4_SUB2
.LZGEMM_L2x4_SUB1:
ZGEMM_L2x4_SUB1:
andi. L, K, 7
ble .LZGEMM_L2x4_SAVE
ble ZGEMM_L2x4_SAVE
.LZGEMM_L2x4_SUB2:
ZGEMM_L2x4_SUB2:
KERNEL2x4_SUB1
addic. L, L, -1
bgt .LZGEMM_L2x4_SUB2
bgt ZGEMM_L2x4_SUB2
.LZGEMM_L2x4_SAVE:
ZGEMM_L2x4_SAVE:
SAVE2x4
.LZGEMM_L2x4_END:
ZGEMM_L2x4_END:
.LZGEMM_L2x2_BEGIN:
ZGEMM_L2x2_BEGIN:
andi. T1, M, 2
ble .LZGEMM_L2x2_END
mr BO, B
ble ZGEMM_L2x2_END
mr BO, BBUFFER
srawi. L, K, 3
ble .LZGEMM_L2x2_SUB0
ble ZGEMM_L2x2_SUB0
cmpwi cr0, L, 1
ble .LZGEMM_L2x2_SUB4
ble ZGEMM_L2x2_SUB4
.LZGEMM_L2x2_LOOP_START:
ZGEMM_L2x2_LOOP_START:
LOAD2x2_1
KERNEL2x2_I1
@ -266,11 +294,11 @@
KERNEL2x2_2
addic. L, L, -2
ble .LZGEMM_L2x2_LOOP_END
ble ZGEMM_L2x2_LOOP_END
.align 5
.LZGEMM_L2x2_LOOP:
ZGEMM_L2x2_LOOP:
KERNEL2x2_1
KERNEL2x2_2
@ -283,9 +311,9 @@
KERNEL2x2_2
addic. L, L, -1
bgt .LZGEMM_L2x2_LOOP
bgt ZGEMM_L2x2_LOOP
.LZGEMM_L2x2_LOOP_END:
ZGEMM_L2x2_LOOP_END:
KERNEL2x2_1
KERNEL2x2_2
@ -297,9 +325,9 @@
KERNEL2x2_1
KERNEL2x2_E2
b .LZGEMM_L2x2_SUB1
b ZGEMM_L2x2_SUB1
.LZGEMM_L2x2_SUB4:
ZGEMM_L2x2_SUB4:
KERNEL2x2_SUBI1
KERNEL2x2_SUB1
@ -311,48 +339,48 @@
KERNEL2x2_SUB1
KERNEL2x2_SUB1
b .LZGEMM_L2x2_SUB1
b ZGEMM_L2x2_SUB1
.LZGEMM_L2x2_SUB0:
ZGEMM_L2x2_SUB0:
andi. L, K, 7
KERNEL2x2_SUBI1
addic. L, L, -1
ble .LZGEMM_L2x2_SAVE
b .LZGEMM_L2x2_SUB2
ble ZGEMM_L2x2_SAVE
b ZGEMM_L2x2_SUB2
.LZGEMM_L2x2_SUB1:
ZGEMM_L2x2_SUB1:
andi. L, K, 7
ble .LZGEMM_L2x2_SAVE
ble ZGEMM_L2x2_SAVE
.LZGEMM_L2x2_SUB2:
ZGEMM_L2x2_SUB2:
KERNEL2x2_SUB1
addic. L, L, -1
bgt .LZGEMM_L2x2_SUB2
bgt ZGEMM_L2x2_SUB2
.LZGEMM_L2x2_SAVE:
ZGEMM_L2x2_SAVE:
SAVE2x2
.LZGEMM_L2x2_END:
ZGEMM_L2x2_END:
.LZGEMM_L2x1_BEGIN:
ZGEMM_L2x1_BEGIN:
andi. T1, M, 1
ble .LZGEMM_L2x1_END
mr BO, B
ble ZGEMM_L2x1_END
mr BO, BBUFFER
srawi. L, K, 3
ble .LZGEMM_L2x1_SUB0
ble ZGEMM_L2x1_SUB0
cmpwi cr0, L, 1
ble .LZGEMM_L2x1_SUB4
ble ZGEMM_L2x1_SUB4
.LZGEMM_L2x1_LOOP_START:
ZGEMM_L2x1_LOOP_START:
LOAD2x1_1
KERNEL2x1_I1
@ -366,11 +394,11 @@
KERNEL2x1_2
addic. L, L, -2
ble .LZGEMM_L2x1_LOOP_END
ble ZGEMM_L2x1_LOOP_END
.align 5
.LZGEMM_L2x1_LOOP:
ZGEMM_L2x1_LOOP:
KERNEL2x1_1
KERNEL2x1_2
@ -383,9 +411,9 @@
KERNEL2x1_2
addic. L, L, -1
bgt .LZGEMM_L2x1_LOOP
bgt ZGEMM_L2x1_LOOP
.LZGEMM_L2x1_LOOP_END:
ZGEMM_L2x1_LOOP_END:
KERNEL2x1_1
KERNEL2x1_2
@ -397,9 +425,9 @@
KERNEL2x1_1
KERNEL2x1_E2
b .LZGEMM_L2x1_SUB1
b ZGEMM_L2x1_SUB1
.LZGEMM_L2x1_SUB4:
ZGEMM_L2x1_SUB4:
KERNEL2x1_SUBI1
KERNEL2x1_SUB1
@ -411,72 +439,89 @@
KERNEL2x1_SUB1
KERNEL2x1_SUB1
b .LZGEMM_L2x1_SUB1
b ZGEMM_L2x1_SUB1
.LZGEMM_L2x1_SUB0:
ZGEMM_L2x1_SUB0:
andi. L, K, 7
KERNEL2x1_SUBI1
addic. L, L, -1
ble .LZGEMM_L2x1_SAVE
b .LZGEMM_L2x1_SUB2
ble ZGEMM_L2x1_SAVE
b ZGEMM_L2x1_SUB2
.LZGEMM_L2x1_SUB1:
ZGEMM_L2x1_SUB1:
andi. L, K, 7
ble .LZGEMM_L2x1_SAVE
ble ZGEMM_L2x1_SAVE
.LZGEMM_L2x1_SUB2:
ZGEMM_L2x1_SUB2:
KERNEL2x1_SUB1
addic. L, L, -1
bgt .LZGEMM_L2x1_SUB2
bgt ZGEMM_L2x1_SUB2
.LZGEMM_L2x1_SAVE:
ZGEMM_L2x1_SAVE:
SAVE2x1
.LZGEMM_L2x1_END:
ZGEMM_L2x1_END:
slwi T1, K, 5
add B, B, T1
addic. J, J, -1
bgt .LZGEMM_L2_BEGIN
bgt ZGEMM_L2_BEGIN
andi. T2, N, 1
ble .L999
ble L999
.LZGEMM_L2_END:
ZGEMM_L2_END:
b .LZGEMM_L1_BEGIN
b ZGEMM_L1_BEGIN
.L999_H1:
L999_H1:
b .L999
b L999
ZGEMM_L1_BEGIN:
mr BO, B
mr BBO, BBUFFER
slwi T1, K, 0
ZGEMM_L1_COPYB:
lxvdsx vs4, o0, BO // b0_r
lxvdsx vs5, o8, BO // b0_i
addi BO, BO, 16
stxvd2x vs4, o0, BBO
stxvd2x vs5, o16, BBO
addic. T1, T1, -1
addi BBO, BBO, 32
bge ZGEMM_L1_COPYB
.LZGEMM_L1_BEGIN:
andi. T1, N, 1
ble .LZGEMM_L1_END
ble ZGEMM_L1_END
mr CO, C
mr AO, A
srawi. I, M, 3
ble .LZGEMM_L1x8_END
ble ZGEMM_L1x8_END
.LZGEMM_L1x8_BEGIN:
ZGEMM_L1x8_BEGIN:
mr BO, B
mr BO, BBUFFER
srawi. L, K, 3
ble .LZGEMM_L1x8_SUB0
ble ZGEMM_L1x8_SUB0
cmpwi cr0, L, 1
ble .LZGEMM_L1x8_SUB4
ble ZGEMM_L1x8_SUB4
.LZGEMM_L1x8_LOOP_START:
ZGEMM_L1x8_LOOP_START:
dcbt AO, PRE
LOAD1x8_1
@ -499,11 +544,11 @@
KERNEL1x8_2
addic. L, L, -2
ble .LZGEMM_L1x8_LOOP_END
ble ZGEMM_L1x8_LOOP_END
.align 5
.LZGEMM_L1x8_LOOP:
ZGEMM_L1x8_LOOP:
dcbt AO, PRE
KERNEL1x8_1
@ -524,9 +569,9 @@
KERNEL1x8_2
addic. L, L, -1
bgt .LZGEMM_L1x8_LOOP
bgt ZGEMM_L1x8_LOOP
.LZGEMM_L1x8_LOOP_END:
ZGEMM_L1x8_LOOP_END:
dcbt AO, PRE
KERNEL1x8_1
@ -545,9 +590,9 @@
KERNEL1x8_1
KERNEL1x8_E2
b .LZGEMM_L1x8_SUB1
b ZGEMM_L1x8_SUB1
.LZGEMM_L1x8_SUB4:
ZGEMM_L1x8_SUB4:
dcbt AO, PRE
KERNEL1x8_SUBI1
@ -563,53 +608,53 @@
KERNEL1x8_SUB1
KERNEL1x8_SUB1
b .LZGEMM_L1x8_SUB1
b ZGEMM_L1x8_SUB1
.LZGEMM_L1x8_SUB0:
ZGEMM_L1x8_SUB0:
andi. L, K, 7
KERNEL1x8_SUBI1
addic. L, L, -1
ble .LZGEMM_L1x8_SAVE
b .LZGEMM_L1x8_SUB2
ble ZGEMM_L1x8_SAVE
b ZGEMM_L1x8_SUB2
.LZGEMM_L1x8_SUB1:
ZGEMM_L1x8_SUB1:
andi. L, K, 7
ble .LZGEMM_L1x8_SAVE
ble ZGEMM_L1x8_SAVE
.LZGEMM_L1x8_SUB2:
ZGEMM_L1x8_SUB2:
KERNEL1x8_SUB1
addic. L, L, -1
bgt .LZGEMM_L1x8_SUB2
bgt ZGEMM_L1x8_SUB2
.LZGEMM_L1x8_SAVE:
ZGEMM_L1x8_SAVE:
SAVE1x8
addic. I, I, -1
bgt .LZGEMM_L1x8_BEGIN
bgt ZGEMM_L1x8_BEGIN
.LZGEMM_L1x8_END:
ZGEMM_L1x8_END:
.LZGEMM_L1x4_BEGIN:
ZGEMM_L1x4_BEGIN:
andi. T2, M, 7
ble .LZGEMM_L1x1_END
ble ZGEMM_L1x1_END
andi. T1, M, 4
ble .LZGEMM_L1x4_END
mr BO, B
ble ZGEMM_L1x4_END
mr BO, BBUFFER
srawi. L, K, 3
ble .LZGEMM_L1x4_SUB0
ble ZGEMM_L1x4_SUB0
cmpwi cr0, L, 1
ble .LZGEMM_L1x4_SUB4
ble ZGEMM_L1x4_SUB4
.LZGEMM_L1x4_LOOP_START:
ZGEMM_L1x4_LOOP_START:
LOAD1x4_1
KERNEL1x4_I1
@ -623,11 +668,11 @@
KERNEL1x4_2
addic. L, L, -2
ble .LZGEMM_L1x4_LOOP_END
ble ZGEMM_L1x4_LOOP_END
.align 5
.LZGEMM_L1x4_LOOP:
ZGEMM_L1x4_LOOP:
KERNEL1x4_1
KERNEL1x4_2
@ -640,9 +685,9 @@
KERNEL1x4_2
addic. L, L, -1
bgt .LZGEMM_L1x4_LOOP
bgt ZGEMM_L1x4_LOOP
.LZGEMM_L1x4_LOOP_END:
ZGEMM_L1x4_LOOP_END:
KERNEL1x4_1
KERNEL1x4_2
@ -654,9 +699,9 @@
KERNEL1x4_1
KERNEL1x4_E2
b .LZGEMM_L1x4_SUB1
b ZGEMM_L1x4_SUB1
.LZGEMM_L1x4_SUB4:
ZGEMM_L1x4_SUB4:
KERNEL1x4_SUBI1
KERNEL1x4_SUB1
@ -668,48 +713,48 @@
KERNEL1x4_SUB1
KERNEL1x4_SUB1
b .LZGEMM_L1x4_SUB1
b ZGEMM_L1x4_SUB1
.LZGEMM_L1x4_SUB0:
ZGEMM_L1x4_SUB0:
andi. L, K, 7
KERNEL1x4_SUBI1
addic. L, L, -1
ble .LZGEMM_L1x4_SAVE
b .LZGEMM_L1x4_SUB2
ble ZGEMM_L1x4_SAVE
b ZGEMM_L1x4_SUB2
.LZGEMM_L1x4_SUB1:
ZGEMM_L1x4_SUB1:
andi. L, K, 7
ble .LZGEMM_L1x4_SAVE
ble ZGEMM_L1x4_SAVE
.LZGEMM_L1x4_SUB2:
ZGEMM_L1x4_SUB2:
KERNEL1x4_SUB1
addic. L, L, -1
bgt .LZGEMM_L1x4_SUB2
bgt ZGEMM_L1x4_SUB2
.LZGEMM_L1x4_SAVE:
ZGEMM_L1x4_SAVE:
SAVE1x4
.LZGEMM_L1x4_END:
ZGEMM_L1x4_END:
.LZGEMM_L1x2_BEGIN:
ZGEMM_L1x2_BEGIN:
andi. T1, M, 2
ble .LZGEMM_L1x2_END
mr BO, B
ble ZGEMM_L1x2_END
mr BO, BBUFFER
srawi. L, K, 3
ble .LZGEMM_L1x2_SUB0
ble ZGEMM_L1x2_SUB0
cmpwi cr0, L, 1
ble .LZGEMM_L1x2_SUB4
ble ZGEMM_L1x2_SUB4
.LZGEMM_L1x2_LOOP_START:
ZGEMM_L1x2_LOOP_START:
LOAD1x2_1
KERNEL1x2_I1
@ -723,11 +768,11 @@
KERNEL1x2_2
addic. L, L, -2
ble .LZGEMM_L1x2_LOOP_END
ble ZGEMM_L1x2_LOOP_END
.align 5
.LZGEMM_L1x2_LOOP:
ZGEMM_L1x2_LOOP:
KERNEL1x2_1
KERNEL1x2_2
@ -740,9 +785,9 @@
KERNEL1x2_2
addic. L, L, -1
bgt .LZGEMM_L1x2_LOOP
bgt ZGEMM_L1x2_LOOP
.LZGEMM_L1x2_LOOP_END:
ZGEMM_L1x2_LOOP_END:
KERNEL1x2_1
KERNEL1x2_2
@ -754,9 +799,9 @@
KERNEL1x2_1
KERNEL1x2_E2
b .LZGEMM_L1x2_SUB1
b ZGEMM_L1x2_SUB1
.LZGEMM_L1x2_SUB4:
ZGEMM_L1x2_SUB4:
KERNEL1x2_SUBI1
KERNEL1x2_SUB1
@ -768,48 +813,48 @@
KERNEL1x2_SUB1
KERNEL1x2_SUB1
b .LZGEMM_L1x2_SUB1
b ZGEMM_L1x2_SUB1
.LZGEMM_L1x2_SUB0:
ZGEMM_L1x2_SUB0:
andi. L, K, 7
KERNEL1x2_SUBI1
addic. L, L, -1
ble .LZGEMM_L1x2_SAVE
b .LZGEMM_L1x2_SUB2
ble ZGEMM_L1x2_SAVE
b ZGEMM_L1x2_SUB2
.LZGEMM_L1x2_SUB1:
ZGEMM_L1x2_SUB1:
andi. L, K, 7
ble .LZGEMM_L1x2_SAVE
ble ZGEMM_L1x2_SAVE
.LZGEMM_L1x2_SUB2:
ZGEMM_L1x2_SUB2:
KERNEL1x2_SUB1
addic. L, L, -1
bgt .LZGEMM_L1x2_SUB2
bgt ZGEMM_L1x2_SUB2
.LZGEMM_L1x2_SAVE:
ZGEMM_L1x2_SAVE:
SAVE1x2
.LZGEMM_L1x2_END:
ZGEMM_L1x2_END:
.LZGEMM_L1x1_BEGIN:
ZGEMM_L1x1_BEGIN:
andi. T1, M, 1
ble .LZGEMM_L1x1_END
mr BO, B
ble ZGEMM_L1x1_END
mr BO, BBUFFER
srawi. L, K, 3
ble .LZGEMM_L1x1_SUB0
ble ZGEMM_L1x1_SUB0
cmpwi cr0, L, 1
ble .LZGEMM_L1x1_SUB4
ble ZGEMM_L1x1_SUB4
.LZGEMM_L1x1_LOOP_START:
ZGEMM_L1x1_LOOP_START:
LOAD1x1_1
KERNEL1x1_I1
@ -823,11 +868,11 @@
KERNEL1x1_2
addic. L, L, -2
ble .LZGEMM_L1x1_LOOP_END
ble ZGEMM_L1x1_LOOP_END
.align 5
.LZGEMM_L1x1_LOOP:
ZGEMM_L1x1_LOOP:
KERNEL1x1_1
KERNEL1x1_2
@ -840,9 +885,9 @@
KERNEL1x1_2
addic. L, L, -1
bgt .LZGEMM_L1x1_LOOP
bgt ZGEMM_L1x1_LOOP
.LZGEMM_L1x1_LOOP_END:
ZGEMM_L1x1_LOOP_END:
KERNEL1x1_1
KERNEL1x1_2
@ -854,9 +899,9 @@
KERNEL1x1_1
KERNEL1x1_E2
b .LZGEMM_L1x1_SUB1
b ZGEMM_L1x1_SUB1
.LZGEMM_L1x1_SUB4:
ZGEMM_L1x1_SUB4:
KERNEL1x1_SUBI1
KERNEL1x1_SUB1
@ -868,34 +913,34 @@
KERNEL1x1_SUB1
KERNEL1x1_SUB1
b .LZGEMM_L1x1_SUB1
b ZGEMM_L1x1_SUB1
.LZGEMM_L1x1_SUB0:
ZGEMM_L1x1_SUB0:
andi. L, K, 7
KERNEL1x1_SUBI1
addic. L, L, -1
ble .LZGEMM_L1x1_SAVE
b .LZGEMM_L1x1_SUB2
ble ZGEMM_L1x1_SAVE
b ZGEMM_L1x1_SUB2
.LZGEMM_L1x1_SUB1:
ZGEMM_L1x1_SUB1:
andi. L, K, 7
ble .LZGEMM_L1x1_SAVE
ble ZGEMM_L1x1_SAVE
.LZGEMM_L1x1_SUB2:
ZGEMM_L1x1_SUB2:
KERNEL1x1_SUB1
addic. L, L, -1
bgt .LZGEMM_L1x1_SUB2
bgt ZGEMM_L1x1_SUB2
.LZGEMM_L1x1_SAVE:
ZGEMM_L1x1_SAVE:
SAVE1x1
.LZGEMM_L1x1_END:
ZGEMM_L1x1_END:
.LZGEMM_L1_END:
ZGEMM_L1_END:

View File

@ -1,39 +1,3 @@
/***************************************************************************
Copyright (c) 2013-2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2016/03/05 Werner Saar (wernsaar@googlemail.com)
* BLASTEST : OK
* CTEST : OK
* TEST : OK
* LAPACK-TEST : OK
**************************************************************************************/
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
#define XSFADD_R1 xsadddp
@ -70,12 +34,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro LOAD2x8_1
lxvdsx vs16, o0, BO // load real part from B
lxvdsx vs17, o8, BO // load imag part from B
lxvdsx vs18, o16, BO // load real part from B
lxvdsx vs19, o24, BO // load imag part from B
lxvd2x vs16, o0, BO // load real part from B
lxvd2x vs17, o16, BO // load imag part from B
lxvd2x vs18, o32, BO // load real part from B
lxvd2x vs19, o48, BO // load imag part from B
addi BO, BO, 32
addi BO, BO, 64
lxvd2x vs0, o0, AO // load real,imag from A
lxvd2x vs1, o16, AO // load real,imag from A
@ -110,12 +74,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 64
lxvdsx vs20, o0, BO // load real part from B
lxvdsx vs21, o8, BO // load imag part from B
lxvdsx vs22, o16, BO // load real part from B
lxvdsx vs23, o24, BO // load imag part from B
lxvd2x vs20, o0, BO // load real part from B
lxvd2x vs21, o16, BO // load imag part from B
lxvd2x vs22, o32, BO // load real part from B
lxvd2x vs23, o48, BO // load imag part from B
addi BO, BO, 32
addi BO, BO, 64
xvmuldp vs32, vs0, vs16 // real*real, imag*real
xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
@ -156,36 +120,41 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro KERNEL2x8_1
lxvd2x vs8, o0, AO // load real,imag from A
lxvd2x vs9, o16, AO // load real,imag from A
lxvd2x vs10, o32, AO // load real,imag from A
lxvd2x vs11, o48, AO // load real,imag from A
addi AO, AO, 64
lxvd2x vs12, o0, AO // load real,imag from A
lxvd2x vs13, o16, AO // load real,imag from A
lxvd2x vs14, o32, AO // load real,imag from A
lxvd2x vs15, o48, AO // load real,imag from A
addi AO, AO, 64
lxvd2x vs20, o0, BO // load real part from B
lxvd2x vs21, o16, BO // load imag part from B
lxvd2x vs22, o32, BO // load real part from B
lxvd2x vs23, o48, BO // load imag part from B
addi BO, BO, 64
xvmaddadp vs32, vs0, vs16 // real*real, imag*real
xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
xvmaddadp vs34, vs1, vs16 // real*real, imag*real
xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag
lxvdsx vs22, o16, BO // load real part from B
lxvdsx vs23, o24, BO // load imag part from B
xvmaddadp vs36, vs2, vs16 // real*real, imag*real
xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag
xvmaddadp vs38, vs3, vs16 // real*real, imag*real
xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag
lxvd2x vs8, o0, AO // load real,imag from A
lxvd2x vs9, o16, AO // load real,imag from A
xvmaddadp vs40, vs4, vs16 // real*real, imag*real
xvmaddadp vs41, vs4, vs17 // real*imag, imag*imag
xvmaddadp vs42, vs5, vs16 // real*real, imag*real
xvmaddadp vs43, vs5, vs17 // real*imag, imag*imag
lxvd2x vs10, o32, AO // load real,imag from A
lxvd2x vs11, o48, AO // load real,imag from A
xvmaddadp vs44, vs6, vs16 // real*real, imag*real
xvmaddadp vs45, vs6, vs17 // real*imag, imag*imag
addi AO, AO, 64
xvmaddadp vs46, vs7, vs16 // real*real, imag*real
xvmaddadp vs47, vs7, vs17 // real*imag, imag*imag
@ -193,101 +162,79 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvmaddadp vs49, vs0, vs19 // real*imag, imag*imag
xvmaddadp vs50, vs1, vs18 // real*real, imag*real
xvmaddadp vs51, vs1, vs19 // real*imag, imag*imag
lxvd2x vs12, o0, AO // load real,imag from A
lxvd2x vs13, o16, AO // load real,imag from A
xvmaddadp vs52, vs2, vs18 // real*real, imag*real
xvmaddadp vs53, vs2, vs19 // real*imag, imag*imag
xvmaddadp vs54, vs3, vs18 // real*real, imag*real
xvmaddadp vs55, vs3, vs19 // real*imag, imag*imag
lxvd2x vs14, o32, AO // load real,imag from A
lxvd2x vs15, o48, AO // load real,imag from A
xvmaddadp vs56, vs4, vs18 // real*real, imag*real
xvmaddadp vs57, vs4, vs19 // real*imag, imag*imag
xvmaddadp vs58, vs5, vs18 // real*real, imag*real
xvmaddadp vs59, vs5, vs19 // real*imag, imag*imag
lxvdsx vs20, o0, BO // load real part from B
lxvdsx vs21, o8, BO // load imag part from B
xvmaddadp vs60, vs6, vs18 // real*real, imag*real
xvmaddadp vs61, vs6, vs19 // real*imag, imag*imag
xvmaddadp vs62, vs7, vs18 // real*real, imag*real
xvmaddadp vs63, vs7, vs19 // real*imag, imag*imag
addi AO, AO, 64
addi BO, BO, 32
.endm
.macro KERNEL2x8_2
lxvd2x vs0, o0, AO // load real,imag from A
lxvd2x vs1, o16, AO // load real,imag from A
lxvd2x vs2, o32, AO // load real,imag from A
lxvd2x vs3, o48, AO // load real,imag from A
addi AO, AO, 64
lxvd2x vs4, o0, AO // load real,imag from A
lxvd2x vs5, o16, AO // load real,imag from A
lxvd2x vs6, o32, AO // load real,imag from A
lxvd2x vs7, o48, AO // load real,imag from A
addi AO, AO, 64
lxvd2x vs16, o0, BO // load real part from B
lxvd2x vs17, o16, BO // load imag part from B
lxvd2x vs18, o32, BO // load real part from B
lxvd2x vs19, o48, BO // load imag part from B
addi BO, BO, 64
xvmaddadp vs32, vs8, vs20 // real*real, imag*real
xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
xvmaddadp vs34, vs9, vs20 // real*real, imag*real
xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag
lxvdsx vs16, o0, BO // load real part from B
lxvdsx vs17, o8, BO // load imag part from B
xvmaddadp vs36, vs10, vs20 // real*real, imag*real
xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag
xvmaddadp vs38, vs11, vs20 // real*real, imag*real
xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag
lxvd2x vs0, o0, AO // load real,imag from A
lxvd2x vs1, o16, AO // load real,imag from A
xvmaddadp vs40, vs12, vs20 // real*real, imag*real
xvmaddadp vs41, vs12, vs21 // real*imag, imag*imag
xvmaddadp vs42, vs13, vs20 // real*real, imag*real
xvmaddadp vs43, vs13, vs21 // real*imag, imag*imag
lxvd2x vs2, o32, AO // load real,imag from A
lxvd2x vs3, o48, AO // load real,imag from A
xvmaddadp vs44, vs14, vs20 // real*real, imag*real
xvmaddadp vs45, vs14, vs21 // real*imag, imag*imag
xvmaddadp vs46, vs15, vs20 // real*real, imag*real
xvmaddadp vs47, vs15, vs21 // real*imag, imag*imag
addi AO, AO, 64
xvmaddadp vs48, vs8, vs22 // real*real, imag*real
xvmaddadp vs49, vs8, vs23 // real*imag, imag*imag
xvmaddadp vs50, vs9, vs22 // real*real, imag*real
xvmaddadp vs51, vs9, vs23 // real*imag, imag*imag
lxvd2x vs4, o0, AO // load real,imag from A
lxvd2x vs5, o16, AO // load real,imag from A
xvmaddadp vs52, vs10, vs22 // real*real, imag*real
xvmaddadp vs53, vs10, vs23 // real*imag, imag*imag
xvmaddadp vs54, vs11, vs22 // real*real, imag*real
xvmaddadp vs55, vs11, vs23 // real*imag, imag*imag
lxvd2x vs6, o32, AO // load real,imag from A
lxvd2x vs7, o48, AO // load real,imag from A
xvmaddadp vs56, vs12, vs22 // real*real, imag*real
xvmaddadp vs57, vs12, vs23 // real*imag, imag*imag
xvmaddadp vs58, vs13, vs22 // real*real, imag*real
xvmaddadp vs59, vs13, vs23 // real*imag, imag*imag
lxvdsx vs18, o16, BO // load real part from B
lxvdsx vs19, o24, BO // load imag part from B
xvmaddadp vs60, vs14, vs22 // real*real, imag*real
xvmaddadp vs61, vs14, vs23 // real*imag, imag*imag
xvmaddadp vs62, vs15, vs22 // real*real, imag*real
xvmaddadp vs63, vs15, vs23 // real*imag, imag*imag
addi AO, AO, 64
addi BO, BO, 32
.endm
@ -347,12 +294,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 64
lxvdsx vs16, o0, BO // load real part from B
lxvdsx vs17, o8, BO // load imag part from B
lxvdsx vs18, o16, BO // load real part from B
lxvdsx vs19, o24, BO // load imag part from B
lxvd2x vs16, o0, BO // load real part from B
lxvd2x vs17, o16, BO // load imag part from B
lxvd2x vs18, o32, BO // load real part from B
lxvd2x vs19, o48, BO // load imag part from B
addi BO, BO, 32
addi BO, BO, 64
xvmuldp vs32, vs0, vs16 // real*real, imag*real
xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
@ -407,12 +354,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 64
lxvdsx vs16, o0, BO // load real part from B
lxvdsx vs17, o8, BO // load imag part from B
lxvdsx vs18, o16, BO // load real part from B
lxvdsx vs19, o24, BO // load imag part from B
lxvd2x vs16, o0, BO // load real part from B
lxvd2x vs17, o16, BO // load imag part from B
lxvd2x vs18, o32, BO // load real part from B
lxvd2x vs19, o48, BO // load imag part from B
addi BO, BO, 32
addi BO, BO, 64
xvmaddadp vs32, vs0, vs16 // real*real, imag*real
xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
@ -927,12 +874,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro LOAD2x4_1
lxvdsx vs16, o0, BO // load real part from B
lxvdsx vs17, o8, BO // load imag part from B
lxvdsx vs18, o16, BO // load real part from B
lxvdsx vs19, o24, BO // load imag part from B
lxvd2x vs16, o0, BO // load real part from B
lxvd2x vs17, o16, BO // load imag part from B
lxvd2x vs18, o32, BO // load real part from B
lxvd2x vs19, o48, BO // load imag part from B
addi BO, BO, 32
addi BO, BO, 64
lxvd2x vs0, o0, AO // load real,imag from A
lxvd2x vs1, o16, AO // load real,imag from A
@ -953,12 +900,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 64
lxvdsx vs20, o0, BO // load real part from B
lxvdsx vs21, o8, BO // load imag part from B
lxvdsx vs22, o16, BO // load real part from B
lxvdsx vs23, o24, BO // load imag part from B
lxvd2x vs20, o0, BO // load real part from B
lxvd2x vs21, o16, BO // load imag part from B
lxvd2x vs22, o32, BO // load real part from B
lxvd2x vs23, o48, BO // load imag part from B
addi BO, BO, 32
addi BO, BO, 64
xvmuldp vs32, vs0, vs16 // real*real, imag*real
xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
@ -990,12 +937,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 64
lxvdsx vs20, o0, BO // load real part from B
lxvdsx vs21, o8, BO // load imag part from B
lxvdsx vs22, o16, BO // load real part from B
lxvdsx vs23, o24, BO // load imag part from B
lxvd2x vs20, o0, BO // load real part from B
lxvd2x vs21, o16, BO // load imag part from B
lxvd2x vs22, o32, BO // load real part from B
lxvd2x vs23, o48, BO // load imag part from B
addi BO, BO, 32
addi BO, BO, 64
xvmaddadp vs32, vs0, vs16 // real*real, imag*real
xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
@ -1027,12 +974,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 64
lxvdsx vs16, o0, BO // load real part from B
lxvdsx vs17, o8, BO // load imag part from B
lxvdsx vs18, o16, BO // load real part from B
lxvdsx vs19, o24, BO // load imag part from B
lxvd2x vs16, o0, BO // load real part from B
lxvd2x vs17, o16, BO // load imag part from B
lxvd2x vs18, o32, BO // load real part from B
lxvd2x vs19, o48, BO // load imag part from B
addi BO, BO, 32
addi BO, BO, 64
xvmaddadp vs32, vs8, vs20 // real*real, imag*real
xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
@ -1088,12 +1035,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 64
lxvdsx vs16, o0, BO // load real part from B
lxvdsx vs17, o8, BO // load imag part from B
lxvdsx vs18, o16, BO // load real part from B
lxvdsx vs19, o24, BO // load imag part from B
lxvd2x vs16, o0, BO // load real part from B
lxvd2x vs17, o16, BO // load imag part from B
lxvd2x vs18, o32, BO // load real part from B
lxvd2x vs19, o48, BO // load imag part from B
addi BO, BO, 32
addi BO, BO, 64
xvmuldp vs32, vs0, vs16 // real*real, imag*real
xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
@ -1125,12 +1072,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 64
lxvdsx vs16, o0, BO // load real part from B
lxvdsx vs17, o8, BO // load imag part from B
lxvdsx vs18, o16, BO // load real part from B
lxvdsx vs19, o24, BO // load imag part from B
lxvd2x vs16, o0, BO // load real part from B
lxvd2x vs17, o16, BO // load imag part from B
lxvd2x vs18, o32, BO // load real part from B
lxvd2x vs19, o48, BO // load imag part from B
addi BO, BO, 32
addi BO, BO, 64
xvmaddadp vs32, vs0, vs16 // real*real, imag*real
xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
@ -1410,12 +1357,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro LOAD2x2_1
lxvdsx vs16, o0, BO // load real part from B
lxvdsx vs17, o8, BO // load imag part from B
lxvdsx vs18, o16, BO // load real part from B
lxvdsx vs19, o24, BO // load imag part from B
lxvd2x vs16, o0, BO // load real part from B
lxvd2x vs17, o16, BO // load imag part from B
lxvd2x vs18, o32, BO // load real part from B
lxvd2x vs19, o48, BO // load imag part from B
addi BO, BO, 32
addi BO, BO, 64
lxvd2x vs0, o0, AO // load real,imag from A
lxvd2x vs1, o16, AO // load real,imag from A
@ -1432,12 +1379,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 32
lxvdsx vs20, o0, BO // load real part from B
lxvdsx vs21, o8, BO // load imag part from B
lxvdsx vs22, o16, BO // load real part from B
lxvdsx vs23, o24, BO // load imag part from B
lxvd2x vs20, o0, BO // load real part from B
lxvd2x vs21, o16, BO // load imag part from B
lxvd2x vs22, o32, BO // load real part from B
lxvd2x vs23, o48, BO // load imag part from B
addi BO, BO, 32
addi BO, BO, 64
xvmuldp vs32, vs0, vs16 // real*real, imag*real
xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
@ -1459,12 +1406,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 32
lxvdsx vs20, o0, BO // load real part from B
lxvdsx vs21, o8, BO // load imag part from B
lxvdsx vs22, o16, BO // load real part from B
lxvdsx vs23, o24, BO // load imag part from B
lxvd2x vs20, o0, BO // load real part from B
lxvd2x vs21, o16, BO // load imag part from B
lxvd2x vs22, o32, BO // load real part from B
lxvd2x vs23, o48, BO // load imag part from B
addi BO, BO, 32
addi BO, BO, 64
xvmaddadp vs32, vs0, vs16 // real*real, imag*real
xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
@ -1486,12 +1433,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 32
lxvdsx vs16, o0, BO // load real part from B
lxvdsx vs17, o8, BO // load imag part from B
lxvdsx vs18, o16, BO // load real part from B
lxvdsx vs19, o24, BO // load imag part from B
lxvd2x vs16, o0, BO // load real part from B
lxvd2x vs17, o16, BO // load imag part from B
lxvd2x vs18, o32, BO // load real part from B
lxvd2x vs19, o48, BO // load imag part from B
addi BO, BO, 32
addi BO, BO, 64
xvmaddadp vs32, vs8, vs20 // real*real, imag*real
xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
@ -1529,12 +1476,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 32
lxvdsx vs16, o0, BO // load real part from B
lxvdsx vs17, o8, BO // load imag part from B
lxvdsx vs18, o16, BO // load real part from B
lxvdsx vs19, o24, BO // load imag part from B
lxvd2x vs16, o0, BO // load real part from B
lxvd2x vs17, o16, BO // load imag part from B
lxvd2x vs18, o32, BO // load real part from B
lxvd2x vs19, o48, BO // load imag part from B
addi BO, BO, 32
addi BO, BO, 64
xvmuldp vs32, vs0, vs16 // real*real, imag*real
xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
@ -1556,12 +1503,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 32
lxvdsx vs16, o0, BO // load real part from B
lxvdsx vs17, o8, BO // load imag part from B
lxvdsx vs18, o16, BO // load real part from B
lxvdsx vs19, o24, BO // load imag part from B
lxvd2x vs16, o0, BO // load real part from B
lxvd2x vs17, o16, BO // load imag part from B
lxvd2x vs18, o32, BO // load real part from B
lxvd2x vs19, o48, BO // load imag part from B
addi BO, BO, 32
addi BO, BO, 64
xvmaddadp vs32, vs0, vs16 // real*real, imag*real
xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
@ -1725,12 +1672,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro LOAD2x1_1
lxvdsx vs16, o0, BO // load real part from B
lxvdsx vs17, o8, BO // load imag part from B
lxvdsx vs18, o16, BO // load real part from B
lxvdsx vs19, o24, BO // load imag part from B
lxvd2x vs16, o0, BO // load real part from B
lxvd2x vs17, o16, BO // load imag part from B
lxvd2x vs18, o32, BO // load real part from B
lxvd2x vs19, o48, BO // load imag part from B
addi BO, BO, 32
addi BO, BO, 64
lxvd2x vs0, o0, AO // load real,imag from A
@ -1745,12 +1692,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 16
lxvdsx vs20, o0, BO // load real part from B
lxvdsx vs21, o8, BO // load imag part from B
lxvdsx vs22, o16, BO // load real part from B
lxvdsx vs23, o24, BO // load imag part from B
lxvd2x vs20, o0, BO // load real part from B
lxvd2x vs21, o16, BO // load imag part from B
lxvd2x vs22, o32, BO // load real part from B
lxvd2x vs23, o48, BO // load imag part from B
addi BO, BO, 32
addi BO, BO, 64
xvmuldp vs32, vs0, vs16 // real*real, imag*real
xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
@ -1767,12 +1714,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 16
lxvdsx vs20, o0, BO // load real part from B
lxvdsx vs21, o8, BO // load imag part from B
lxvdsx vs22, o16, BO // load real part from B
lxvdsx vs23, o24, BO // load imag part from B
lxvd2x vs20, o0, BO // load real part from B
lxvd2x vs21, o16, BO // load imag part from B
lxvd2x vs22, o32, BO // load real part from B
lxvd2x vs23, o48, BO // load imag part from B
addi BO, BO, 32
addi BO, BO, 64
xvmaddadp vs32, vs0, vs16 // real*real, imag*real
xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
@ -1789,12 +1736,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 16
lxvdsx vs16, o0, BO // load real part from B
lxvdsx vs17, o8, BO // load imag part from B
lxvdsx vs18, o16, BO // load real part from B
lxvdsx vs19, o24, BO // load imag part from B
lxvd2x vs16, o0, BO // load real part from B
lxvd2x vs17, o16, BO // load imag part from B
lxvd2x vs18, o32, BO // load real part from B
lxvd2x vs19, o48, BO // load imag part from B
addi BO, BO, 32
addi BO, BO, 64
xvmaddadp vs32, vs8, vs20 // real*real, imag*real
xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
@ -1823,12 +1770,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 16
lxvdsx vs16, o0, BO // load real part from B
lxvdsx vs17, o8, BO // load imag part from B
lxvdsx vs18, o16, BO // load real part from B
lxvdsx vs19, o24, BO // load imag part from B
lxvd2x vs16, o0, BO // load real part from B
lxvd2x vs17, o16, BO // load imag part from B
lxvd2x vs18, o32, BO // load real part from B
lxvd2x vs19, o48, BO // load imag part from B
addi BO, BO, 32
addi BO, BO, 64
xvmuldp vs32, vs0, vs16 // real*real, imag*real
xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
@ -1845,12 +1792,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 16
lxvdsx vs16, o0, BO // load real part from B
lxvdsx vs17, o8, BO // load imag part from B
lxvdsx vs18, o16, BO // load real part from B
lxvdsx vs19, o24, BO // load imag part from B
lxvd2x vs16, o0, BO // load real part from B
lxvd2x vs17, o16, BO // load imag part from B
lxvd2x vs18, o32, BO // load real part from B
lxvd2x vs19, o48, BO // load imag part from B
addi BO, BO, 32
addi BO, BO, 64
xvmaddadp vs32, vs0, vs16 // real*real, imag*real
xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
@ -1956,10 +1903,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro LOAD1x8_1
lxvdsx vs16, o0, BO // load real part from B
lxvdsx vs17, o8, BO // load imag part from B
lxvd2x vs16, o0, BO // load real part from B
lxvd2x vs17, o16, BO // load imag part from B
addi BO, BO, 16
addi BO, BO, 32
lxvd2x vs0, o0, AO // load real,imag from A
lxvd2x vs1, o16, AO // load real,imag from A
@ -1994,10 +1941,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 64
lxvdsx vs20, o0, BO // load real part from B
lxvdsx vs21, o8, BO // load imag part from B
lxvd2x vs20, o0, BO // load real part from B
lxvd2x vs21, o16, BO // load imag part from B
addi BO, BO, 16
addi BO, BO, 32
xvmuldp vs32, vs0, vs16 // real*real, imag*real
xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
@ -2035,10 +1982,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 64
lxvdsx vs20, o0, BO // load real part from B
lxvdsx vs21, o8, BO // load imag part from B
lxvd2x vs20, o0, BO // load real part from B
lxvd2x vs21, o16, BO // load imag part from B
addi BO, BO, 16
addi BO, BO, 32
xvmaddadp vs32, vs0, vs16 // real*real, imag*real
xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
@ -2076,10 +2023,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 64
lxvdsx vs16, o0, BO // load real part from B
lxvdsx vs17, o8, BO // load imag part from B
lxvd2x vs16, o0, BO // load real part from B
lxvd2x vs17, o16, BO // load imag part from B
addi BO, BO, 16
addi BO, BO, 32
xvmaddadp vs32, vs8, vs20 // real*real, imag*real
xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
@ -2140,10 +2087,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 64
lxvdsx vs16, o0, BO // load real part from B
lxvdsx vs17, o8, BO // load imag part from B
lxvd2x vs16, o0, BO // load real part from B
lxvd2x vs17, o16, BO // load imag part from B
addi BO, BO, 16
addi BO, BO, 32
xvmuldp vs32, vs0, vs16 // real*real, imag*real
xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
@ -2181,10 +2128,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 64
lxvdsx vs16, o0, BO // load real part from B
lxvdsx vs17, o8, BO // load imag part from B
lxvd2x vs16, o0, BO // load real part from B
lxvd2x vs17, o16, BO // load imag part from B
addi BO, BO, 16
addi BO, BO, 32
xvmaddadp vs32, vs0, vs16 // real*real, imag*real
xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
@ -2452,10 +2399,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro LOAD1x4_1
lxvdsx vs16, o0, BO // load real part from B
lxvdsx vs17, o8, BO // load imag part from B
lxvd2x vs16, o0, BO // load real part from B
lxvd2x vs17, o16, BO // load imag part from B
addi BO, BO, 16
addi BO, BO, 32
lxvd2x vs0, o0, AO // load real,imag from A
lxvd2x vs1, o16, AO // load real,imag from A
@ -2476,10 +2423,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 64
lxvdsx vs20, o0, BO // load real part from B
lxvdsx vs21, o8, BO // load imag part from B
lxvd2x vs20, o0, BO // load real part from B
lxvd2x vs21, o16, BO // load imag part from B
addi BO, BO, 16
addi BO, BO, 32
xvmuldp vs32, vs0, vs16 // real*real, imag*real
xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
@ -2502,10 +2449,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 64
lxvdsx vs20, o0, BO // load real part from B
lxvdsx vs21, o8, BO // load imag part from B
lxvd2x vs20, o0, BO // load real part from B
lxvd2x vs21, o16, BO // load imag part from B
addi BO, BO, 16
addi BO, BO, 32
xvmaddadp vs32, vs0, vs16 // real*real, imag*real
xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
@ -2528,10 +2475,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 64
lxvdsx vs16, o0, BO // load real part from B
lxvdsx vs17, o8, BO // load imag part from B
lxvd2x vs16, o0, BO // load real part from B
lxvd2x vs17, o16, BO // load imag part from B
addi BO, BO, 16
addi BO, BO, 32
xvmaddadp vs32, vs8, vs20 // real*real, imag*real
xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
@ -2569,10 +2516,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 64
lxvdsx vs16, o0, BO // load real part from B
lxvdsx vs17, o8, BO // load imag part from B
lxvd2x vs16, o0, BO // load real part from B
lxvd2x vs17, o16, BO // load imag part from B
addi BO, BO, 16
addi BO, BO, 32
xvmuldp vs32, vs0, vs16 // real*real, imag*real
xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
@ -2595,10 +2542,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 64
lxvdsx vs16, o0, BO // load real part from B
lxvdsx vs17, o8, BO // load imag part from B
lxvd2x vs16, o0, BO // load real part from B
lxvd2x vs17, o16, BO // load imag part from B
addi BO, BO, 16
addi BO, BO, 32
xvmaddadp vs32, vs0, vs16 // real*real, imag*real
xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
@ -2748,10 +2695,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro LOAD1x2_1
lxvdsx vs16, o0, BO // load real part from B
lxvdsx vs17, o8, BO // load imag part from B
lxvd2x vs16, o0, BO // load real part from B
lxvd2x vs17, o16, BO // load imag part from B
addi BO, BO, 16
addi BO, BO, 32
lxvd2x vs0, o0, AO // load real,imag from A
lxvd2x vs1, o16, AO // load real,imag from A
@ -2768,10 +2715,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 32
lxvdsx vs20, o0, BO // load real part from B
lxvdsx vs21, o8, BO // load imag part from B
lxvd2x vs20, o0, BO // load real part from B
lxvd2x vs21, o16, BO // load imag part from B
addi BO, BO, 16
addi BO, BO, 32
xvmuldp vs32, vs0, vs16 // real*real, imag*real
xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
@ -2788,10 +2735,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 32
lxvdsx vs20, o0, BO // load real part from B
lxvdsx vs21, o8, BO // load imag part from B
lxvd2x vs20, o0, BO // load real part from B
lxvd2x vs21, o16, BO // load imag part from B
addi BO, BO, 16
addi BO, BO, 32
xvmaddadp vs32, vs0, vs16 // real*real, imag*real
xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
@ -2808,10 +2755,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 32
lxvdsx vs16, o0, BO // load real part from B
lxvdsx vs17, o8, BO // load imag part from B
lxvd2x vs16, o0, BO // load real part from B
lxvd2x vs17, o16, BO // load imag part from B
addi BO, BO, 16
addi BO, BO, 32
xvmaddadp vs32, vs8, vs20 // real*real, imag*real
xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
@ -2839,10 +2786,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 32
lxvdsx vs16, o0, BO // load real part from B
lxvdsx vs17, o8, BO // load imag part from B
lxvd2x vs16, o0, BO // load real part from B
lxvd2x vs17, o16, BO // load imag part from B
addi BO, BO, 16
addi BO, BO, 32
xvmuldp vs32, vs0, vs16 // real*real, imag*real
xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
@ -2859,10 +2806,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 32
lxvdsx vs16, o0, BO // load real part from B
lxvdsx vs17, o8, BO // load imag part from B
lxvd2x vs16, o0, BO // load real part from B
lxvd2x vs17, o16, BO // load imag part from B
addi BO, BO, 16
addi BO, BO, 32
xvmaddadp vs32, vs0, vs16 // real*real, imag*real
xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
@ -2954,10 +2901,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro LOAD1x1_1
lxvdsx vs16, o0, BO // load real part from B
lxvdsx vs17, o8, BO // load imag part from B
lxvd2x vs16, o0, BO // load real part from B
lxvd2x vs17, o16, BO // load imag part from B
addi BO, BO, 16
addi BO, BO, 32
lxvd2x vs0, o0, AO // load real,imag from A
@ -2972,10 +2919,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 16
lxvdsx vs20, o0, BO // load real part from B
lxvdsx vs21, o8, BO // load imag part from B
lxvd2x vs20, o0, BO // load real part from B
lxvd2x vs21, o16, BO // load imag part from B
addi BO, BO, 16
addi BO, BO, 32
xvmuldp vs32, vs0, vs16 // real*real, imag*real
xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
@ -2989,10 +2936,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 16
lxvdsx vs20, o0, BO // load real part from B
lxvdsx vs21, o8, BO // load imag part from B
lxvd2x vs20, o0, BO // load real part from B
lxvd2x vs21, o16, BO // load imag part from B
addi BO, BO, 16
addi BO, BO, 32
xvmaddadp vs32, vs0, vs16 // real*real, imag*real
xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
@ -3006,10 +2953,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 16
lxvdsx vs16, o0, BO // load real part from B
lxvdsx vs17, o8, BO // load imag part from B
lxvd2x vs16, o0, BO // load real part from B
lxvd2x vs17, o16, BO // load imag part from B
addi BO, BO, 16
addi BO, BO, 32
xvmaddadp vs32, vs8, vs20 // real*real, imag*real
xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
@ -3032,10 +2979,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 16
lxvdsx vs16, o0, BO // load real part from B
lxvdsx vs17, o8, BO // load imag part from B
lxvd2x vs16, o0, BO // load real part from B
lxvd2x vs17, o16, BO // load imag part from B
addi BO, BO, 16
addi BO, BO, 32
xvmuldp vs32, vs0, vs16 // real*real, imag*real
xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
@ -3049,10 +2996,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 16
lxvdsx vs16, o0, BO // load real part from B
lxvdsx vs17, o8, BO // load imag part from B
lxvd2x vs16, o0, BO // load real part from B
lxvd2x vs17, o16, BO // load imag part from B
addi BO, BO, 16
addi BO, BO, 32
xvmaddadp vs32, vs0, vs16 // real*real, imag*real
xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag

View File

@ -271,7 +271,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
#endif
#include "zgemm_macros_8x2_power8.S"
#include "ztrmm_macros_8x2_power8.S"
cmpwi cr0, M, 0
ble .L999

File diff suppressed because it is too large Load Diff

14
param.h
View File

@ -1964,8 +1964,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define SNUMOPT 16
#define DNUMOPT 8
#define GEMM_DEFAULT_OFFSET_A 131072
#define GEMM_DEFAULT_OFFSET_B 1024
#define GEMM_DEFAULT_OFFSET_A 4096
#define GEMM_DEFAULT_OFFSET_B 4096
#define GEMM_DEFAULT_ALIGN 0x03fffUL
#define SGEMM_DEFAULT_UNROLL_M 16
@ -1980,17 +1980,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define SGEMM_DEFAULT_P 960
#define DGEMM_DEFAULT_P 480
#define CGEMM_DEFAULT_P 720
#define ZGEMM_DEFAULT_P 240
#define ZGEMM_DEFAULT_P 480
#define SGEMM_DEFAULT_Q 720
#define DGEMM_DEFAULT_Q 720
#define CGEMM_DEFAULT_Q 720
#define ZGEMM_DEFAULT_Q 360
#define ZGEMM_DEFAULT_Q 720
#define SGEMM_DEFAULT_R 14400
#define SGEMM_DEFAULT_R 21600
#define DGEMM_DEFAULT_R 14400
#define CGEMM_DEFAULT_R 14400
#define ZGEMM_DEFAULT_R 7200
#define CGEMM_DEFAULT_R 16200
#define ZGEMM_DEFAULT_R 21600
#define SYMV_P 8