Merge pull request #837 from wernsaar/develop

updated zgemm- and ztrmm-kernel for POWER8
This commit is contained in:
wernsaar 2016-04-08 11:13:27 +02:00
commit 0a4276bc2f
10 changed files with 3673 additions and 557 deletions

View File

@ -34,7 +34,8 @@ LIBMKL = -L$(MKL) -lmkl_intel_lp64 -lmkl_gnu_thread -lmkl_core -lgomp -lpthread
LIBVECLIB = -framework Accelerate LIBVECLIB = -framework Accelerate
ESSL=/opt/ibm/lib ESSL=/opt/ibm/lib
LIBESSL = -lessl $(ESSL)/libxlomp_ser.so.1 $(ESSL)/libxlf90_r.so.1 $(ESSL)/libxlfmath.so.1 $(ESSL)/libxlsmp.so.1 /opt/ibm/xlC/13.1.2/lib/libxl.a #LIBESSL = -lesslsmp $(ESSL)/libxlomp_ser.so.1 $(ESSL)/libxlf90_r.so.1 $(ESSL)/libxlfmath.so.1 $(ESSL)/libxlsmp.so.1 /opt/ibm/xlC/13.1.3/lib/libxl.a
LIBESSL = -lesslsmp $(ESSL)/libxlf90_r.so.1 $(ESSL)/libxlfmath.so.1 $(ESSL)/libxlsmp.so.1 /opt/ibm/xlC/13.1.3/lib/libxl.a
ifeq ($(OSNAME), WINNT) ifeq ($(OSNAME), WINNT)
@ -259,7 +260,8 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \
endif endif
essl :: sgemm.essl strmm.essl dgemm.essl dtrmm.essl \ essl :: sgemm.essl strmm.essl dgemm.essl dtrmm.essl \
cgemm.essl ctrmm.essl zgemm.essl ztrmm.essl cgemm.essl ctrmm.essl zgemm.essl ztrmm.essl \
slinpack.essl clinpack.essl dlinpack.essl zlinpack.essl
veclib :: slinpack.veclib dlinpack.veclib clinpack.veclib zlinpack.veclib \ veclib :: slinpack.veclib dlinpack.veclib clinpack.veclib zlinpack.veclib \
scholesky.veclib dcholesky.veclib ccholesky.veclib zcholesky.veclib \ scholesky.veclib dcholesky.veclib ccholesky.veclib zcholesky.veclib \
@ -312,6 +314,9 @@ slinpack.mkl : slinpack.$(SUFFIX)
slinpack.veclib : slinpack.$(SUFFIX) slinpack.veclib : slinpack.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
slinpack.essl : slinpack.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
##################################### Dlinpack #################################################### ##################################### Dlinpack ####################################################
dlinpack.goto : dlinpack.$(SUFFIX) ../$(LIBNAME) dlinpack.goto : dlinpack.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
@ -328,6 +333,9 @@ dlinpack.mkl : dlinpack.$(SUFFIX)
dlinpack.veclib : dlinpack.$(SUFFIX) dlinpack.veclib : dlinpack.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
dlinpack.essl : dlinpack.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
##################################### Clinpack #################################################### ##################################### Clinpack ####################################################
clinpack.goto : clinpack.$(SUFFIX) ../$(LIBNAME) clinpack.goto : clinpack.$(SUFFIX) ../$(LIBNAME)
@ -345,6 +353,9 @@ clinpack.mkl : clinpack.$(SUFFIX)
clinpack.veclib : clinpack.$(SUFFIX) clinpack.veclib : clinpack.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
clinpack.essl : clinpack.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
##################################### Zlinpack #################################################### ##################################### Zlinpack ####################################################
zlinpack.goto : zlinpack.$(SUFFIX) ../$(LIBNAME) zlinpack.goto : zlinpack.$(SUFFIX) ../$(LIBNAME)
@ -362,6 +373,9 @@ zlinpack.mkl : zlinpack.$(SUFFIX)
zlinpack.veclib : zlinpack.$(SUFFIX) zlinpack.veclib : zlinpack.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
zlinpack.essl : zlinpack.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
##################################### Scholesky ################################################### ##################################### Scholesky ###################################################
scholesky.goto : scholesky.$(SUFFIX) ../$(LIBNAME) scholesky.goto : scholesky.$(SUFFIX) ../$(LIBNAME)

View File

@ -798,7 +798,7 @@ Lmcount$lazy_ptr:
#elif defined(PPC440FP2) #elif defined(PPC440FP2)
#define BUFFER_SIZE ( 16 << 20) #define BUFFER_SIZE ( 16 << 20)
#elif defined(POWER8) #elif defined(POWER8)
#define BUFFER_SIZE ( 64 << 20) #define BUFFER_SIZE ( 32 << 20)
#else #else
#define BUFFER_SIZE ( 16 << 20) #define BUFFER_SIZE ( 16 << 20)
#endif #endif

View File

@ -82,7 +82,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif #endif
#ifdef __64BIT__ #ifdef __64BIT__
#define STACKSIZE 400 #define STACKSIZE 32000
#define ALPHA_R_SP 296(SP) #define ALPHA_R_SP 296(SP)
#define ALPHA_I_SP 304(SP) #define ALPHA_I_SP 304(SP)
#define FZERO 312(SP) #define FZERO 312(SP)
@ -136,6 +136,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define alpha_sr vs30 #define alpha_sr vs30
#define alpha_si vs31 #define alpha_si vs31
#define FRAMEPOINTER r12
#define BBUFFER r14 #define BBUFFER r14
#define L r15 #define L r15
@ -161,6 +162,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PROLOGUE PROLOGUE
PROFCODE PROFCODE
mr FRAMEPOINTER, SP
addi SP, SP, -STACKSIZE
addi SP, SP, -STACKSIZE
addi SP, SP, -STACKSIZE
addi SP, SP, -STACKSIZE addi SP, SP, -STACKSIZE
li r0, 0 li r0, 0
@ -233,37 +238,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#ifdef linux #ifdef linux
#ifdef __64BIT__ #ifdef __64BIT__
ld LDC, FRAMESLOT(0) + STACKSIZE(SP) ld LDC, FRAMESLOT(0) + 0(FRAMEPOINTER)
#endif #endif
#endif #endif
#if defined(_AIX) || defined(__APPLE__) #if defined(_AIX) || defined(__APPLE__)
#ifdef __64BIT__ #ifdef __64BIT__
ld LDC, FRAMESLOT(0) + STACKSIZE(SP) ld LDC, FRAMESLOT(0) + 0(FRAMEPOINTER)
#else #else
#ifdef DOUBLE #ifdef DOUBLE
lwz B, FRAMESLOT(0) + STACKSIZE(SP) lwz B, FRAMESLOT(0) + 0(FRAMEPOINTER)
lwz C, FRAMESLOT(1) + STACKSIZE(SP) lwz C, FRAMESLOT(1) + 0(FRAMEPOINTER)
lwz LDC, FRAMESLOT(2) + STACKSIZE(SP) lwz LDC, FRAMESLOT(2) + 0(FRAMEPOINTER)
#else #else
lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) lwz LDC, FRAMESLOT(0) + 0(FRAMEPOINTER)
#endif #endif
#endif #endif
#endif #endif
#ifdef TRMMKERNEL #ifdef TRMMKERNEL
#if defined(linux) && defined(__64BIT__) #if defined(linux) && defined(__64BIT__)
ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) ld OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER)
#endif #endif
#if defined(_AIX) || defined(__APPLE__) #if defined(_AIX) || defined(__APPLE__)
#ifdef __64BIT__ #ifdef __64BIT__
ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) ld OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER)
#else #else
#ifdef DOUBLE #ifdef DOUBLE
lwz OFFSET, FRAMESLOT(3) + STACKSIZE(SP) lwz OFFSET, FRAMESLOT(3) + 0(FRAMEPOINTER)
#else #else
lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP) lwz OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER)
#endif #endif
#endif #endif
#endif #endif
@ -290,9 +295,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
li o32 , 32 li o32 , 32
li o48 , 48 li o48 , 48
li T1, 256 addi BBUFFER, SP, 512+4096
slwi T1, T1, 9 // 131072 li T1, -4096
sub BBUFFER, A, T1 // temp buffer for B unrolled and BBUFFER, BBUFFER, T1
#ifdef __64BIT__ #ifdef __64BIT__
@ -392,6 +397,9 @@ L999:
#endif #endif
addi SP, SP, STACKSIZE addi SP, SP, STACKSIZE
addi SP, SP, STACKSIZE
addi SP, SP, STACKSIZE
addi SP, SP, STACKSIZE
blr blr

View File

@ -82,7 +82,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif #endif
#ifdef __64BIT__ #ifdef __64BIT__
#define STACKSIZE 340 #define STACKSIZE 32752
#define ALPHA_SP 296(SP) #define ALPHA_SP 296(SP)
#define FZERO 304(SP) #define FZERO 304(SP)
#else #else
@ -132,6 +132,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define o0 0 #define o0 0
#define FRAMEPOINTER r12
#define BBUFFER r14 #define BBUFFER r14
#define o4 r15 #define o4 r15
#define o12 r16 #define o12 r16
@ -160,6 +162,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PROLOGUE PROLOGUE
PROFCODE PROFCODE
mr FRAMEPOINTER, SP
addi SP, SP, -STACKSIZE
addi SP, SP, -STACKSIZE
addi SP, SP, -STACKSIZE
addi SP, SP, -STACKSIZE addi SP, SP, -STACKSIZE
li r0, 0 li r0, 0
@ -231,7 +237,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if defined(_AIX) || defined(__APPLE__) #if defined(_AIX) || defined(__APPLE__)
#if !defined(__64BIT__) && defined(DOUBLE) #if !defined(__64BIT__) && defined(DOUBLE)
lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) lwz LDC, FRAMESLOT(0) + 0(FRAMEPOINTER)
#endif #endif
#endif #endif
@ -239,17 +245,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if defined(TRMMKERNEL) #if defined(TRMMKERNEL)
#if defined(linux) && defined(__64BIT__) #if defined(linux) && defined(__64BIT__)
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) ld OFFSET, FRAMESLOT(0) + 0(FRAMEPOINTER)
#endif #endif
#if defined(_AIX) || defined(__APPLE__) #if defined(_AIX) || defined(__APPLE__)
#ifdef __64BIT__ #ifdef __64BIT__
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) ld OFFSET, FRAMESLOT(0) + 0(FRAMEPOINTER)
#else #else
#ifdef DOUBLE #ifdef DOUBLE
lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP) lwz OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER)
#else #else
lwz OFFSET, FRAMESLOT(0) + STACKSIZE(SP) lwz OFFSET, FRAMESLOT(0) + 0(FRAMEPOINTER)
#endif #endif
#endif #endif
#endif #endif
@ -271,9 +277,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
li o32, 32 li o32, 32
li o48, 48 li o48, 48
li T1, 256 addi BBUFFER, SP, 512+4096
slwi T1, T1, 9 // 131072 li T1, -4096
sub BBUFFER, A, T1 // temp buffer for B unrolled and BBUFFER, BBUFFER, T1
addi T1, SP, 300 addi T1, SP, 300
stxsspx f1, o0 , T1 stxsspx f1, o0 , T1
@ -355,6 +361,9 @@ L999:
#endif #endif
addi SP, SP, STACKSIZE addi SP, SP, STACKSIZE
addi SP, SP, STACKSIZE
addi SP, SP, STACKSIZE
addi SP, SP, STACKSIZE
blr blr

View File

@ -1,38 +1,3 @@
/***************************************************************************
Copyright (c) 2013-2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2016/03/05 Werner Saar (wernsaar@googlemail.com)
* BLASTEST : OK
* CTEST : OK
* TEST : OK
* LAPACK-TEST : OK
**************************************************************************************/
/*********************************************************************/ /*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */ /* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */ /* All rights reserved. */
@ -82,7 +47,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif #endif
#ifdef __64BIT__ #ifdef __64BIT__
#define STACKSIZE 320 #define STACKSIZE 32000
#define ALPHA_R_SP 296(SP) #define ALPHA_R_SP 296(SP)
#define ALPHA_I_SP 304(SP) #define ALPHA_I_SP 304(SP)
#define FZERO 312(SP) #define FZERO 312(SP)
@ -133,11 +98,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define alpha_r vs30 #define alpha_r vs30
#define alpha_i vs31 #define alpha_i vs31
#define FRAMEPOINTER r12
#define BBUFFER r14
#define L r15 #define L r15
#define ALPHA r16 #define ALPHA r16
#define o24 r17 #define o24 r17
#define T2 r19 #define T2 r19
#define KK r20 #define BBO r20
#define o8 r21 #define o8 r21
#define I r22 #define I r22
#define J r23 #define J r23
@ -156,8 +126,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PROLOGUE PROLOGUE
PROFCODE PROFCODE
addi SP, SP, -STACKSIZE mr FRAMEPOINTER, SP
li r0, 0 addi SP, SP, -STACKSIZE
addi SP, SP, -STACKSIZE
addi SP, SP, -STACKSIZE
addi SP, SP, -STACKSIZE
li r0, 0
stfd f14, 0(SP) stfd f14, 0(SP)
stfd f15, 8(SP) stfd f15, 8(SP)
@ -200,6 +174,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
std r17, 256(SP) std r17, 256(SP)
std r16, 264(SP) std r16, 264(SP)
std r15, 272(SP) std r15, 272(SP)
std r14, 280(SP)
#else #else
stw r31, 144(SP) stw r31, 144(SP)
stw r30, 148(SP) stw r30, 148(SP)
@ -226,37 +201,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#ifdef linux #ifdef linux
#ifdef __64BIT__ #ifdef __64BIT__
ld LDC, FRAMESLOT(0) + STACKSIZE(SP) ld LDC, FRAMESLOT(0) + 0(FRAMEPOINTER)
#endif #endif
#endif #endif
#if defined(_AIX) || defined(__APPLE__) #if defined(_AIX) || defined(__APPLE__)
#ifdef __64BIT__ #ifdef __64BIT__
ld LDC, FRAMESLOT(0) + STACKSIZE(SP) ld LDC, FRAMESLOT(0) + 0(FRAMEPOINTER)
#else #else
#ifdef DOUBLE #ifdef DOUBLE
lwz B, FRAMESLOT(0) + STACKSIZE(SP) lwz B, FRAMESLOT(0) + 0(FRAMEPOINTER)
lwz C, FRAMESLOT(1) + STACKSIZE(SP) lwz C, FRAMESLOT(1) + 0(FRAMEPOINTER)
lwz LDC, FRAMESLOT(2) + STACKSIZE(SP) lwz LDC, FRAMESLOT(2) + 0(FRAMEPOINTER)
#else #else
lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) lwz LDC, FRAMESLOT(0) + 0(FRAMEPOINTER)
#endif #endif
#endif #endif
#endif #endif
#ifdef TRMMKERNEL #ifdef TRMMKERNEL
#if defined(linux) && defined(__64BIT__) #if defined(linux) && defined(__64BIT__)
ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) ld OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER)
#endif #endif
#if defined(_AIX) || defined(__APPLE__) #if defined(_AIX) || defined(__APPLE__)
#ifdef __64BIT__ #ifdef __64BIT__
ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) ld OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER)
#else #else
#ifdef DOUBLE #ifdef DOUBLE
lwz OFFSET, FRAMESLOT(3) + STACKSIZE(SP) lwz OFFSET, FRAMESLOT(3) + 0(FRAMEPOINTER)
#else #else
lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP) lwz OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER)
#endif #endif
#endif #endif
#endif #endif
@ -268,34 +243,38 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "zgemm_macros_8x2_power8.S" #include "zgemm_macros_8x2_power8.S"
cmpwi cr0, M, 0 cmpwi cr0, M, 0
ble .L999 ble L999
cmpwi cr0, N, 0 cmpwi cr0, N, 0
ble .L999 ble L999
cmpwi cr0, K, 0 cmpwi cr0, K, 0
ble .L999 ble L999
slwi LDC, LDC, ZBASE_SHIFT slwi LDC, LDC, ZBASE_SHIFT
li PRE, 256 li PRE, 384
li o8 , 8 li o8 , 8
li o16 , 16 li o16 , 16
li o24 , 24 li o24 , 24
li o32 , 32 li o32 , 32
li o48 , 48 li o48 , 48
addi BBUFFER, SP, 512+4096
li T1, -4096
and BBUFFER, BBUFFER, T1
#ifdef __64BIT__ #ifdef __64BIT__
addi ALPHA, SP, 296 addi ALPHA, SP, 296
#else #else
addi ALPHA, SP, 224 addi ALPHA, SP, 224
#endif #endif
lxvdsx alpha_r, 0, ALPHA lxsdx alpha_r, 0, ALPHA
lxvdsx alpha_i, o8, ALPHA lxsdx alpha_i, o8, ALPHA
.align 5 .align 4
#include "zgemm_logic_8x2_power8.S" #include "zgemm_logic_8x2_power8.S"
.L999: L999:
addi r3, 0, 0 addi r3, 0, 0
lfd f14, 0(SP) lfd f14, 0(SP)
@ -339,6 +318,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld r17, 256(SP) ld r17, 256(SP)
ld r16, 264(SP) ld r16, 264(SP)
ld r15, 272(SP) ld r15, 272(SP)
ld r14, 280(SP)
#else #else
lwz r31, 144(SP) lwz r31, 144(SP)
lwz r30, 148(SP) lwz r30, 148(SP)
@ -360,6 +340,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif #endif
addi SP, SP, STACKSIZE addi SP, SP, STACKSIZE
addi SP, SP, STACKSIZE
addi SP, SP, STACKSIZE
addi SP, SP, STACKSIZE
blr blr

View File

@ -1,83 +1,111 @@
srawi. J, N, 1 srawi. J, N, 1
ble .LZGEMM_L2_END ble ZGEMM_L2_END
ZGEMM_L2_BEGIN:
mr BO, B
mr BBO, BBUFFER
slwi T1, K, 1
ZGEMM_L2_COPYB:
lxvdsx vs4, o0, BO // b0_r
lxvdsx vs5, o8, BO // b0_i
addi BO, BO, 16
stxvd2x vs4, o0, BBO
stxvd2x vs5, o16, BBO
addic. T1, T1, -1
addi BBO, BBO, 32
bge ZGEMM_L2_COPYB
.LZGEMM_L2_BEGIN:
mr CO, C mr CO, C
mr AO, A mr AO, A
slwi T1, LDC , 1 slwi T1, LDC , 1
add C, C, T1 add C, C, T1
srawi. I, M, 3 srawi. I, M, 3
ble .LZGEMM_L2x8_END ble ZGEMM_L2x8_END
.LZGEMM_L2x8_BEGIN: ZGEMM_L2x8_BEGIN:
mr BO, B mr BO, BBUFFER
srawi. L, K, 3 srawi. L, K, 3
ble .LZGEMM_L2x8_SUB0 ble ZGEMM_L2x8_SUB0
cmpwi cr0, L, 1 cmpwi cr0, L, 1
ble .LZGEMM_L2x8_SUB4 ble ZGEMM_L2x8_SUB4
.LZGEMM_L2x8_LOOP_START: ZGEMM_L2x8_LOOP_START:
dcbt AO, PRE dcbt AO, PRE
dcbt BO, PRE
LOAD2x8_1 LOAD2x8_1
dcbt AO, PRE dcbt AO, PRE
KERNEL2x8_I1 KERNEL2x8_I1
dcbt AO, PRE dcbt AO, PRE
dcbt BO, PRE
KERNEL2x8_2 KERNEL2x8_2
dcbt AO, PRE dcbt AO, PRE
KERNEL2x8_1 KERNEL2x8_1
dcbt AO, PRE dcbt AO, PRE
dcbt BO, PRE
KERNEL2x8_2 KERNEL2x8_2
dcbt AO, PRE dcbt AO, PRE
KERNEL2x8_1 KERNEL2x8_1
dcbt AO, PRE dcbt AO, PRE
dcbt BO, PRE
KERNEL2x8_2 KERNEL2x8_2
dcbt AO, PRE dcbt AO, PRE
KERNEL2x8_1 KERNEL2x8_1
dcbt AO, PRE dcbt AO, PRE
dcbt BO, PRE
KERNEL2x8_2 KERNEL2x8_2
addic. L, L, -2 addic. L, L, -2
ble .LZGEMM_L2x8_LOOP_END ble ZGEMM_L2x8_LOOP_END
.align 5 .align 5
.LZGEMM_L2x8_LOOP: ZGEMM_L2x8_LOOP:
dcbt AO, PRE dcbt AO, PRE
KERNEL2x8_1 KERNEL2x8_1
dcbt AO, PRE dcbt AO, PRE
dcbt BO, PRE
KERNEL2x8_2 KERNEL2x8_2
dcbt AO, PRE dcbt AO, PRE
KERNEL2x8_1 KERNEL2x8_1
dcbt AO, PRE dcbt AO, PRE
dcbt BO, PRE
KERNEL2x8_2 KERNEL2x8_2
dcbt AO, PRE dcbt AO, PRE
KERNEL2x8_1 KERNEL2x8_1
dcbt AO, PRE dcbt AO, PRE
dcbt BO, PRE
KERNEL2x8_2 KERNEL2x8_2
dcbt AO, PRE dcbt AO, PRE
KERNEL2x8_1 KERNEL2x8_1
dcbt AO, PRE dcbt AO, PRE
dcbt BO, PRE
KERNEL2x8_2 KERNEL2x8_2
addic. L, L, -1 addic. L, L, -1
bgt .LZGEMM_L2x8_LOOP bgt ZGEMM_L2x8_LOOP
.LZGEMM_L2x8_LOOP_END: ZGEMM_L2x8_LOOP_END:
dcbt AO, PRE dcbt AO, PRE
KERNEL2x8_1 KERNEL2x8_1
dcbt AO, PRE dcbt AO, PRE
dcbt BO, PRE
KERNEL2x8_2 KERNEL2x8_2
dcbt AO, PRE dcbt AO, PRE
KERNEL2x8_1 KERNEL2x8_1
dcbt AO, PRE dcbt AO, PRE
dcbt BO, PRE
KERNEL2x8_2 KERNEL2x8_2
dcbt AO, PRE dcbt AO, PRE
@ -88,9 +116,9 @@
KERNEL2x8_1 KERNEL2x8_1
KERNEL2x8_E2 KERNEL2x8_E2
b .LZGEMM_L2x8_SUB1 b ZGEMM_L2x8_SUB1
.LZGEMM_L2x8_SUB4: ZGEMM_L2x8_SUB4:
dcbt AO, PRE dcbt AO, PRE
KERNEL2x8_SUBI1 KERNEL2x8_SUBI1
@ -106,53 +134,53 @@
KERNEL2x8_SUB1 KERNEL2x8_SUB1
KERNEL2x8_SUB1 KERNEL2x8_SUB1
b .LZGEMM_L2x8_SUB1 b ZGEMM_L2x8_SUB1
.LZGEMM_L2x8_SUB0: ZGEMM_L2x8_SUB0:
andi. L, K, 7 andi. L, K, 7
KERNEL2x8_SUBI1 KERNEL2x8_SUBI1
addic. L, L, -1 addic. L, L, -1
ble .LZGEMM_L2x8_SAVE ble ZGEMM_L2x8_SAVE
b .LZGEMM_L2x8_SUB2 b ZGEMM_L2x8_SUB2
.LZGEMM_L2x8_SUB1: ZGEMM_L2x8_SUB1:
andi. L, K, 7 andi. L, K, 7
ble .LZGEMM_L2x8_SAVE ble ZGEMM_L2x8_SAVE
.LZGEMM_L2x8_SUB2: ZGEMM_L2x8_SUB2:
KERNEL2x8_SUB1 KERNEL2x8_SUB1
addic. L, L, -1 addic. L, L, -1
bgt .LZGEMM_L2x8_SUB2 bgt ZGEMM_L2x8_SUB2
.LZGEMM_L2x8_SAVE: ZGEMM_L2x8_SAVE:
SAVE2x8 SAVE2x8
addic. I, I, -1 addic. I, I, -1
bgt .LZGEMM_L2x8_BEGIN bgt ZGEMM_L2x8_BEGIN
.LZGEMM_L2x8_END: ZGEMM_L2x8_END:
.LZGEMM_L2x4_BEGIN: ZGEMM_L2x4_BEGIN:
andi. T2, M, 7 andi. T2, M, 7
ble .LZGEMM_L2x1_END ble ZGEMM_L2x1_END
andi. T1, M, 4 andi. T1, M, 4
ble .LZGEMM_L2x4_END ble ZGEMM_L2x4_END
mr BO, B mr BO, BBUFFER
srawi. L, K, 3 srawi. L, K, 3
ble .LZGEMM_L2x4_SUB0 ble ZGEMM_L2x4_SUB0
cmpwi cr0, L, 1 cmpwi cr0, L, 1
ble .LZGEMM_L2x4_SUB4 ble ZGEMM_L2x4_SUB4
.LZGEMM_L2x4_LOOP_START: ZGEMM_L2x4_LOOP_START:
LOAD2x4_1 LOAD2x4_1
KERNEL2x4_I1 KERNEL2x4_I1
@ -166,11 +194,11 @@
KERNEL2x4_2 KERNEL2x4_2
addic. L, L, -2 addic. L, L, -2
ble .LZGEMM_L2x4_LOOP_END ble ZGEMM_L2x4_LOOP_END
.align 5 .align 5
.LZGEMM_L2x4_LOOP: ZGEMM_L2x4_LOOP:
KERNEL2x4_1 KERNEL2x4_1
KERNEL2x4_2 KERNEL2x4_2
@ -183,9 +211,9 @@
KERNEL2x4_2 KERNEL2x4_2
addic. L, L, -1 addic. L, L, -1
bgt .LZGEMM_L2x4_LOOP bgt ZGEMM_L2x4_LOOP
.LZGEMM_L2x4_LOOP_END: ZGEMM_L2x4_LOOP_END:
KERNEL2x4_1 KERNEL2x4_1
KERNEL2x4_2 KERNEL2x4_2
@ -197,9 +225,9 @@
KERNEL2x4_1 KERNEL2x4_1
KERNEL2x4_E2 KERNEL2x4_E2
b .LZGEMM_L2x4_SUB1 b ZGEMM_L2x4_SUB1
.LZGEMM_L2x4_SUB4: ZGEMM_L2x4_SUB4:
KERNEL2x4_SUBI1 KERNEL2x4_SUBI1
KERNEL2x4_SUB1 KERNEL2x4_SUB1
@ -211,48 +239,48 @@
KERNEL2x4_SUB1 KERNEL2x4_SUB1
KERNEL2x4_SUB1 KERNEL2x4_SUB1
b .LZGEMM_L2x4_SUB1 b ZGEMM_L2x4_SUB1
.LZGEMM_L2x4_SUB0: ZGEMM_L2x4_SUB0:
andi. L, K, 7 andi. L, K, 7
KERNEL2x4_SUBI1 KERNEL2x4_SUBI1
addic. L, L, -1 addic. L, L, -1
ble .LZGEMM_L2x4_SAVE ble ZGEMM_L2x4_SAVE
b .LZGEMM_L2x4_SUB2 b ZGEMM_L2x4_SUB2
.LZGEMM_L2x4_SUB1: ZGEMM_L2x4_SUB1:
andi. L, K, 7 andi. L, K, 7
ble .LZGEMM_L2x4_SAVE ble ZGEMM_L2x4_SAVE
.LZGEMM_L2x4_SUB2: ZGEMM_L2x4_SUB2:
KERNEL2x4_SUB1 KERNEL2x4_SUB1
addic. L, L, -1 addic. L, L, -1
bgt .LZGEMM_L2x4_SUB2 bgt ZGEMM_L2x4_SUB2
.LZGEMM_L2x4_SAVE: ZGEMM_L2x4_SAVE:
SAVE2x4 SAVE2x4
.LZGEMM_L2x4_END: ZGEMM_L2x4_END:
.LZGEMM_L2x2_BEGIN: ZGEMM_L2x2_BEGIN:
andi. T1, M, 2 andi. T1, M, 2
ble .LZGEMM_L2x2_END ble ZGEMM_L2x2_END
mr BO, B mr BO, BBUFFER
srawi. L, K, 3 srawi. L, K, 3
ble .LZGEMM_L2x2_SUB0 ble ZGEMM_L2x2_SUB0
cmpwi cr0, L, 1 cmpwi cr0, L, 1
ble .LZGEMM_L2x2_SUB4 ble ZGEMM_L2x2_SUB4
.LZGEMM_L2x2_LOOP_START: ZGEMM_L2x2_LOOP_START:
LOAD2x2_1 LOAD2x2_1
KERNEL2x2_I1 KERNEL2x2_I1
@ -266,11 +294,11 @@
KERNEL2x2_2 KERNEL2x2_2
addic. L, L, -2 addic. L, L, -2
ble .LZGEMM_L2x2_LOOP_END ble ZGEMM_L2x2_LOOP_END
.align 5 .align 5
.LZGEMM_L2x2_LOOP: ZGEMM_L2x2_LOOP:
KERNEL2x2_1 KERNEL2x2_1
KERNEL2x2_2 KERNEL2x2_2
@ -283,9 +311,9 @@
KERNEL2x2_2 KERNEL2x2_2
addic. L, L, -1 addic. L, L, -1
bgt .LZGEMM_L2x2_LOOP bgt ZGEMM_L2x2_LOOP
.LZGEMM_L2x2_LOOP_END: ZGEMM_L2x2_LOOP_END:
KERNEL2x2_1 KERNEL2x2_1
KERNEL2x2_2 KERNEL2x2_2
@ -297,9 +325,9 @@
KERNEL2x2_1 KERNEL2x2_1
KERNEL2x2_E2 KERNEL2x2_E2
b .LZGEMM_L2x2_SUB1 b ZGEMM_L2x2_SUB1
.LZGEMM_L2x2_SUB4: ZGEMM_L2x2_SUB4:
KERNEL2x2_SUBI1 KERNEL2x2_SUBI1
KERNEL2x2_SUB1 KERNEL2x2_SUB1
@ -311,48 +339,48 @@
KERNEL2x2_SUB1 KERNEL2x2_SUB1
KERNEL2x2_SUB1 KERNEL2x2_SUB1
b .LZGEMM_L2x2_SUB1 b ZGEMM_L2x2_SUB1
.LZGEMM_L2x2_SUB0: ZGEMM_L2x2_SUB0:
andi. L, K, 7 andi. L, K, 7
KERNEL2x2_SUBI1 KERNEL2x2_SUBI1
addic. L, L, -1 addic. L, L, -1
ble .LZGEMM_L2x2_SAVE ble ZGEMM_L2x2_SAVE
b .LZGEMM_L2x2_SUB2 b ZGEMM_L2x2_SUB2
.LZGEMM_L2x2_SUB1: ZGEMM_L2x2_SUB1:
andi. L, K, 7 andi. L, K, 7
ble .LZGEMM_L2x2_SAVE ble ZGEMM_L2x2_SAVE
.LZGEMM_L2x2_SUB2: ZGEMM_L2x2_SUB2:
KERNEL2x2_SUB1 KERNEL2x2_SUB1
addic. L, L, -1 addic. L, L, -1
bgt .LZGEMM_L2x2_SUB2 bgt ZGEMM_L2x2_SUB2
.LZGEMM_L2x2_SAVE: ZGEMM_L2x2_SAVE:
SAVE2x2 SAVE2x2
.LZGEMM_L2x2_END: ZGEMM_L2x2_END:
.LZGEMM_L2x1_BEGIN: ZGEMM_L2x1_BEGIN:
andi. T1, M, 1 andi. T1, M, 1
ble .LZGEMM_L2x1_END ble ZGEMM_L2x1_END
mr BO, B mr BO, BBUFFER
srawi. L, K, 3 srawi. L, K, 3
ble .LZGEMM_L2x1_SUB0 ble ZGEMM_L2x1_SUB0
cmpwi cr0, L, 1 cmpwi cr0, L, 1
ble .LZGEMM_L2x1_SUB4 ble ZGEMM_L2x1_SUB4
.LZGEMM_L2x1_LOOP_START: ZGEMM_L2x1_LOOP_START:
LOAD2x1_1 LOAD2x1_1
KERNEL2x1_I1 KERNEL2x1_I1
@ -366,11 +394,11 @@
KERNEL2x1_2 KERNEL2x1_2
addic. L, L, -2 addic. L, L, -2
ble .LZGEMM_L2x1_LOOP_END ble ZGEMM_L2x1_LOOP_END
.align 5 .align 5
.LZGEMM_L2x1_LOOP: ZGEMM_L2x1_LOOP:
KERNEL2x1_1 KERNEL2x1_1
KERNEL2x1_2 KERNEL2x1_2
@ -383,9 +411,9 @@
KERNEL2x1_2 KERNEL2x1_2
addic. L, L, -1 addic. L, L, -1
bgt .LZGEMM_L2x1_LOOP bgt ZGEMM_L2x1_LOOP
.LZGEMM_L2x1_LOOP_END: ZGEMM_L2x1_LOOP_END:
KERNEL2x1_1 KERNEL2x1_1
KERNEL2x1_2 KERNEL2x1_2
@ -397,9 +425,9 @@
KERNEL2x1_1 KERNEL2x1_1
KERNEL2x1_E2 KERNEL2x1_E2
b .LZGEMM_L2x1_SUB1 b ZGEMM_L2x1_SUB1
.LZGEMM_L2x1_SUB4: ZGEMM_L2x1_SUB4:
KERNEL2x1_SUBI1 KERNEL2x1_SUBI1
KERNEL2x1_SUB1 KERNEL2x1_SUB1
@ -411,72 +439,89 @@
KERNEL2x1_SUB1 KERNEL2x1_SUB1
KERNEL2x1_SUB1 KERNEL2x1_SUB1
b .LZGEMM_L2x1_SUB1 b ZGEMM_L2x1_SUB1
.LZGEMM_L2x1_SUB0: ZGEMM_L2x1_SUB0:
andi. L, K, 7 andi. L, K, 7
KERNEL2x1_SUBI1 KERNEL2x1_SUBI1
addic. L, L, -1 addic. L, L, -1
ble .LZGEMM_L2x1_SAVE ble ZGEMM_L2x1_SAVE
b .LZGEMM_L2x1_SUB2 b ZGEMM_L2x1_SUB2
.LZGEMM_L2x1_SUB1: ZGEMM_L2x1_SUB1:
andi. L, K, 7 andi. L, K, 7
ble .LZGEMM_L2x1_SAVE ble ZGEMM_L2x1_SAVE
.LZGEMM_L2x1_SUB2: ZGEMM_L2x1_SUB2:
KERNEL2x1_SUB1 KERNEL2x1_SUB1
addic. L, L, -1 addic. L, L, -1
bgt .LZGEMM_L2x1_SUB2 bgt ZGEMM_L2x1_SUB2
.LZGEMM_L2x1_SAVE: ZGEMM_L2x1_SAVE:
SAVE2x1 SAVE2x1
.LZGEMM_L2x1_END: ZGEMM_L2x1_END:
slwi T1, K, 5 slwi T1, K, 5
add B, B, T1 add B, B, T1
addic. J, J, -1 addic. J, J, -1
bgt .LZGEMM_L2_BEGIN bgt ZGEMM_L2_BEGIN
andi. T2, N, 1 andi. T2, N, 1
ble .L999 ble L999
.LZGEMM_L2_END: ZGEMM_L2_END:
b .LZGEMM_L1_BEGIN b ZGEMM_L1_BEGIN
.L999_H1: L999_H1:
b .L999 b L999
ZGEMM_L1_BEGIN:
mr BO, B
mr BBO, BBUFFER
slwi T1, K, 0
ZGEMM_L1_COPYB:
lxvdsx vs4, o0, BO // b0_r
lxvdsx vs5, o8, BO // b0_i
addi BO, BO, 16
stxvd2x vs4, o0, BBO
stxvd2x vs5, o16, BBO
addic. T1, T1, -1
addi BBO, BBO, 32
bge ZGEMM_L1_COPYB
.LZGEMM_L1_BEGIN:
andi. T1, N, 1 andi. T1, N, 1
ble .LZGEMM_L1_END ble ZGEMM_L1_END
mr CO, C mr CO, C
mr AO, A mr AO, A
srawi. I, M, 3 srawi. I, M, 3
ble .LZGEMM_L1x8_END ble ZGEMM_L1x8_END
.LZGEMM_L1x8_BEGIN: ZGEMM_L1x8_BEGIN:
mr BO, B mr BO, BBUFFER
srawi. L, K, 3 srawi. L, K, 3
ble .LZGEMM_L1x8_SUB0 ble ZGEMM_L1x8_SUB0
cmpwi cr0, L, 1 cmpwi cr0, L, 1
ble .LZGEMM_L1x8_SUB4 ble ZGEMM_L1x8_SUB4
.LZGEMM_L1x8_LOOP_START: ZGEMM_L1x8_LOOP_START:
dcbt AO, PRE dcbt AO, PRE
LOAD1x8_1 LOAD1x8_1
@ -499,11 +544,11 @@
KERNEL1x8_2 KERNEL1x8_2
addic. L, L, -2 addic. L, L, -2
ble .LZGEMM_L1x8_LOOP_END ble ZGEMM_L1x8_LOOP_END
.align 5 .align 5
.LZGEMM_L1x8_LOOP: ZGEMM_L1x8_LOOP:
dcbt AO, PRE dcbt AO, PRE
KERNEL1x8_1 KERNEL1x8_1
@ -524,9 +569,9 @@
KERNEL1x8_2 KERNEL1x8_2
addic. L, L, -1 addic. L, L, -1
bgt .LZGEMM_L1x8_LOOP bgt ZGEMM_L1x8_LOOP
.LZGEMM_L1x8_LOOP_END: ZGEMM_L1x8_LOOP_END:
dcbt AO, PRE dcbt AO, PRE
KERNEL1x8_1 KERNEL1x8_1
@ -545,9 +590,9 @@
KERNEL1x8_1 KERNEL1x8_1
KERNEL1x8_E2 KERNEL1x8_E2
b .LZGEMM_L1x8_SUB1 b ZGEMM_L1x8_SUB1
.LZGEMM_L1x8_SUB4: ZGEMM_L1x8_SUB4:
dcbt AO, PRE dcbt AO, PRE
KERNEL1x8_SUBI1 KERNEL1x8_SUBI1
@ -563,53 +608,53 @@
KERNEL1x8_SUB1 KERNEL1x8_SUB1
KERNEL1x8_SUB1 KERNEL1x8_SUB1
b .LZGEMM_L1x8_SUB1 b ZGEMM_L1x8_SUB1
.LZGEMM_L1x8_SUB0: ZGEMM_L1x8_SUB0:
andi. L, K, 7 andi. L, K, 7
KERNEL1x8_SUBI1 KERNEL1x8_SUBI1
addic. L, L, -1 addic. L, L, -1
ble .LZGEMM_L1x8_SAVE ble ZGEMM_L1x8_SAVE
b .LZGEMM_L1x8_SUB2 b ZGEMM_L1x8_SUB2
.LZGEMM_L1x8_SUB1: ZGEMM_L1x8_SUB1:
andi. L, K, 7 andi. L, K, 7
ble .LZGEMM_L1x8_SAVE ble ZGEMM_L1x8_SAVE
.LZGEMM_L1x8_SUB2: ZGEMM_L1x8_SUB2:
KERNEL1x8_SUB1 KERNEL1x8_SUB1
addic. L, L, -1 addic. L, L, -1
bgt .LZGEMM_L1x8_SUB2 bgt ZGEMM_L1x8_SUB2
.LZGEMM_L1x8_SAVE: ZGEMM_L1x8_SAVE:
SAVE1x8 SAVE1x8
addic. I, I, -1 addic. I, I, -1
bgt .LZGEMM_L1x8_BEGIN bgt ZGEMM_L1x8_BEGIN
.LZGEMM_L1x8_END: ZGEMM_L1x8_END:
.LZGEMM_L1x4_BEGIN: ZGEMM_L1x4_BEGIN:
andi. T2, M, 7 andi. T2, M, 7
ble .LZGEMM_L1x1_END ble ZGEMM_L1x1_END
andi. T1, M, 4 andi. T1, M, 4
ble .LZGEMM_L1x4_END ble ZGEMM_L1x4_END
mr BO, B mr BO, BBUFFER
srawi. L, K, 3 srawi. L, K, 3
ble .LZGEMM_L1x4_SUB0 ble ZGEMM_L1x4_SUB0
cmpwi cr0, L, 1 cmpwi cr0, L, 1
ble .LZGEMM_L1x4_SUB4 ble ZGEMM_L1x4_SUB4
.LZGEMM_L1x4_LOOP_START: ZGEMM_L1x4_LOOP_START:
LOAD1x4_1 LOAD1x4_1
KERNEL1x4_I1 KERNEL1x4_I1
@ -623,11 +668,11 @@
KERNEL1x4_2 KERNEL1x4_2
addic. L, L, -2 addic. L, L, -2
ble .LZGEMM_L1x4_LOOP_END ble ZGEMM_L1x4_LOOP_END
.align 5 .align 5
.LZGEMM_L1x4_LOOP: ZGEMM_L1x4_LOOP:
KERNEL1x4_1 KERNEL1x4_1
KERNEL1x4_2 KERNEL1x4_2
@ -640,9 +685,9 @@
KERNEL1x4_2 KERNEL1x4_2
addic. L, L, -1 addic. L, L, -1
bgt .LZGEMM_L1x4_LOOP bgt ZGEMM_L1x4_LOOP
.LZGEMM_L1x4_LOOP_END: ZGEMM_L1x4_LOOP_END:
KERNEL1x4_1 KERNEL1x4_1
KERNEL1x4_2 KERNEL1x4_2
@ -654,9 +699,9 @@
KERNEL1x4_1 KERNEL1x4_1
KERNEL1x4_E2 KERNEL1x4_E2
b .LZGEMM_L1x4_SUB1 b ZGEMM_L1x4_SUB1
.LZGEMM_L1x4_SUB4: ZGEMM_L1x4_SUB4:
KERNEL1x4_SUBI1 KERNEL1x4_SUBI1
KERNEL1x4_SUB1 KERNEL1x4_SUB1
@ -668,48 +713,48 @@
KERNEL1x4_SUB1 KERNEL1x4_SUB1
KERNEL1x4_SUB1 KERNEL1x4_SUB1
b .LZGEMM_L1x4_SUB1 b ZGEMM_L1x4_SUB1
.LZGEMM_L1x4_SUB0: ZGEMM_L1x4_SUB0:
andi. L, K, 7 andi. L, K, 7
KERNEL1x4_SUBI1 KERNEL1x4_SUBI1
addic. L, L, -1 addic. L, L, -1
ble .LZGEMM_L1x4_SAVE ble ZGEMM_L1x4_SAVE
b .LZGEMM_L1x4_SUB2 b ZGEMM_L1x4_SUB2
.LZGEMM_L1x4_SUB1: ZGEMM_L1x4_SUB1:
andi. L, K, 7 andi. L, K, 7
ble .LZGEMM_L1x4_SAVE ble ZGEMM_L1x4_SAVE
.LZGEMM_L1x4_SUB2: ZGEMM_L1x4_SUB2:
KERNEL1x4_SUB1 KERNEL1x4_SUB1
addic. L, L, -1 addic. L, L, -1
bgt .LZGEMM_L1x4_SUB2 bgt ZGEMM_L1x4_SUB2
.LZGEMM_L1x4_SAVE: ZGEMM_L1x4_SAVE:
SAVE1x4 SAVE1x4
.LZGEMM_L1x4_END: ZGEMM_L1x4_END:
.LZGEMM_L1x2_BEGIN: ZGEMM_L1x2_BEGIN:
andi. T1, M, 2 andi. T1, M, 2
ble .LZGEMM_L1x2_END ble ZGEMM_L1x2_END
mr BO, B mr BO, BBUFFER
srawi. L, K, 3 srawi. L, K, 3
ble .LZGEMM_L1x2_SUB0 ble ZGEMM_L1x2_SUB0
cmpwi cr0, L, 1 cmpwi cr0, L, 1
ble .LZGEMM_L1x2_SUB4 ble ZGEMM_L1x2_SUB4
.LZGEMM_L1x2_LOOP_START: ZGEMM_L1x2_LOOP_START:
LOAD1x2_1 LOAD1x2_1
KERNEL1x2_I1 KERNEL1x2_I1
@ -723,11 +768,11 @@
KERNEL1x2_2 KERNEL1x2_2
addic. L, L, -2 addic. L, L, -2
ble .LZGEMM_L1x2_LOOP_END ble ZGEMM_L1x2_LOOP_END
.align 5 .align 5
.LZGEMM_L1x2_LOOP: ZGEMM_L1x2_LOOP:
KERNEL1x2_1 KERNEL1x2_1
KERNEL1x2_2 KERNEL1x2_2
@ -740,9 +785,9 @@
KERNEL1x2_2 KERNEL1x2_2
addic. L, L, -1 addic. L, L, -1
bgt .LZGEMM_L1x2_LOOP bgt ZGEMM_L1x2_LOOP
.LZGEMM_L1x2_LOOP_END: ZGEMM_L1x2_LOOP_END:
KERNEL1x2_1 KERNEL1x2_1
KERNEL1x2_2 KERNEL1x2_2
@ -754,9 +799,9 @@
KERNEL1x2_1 KERNEL1x2_1
KERNEL1x2_E2 KERNEL1x2_E2
b .LZGEMM_L1x2_SUB1 b ZGEMM_L1x2_SUB1
.LZGEMM_L1x2_SUB4: ZGEMM_L1x2_SUB4:
KERNEL1x2_SUBI1 KERNEL1x2_SUBI1
KERNEL1x2_SUB1 KERNEL1x2_SUB1
@ -768,48 +813,48 @@
KERNEL1x2_SUB1 KERNEL1x2_SUB1
KERNEL1x2_SUB1 KERNEL1x2_SUB1
b .LZGEMM_L1x2_SUB1 b ZGEMM_L1x2_SUB1
.LZGEMM_L1x2_SUB0: ZGEMM_L1x2_SUB0:
andi. L, K, 7 andi. L, K, 7
KERNEL1x2_SUBI1 KERNEL1x2_SUBI1
addic. L, L, -1 addic. L, L, -1
ble .LZGEMM_L1x2_SAVE ble ZGEMM_L1x2_SAVE
b .LZGEMM_L1x2_SUB2 b ZGEMM_L1x2_SUB2
.LZGEMM_L1x2_SUB1: ZGEMM_L1x2_SUB1:
andi. L, K, 7 andi. L, K, 7
ble .LZGEMM_L1x2_SAVE ble ZGEMM_L1x2_SAVE
.LZGEMM_L1x2_SUB2: ZGEMM_L1x2_SUB2:
KERNEL1x2_SUB1 KERNEL1x2_SUB1
addic. L, L, -1 addic. L, L, -1
bgt .LZGEMM_L1x2_SUB2 bgt ZGEMM_L1x2_SUB2
.LZGEMM_L1x2_SAVE: ZGEMM_L1x2_SAVE:
SAVE1x2 SAVE1x2
.LZGEMM_L1x2_END: ZGEMM_L1x2_END:
.LZGEMM_L1x1_BEGIN: ZGEMM_L1x1_BEGIN:
andi. T1, M, 1 andi. T1, M, 1
ble .LZGEMM_L1x1_END ble ZGEMM_L1x1_END
mr BO, B mr BO, BBUFFER
srawi. L, K, 3 srawi. L, K, 3
ble .LZGEMM_L1x1_SUB0 ble ZGEMM_L1x1_SUB0
cmpwi cr0, L, 1 cmpwi cr0, L, 1
ble .LZGEMM_L1x1_SUB4 ble ZGEMM_L1x1_SUB4
.LZGEMM_L1x1_LOOP_START: ZGEMM_L1x1_LOOP_START:
LOAD1x1_1 LOAD1x1_1
KERNEL1x1_I1 KERNEL1x1_I1
@ -823,11 +868,11 @@
KERNEL1x1_2 KERNEL1x1_2
addic. L, L, -2 addic. L, L, -2
ble .LZGEMM_L1x1_LOOP_END ble ZGEMM_L1x1_LOOP_END
.align 5 .align 5
.LZGEMM_L1x1_LOOP: ZGEMM_L1x1_LOOP:
KERNEL1x1_1 KERNEL1x1_1
KERNEL1x1_2 KERNEL1x1_2
@ -840,9 +885,9 @@
KERNEL1x1_2 KERNEL1x1_2
addic. L, L, -1 addic. L, L, -1
bgt .LZGEMM_L1x1_LOOP bgt ZGEMM_L1x1_LOOP
.LZGEMM_L1x1_LOOP_END: ZGEMM_L1x1_LOOP_END:
KERNEL1x1_1 KERNEL1x1_1
KERNEL1x1_2 KERNEL1x1_2
@ -854,9 +899,9 @@
KERNEL1x1_1 KERNEL1x1_1
KERNEL1x1_E2 KERNEL1x1_E2
b .LZGEMM_L1x1_SUB1 b ZGEMM_L1x1_SUB1
.LZGEMM_L1x1_SUB4: ZGEMM_L1x1_SUB4:
KERNEL1x1_SUBI1 KERNEL1x1_SUBI1
KERNEL1x1_SUB1 KERNEL1x1_SUB1
@ -868,34 +913,34 @@
KERNEL1x1_SUB1 KERNEL1x1_SUB1
KERNEL1x1_SUB1 KERNEL1x1_SUB1
b .LZGEMM_L1x1_SUB1 b ZGEMM_L1x1_SUB1
.LZGEMM_L1x1_SUB0: ZGEMM_L1x1_SUB0:
andi. L, K, 7 andi. L, K, 7
KERNEL1x1_SUBI1 KERNEL1x1_SUBI1
addic. L, L, -1 addic. L, L, -1
ble .LZGEMM_L1x1_SAVE ble ZGEMM_L1x1_SAVE
b .LZGEMM_L1x1_SUB2 b ZGEMM_L1x1_SUB2
.LZGEMM_L1x1_SUB1: ZGEMM_L1x1_SUB1:
andi. L, K, 7 andi. L, K, 7
ble .LZGEMM_L1x1_SAVE ble ZGEMM_L1x1_SAVE
.LZGEMM_L1x1_SUB2: ZGEMM_L1x1_SUB2:
KERNEL1x1_SUB1 KERNEL1x1_SUB1
addic. L, L, -1 addic. L, L, -1
bgt .LZGEMM_L1x1_SUB2 bgt ZGEMM_L1x1_SUB2
.LZGEMM_L1x1_SAVE: ZGEMM_L1x1_SAVE:
SAVE1x1 SAVE1x1
.LZGEMM_L1x1_END: ZGEMM_L1x1_END:
.LZGEMM_L1_END: ZGEMM_L1_END:

View File

@ -1,39 +1,3 @@
/***************************************************************************
Copyright (c) 2013-2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2016/03/05 Werner Saar (wernsaar@googlemail.com)
* BLASTEST : OK
* CTEST : OK
* TEST : OK
* LAPACK-TEST : OK
**************************************************************************************/
#if defined(NN) || defined(NT) || defined(TN) || defined(TT) #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
#define XSFADD_R1 xsadddp #define XSFADD_R1 xsadddp
@ -70,12 +34,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro LOAD2x8_1 .macro LOAD2x8_1
lxvdsx vs16, o0, BO // load real part from B lxvd2x vs16, o0, BO // load real part from B
lxvdsx vs17, o8, BO // load imag part from B lxvd2x vs17, o16, BO // load imag part from B
lxvdsx vs18, o16, BO // load real part from B lxvd2x vs18, o32, BO // load real part from B
lxvdsx vs19, o24, BO // load imag part from B lxvd2x vs19, o48, BO // load imag part from B
addi BO, BO, 32 addi BO, BO, 64
lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs0, o0, AO // load real,imag from A
lxvd2x vs1, o16, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A
@ -110,12 +74,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 64 addi AO, AO, 64
lxvdsx vs20, o0, BO // load real part from B lxvd2x vs20, o0, BO // load real part from B
lxvdsx vs21, o8, BO // load imag part from B lxvd2x vs21, o16, BO // load imag part from B
lxvdsx vs22, o16, BO // load real part from B lxvd2x vs22, o32, BO // load real part from B
lxvdsx vs23, o24, BO // load imag part from B lxvd2x vs23, o48, BO // load imag part from B
addi BO, BO, 32 addi BO, BO, 64
xvmuldp vs32, vs0, vs16 // real*real, imag*real xvmuldp vs32, vs0, vs16 // real*real, imag*real
xvmuldp vs33, vs0, vs17 // real*imag, imag*imag xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
@ -156,36 +120,41 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro KERNEL2x8_1 .macro KERNEL2x8_1
lxvd2x vs8, o0, AO // load real,imag from A
lxvd2x vs9, o16, AO // load real,imag from A
lxvd2x vs10, o32, AO // load real,imag from A
lxvd2x vs11, o48, AO // load real,imag from A
addi AO, AO, 64
lxvd2x vs12, o0, AO // load real,imag from A
lxvd2x vs13, o16, AO // load real,imag from A
lxvd2x vs14, o32, AO // load real,imag from A
lxvd2x vs15, o48, AO // load real,imag from A
addi AO, AO, 64
lxvd2x vs20, o0, BO // load real part from B
lxvd2x vs21, o16, BO // load imag part from B
lxvd2x vs22, o32, BO // load real part from B
lxvd2x vs23, o48, BO // load imag part from B
addi BO, BO, 64
xvmaddadp vs32, vs0, vs16 // real*real, imag*real xvmaddadp vs32, vs0, vs16 // real*real, imag*real
xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
xvmaddadp vs34, vs1, vs16 // real*real, imag*real xvmaddadp vs34, vs1, vs16 // real*real, imag*real
xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag
lxvdsx vs22, o16, BO // load real part from B
lxvdsx vs23, o24, BO // load imag part from B
xvmaddadp vs36, vs2, vs16 // real*real, imag*real xvmaddadp vs36, vs2, vs16 // real*real, imag*real
xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag
xvmaddadp vs38, vs3, vs16 // real*real, imag*real xvmaddadp vs38, vs3, vs16 // real*real, imag*real
xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag
lxvd2x vs8, o0, AO // load real,imag from A
lxvd2x vs9, o16, AO // load real,imag from A
xvmaddadp vs40, vs4, vs16 // real*real, imag*real xvmaddadp vs40, vs4, vs16 // real*real, imag*real
xvmaddadp vs41, vs4, vs17 // real*imag, imag*imag xvmaddadp vs41, vs4, vs17 // real*imag, imag*imag
xvmaddadp vs42, vs5, vs16 // real*real, imag*real xvmaddadp vs42, vs5, vs16 // real*real, imag*real
xvmaddadp vs43, vs5, vs17 // real*imag, imag*imag xvmaddadp vs43, vs5, vs17 // real*imag, imag*imag
lxvd2x vs10, o32, AO // load real,imag from A
lxvd2x vs11, o48, AO // load real,imag from A
xvmaddadp vs44, vs6, vs16 // real*real, imag*real xvmaddadp vs44, vs6, vs16 // real*real, imag*real
xvmaddadp vs45, vs6, vs17 // real*imag, imag*imag xvmaddadp vs45, vs6, vs17 // real*imag, imag*imag
addi AO, AO, 64
xvmaddadp vs46, vs7, vs16 // real*real, imag*real xvmaddadp vs46, vs7, vs16 // real*real, imag*real
xvmaddadp vs47, vs7, vs17 // real*imag, imag*imag xvmaddadp vs47, vs7, vs17 // real*imag, imag*imag
@ -193,101 +162,79 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvmaddadp vs49, vs0, vs19 // real*imag, imag*imag xvmaddadp vs49, vs0, vs19 // real*imag, imag*imag
xvmaddadp vs50, vs1, vs18 // real*real, imag*real xvmaddadp vs50, vs1, vs18 // real*real, imag*real
xvmaddadp vs51, vs1, vs19 // real*imag, imag*imag xvmaddadp vs51, vs1, vs19 // real*imag, imag*imag
lxvd2x vs12, o0, AO // load real,imag from A
lxvd2x vs13, o16, AO // load real,imag from A
xvmaddadp vs52, vs2, vs18 // real*real, imag*real xvmaddadp vs52, vs2, vs18 // real*real, imag*real
xvmaddadp vs53, vs2, vs19 // real*imag, imag*imag xvmaddadp vs53, vs2, vs19 // real*imag, imag*imag
xvmaddadp vs54, vs3, vs18 // real*real, imag*real xvmaddadp vs54, vs3, vs18 // real*real, imag*real
xvmaddadp vs55, vs3, vs19 // real*imag, imag*imag xvmaddadp vs55, vs3, vs19 // real*imag, imag*imag
lxvd2x vs14, o32, AO // load real,imag from A
lxvd2x vs15, o48, AO // load real,imag from A
xvmaddadp vs56, vs4, vs18 // real*real, imag*real xvmaddadp vs56, vs4, vs18 // real*real, imag*real
xvmaddadp vs57, vs4, vs19 // real*imag, imag*imag xvmaddadp vs57, vs4, vs19 // real*imag, imag*imag
xvmaddadp vs58, vs5, vs18 // real*real, imag*real xvmaddadp vs58, vs5, vs18 // real*real, imag*real
xvmaddadp vs59, vs5, vs19 // real*imag, imag*imag xvmaddadp vs59, vs5, vs19 // real*imag, imag*imag
lxvdsx vs20, o0, BO // load real part from B
lxvdsx vs21, o8, BO // load imag part from B
xvmaddadp vs60, vs6, vs18 // real*real, imag*real xvmaddadp vs60, vs6, vs18 // real*real, imag*real
xvmaddadp vs61, vs6, vs19 // real*imag, imag*imag xvmaddadp vs61, vs6, vs19 // real*imag, imag*imag
xvmaddadp vs62, vs7, vs18 // real*real, imag*real xvmaddadp vs62, vs7, vs18 // real*real, imag*real
xvmaddadp vs63, vs7, vs19 // real*imag, imag*imag xvmaddadp vs63, vs7, vs19 // real*imag, imag*imag
addi AO, AO, 64
addi BO, BO, 32
.endm .endm
.macro KERNEL2x8_2 .macro KERNEL2x8_2
lxvd2x vs0, o0, AO // load real,imag from A
lxvd2x vs1, o16, AO // load real,imag from A
lxvd2x vs2, o32, AO // load real,imag from A
lxvd2x vs3, o48, AO // load real,imag from A
addi AO, AO, 64
lxvd2x vs4, o0, AO // load real,imag from A
lxvd2x vs5, o16, AO // load real,imag from A
lxvd2x vs6, o32, AO // load real,imag from A
lxvd2x vs7, o48, AO // load real,imag from A
addi AO, AO, 64
lxvd2x vs16, o0, BO // load real part from B
lxvd2x vs17, o16, BO // load imag part from B
lxvd2x vs18, o32, BO // load real part from B
lxvd2x vs19, o48, BO // load imag part from B
addi BO, BO, 64
xvmaddadp vs32, vs8, vs20 // real*real, imag*real xvmaddadp vs32, vs8, vs20 // real*real, imag*real
xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
xvmaddadp vs34, vs9, vs20 // real*real, imag*real xvmaddadp vs34, vs9, vs20 // real*real, imag*real
xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag
lxvdsx vs16, o0, BO // load real part from B
lxvdsx vs17, o8, BO // load imag part from B
xvmaddadp vs36, vs10, vs20 // real*real, imag*real xvmaddadp vs36, vs10, vs20 // real*real, imag*real
xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag
xvmaddadp vs38, vs11, vs20 // real*real, imag*real xvmaddadp vs38, vs11, vs20 // real*real, imag*real
xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag
lxvd2x vs0, o0, AO // load real,imag from A
lxvd2x vs1, o16, AO // load real,imag from A
xvmaddadp vs40, vs12, vs20 // real*real, imag*real xvmaddadp vs40, vs12, vs20 // real*real, imag*real
xvmaddadp vs41, vs12, vs21 // real*imag, imag*imag xvmaddadp vs41, vs12, vs21 // real*imag, imag*imag
xvmaddadp vs42, vs13, vs20 // real*real, imag*real xvmaddadp vs42, vs13, vs20 // real*real, imag*real
xvmaddadp vs43, vs13, vs21 // real*imag, imag*imag xvmaddadp vs43, vs13, vs21 // real*imag, imag*imag
lxvd2x vs2, o32, AO // load real,imag from A
lxvd2x vs3, o48, AO // load real,imag from A
xvmaddadp vs44, vs14, vs20 // real*real, imag*real xvmaddadp vs44, vs14, vs20 // real*real, imag*real
xvmaddadp vs45, vs14, vs21 // real*imag, imag*imag xvmaddadp vs45, vs14, vs21 // real*imag, imag*imag
xvmaddadp vs46, vs15, vs20 // real*real, imag*real xvmaddadp vs46, vs15, vs20 // real*real, imag*real
xvmaddadp vs47, vs15, vs21 // real*imag, imag*imag xvmaddadp vs47, vs15, vs21 // real*imag, imag*imag
addi AO, AO, 64
xvmaddadp vs48, vs8, vs22 // real*real, imag*real xvmaddadp vs48, vs8, vs22 // real*real, imag*real
xvmaddadp vs49, vs8, vs23 // real*imag, imag*imag xvmaddadp vs49, vs8, vs23 // real*imag, imag*imag
xvmaddadp vs50, vs9, vs22 // real*real, imag*real xvmaddadp vs50, vs9, vs22 // real*real, imag*real
xvmaddadp vs51, vs9, vs23 // real*imag, imag*imag xvmaddadp vs51, vs9, vs23 // real*imag, imag*imag
lxvd2x vs4, o0, AO // load real,imag from A
lxvd2x vs5, o16, AO // load real,imag from A
xvmaddadp vs52, vs10, vs22 // real*real, imag*real xvmaddadp vs52, vs10, vs22 // real*real, imag*real
xvmaddadp vs53, vs10, vs23 // real*imag, imag*imag xvmaddadp vs53, vs10, vs23 // real*imag, imag*imag
xvmaddadp vs54, vs11, vs22 // real*real, imag*real xvmaddadp vs54, vs11, vs22 // real*real, imag*real
xvmaddadp vs55, vs11, vs23 // real*imag, imag*imag xvmaddadp vs55, vs11, vs23 // real*imag, imag*imag
lxvd2x vs6, o32, AO // load real,imag from A
lxvd2x vs7, o48, AO // load real,imag from A
xvmaddadp vs56, vs12, vs22 // real*real, imag*real xvmaddadp vs56, vs12, vs22 // real*real, imag*real
xvmaddadp vs57, vs12, vs23 // real*imag, imag*imag xvmaddadp vs57, vs12, vs23 // real*imag, imag*imag
xvmaddadp vs58, vs13, vs22 // real*real, imag*real xvmaddadp vs58, vs13, vs22 // real*real, imag*real
xvmaddadp vs59, vs13, vs23 // real*imag, imag*imag xvmaddadp vs59, vs13, vs23 // real*imag, imag*imag
lxvdsx vs18, o16, BO // load real part from B
lxvdsx vs19, o24, BO // load imag part from B
xvmaddadp vs60, vs14, vs22 // real*real, imag*real xvmaddadp vs60, vs14, vs22 // real*real, imag*real
xvmaddadp vs61, vs14, vs23 // real*imag, imag*imag xvmaddadp vs61, vs14, vs23 // real*imag, imag*imag
xvmaddadp vs62, vs15, vs22 // real*real, imag*real xvmaddadp vs62, vs15, vs22 // real*real, imag*real
xvmaddadp vs63, vs15, vs23 // real*imag, imag*imag xvmaddadp vs63, vs15, vs23 // real*imag, imag*imag
addi AO, AO, 64
addi BO, BO, 32
.endm .endm
@ -347,12 +294,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 64 addi AO, AO, 64
lxvdsx vs16, o0, BO // load real part from B lxvd2x vs16, o0, BO // load real part from B
lxvdsx vs17, o8, BO // load imag part from B lxvd2x vs17, o16, BO // load imag part from B
lxvdsx vs18, o16, BO // load real part from B lxvd2x vs18, o32, BO // load real part from B
lxvdsx vs19, o24, BO // load imag part from B lxvd2x vs19, o48, BO // load imag part from B
addi BO, BO, 32 addi BO, BO, 64
xvmuldp vs32, vs0, vs16 // real*real, imag*real xvmuldp vs32, vs0, vs16 // real*real, imag*real
xvmuldp vs33, vs0, vs17 // real*imag, imag*imag xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
@ -407,12 +354,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 64 addi AO, AO, 64
lxvdsx vs16, o0, BO // load real part from B lxvd2x vs16, o0, BO // load real part from B
lxvdsx vs17, o8, BO // load imag part from B lxvd2x vs17, o16, BO // load imag part from B
lxvdsx vs18, o16, BO // load real part from B lxvd2x vs18, o32, BO // load real part from B
lxvdsx vs19, o24, BO // load imag part from B lxvd2x vs19, o48, BO // load imag part from B
addi BO, BO, 32 addi BO, BO, 64
xvmaddadp vs32, vs0, vs16 // real*real, imag*real xvmaddadp vs32, vs0, vs16 // real*real, imag*real
xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
@ -927,12 +874,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro LOAD2x4_1 .macro LOAD2x4_1
lxvdsx vs16, o0, BO // load real part from B lxvd2x vs16, o0, BO // load real part from B
lxvdsx vs17, o8, BO // load imag part from B lxvd2x vs17, o16, BO // load imag part from B
lxvdsx vs18, o16, BO // load real part from B lxvd2x vs18, o32, BO // load real part from B
lxvdsx vs19, o24, BO // load imag part from B lxvd2x vs19, o48, BO // load imag part from B
addi BO, BO, 32 addi BO, BO, 64
lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs0, o0, AO // load real,imag from A
lxvd2x vs1, o16, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A
@ -953,12 +900,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 64 addi AO, AO, 64
lxvdsx vs20, o0, BO // load real part from B lxvd2x vs20, o0, BO // load real part from B
lxvdsx vs21, o8, BO // load imag part from B lxvd2x vs21, o16, BO // load imag part from B
lxvdsx vs22, o16, BO // load real part from B lxvd2x vs22, o32, BO // load real part from B
lxvdsx vs23, o24, BO // load imag part from B lxvd2x vs23, o48, BO // load imag part from B
addi BO, BO, 32 addi BO, BO, 64
xvmuldp vs32, vs0, vs16 // real*real, imag*real xvmuldp vs32, vs0, vs16 // real*real, imag*real
xvmuldp vs33, vs0, vs17 // real*imag, imag*imag xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
@ -990,12 +937,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 64 addi AO, AO, 64
lxvdsx vs20, o0, BO // load real part from B lxvd2x vs20, o0, BO // load real part from B
lxvdsx vs21, o8, BO // load imag part from B lxvd2x vs21, o16, BO // load imag part from B
lxvdsx vs22, o16, BO // load real part from B lxvd2x vs22, o32, BO // load real part from B
lxvdsx vs23, o24, BO // load imag part from B lxvd2x vs23, o48, BO // load imag part from B
addi BO, BO, 32 addi BO, BO, 64
xvmaddadp vs32, vs0, vs16 // real*real, imag*real xvmaddadp vs32, vs0, vs16 // real*real, imag*real
xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
@ -1027,12 +974,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 64 addi AO, AO, 64
lxvdsx vs16, o0, BO // load real part from B lxvd2x vs16, o0, BO // load real part from B
lxvdsx vs17, o8, BO // load imag part from B lxvd2x vs17, o16, BO // load imag part from B
lxvdsx vs18, o16, BO // load real part from B lxvd2x vs18, o32, BO // load real part from B
lxvdsx vs19, o24, BO // load imag part from B lxvd2x vs19, o48, BO // load imag part from B
addi BO, BO, 32 addi BO, BO, 64
xvmaddadp vs32, vs8, vs20 // real*real, imag*real xvmaddadp vs32, vs8, vs20 // real*real, imag*real
xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
@ -1088,12 +1035,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 64 addi AO, AO, 64
lxvdsx vs16, o0, BO // load real part from B lxvd2x vs16, o0, BO // load real part from B
lxvdsx vs17, o8, BO // load imag part from B lxvd2x vs17, o16, BO // load imag part from B
lxvdsx vs18, o16, BO // load real part from B lxvd2x vs18, o32, BO // load real part from B
lxvdsx vs19, o24, BO // load imag part from B lxvd2x vs19, o48, BO // load imag part from B
addi BO, BO, 32 addi BO, BO, 64
xvmuldp vs32, vs0, vs16 // real*real, imag*real xvmuldp vs32, vs0, vs16 // real*real, imag*real
xvmuldp vs33, vs0, vs17 // real*imag, imag*imag xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
@ -1125,12 +1072,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 64 addi AO, AO, 64
lxvdsx vs16, o0, BO // load real part from B lxvd2x vs16, o0, BO // load real part from B
lxvdsx vs17, o8, BO // load imag part from B lxvd2x vs17, o16, BO // load imag part from B
lxvdsx vs18, o16, BO // load real part from B lxvd2x vs18, o32, BO // load real part from B
lxvdsx vs19, o24, BO // load imag part from B lxvd2x vs19, o48, BO // load imag part from B
addi BO, BO, 32 addi BO, BO, 64
xvmaddadp vs32, vs0, vs16 // real*real, imag*real xvmaddadp vs32, vs0, vs16 // real*real, imag*real
xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
@ -1410,12 +1357,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro LOAD2x2_1 .macro LOAD2x2_1
lxvdsx vs16, o0, BO // load real part from B lxvd2x vs16, o0, BO // load real part from B
lxvdsx vs17, o8, BO // load imag part from B lxvd2x vs17, o16, BO // load imag part from B
lxvdsx vs18, o16, BO // load real part from B lxvd2x vs18, o32, BO // load real part from B
lxvdsx vs19, o24, BO // load imag part from B lxvd2x vs19, o48, BO // load imag part from B
addi BO, BO, 32 addi BO, BO, 64
lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs0, o0, AO // load real,imag from A
lxvd2x vs1, o16, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A
@ -1432,12 +1379,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 32 addi AO, AO, 32
lxvdsx vs20, o0, BO // load real part from B lxvd2x vs20, o0, BO // load real part from B
lxvdsx vs21, o8, BO // load imag part from B lxvd2x vs21, o16, BO // load imag part from B
lxvdsx vs22, o16, BO // load real part from B lxvd2x vs22, o32, BO // load real part from B
lxvdsx vs23, o24, BO // load imag part from B lxvd2x vs23, o48, BO // load imag part from B
addi BO, BO, 32 addi BO, BO, 64
xvmuldp vs32, vs0, vs16 // real*real, imag*real xvmuldp vs32, vs0, vs16 // real*real, imag*real
xvmuldp vs33, vs0, vs17 // real*imag, imag*imag xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
@ -1459,12 +1406,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 32 addi AO, AO, 32
lxvdsx vs20, o0, BO // load real part from B lxvd2x vs20, o0, BO // load real part from B
lxvdsx vs21, o8, BO // load imag part from B lxvd2x vs21, o16, BO // load imag part from B
lxvdsx vs22, o16, BO // load real part from B lxvd2x vs22, o32, BO // load real part from B
lxvdsx vs23, o24, BO // load imag part from B lxvd2x vs23, o48, BO // load imag part from B
addi BO, BO, 32 addi BO, BO, 64
xvmaddadp vs32, vs0, vs16 // real*real, imag*real xvmaddadp vs32, vs0, vs16 // real*real, imag*real
xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
@ -1486,12 +1433,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 32 addi AO, AO, 32
lxvdsx vs16, o0, BO // load real part from B lxvd2x vs16, o0, BO // load real part from B
lxvdsx vs17, o8, BO // load imag part from B lxvd2x vs17, o16, BO // load imag part from B
lxvdsx vs18, o16, BO // load real part from B lxvd2x vs18, o32, BO // load real part from B
lxvdsx vs19, o24, BO // load imag part from B lxvd2x vs19, o48, BO // load imag part from B
addi BO, BO, 32 addi BO, BO, 64
xvmaddadp vs32, vs8, vs20 // real*real, imag*real xvmaddadp vs32, vs8, vs20 // real*real, imag*real
xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
@ -1529,12 +1476,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 32 addi AO, AO, 32
lxvdsx vs16, o0, BO // load real part from B lxvd2x vs16, o0, BO // load real part from B
lxvdsx vs17, o8, BO // load imag part from B lxvd2x vs17, o16, BO // load imag part from B
lxvdsx vs18, o16, BO // load real part from B lxvd2x vs18, o32, BO // load real part from B
lxvdsx vs19, o24, BO // load imag part from B lxvd2x vs19, o48, BO // load imag part from B
addi BO, BO, 32 addi BO, BO, 64
xvmuldp vs32, vs0, vs16 // real*real, imag*real xvmuldp vs32, vs0, vs16 // real*real, imag*real
xvmuldp vs33, vs0, vs17 // real*imag, imag*imag xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
@ -1556,12 +1503,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 32 addi AO, AO, 32
lxvdsx vs16, o0, BO // load real part from B lxvd2x vs16, o0, BO // load real part from B
lxvdsx vs17, o8, BO // load imag part from B lxvd2x vs17, o16, BO // load imag part from B
lxvdsx vs18, o16, BO // load real part from B lxvd2x vs18, o32, BO // load real part from B
lxvdsx vs19, o24, BO // load imag part from B lxvd2x vs19, o48, BO // load imag part from B
addi BO, BO, 32 addi BO, BO, 64
xvmaddadp vs32, vs0, vs16 // real*real, imag*real xvmaddadp vs32, vs0, vs16 // real*real, imag*real
xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
@ -1725,12 +1672,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro LOAD2x1_1 .macro LOAD2x1_1
lxvdsx vs16, o0, BO // load real part from B lxvd2x vs16, o0, BO // load real part from B
lxvdsx vs17, o8, BO // load imag part from B lxvd2x vs17, o16, BO // load imag part from B
lxvdsx vs18, o16, BO // load real part from B lxvd2x vs18, o32, BO // load real part from B
lxvdsx vs19, o24, BO // load imag part from B lxvd2x vs19, o48, BO // load imag part from B
addi BO, BO, 32 addi BO, BO, 64
lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs0, o0, AO // load real,imag from A
@ -1745,12 +1692,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 16 addi AO, AO, 16
lxvdsx vs20, o0, BO // load real part from B lxvd2x vs20, o0, BO // load real part from B
lxvdsx vs21, o8, BO // load imag part from B lxvd2x vs21, o16, BO // load imag part from B
lxvdsx vs22, o16, BO // load real part from B lxvd2x vs22, o32, BO // load real part from B
lxvdsx vs23, o24, BO // load imag part from B lxvd2x vs23, o48, BO // load imag part from B
addi BO, BO, 32 addi BO, BO, 64
xvmuldp vs32, vs0, vs16 // real*real, imag*real xvmuldp vs32, vs0, vs16 // real*real, imag*real
xvmuldp vs33, vs0, vs17 // real*imag, imag*imag xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
@ -1767,12 +1714,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 16 addi AO, AO, 16
lxvdsx vs20, o0, BO // load real part from B lxvd2x vs20, o0, BO // load real part from B
lxvdsx vs21, o8, BO // load imag part from B lxvd2x vs21, o16, BO // load imag part from B
lxvdsx vs22, o16, BO // load real part from B lxvd2x vs22, o32, BO // load real part from B
lxvdsx vs23, o24, BO // load imag part from B lxvd2x vs23, o48, BO // load imag part from B
addi BO, BO, 32 addi BO, BO, 64
xvmaddadp vs32, vs0, vs16 // real*real, imag*real xvmaddadp vs32, vs0, vs16 // real*real, imag*real
xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
@ -1789,12 +1736,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 16 addi AO, AO, 16
lxvdsx vs16, o0, BO // load real part from B lxvd2x vs16, o0, BO // load real part from B
lxvdsx vs17, o8, BO // load imag part from B lxvd2x vs17, o16, BO // load imag part from B
lxvdsx vs18, o16, BO // load real part from B lxvd2x vs18, o32, BO // load real part from B
lxvdsx vs19, o24, BO // load imag part from B lxvd2x vs19, o48, BO // load imag part from B
addi BO, BO, 32 addi BO, BO, 64
xvmaddadp vs32, vs8, vs20 // real*real, imag*real xvmaddadp vs32, vs8, vs20 // real*real, imag*real
xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
@ -1823,12 +1770,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 16 addi AO, AO, 16
lxvdsx vs16, o0, BO // load real part from B lxvd2x vs16, o0, BO // load real part from B
lxvdsx vs17, o8, BO // load imag part from B lxvd2x vs17, o16, BO // load imag part from B
lxvdsx vs18, o16, BO // load real part from B lxvd2x vs18, o32, BO // load real part from B
lxvdsx vs19, o24, BO // load imag part from B lxvd2x vs19, o48, BO // load imag part from B
addi BO, BO, 32 addi BO, BO, 64
xvmuldp vs32, vs0, vs16 // real*real, imag*real xvmuldp vs32, vs0, vs16 // real*real, imag*real
xvmuldp vs33, vs0, vs17 // real*imag, imag*imag xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
@ -1845,12 +1792,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 16 addi AO, AO, 16
lxvdsx vs16, o0, BO // load real part from B lxvd2x vs16, o0, BO // load real part from B
lxvdsx vs17, o8, BO // load imag part from B lxvd2x vs17, o16, BO // load imag part from B
lxvdsx vs18, o16, BO // load real part from B lxvd2x vs18, o32, BO // load real part from B
lxvdsx vs19, o24, BO // load imag part from B lxvd2x vs19, o48, BO // load imag part from B
addi BO, BO, 32 addi BO, BO, 64
xvmaddadp vs32, vs0, vs16 // real*real, imag*real xvmaddadp vs32, vs0, vs16 // real*real, imag*real
xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
@ -1956,10 +1903,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro LOAD1x8_1 .macro LOAD1x8_1
lxvdsx vs16, o0, BO // load real part from B lxvd2x vs16, o0, BO // load real part from B
lxvdsx vs17, o8, BO // load imag part from B lxvd2x vs17, o16, BO // load imag part from B
addi BO, BO, 16 addi BO, BO, 32
lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs0, o0, AO // load real,imag from A
lxvd2x vs1, o16, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A
@ -1994,10 +1941,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 64 addi AO, AO, 64
lxvdsx vs20, o0, BO // load real part from B lxvd2x vs20, o0, BO // load real part from B
lxvdsx vs21, o8, BO // load imag part from B lxvd2x vs21, o16, BO // load imag part from B
addi BO, BO, 16 addi BO, BO, 32
xvmuldp vs32, vs0, vs16 // real*real, imag*real xvmuldp vs32, vs0, vs16 // real*real, imag*real
xvmuldp vs33, vs0, vs17 // real*imag, imag*imag xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
@ -2035,10 +1982,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 64 addi AO, AO, 64
lxvdsx vs20, o0, BO // load real part from B lxvd2x vs20, o0, BO // load real part from B
lxvdsx vs21, o8, BO // load imag part from B lxvd2x vs21, o16, BO // load imag part from B
addi BO, BO, 16 addi BO, BO, 32
xvmaddadp vs32, vs0, vs16 // real*real, imag*real xvmaddadp vs32, vs0, vs16 // real*real, imag*real
xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
@ -2076,10 +2023,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 64 addi AO, AO, 64
lxvdsx vs16, o0, BO // load real part from B lxvd2x vs16, o0, BO // load real part from B
lxvdsx vs17, o8, BO // load imag part from B lxvd2x vs17, o16, BO // load imag part from B
addi BO, BO, 16 addi BO, BO, 32
xvmaddadp vs32, vs8, vs20 // real*real, imag*real xvmaddadp vs32, vs8, vs20 // real*real, imag*real
xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
@ -2140,10 +2087,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 64 addi AO, AO, 64
lxvdsx vs16, o0, BO // load real part from B lxvd2x vs16, o0, BO // load real part from B
lxvdsx vs17, o8, BO // load imag part from B lxvd2x vs17, o16, BO // load imag part from B
addi BO, BO, 16 addi BO, BO, 32
xvmuldp vs32, vs0, vs16 // real*real, imag*real xvmuldp vs32, vs0, vs16 // real*real, imag*real
xvmuldp vs33, vs0, vs17 // real*imag, imag*imag xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
@ -2181,10 +2128,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 64 addi AO, AO, 64
lxvdsx vs16, o0, BO // load real part from B lxvd2x vs16, o0, BO // load real part from B
lxvdsx vs17, o8, BO // load imag part from B lxvd2x vs17, o16, BO // load imag part from B
addi BO, BO, 16 addi BO, BO, 32
xvmaddadp vs32, vs0, vs16 // real*real, imag*real xvmaddadp vs32, vs0, vs16 // real*real, imag*real
xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
@ -2452,10 +2399,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro LOAD1x4_1 .macro LOAD1x4_1
lxvdsx vs16, o0, BO // load real part from B lxvd2x vs16, o0, BO // load real part from B
lxvdsx vs17, o8, BO // load imag part from B lxvd2x vs17, o16, BO // load imag part from B
addi BO, BO, 16 addi BO, BO, 32
lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs0, o0, AO // load real,imag from A
lxvd2x vs1, o16, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A
@ -2476,10 +2423,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 64 addi AO, AO, 64
lxvdsx vs20, o0, BO // load real part from B lxvd2x vs20, o0, BO // load real part from B
lxvdsx vs21, o8, BO // load imag part from B lxvd2x vs21, o16, BO // load imag part from B
addi BO, BO, 16 addi BO, BO, 32
xvmuldp vs32, vs0, vs16 // real*real, imag*real xvmuldp vs32, vs0, vs16 // real*real, imag*real
xvmuldp vs33, vs0, vs17 // real*imag, imag*imag xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
@ -2502,10 +2449,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 64 addi AO, AO, 64
lxvdsx vs20, o0, BO // load real part from B lxvd2x vs20, o0, BO // load real part from B
lxvdsx vs21, o8, BO // load imag part from B lxvd2x vs21, o16, BO // load imag part from B
addi BO, BO, 16 addi BO, BO, 32
xvmaddadp vs32, vs0, vs16 // real*real, imag*real xvmaddadp vs32, vs0, vs16 // real*real, imag*real
xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
@ -2528,10 +2475,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 64 addi AO, AO, 64
lxvdsx vs16, o0, BO // load real part from B lxvd2x vs16, o0, BO // load real part from B
lxvdsx vs17, o8, BO // load imag part from B lxvd2x vs17, o16, BO // load imag part from B
addi BO, BO, 16 addi BO, BO, 32
xvmaddadp vs32, vs8, vs20 // real*real, imag*real xvmaddadp vs32, vs8, vs20 // real*real, imag*real
xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
@ -2569,10 +2516,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 64 addi AO, AO, 64
lxvdsx vs16, o0, BO // load real part from B lxvd2x vs16, o0, BO // load real part from B
lxvdsx vs17, o8, BO // load imag part from B lxvd2x vs17, o16, BO // load imag part from B
addi BO, BO, 16 addi BO, BO, 32
xvmuldp vs32, vs0, vs16 // real*real, imag*real xvmuldp vs32, vs0, vs16 // real*real, imag*real
xvmuldp vs33, vs0, vs17 // real*imag, imag*imag xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
@ -2595,10 +2542,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 64 addi AO, AO, 64
lxvdsx vs16, o0, BO // load real part from B lxvd2x vs16, o0, BO // load real part from B
lxvdsx vs17, o8, BO // load imag part from B lxvd2x vs17, o16, BO // load imag part from B
addi BO, BO, 16 addi BO, BO, 32
xvmaddadp vs32, vs0, vs16 // real*real, imag*real xvmaddadp vs32, vs0, vs16 // real*real, imag*real
xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
@ -2748,10 +2695,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro LOAD1x2_1 .macro LOAD1x2_1
lxvdsx vs16, o0, BO // load real part from B lxvd2x vs16, o0, BO // load real part from B
lxvdsx vs17, o8, BO // load imag part from B lxvd2x vs17, o16, BO // load imag part from B
addi BO, BO, 16 addi BO, BO, 32
lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs0, o0, AO // load real,imag from A
lxvd2x vs1, o16, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A
@ -2768,10 +2715,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 32 addi AO, AO, 32
lxvdsx vs20, o0, BO // load real part from B lxvd2x vs20, o0, BO // load real part from B
lxvdsx vs21, o8, BO // load imag part from B lxvd2x vs21, o16, BO // load imag part from B
addi BO, BO, 16 addi BO, BO, 32
xvmuldp vs32, vs0, vs16 // real*real, imag*real xvmuldp vs32, vs0, vs16 // real*real, imag*real
xvmuldp vs33, vs0, vs17 // real*imag, imag*imag xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
@ -2788,10 +2735,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 32 addi AO, AO, 32
lxvdsx vs20, o0, BO // load real part from B lxvd2x vs20, o0, BO // load real part from B
lxvdsx vs21, o8, BO // load imag part from B lxvd2x vs21, o16, BO // load imag part from B
addi BO, BO, 16 addi BO, BO, 32
xvmaddadp vs32, vs0, vs16 // real*real, imag*real xvmaddadp vs32, vs0, vs16 // real*real, imag*real
xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
@ -2808,10 +2755,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 32 addi AO, AO, 32
lxvdsx vs16, o0, BO // load real part from B lxvd2x vs16, o0, BO // load real part from B
lxvdsx vs17, o8, BO // load imag part from B lxvd2x vs17, o16, BO // load imag part from B
addi BO, BO, 16 addi BO, BO, 32
xvmaddadp vs32, vs8, vs20 // real*real, imag*real xvmaddadp vs32, vs8, vs20 // real*real, imag*real
xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
@ -2839,10 +2786,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 32 addi AO, AO, 32
lxvdsx vs16, o0, BO // load real part from B lxvd2x vs16, o0, BO // load real part from B
lxvdsx vs17, o8, BO // load imag part from B lxvd2x vs17, o16, BO // load imag part from B
addi BO, BO, 16 addi BO, BO, 32
xvmuldp vs32, vs0, vs16 // real*real, imag*real xvmuldp vs32, vs0, vs16 // real*real, imag*real
xvmuldp vs33, vs0, vs17 // real*imag, imag*imag xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
@ -2859,10 +2806,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 32 addi AO, AO, 32
lxvdsx vs16, o0, BO // load real part from B lxvd2x vs16, o0, BO // load real part from B
lxvdsx vs17, o8, BO // load imag part from B lxvd2x vs17, o16, BO // load imag part from B
addi BO, BO, 16 addi BO, BO, 32
xvmaddadp vs32, vs0, vs16 // real*real, imag*real xvmaddadp vs32, vs0, vs16 // real*real, imag*real
xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
@ -2954,10 +2901,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro LOAD1x1_1 .macro LOAD1x1_1
lxvdsx vs16, o0, BO // load real part from B lxvd2x vs16, o0, BO // load real part from B
lxvdsx vs17, o8, BO // load imag part from B lxvd2x vs17, o16, BO // load imag part from B
addi BO, BO, 16 addi BO, BO, 32
lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs0, o0, AO // load real,imag from A
@ -2972,10 +2919,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 16 addi AO, AO, 16
lxvdsx vs20, o0, BO // load real part from B lxvd2x vs20, o0, BO // load real part from B
lxvdsx vs21, o8, BO // load imag part from B lxvd2x vs21, o16, BO // load imag part from B
addi BO, BO, 16 addi BO, BO, 32
xvmuldp vs32, vs0, vs16 // real*real, imag*real xvmuldp vs32, vs0, vs16 // real*real, imag*real
xvmuldp vs33, vs0, vs17 // real*imag, imag*imag xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
@ -2989,10 +2936,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 16 addi AO, AO, 16
lxvdsx vs20, o0, BO // load real part from B lxvd2x vs20, o0, BO // load real part from B
lxvdsx vs21, o8, BO // load imag part from B lxvd2x vs21, o16, BO // load imag part from B
addi BO, BO, 16 addi BO, BO, 32
xvmaddadp vs32, vs0, vs16 // real*real, imag*real xvmaddadp vs32, vs0, vs16 // real*real, imag*real
xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
@ -3006,10 +2953,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 16 addi AO, AO, 16
lxvdsx vs16, o0, BO // load real part from B lxvd2x vs16, o0, BO // load real part from B
lxvdsx vs17, o8, BO // load imag part from B lxvd2x vs17, o16, BO // load imag part from B
addi BO, BO, 16 addi BO, BO, 32
xvmaddadp vs32, vs8, vs20 // real*real, imag*real xvmaddadp vs32, vs8, vs20 // real*real, imag*real
xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
@ -3032,10 +2979,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 16 addi AO, AO, 16
lxvdsx vs16, o0, BO // load real part from B lxvd2x vs16, o0, BO // load real part from B
lxvdsx vs17, o8, BO // load imag part from B lxvd2x vs17, o16, BO // load imag part from B
addi BO, BO, 16 addi BO, BO, 32
xvmuldp vs32, vs0, vs16 // real*real, imag*real xvmuldp vs32, vs0, vs16 // real*real, imag*real
xvmuldp vs33, vs0, vs17 // real*imag, imag*imag xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
@ -3049,10 +2996,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi AO, AO, 16 addi AO, AO, 16
lxvdsx vs16, o0, BO // load real part from B lxvd2x vs16, o0, BO // load real part from B
lxvdsx vs17, o8, BO // load imag part from B lxvd2x vs17, o16, BO // load imag part from B
addi BO, BO, 16 addi BO, BO, 32
xvmaddadp vs32, vs0, vs16 // real*real, imag*real xvmaddadp vs32, vs0, vs16 // real*real, imag*real
xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag

View File

@ -271,7 +271,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif #endif
#endif #endif
#include "zgemm_macros_8x2_power8.S" #include "ztrmm_macros_8x2_power8.S"
cmpwi cr0, M, 0 cmpwi cr0, M, 0
ble .L999 ble .L999

File diff suppressed because it is too large Load Diff

14
param.h
View File

@ -1964,8 +1964,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define SNUMOPT 16 #define SNUMOPT 16
#define DNUMOPT 8 #define DNUMOPT 8
#define GEMM_DEFAULT_OFFSET_A 131072 #define GEMM_DEFAULT_OFFSET_A 4096
#define GEMM_DEFAULT_OFFSET_B 1024 #define GEMM_DEFAULT_OFFSET_B 4096
#define GEMM_DEFAULT_ALIGN 0x03fffUL #define GEMM_DEFAULT_ALIGN 0x03fffUL
#define SGEMM_DEFAULT_UNROLL_M 16 #define SGEMM_DEFAULT_UNROLL_M 16
@ -1980,17 +1980,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define SGEMM_DEFAULT_P 960 #define SGEMM_DEFAULT_P 960
#define DGEMM_DEFAULT_P 480 #define DGEMM_DEFAULT_P 480
#define CGEMM_DEFAULT_P 720 #define CGEMM_DEFAULT_P 720
#define ZGEMM_DEFAULT_P 240 #define ZGEMM_DEFAULT_P 480
#define SGEMM_DEFAULT_Q 720 #define SGEMM_DEFAULT_Q 720
#define DGEMM_DEFAULT_Q 720 #define DGEMM_DEFAULT_Q 720
#define CGEMM_DEFAULT_Q 720 #define CGEMM_DEFAULT_Q 720
#define ZGEMM_DEFAULT_Q 360 #define ZGEMM_DEFAULT_Q 720
#define SGEMM_DEFAULT_R 14400 #define SGEMM_DEFAULT_R 21600
#define DGEMM_DEFAULT_R 14400 #define DGEMM_DEFAULT_R 14400
#define CGEMM_DEFAULT_R 14400 #define CGEMM_DEFAULT_R 16200
#define ZGEMM_DEFAULT_R 7200 #define ZGEMM_DEFAULT_R 21600
#define SYMV_P 8 #define SYMV_P 8