Merge pull request #837 from wernsaar/develop
updated zgemm- and ztrmm-kernel for POWER8
This commit is contained in:
commit
0a4276bc2f
|
@ -34,7 +34,8 @@ LIBMKL = -L$(MKL) -lmkl_intel_lp64 -lmkl_gnu_thread -lmkl_core -lgomp -lpthread
|
|||
LIBVECLIB = -framework Accelerate
|
||||
|
||||
ESSL=/opt/ibm/lib
|
||||
LIBESSL = -lessl $(ESSL)/libxlomp_ser.so.1 $(ESSL)/libxlf90_r.so.1 $(ESSL)/libxlfmath.so.1 $(ESSL)/libxlsmp.so.1 /opt/ibm/xlC/13.1.2/lib/libxl.a
|
||||
#LIBESSL = -lesslsmp $(ESSL)/libxlomp_ser.so.1 $(ESSL)/libxlf90_r.so.1 $(ESSL)/libxlfmath.so.1 $(ESSL)/libxlsmp.so.1 /opt/ibm/xlC/13.1.3/lib/libxl.a
|
||||
LIBESSL = -lesslsmp $(ESSL)/libxlf90_r.so.1 $(ESSL)/libxlfmath.so.1 $(ESSL)/libxlsmp.so.1 /opt/ibm/xlC/13.1.3/lib/libxl.a
|
||||
|
||||
ifeq ($(OSNAME), WINNT)
|
||||
|
||||
|
@ -259,7 +260,8 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \
|
|||
endif
|
||||
|
||||
essl :: sgemm.essl strmm.essl dgemm.essl dtrmm.essl \
|
||||
cgemm.essl ctrmm.essl zgemm.essl ztrmm.essl
|
||||
cgemm.essl ctrmm.essl zgemm.essl ztrmm.essl \
|
||||
slinpack.essl clinpack.essl dlinpack.essl zlinpack.essl
|
||||
|
||||
veclib :: slinpack.veclib dlinpack.veclib clinpack.veclib zlinpack.veclib \
|
||||
scholesky.veclib dcholesky.veclib ccholesky.veclib zcholesky.veclib \
|
||||
|
@ -312,6 +314,9 @@ slinpack.mkl : slinpack.$(SUFFIX)
|
|||
slinpack.veclib : slinpack.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
slinpack.essl : slinpack.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
##################################### Dlinpack ####################################################
|
||||
dlinpack.goto : dlinpack.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
|
@ -328,6 +333,9 @@ dlinpack.mkl : dlinpack.$(SUFFIX)
|
|||
dlinpack.veclib : dlinpack.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
dlinpack.essl : dlinpack.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
##################################### Clinpack ####################################################
|
||||
|
||||
clinpack.goto : clinpack.$(SUFFIX) ../$(LIBNAME)
|
||||
|
@ -345,6 +353,9 @@ clinpack.mkl : clinpack.$(SUFFIX)
|
|||
clinpack.veclib : clinpack.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
clinpack.essl : clinpack.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
##################################### Zlinpack ####################################################
|
||||
|
||||
zlinpack.goto : zlinpack.$(SUFFIX) ../$(LIBNAME)
|
||||
|
@ -362,6 +373,9 @@ zlinpack.mkl : zlinpack.$(SUFFIX)
|
|||
zlinpack.veclib : zlinpack.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
zlinpack.essl : zlinpack.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
##################################### Scholesky ###################################################
|
||||
|
||||
scholesky.goto : scholesky.$(SUFFIX) ../$(LIBNAME)
|
||||
|
|
|
@ -798,7 +798,7 @@ Lmcount$lazy_ptr:
|
|||
#elif defined(PPC440FP2)
|
||||
#define BUFFER_SIZE ( 16 << 20)
|
||||
#elif defined(POWER8)
|
||||
#define BUFFER_SIZE ( 64 << 20)
|
||||
#define BUFFER_SIZE ( 32 << 20)
|
||||
#else
|
||||
#define BUFFER_SIZE ( 16 << 20)
|
||||
#endif
|
||||
|
|
|
@ -82,7 +82,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#endif
|
||||
|
||||
#ifdef __64BIT__
|
||||
#define STACKSIZE 400
|
||||
#define STACKSIZE 32000
|
||||
#define ALPHA_R_SP 296(SP)
|
||||
#define ALPHA_I_SP 304(SP)
|
||||
#define FZERO 312(SP)
|
||||
|
@ -136,6 +136,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define alpha_sr vs30
|
||||
#define alpha_si vs31
|
||||
|
||||
#define FRAMEPOINTER r12
|
||||
|
||||
#define BBUFFER r14
|
||||
#define L r15
|
||||
|
@ -161,6 +162,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
PROLOGUE
|
||||
PROFCODE
|
||||
|
||||
mr FRAMEPOINTER, SP
|
||||
addi SP, SP, -STACKSIZE
|
||||
addi SP, SP, -STACKSIZE
|
||||
addi SP, SP, -STACKSIZE
|
||||
addi SP, SP, -STACKSIZE
|
||||
li r0, 0
|
||||
|
||||
|
@ -233,37 +238,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#ifdef linux
|
||||
#ifdef __64BIT__
|
||||
ld LDC, FRAMESLOT(0) + STACKSIZE(SP)
|
||||
ld LDC, FRAMESLOT(0) + 0(FRAMEPOINTER)
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(_AIX) || defined(__APPLE__)
|
||||
#ifdef __64BIT__
|
||||
ld LDC, FRAMESLOT(0) + STACKSIZE(SP)
|
||||
ld LDC, FRAMESLOT(0) + 0(FRAMEPOINTER)
|
||||
#else
|
||||
#ifdef DOUBLE
|
||||
lwz B, FRAMESLOT(0) + STACKSIZE(SP)
|
||||
lwz C, FRAMESLOT(1) + STACKSIZE(SP)
|
||||
lwz LDC, FRAMESLOT(2) + STACKSIZE(SP)
|
||||
lwz B, FRAMESLOT(0) + 0(FRAMEPOINTER)
|
||||
lwz C, FRAMESLOT(1) + 0(FRAMEPOINTER)
|
||||
lwz LDC, FRAMESLOT(2) + 0(FRAMEPOINTER)
|
||||
#else
|
||||
lwz LDC, FRAMESLOT(0) + STACKSIZE(SP)
|
||||
lwz LDC, FRAMESLOT(0) + 0(FRAMEPOINTER)
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef TRMMKERNEL
|
||||
#if defined(linux) && defined(__64BIT__)
|
||||
ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
|
||||
ld OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER)
|
||||
#endif
|
||||
|
||||
#if defined(_AIX) || defined(__APPLE__)
|
||||
#ifdef __64BIT__
|
||||
ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
|
||||
ld OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER)
|
||||
#else
|
||||
#ifdef DOUBLE
|
||||
lwz OFFSET, FRAMESLOT(3) + STACKSIZE(SP)
|
||||
lwz OFFSET, FRAMESLOT(3) + 0(FRAMEPOINTER)
|
||||
#else
|
||||
lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
|
||||
lwz OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER)
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
@ -290,9 +295,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
li o32 , 32
|
||||
li o48 , 48
|
||||
|
||||
li T1, 256
|
||||
slwi T1, T1, 9 // 131072
|
||||
sub BBUFFER, A, T1 // temp buffer for B unrolled
|
||||
addi BBUFFER, SP, 512+4096
|
||||
li T1, -4096
|
||||
and BBUFFER, BBUFFER, T1
|
||||
|
||||
|
||||
#ifdef __64BIT__
|
||||
|
@ -392,6 +397,9 @@ L999:
|
|||
#endif
|
||||
|
||||
addi SP, SP, STACKSIZE
|
||||
addi SP, SP, STACKSIZE
|
||||
addi SP, SP, STACKSIZE
|
||||
addi SP, SP, STACKSIZE
|
||||
|
||||
blr
|
||||
|
||||
|
|
|
@ -82,7 +82,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#endif
|
||||
|
||||
#ifdef __64BIT__
|
||||
#define STACKSIZE 340
|
||||
#define STACKSIZE 32752
|
||||
#define ALPHA_SP 296(SP)
|
||||
#define FZERO 304(SP)
|
||||
#else
|
||||
|
@ -132,6 +132,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#define o0 0
|
||||
|
||||
#define FRAMEPOINTER r12
|
||||
|
||||
#define BBUFFER r14
|
||||
#define o4 r15
|
||||
#define o12 r16
|
||||
|
@ -160,6 +162,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
PROLOGUE
|
||||
PROFCODE
|
||||
|
||||
mr FRAMEPOINTER, SP
|
||||
addi SP, SP, -STACKSIZE
|
||||
addi SP, SP, -STACKSIZE
|
||||
addi SP, SP, -STACKSIZE
|
||||
addi SP, SP, -STACKSIZE
|
||||
li r0, 0
|
||||
|
||||
|
@ -231,7 +237,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#if defined(_AIX) || defined(__APPLE__)
|
||||
#if !defined(__64BIT__) && defined(DOUBLE)
|
||||
lwz LDC, FRAMESLOT(0) + STACKSIZE(SP)
|
||||
lwz LDC, FRAMESLOT(0) + 0(FRAMEPOINTER)
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
@ -239,17 +245,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#if defined(TRMMKERNEL)
|
||||
#if defined(linux) && defined(__64BIT__)
|
||||
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
|
||||
ld OFFSET, FRAMESLOT(0) + 0(FRAMEPOINTER)
|
||||
#endif
|
||||
|
||||
#if defined(_AIX) || defined(__APPLE__)
|
||||
#ifdef __64BIT__
|
||||
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
|
||||
ld OFFSET, FRAMESLOT(0) + 0(FRAMEPOINTER)
|
||||
#else
|
||||
#ifdef DOUBLE
|
||||
lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
|
||||
lwz OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER)
|
||||
#else
|
||||
lwz OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
|
||||
lwz OFFSET, FRAMESLOT(0) + 0(FRAMEPOINTER)
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
@ -271,9 +277,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
li o32, 32
|
||||
li o48, 48
|
||||
|
||||
li T1, 256
|
||||
slwi T1, T1, 9 // 131072
|
||||
sub BBUFFER, A, T1 // temp buffer for B unrolled
|
||||
addi BBUFFER, SP, 512+4096
|
||||
li T1, -4096
|
||||
and BBUFFER, BBUFFER, T1
|
||||
|
||||
addi T1, SP, 300
|
||||
stxsspx f1, o0 , T1
|
||||
|
@ -355,6 +361,9 @@ L999:
|
|||
#endif
|
||||
|
||||
addi SP, SP, STACKSIZE
|
||||
addi SP, SP, STACKSIZE
|
||||
addi SP, SP, STACKSIZE
|
||||
addi SP, SP, STACKSIZE
|
||||
|
||||
blr
|
||||
|
||||
|
|
|
@ -1,38 +1,3 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
/**************************************************************************************
|
||||
* 2016/03/05 Werner Saar (wernsaar@googlemail.com)
|
||||
* BLASTEST : OK
|
||||
* CTEST : OK
|
||||
* TEST : OK
|
||||
* LAPACK-TEST : OK
|
||||
**************************************************************************************/
|
||||
|
||||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
|
@ -82,7 +47,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#endif
|
||||
|
||||
#ifdef __64BIT__
|
||||
#define STACKSIZE 320
|
||||
#define STACKSIZE 32000
|
||||
#define ALPHA_R_SP 296(SP)
|
||||
#define ALPHA_I_SP 304(SP)
|
||||
#define FZERO 312(SP)
|
||||
|
@ -133,11 +98,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define alpha_r vs30
|
||||
#define alpha_i vs31
|
||||
|
||||
|
||||
#define FRAMEPOINTER r12
|
||||
|
||||
#define BBUFFER r14
|
||||
|
||||
#define L r15
|
||||
#define ALPHA r16
|
||||
#define o24 r17
|
||||
#define T2 r19
|
||||
#define KK r20
|
||||
#define BBO r20
|
||||
#define o8 r21
|
||||
#define I r22
|
||||
#define J r23
|
||||
|
@ -156,8 +126,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
PROLOGUE
|
||||
PROFCODE
|
||||
|
||||
addi SP, SP, -STACKSIZE
|
||||
li r0, 0
|
||||
mr FRAMEPOINTER, SP
|
||||
addi SP, SP, -STACKSIZE
|
||||
addi SP, SP, -STACKSIZE
|
||||
addi SP, SP, -STACKSIZE
|
||||
addi SP, SP, -STACKSIZE
|
||||
li r0, 0
|
||||
|
||||
stfd f14, 0(SP)
|
||||
stfd f15, 8(SP)
|
||||
|
@ -200,6 +174,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
std r17, 256(SP)
|
||||
std r16, 264(SP)
|
||||
std r15, 272(SP)
|
||||
std r14, 280(SP)
|
||||
#else
|
||||
stw r31, 144(SP)
|
||||
stw r30, 148(SP)
|
||||
|
@ -226,37 +201,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#ifdef linux
|
||||
#ifdef __64BIT__
|
||||
ld LDC, FRAMESLOT(0) + STACKSIZE(SP)
|
||||
ld LDC, FRAMESLOT(0) + 0(FRAMEPOINTER)
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(_AIX) || defined(__APPLE__)
|
||||
#ifdef __64BIT__
|
||||
ld LDC, FRAMESLOT(0) + STACKSIZE(SP)
|
||||
ld LDC, FRAMESLOT(0) + 0(FRAMEPOINTER)
|
||||
#else
|
||||
#ifdef DOUBLE
|
||||
lwz B, FRAMESLOT(0) + STACKSIZE(SP)
|
||||
lwz C, FRAMESLOT(1) + STACKSIZE(SP)
|
||||
lwz LDC, FRAMESLOT(2) + STACKSIZE(SP)
|
||||
lwz B, FRAMESLOT(0) + 0(FRAMEPOINTER)
|
||||
lwz C, FRAMESLOT(1) + 0(FRAMEPOINTER)
|
||||
lwz LDC, FRAMESLOT(2) + 0(FRAMEPOINTER)
|
||||
#else
|
||||
lwz LDC, FRAMESLOT(0) + STACKSIZE(SP)
|
||||
lwz LDC, FRAMESLOT(0) + 0(FRAMEPOINTER)
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef TRMMKERNEL
|
||||
#if defined(linux) && defined(__64BIT__)
|
||||
ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
|
||||
ld OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER)
|
||||
#endif
|
||||
|
||||
#if defined(_AIX) || defined(__APPLE__)
|
||||
#ifdef __64BIT__
|
||||
ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
|
||||
ld OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER)
|
||||
#else
|
||||
#ifdef DOUBLE
|
||||
lwz OFFSET, FRAMESLOT(3) + STACKSIZE(SP)
|
||||
lwz OFFSET, FRAMESLOT(3) + 0(FRAMEPOINTER)
|
||||
#else
|
||||
lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
|
||||
lwz OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER)
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
@ -268,34 +243,38 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#include "zgemm_macros_8x2_power8.S"
|
||||
|
||||
cmpwi cr0, M, 0
|
||||
ble .L999
|
||||
ble L999
|
||||
cmpwi cr0, N, 0
|
||||
ble .L999
|
||||
ble L999
|
||||
cmpwi cr0, K, 0
|
||||
ble .L999
|
||||
ble L999
|
||||
|
||||
slwi LDC, LDC, ZBASE_SHIFT
|
||||
li PRE, 256
|
||||
li PRE, 384
|
||||
li o8 , 8
|
||||
li o16 , 16
|
||||
li o24 , 24
|
||||
li o32 , 32
|
||||
li o48 , 48
|
||||
|
||||
addi BBUFFER, SP, 512+4096
|
||||
li T1, -4096
|
||||
and BBUFFER, BBUFFER, T1
|
||||
|
||||
#ifdef __64BIT__
|
||||
addi ALPHA, SP, 296
|
||||
#else
|
||||
addi ALPHA, SP, 224
|
||||
#endif
|
||||
|
||||
lxvdsx alpha_r, 0, ALPHA
|
||||
lxvdsx alpha_i, o8, ALPHA
|
||||
lxsdx alpha_r, 0, ALPHA
|
||||
lxsdx alpha_i, o8, ALPHA
|
||||
|
||||
.align 5
|
||||
.align 4
|
||||
|
||||
#include "zgemm_logic_8x2_power8.S"
|
||||
|
||||
.L999:
|
||||
L999:
|
||||
addi r3, 0, 0
|
||||
|
||||
lfd f14, 0(SP)
|
||||
|
@ -339,6 +318,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld r17, 256(SP)
|
||||
ld r16, 264(SP)
|
||||
ld r15, 272(SP)
|
||||
ld r14, 280(SP)
|
||||
#else
|
||||
lwz r31, 144(SP)
|
||||
lwz r30, 148(SP)
|
||||
|
@ -360,6 +340,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#endif
|
||||
|
||||
addi SP, SP, STACKSIZE
|
||||
addi SP, SP, STACKSIZE
|
||||
addi SP, SP, STACKSIZE
|
||||
addi SP, SP, STACKSIZE
|
||||
|
||||
blr
|
||||
|
||||
|
|
|
@ -1,83 +1,111 @@
|
|||
srawi. J, N, 1
|
||||
ble .LZGEMM_L2_END
|
||||
ble ZGEMM_L2_END
|
||||
|
||||
ZGEMM_L2_BEGIN:
|
||||
|
||||
mr BO, B
|
||||
mr BBO, BBUFFER
|
||||
slwi T1, K, 1
|
||||
|
||||
ZGEMM_L2_COPYB:
|
||||
|
||||
lxvdsx vs4, o0, BO // b0_r
|
||||
lxvdsx vs5, o8, BO // b0_i
|
||||
addi BO, BO, 16
|
||||
stxvd2x vs4, o0, BBO
|
||||
stxvd2x vs5, o16, BBO
|
||||
addic. T1, T1, -1
|
||||
addi BBO, BBO, 32
|
||||
|
||||
bge ZGEMM_L2_COPYB
|
||||
|
||||
.LZGEMM_L2_BEGIN:
|
||||
|
||||
mr CO, C
|
||||
mr AO, A
|
||||
slwi T1, LDC , 1
|
||||
add C, C, T1
|
||||
srawi. I, M, 3
|
||||
ble .LZGEMM_L2x8_END
|
||||
ble ZGEMM_L2x8_END
|
||||
|
||||
.LZGEMM_L2x8_BEGIN:
|
||||
ZGEMM_L2x8_BEGIN:
|
||||
|
||||
|
||||
mr BO, B
|
||||
mr BO, BBUFFER
|
||||
srawi. L, K, 3
|
||||
ble .LZGEMM_L2x8_SUB0
|
||||
ble ZGEMM_L2x8_SUB0
|
||||
cmpwi cr0, L, 1
|
||||
ble .LZGEMM_L2x8_SUB4
|
||||
ble ZGEMM_L2x8_SUB4
|
||||
|
||||
.LZGEMM_L2x8_LOOP_START:
|
||||
ZGEMM_L2x8_LOOP_START:
|
||||
|
||||
dcbt AO, PRE
|
||||
dcbt BO, PRE
|
||||
LOAD2x8_1
|
||||
dcbt AO, PRE
|
||||
KERNEL2x8_I1
|
||||
dcbt AO, PRE
|
||||
dcbt BO, PRE
|
||||
KERNEL2x8_2
|
||||
dcbt AO, PRE
|
||||
KERNEL2x8_1
|
||||
dcbt AO, PRE
|
||||
dcbt BO, PRE
|
||||
KERNEL2x8_2
|
||||
|
||||
dcbt AO, PRE
|
||||
KERNEL2x8_1
|
||||
dcbt AO, PRE
|
||||
dcbt BO, PRE
|
||||
KERNEL2x8_2
|
||||
dcbt AO, PRE
|
||||
KERNEL2x8_1
|
||||
dcbt AO, PRE
|
||||
dcbt BO, PRE
|
||||
KERNEL2x8_2
|
||||
|
||||
addic. L, L, -2
|
||||
ble .LZGEMM_L2x8_LOOP_END
|
||||
ble ZGEMM_L2x8_LOOP_END
|
||||
|
||||
.align 5
|
||||
|
||||
.LZGEMM_L2x8_LOOP:
|
||||
ZGEMM_L2x8_LOOP:
|
||||
|
||||
dcbt AO, PRE
|
||||
KERNEL2x8_1
|
||||
dcbt AO, PRE
|
||||
dcbt BO, PRE
|
||||
KERNEL2x8_2
|
||||
dcbt AO, PRE
|
||||
KERNEL2x8_1
|
||||
dcbt AO, PRE
|
||||
dcbt BO, PRE
|
||||
KERNEL2x8_2
|
||||
|
||||
dcbt AO, PRE
|
||||
KERNEL2x8_1
|
||||
dcbt AO, PRE
|
||||
dcbt BO, PRE
|
||||
KERNEL2x8_2
|
||||
dcbt AO, PRE
|
||||
KERNEL2x8_1
|
||||
dcbt AO, PRE
|
||||
dcbt BO, PRE
|
||||
KERNEL2x8_2
|
||||
|
||||
addic. L, L, -1
|
||||
bgt .LZGEMM_L2x8_LOOP
|
||||
bgt ZGEMM_L2x8_LOOP
|
||||
|
||||
.LZGEMM_L2x8_LOOP_END:
|
||||
ZGEMM_L2x8_LOOP_END:
|
||||
|
||||
dcbt AO, PRE
|
||||
KERNEL2x8_1
|
||||
dcbt AO, PRE
|
||||
dcbt BO, PRE
|
||||
KERNEL2x8_2
|
||||
dcbt AO, PRE
|
||||
KERNEL2x8_1
|
||||
dcbt AO, PRE
|
||||
dcbt BO, PRE
|
||||
KERNEL2x8_2
|
||||
|
||||
dcbt AO, PRE
|
||||
|
@ -88,9 +116,9 @@
|
|||
KERNEL2x8_1
|
||||
KERNEL2x8_E2
|
||||
|
||||
b .LZGEMM_L2x8_SUB1
|
||||
b ZGEMM_L2x8_SUB1
|
||||
|
||||
.LZGEMM_L2x8_SUB4:
|
||||
ZGEMM_L2x8_SUB4:
|
||||
|
||||
dcbt AO, PRE
|
||||
KERNEL2x8_SUBI1
|
||||
|
@ -106,53 +134,53 @@
|
|||
KERNEL2x8_SUB1
|
||||
KERNEL2x8_SUB1
|
||||
|
||||
b .LZGEMM_L2x8_SUB1
|
||||
b ZGEMM_L2x8_SUB1
|
||||
|
||||
.LZGEMM_L2x8_SUB0:
|
||||
ZGEMM_L2x8_SUB0:
|
||||
|
||||
andi. L, K, 7
|
||||
|
||||
KERNEL2x8_SUBI1
|
||||
|
||||
addic. L, L, -1
|
||||
ble .LZGEMM_L2x8_SAVE
|
||||
b .LZGEMM_L2x8_SUB2
|
||||
ble ZGEMM_L2x8_SAVE
|
||||
b ZGEMM_L2x8_SUB2
|
||||
|
||||
.LZGEMM_L2x8_SUB1:
|
||||
ZGEMM_L2x8_SUB1:
|
||||
|
||||
andi. L, K, 7
|
||||
ble .LZGEMM_L2x8_SAVE
|
||||
ble ZGEMM_L2x8_SAVE
|
||||
|
||||
.LZGEMM_L2x8_SUB2:
|
||||
ZGEMM_L2x8_SUB2:
|
||||
|
||||
KERNEL2x8_SUB1
|
||||
|
||||
addic. L, L, -1
|
||||
bgt .LZGEMM_L2x8_SUB2
|
||||
bgt ZGEMM_L2x8_SUB2
|
||||
|
||||
.LZGEMM_L2x8_SAVE:
|
||||
ZGEMM_L2x8_SAVE:
|
||||
|
||||
SAVE2x8
|
||||
|
||||
addic. I, I, -1
|
||||
bgt .LZGEMM_L2x8_BEGIN
|
||||
bgt ZGEMM_L2x8_BEGIN
|
||||
|
||||
.LZGEMM_L2x8_END:
|
||||
ZGEMM_L2x8_END:
|
||||
|
||||
.LZGEMM_L2x4_BEGIN:
|
||||
ZGEMM_L2x4_BEGIN:
|
||||
|
||||
andi. T2, M, 7
|
||||
ble .LZGEMM_L2x1_END
|
||||
ble ZGEMM_L2x1_END
|
||||
|
||||
andi. T1, M, 4
|
||||
ble .LZGEMM_L2x4_END
|
||||
mr BO, B
|
||||
ble ZGEMM_L2x4_END
|
||||
mr BO, BBUFFER
|
||||
srawi. L, K, 3
|
||||
ble .LZGEMM_L2x4_SUB0
|
||||
ble ZGEMM_L2x4_SUB0
|
||||
cmpwi cr0, L, 1
|
||||
ble .LZGEMM_L2x4_SUB4
|
||||
ble ZGEMM_L2x4_SUB4
|
||||
|
||||
.LZGEMM_L2x4_LOOP_START:
|
||||
ZGEMM_L2x4_LOOP_START:
|
||||
|
||||
LOAD2x4_1
|
||||
KERNEL2x4_I1
|
||||
|
@ -166,11 +194,11 @@
|
|||
KERNEL2x4_2
|
||||
|
||||
addic. L, L, -2
|
||||
ble .LZGEMM_L2x4_LOOP_END
|
||||
ble ZGEMM_L2x4_LOOP_END
|
||||
|
||||
.align 5
|
||||
|
||||
.LZGEMM_L2x4_LOOP:
|
||||
ZGEMM_L2x4_LOOP:
|
||||
|
||||
KERNEL2x4_1
|
||||
KERNEL2x4_2
|
||||
|
@ -183,9 +211,9 @@
|
|||
KERNEL2x4_2
|
||||
|
||||
addic. L, L, -1
|
||||
bgt .LZGEMM_L2x4_LOOP
|
||||
bgt ZGEMM_L2x4_LOOP
|
||||
|
||||
.LZGEMM_L2x4_LOOP_END:
|
||||
ZGEMM_L2x4_LOOP_END:
|
||||
|
||||
KERNEL2x4_1
|
||||
KERNEL2x4_2
|
||||
|
@ -197,9 +225,9 @@
|
|||
KERNEL2x4_1
|
||||
KERNEL2x4_E2
|
||||
|
||||
b .LZGEMM_L2x4_SUB1
|
||||
b ZGEMM_L2x4_SUB1
|
||||
|
||||
.LZGEMM_L2x4_SUB4:
|
||||
ZGEMM_L2x4_SUB4:
|
||||
|
||||
KERNEL2x4_SUBI1
|
||||
KERNEL2x4_SUB1
|
||||
|
@ -211,48 +239,48 @@
|
|||
KERNEL2x4_SUB1
|
||||
KERNEL2x4_SUB1
|
||||
|
||||
b .LZGEMM_L2x4_SUB1
|
||||
b ZGEMM_L2x4_SUB1
|
||||
|
||||
.LZGEMM_L2x4_SUB0:
|
||||
ZGEMM_L2x4_SUB0:
|
||||
|
||||
andi. L, K, 7
|
||||
|
||||
KERNEL2x4_SUBI1
|
||||
|
||||
addic. L, L, -1
|
||||
ble .LZGEMM_L2x4_SAVE
|
||||
b .LZGEMM_L2x4_SUB2
|
||||
ble ZGEMM_L2x4_SAVE
|
||||
b ZGEMM_L2x4_SUB2
|
||||
|
||||
.LZGEMM_L2x4_SUB1:
|
||||
ZGEMM_L2x4_SUB1:
|
||||
|
||||
andi. L, K, 7
|
||||
ble .LZGEMM_L2x4_SAVE
|
||||
ble ZGEMM_L2x4_SAVE
|
||||
|
||||
.LZGEMM_L2x4_SUB2:
|
||||
ZGEMM_L2x4_SUB2:
|
||||
|
||||
KERNEL2x4_SUB1
|
||||
|
||||
addic. L, L, -1
|
||||
bgt .LZGEMM_L2x4_SUB2
|
||||
bgt ZGEMM_L2x4_SUB2
|
||||
|
||||
.LZGEMM_L2x4_SAVE:
|
||||
ZGEMM_L2x4_SAVE:
|
||||
|
||||
SAVE2x4
|
||||
|
||||
.LZGEMM_L2x4_END:
|
||||
ZGEMM_L2x4_END:
|
||||
|
||||
.LZGEMM_L2x2_BEGIN:
|
||||
ZGEMM_L2x2_BEGIN:
|
||||
|
||||
|
||||
andi. T1, M, 2
|
||||
ble .LZGEMM_L2x2_END
|
||||
mr BO, B
|
||||
ble ZGEMM_L2x2_END
|
||||
mr BO, BBUFFER
|
||||
srawi. L, K, 3
|
||||
ble .LZGEMM_L2x2_SUB0
|
||||
ble ZGEMM_L2x2_SUB0
|
||||
cmpwi cr0, L, 1
|
||||
ble .LZGEMM_L2x2_SUB4
|
||||
ble ZGEMM_L2x2_SUB4
|
||||
|
||||
.LZGEMM_L2x2_LOOP_START:
|
||||
ZGEMM_L2x2_LOOP_START:
|
||||
|
||||
LOAD2x2_1
|
||||
KERNEL2x2_I1
|
||||
|
@ -266,11 +294,11 @@
|
|||
KERNEL2x2_2
|
||||
|
||||
addic. L, L, -2
|
||||
ble .LZGEMM_L2x2_LOOP_END
|
||||
ble ZGEMM_L2x2_LOOP_END
|
||||
|
||||
.align 5
|
||||
|
||||
.LZGEMM_L2x2_LOOP:
|
||||
ZGEMM_L2x2_LOOP:
|
||||
|
||||
KERNEL2x2_1
|
||||
KERNEL2x2_2
|
||||
|
@ -283,9 +311,9 @@
|
|||
KERNEL2x2_2
|
||||
|
||||
addic. L, L, -1
|
||||
bgt .LZGEMM_L2x2_LOOP
|
||||
bgt ZGEMM_L2x2_LOOP
|
||||
|
||||
.LZGEMM_L2x2_LOOP_END:
|
||||
ZGEMM_L2x2_LOOP_END:
|
||||
|
||||
KERNEL2x2_1
|
||||
KERNEL2x2_2
|
||||
|
@ -297,9 +325,9 @@
|
|||
KERNEL2x2_1
|
||||
KERNEL2x2_E2
|
||||
|
||||
b .LZGEMM_L2x2_SUB1
|
||||
b ZGEMM_L2x2_SUB1
|
||||
|
||||
.LZGEMM_L2x2_SUB4:
|
||||
ZGEMM_L2x2_SUB4:
|
||||
|
||||
KERNEL2x2_SUBI1
|
||||
KERNEL2x2_SUB1
|
||||
|
@ -311,48 +339,48 @@
|
|||
KERNEL2x2_SUB1
|
||||
KERNEL2x2_SUB1
|
||||
|
||||
b .LZGEMM_L2x2_SUB1
|
||||
b ZGEMM_L2x2_SUB1
|
||||
|
||||
.LZGEMM_L2x2_SUB0:
|
||||
ZGEMM_L2x2_SUB0:
|
||||
|
||||
andi. L, K, 7
|
||||
|
||||
KERNEL2x2_SUBI1
|
||||
|
||||
addic. L, L, -1
|
||||
ble .LZGEMM_L2x2_SAVE
|
||||
b .LZGEMM_L2x2_SUB2
|
||||
ble ZGEMM_L2x2_SAVE
|
||||
b ZGEMM_L2x2_SUB2
|
||||
|
||||
.LZGEMM_L2x2_SUB1:
|
||||
ZGEMM_L2x2_SUB1:
|
||||
|
||||
andi. L, K, 7
|
||||
ble .LZGEMM_L2x2_SAVE
|
||||
ble ZGEMM_L2x2_SAVE
|
||||
|
||||
.LZGEMM_L2x2_SUB2:
|
||||
ZGEMM_L2x2_SUB2:
|
||||
|
||||
KERNEL2x2_SUB1
|
||||
|
||||
addic. L, L, -1
|
||||
bgt .LZGEMM_L2x2_SUB2
|
||||
bgt ZGEMM_L2x2_SUB2
|
||||
|
||||
.LZGEMM_L2x2_SAVE:
|
||||
ZGEMM_L2x2_SAVE:
|
||||
|
||||
SAVE2x2
|
||||
|
||||
.LZGEMM_L2x2_END:
|
||||
ZGEMM_L2x2_END:
|
||||
|
||||
.LZGEMM_L2x1_BEGIN:
|
||||
ZGEMM_L2x1_BEGIN:
|
||||
|
||||
|
||||
andi. T1, M, 1
|
||||
ble .LZGEMM_L2x1_END
|
||||
mr BO, B
|
||||
ble ZGEMM_L2x1_END
|
||||
mr BO, BBUFFER
|
||||
srawi. L, K, 3
|
||||
ble .LZGEMM_L2x1_SUB0
|
||||
ble ZGEMM_L2x1_SUB0
|
||||
cmpwi cr0, L, 1
|
||||
ble .LZGEMM_L2x1_SUB4
|
||||
ble ZGEMM_L2x1_SUB4
|
||||
|
||||
.LZGEMM_L2x1_LOOP_START:
|
||||
ZGEMM_L2x1_LOOP_START:
|
||||
|
||||
LOAD2x1_1
|
||||
KERNEL2x1_I1
|
||||
|
@ -366,11 +394,11 @@
|
|||
KERNEL2x1_2
|
||||
|
||||
addic. L, L, -2
|
||||
ble .LZGEMM_L2x1_LOOP_END
|
||||
ble ZGEMM_L2x1_LOOP_END
|
||||
|
||||
.align 5
|
||||
|
||||
.LZGEMM_L2x1_LOOP:
|
||||
ZGEMM_L2x1_LOOP:
|
||||
|
||||
KERNEL2x1_1
|
||||
KERNEL2x1_2
|
||||
|
@ -383,9 +411,9 @@
|
|||
KERNEL2x1_2
|
||||
|
||||
addic. L, L, -1
|
||||
bgt .LZGEMM_L2x1_LOOP
|
||||
bgt ZGEMM_L2x1_LOOP
|
||||
|
||||
.LZGEMM_L2x1_LOOP_END:
|
||||
ZGEMM_L2x1_LOOP_END:
|
||||
|
||||
KERNEL2x1_1
|
||||
KERNEL2x1_2
|
||||
|
@ -397,9 +425,9 @@
|
|||
KERNEL2x1_1
|
||||
KERNEL2x1_E2
|
||||
|
||||
b .LZGEMM_L2x1_SUB1
|
||||
b ZGEMM_L2x1_SUB1
|
||||
|
||||
.LZGEMM_L2x1_SUB4:
|
||||
ZGEMM_L2x1_SUB4:
|
||||
|
||||
KERNEL2x1_SUBI1
|
||||
KERNEL2x1_SUB1
|
||||
|
@ -411,72 +439,89 @@
|
|||
KERNEL2x1_SUB1
|
||||
KERNEL2x1_SUB1
|
||||
|
||||
b .LZGEMM_L2x1_SUB1
|
||||
b ZGEMM_L2x1_SUB1
|
||||
|
||||
.LZGEMM_L2x1_SUB0:
|
||||
ZGEMM_L2x1_SUB0:
|
||||
|
||||
andi. L, K, 7
|
||||
|
||||
KERNEL2x1_SUBI1
|
||||
|
||||
addic. L, L, -1
|
||||
ble .LZGEMM_L2x1_SAVE
|
||||
b .LZGEMM_L2x1_SUB2
|
||||
ble ZGEMM_L2x1_SAVE
|
||||
b ZGEMM_L2x1_SUB2
|
||||
|
||||
.LZGEMM_L2x1_SUB1:
|
||||
ZGEMM_L2x1_SUB1:
|
||||
|
||||
andi. L, K, 7
|
||||
ble .LZGEMM_L2x1_SAVE
|
||||
ble ZGEMM_L2x1_SAVE
|
||||
|
||||
.LZGEMM_L2x1_SUB2:
|
||||
ZGEMM_L2x1_SUB2:
|
||||
|
||||
KERNEL2x1_SUB1
|
||||
|
||||
addic. L, L, -1
|
||||
bgt .LZGEMM_L2x1_SUB2
|
||||
bgt ZGEMM_L2x1_SUB2
|
||||
|
||||
.LZGEMM_L2x1_SAVE:
|
||||
ZGEMM_L2x1_SAVE:
|
||||
|
||||
SAVE2x1
|
||||
|
||||
.LZGEMM_L2x1_END:
|
||||
ZGEMM_L2x1_END:
|
||||
|
||||
slwi T1, K, 5
|
||||
add B, B, T1
|
||||
|
||||
addic. J, J, -1
|
||||
bgt .LZGEMM_L2_BEGIN
|
||||
bgt ZGEMM_L2_BEGIN
|
||||
|
||||
andi. T2, N, 1
|
||||
ble .L999
|
||||
ble L999
|
||||
|
||||
.LZGEMM_L2_END:
|
||||
ZGEMM_L2_END:
|
||||
|
||||
b .LZGEMM_L1_BEGIN
|
||||
b ZGEMM_L1_BEGIN
|
||||
|
||||
.L999_H1:
|
||||
L999_H1:
|
||||
|
||||
b .L999
|
||||
b L999
|
||||
|
||||
ZGEMM_L1_BEGIN:
|
||||
|
||||
mr BO, B
|
||||
mr BBO, BBUFFER
|
||||
slwi T1, K, 0
|
||||
|
||||
ZGEMM_L1_COPYB:
|
||||
|
||||
lxvdsx vs4, o0, BO // b0_r
|
||||
lxvdsx vs5, o8, BO // b0_i
|
||||
addi BO, BO, 16
|
||||
stxvd2x vs4, o0, BBO
|
||||
stxvd2x vs5, o16, BBO
|
||||
addic. T1, T1, -1
|
||||
addi BBO, BBO, 32
|
||||
|
||||
bge ZGEMM_L1_COPYB
|
||||
|
||||
.LZGEMM_L1_BEGIN:
|
||||
|
||||
andi. T1, N, 1
|
||||
ble .LZGEMM_L1_END
|
||||
ble ZGEMM_L1_END
|
||||
mr CO, C
|
||||
mr AO, A
|
||||
srawi. I, M, 3
|
||||
ble .LZGEMM_L1x8_END
|
||||
ble ZGEMM_L1x8_END
|
||||
|
||||
.LZGEMM_L1x8_BEGIN:
|
||||
ZGEMM_L1x8_BEGIN:
|
||||
|
||||
|
||||
mr BO, B
|
||||
mr BO, BBUFFER
|
||||
srawi. L, K, 3
|
||||
ble .LZGEMM_L1x8_SUB0
|
||||
ble ZGEMM_L1x8_SUB0
|
||||
cmpwi cr0, L, 1
|
||||
ble .LZGEMM_L1x8_SUB4
|
||||
ble ZGEMM_L1x8_SUB4
|
||||
|
||||
.LZGEMM_L1x8_LOOP_START:
|
||||
ZGEMM_L1x8_LOOP_START:
|
||||
|
||||
dcbt AO, PRE
|
||||
LOAD1x8_1
|
||||
|
@ -499,11 +544,11 @@
|
|||
KERNEL1x8_2
|
||||
|
||||
addic. L, L, -2
|
||||
ble .LZGEMM_L1x8_LOOP_END
|
||||
ble ZGEMM_L1x8_LOOP_END
|
||||
|
||||
.align 5
|
||||
|
||||
.LZGEMM_L1x8_LOOP:
|
||||
ZGEMM_L1x8_LOOP:
|
||||
|
||||
dcbt AO, PRE
|
||||
KERNEL1x8_1
|
||||
|
@ -524,9 +569,9 @@
|
|||
KERNEL1x8_2
|
||||
|
||||
addic. L, L, -1
|
||||
bgt .LZGEMM_L1x8_LOOP
|
||||
bgt ZGEMM_L1x8_LOOP
|
||||
|
||||
.LZGEMM_L1x8_LOOP_END:
|
||||
ZGEMM_L1x8_LOOP_END:
|
||||
|
||||
dcbt AO, PRE
|
||||
KERNEL1x8_1
|
||||
|
@ -545,9 +590,9 @@
|
|||
KERNEL1x8_1
|
||||
KERNEL1x8_E2
|
||||
|
||||
b .LZGEMM_L1x8_SUB1
|
||||
b ZGEMM_L1x8_SUB1
|
||||
|
||||
.LZGEMM_L1x8_SUB4:
|
||||
ZGEMM_L1x8_SUB4:
|
||||
|
||||
dcbt AO, PRE
|
||||
KERNEL1x8_SUBI1
|
||||
|
@ -563,53 +608,53 @@
|
|||
KERNEL1x8_SUB1
|
||||
KERNEL1x8_SUB1
|
||||
|
||||
b .LZGEMM_L1x8_SUB1
|
||||
b ZGEMM_L1x8_SUB1
|
||||
|
||||
.LZGEMM_L1x8_SUB0:
|
||||
ZGEMM_L1x8_SUB0:
|
||||
|
||||
andi. L, K, 7
|
||||
|
||||
KERNEL1x8_SUBI1
|
||||
|
||||
addic. L, L, -1
|
||||
ble .LZGEMM_L1x8_SAVE
|
||||
b .LZGEMM_L1x8_SUB2
|
||||
ble ZGEMM_L1x8_SAVE
|
||||
b ZGEMM_L1x8_SUB2
|
||||
|
||||
.LZGEMM_L1x8_SUB1:
|
||||
ZGEMM_L1x8_SUB1:
|
||||
|
||||
andi. L, K, 7
|
||||
ble .LZGEMM_L1x8_SAVE
|
||||
ble ZGEMM_L1x8_SAVE
|
||||
|
||||
.LZGEMM_L1x8_SUB2:
|
||||
ZGEMM_L1x8_SUB2:
|
||||
|
||||
KERNEL1x8_SUB1
|
||||
|
||||
addic. L, L, -1
|
||||
bgt .LZGEMM_L1x8_SUB2
|
||||
bgt ZGEMM_L1x8_SUB2
|
||||
|
||||
.LZGEMM_L1x8_SAVE:
|
||||
ZGEMM_L1x8_SAVE:
|
||||
|
||||
SAVE1x8
|
||||
|
||||
addic. I, I, -1
|
||||
bgt .LZGEMM_L1x8_BEGIN
|
||||
bgt ZGEMM_L1x8_BEGIN
|
||||
|
||||
.LZGEMM_L1x8_END:
|
||||
ZGEMM_L1x8_END:
|
||||
|
||||
.LZGEMM_L1x4_BEGIN:
|
||||
ZGEMM_L1x4_BEGIN:
|
||||
|
||||
andi. T2, M, 7
|
||||
ble .LZGEMM_L1x1_END
|
||||
ble ZGEMM_L1x1_END
|
||||
|
||||
andi. T1, M, 4
|
||||
ble .LZGEMM_L1x4_END
|
||||
mr BO, B
|
||||
ble ZGEMM_L1x4_END
|
||||
mr BO, BBUFFER
|
||||
srawi. L, K, 3
|
||||
ble .LZGEMM_L1x4_SUB0
|
||||
ble ZGEMM_L1x4_SUB0
|
||||
cmpwi cr0, L, 1
|
||||
ble .LZGEMM_L1x4_SUB4
|
||||
ble ZGEMM_L1x4_SUB4
|
||||
|
||||
.LZGEMM_L1x4_LOOP_START:
|
||||
ZGEMM_L1x4_LOOP_START:
|
||||
|
||||
LOAD1x4_1
|
||||
KERNEL1x4_I1
|
||||
|
@ -623,11 +668,11 @@
|
|||
KERNEL1x4_2
|
||||
|
||||
addic. L, L, -2
|
||||
ble .LZGEMM_L1x4_LOOP_END
|
||||
ble ZGEMM_L1x4_LOOP_END
|
||||
|
||||
.align 5
|
||||
|
||||
.LZGEMM_L1x4_LOOP:
|
||||
ZGEMM_L1x4_LOOP:
|
||||
|
||||
KERNEL1x4_1
|
||||
KERNEL1x4_2
|
||||
|
@ -640,9 +685,9 @@
|
|||
KERNEL1x4_2
|
||||
|
||||
addic. L, L, -1
|
||||
bgt .LZGEMM_L1x4_LOOP
|
||||
bgt ZGEMM_L1x4_LOOP
|
||||
|
||||
.LZGEMM_L1x4_LOOP_END:
|
||||
ZGEMM_L1x4_LOOP_END:
|
||||
|
||||
KERNEL1x4_1
|
||||
KERNEL1x4_2
|
||||
|
@ -654,9 +699,9 @@
|
|||
KERNEL1x4_1
|
||||
KERNEL1x4_E2
|
||||
|
||||
b .LZGEMM_L1x4_SUB1
|
||||
b ZGEMM_L1x4_SUB1
|
||||
|
||||
.LZGEMM_L1x4_SUB4:
|
||||
ZGEMM_L1x4_SUB4:
|
||||
|
||||
KERNEL1x4_SUBI1
|
||||
KERNEL1x4_SUB1
|
||||
|
@ -668,48 +713,48 @@
|
|||
KERNEL1x4_SUB1
|
||||
KERNEL1x4_SUB1
|
||||
|
||||
b .LZGEMM_L1x4_SUB1
|
||||
b ZGEMM_L1x4_SUB1
|
||||
|
||||
.LZGEMM_L1x4_SUB0:
|
||||
ZGEMM_L1x4_SUB0:
|
||||
|
||||
andi. L, K, 7
|
||||
|
||||
KERNEL1x4_SUBI1
|
||||
|
||||
addic. L, L, -1
|
||||
ble .LZGEMM_L1x4_SAVE
|
||||
b .LZGEMM_L1x4_SUB2
|
||||
ble ZGEMM_L1x4_SAVE
|
||||
b ZGEMM_L1x4_SUB2
|
||||
|
||||
.LZGEMM_L1x4_SUB1:
|
||||
ZGEMM_L1x4_SUB1:
|
||||
|
||||
andi. L, K, 7
|
||||
ble .LZGEMM_L1x4_SAVE
|
||||
ble ZGEMM_L1x4_SAVE
|
||||
|
||||
.LZGEMM_L1x4_SUB2:
|
||||
ZGEMM_L1x4_SUB2:
|
||||
|
||||
KERNEL1x4_SUB1
|
||||
|
||||
addic. L, L, -1
|
||||
bgt .LZGEMM_L1x4_SUB2
|
||||
bgt ZGEMM_L1x4_SUB2
|
||||
|
||||
.LZGEMM_L1x4_SAVE:
|
||||
ZGEMM_L1x4_SAVE:
|
||||
|
||||
SAVE1x4
|
||||
|
||||
.LZGEMM_L1x4_END:
|
||||
ZGEMM_L1x4_END:
|
||||
|
||||
.LZGEMM_L1x2_BEGIN:
|
||||
ZGEMM_L1x2_BEGIN:
|
||||
|
||||
|
||||
andi. T1, M, 2
|
||||
ble .LZGEMM_L1x2_END
|
||||
mr BO, B
|
||||
ble ZGEMM_L1x2_END
|
||||
mr BO, BBUFFER
|
||||
srawi. L, K, 3
|
||||
ble .LZGEMM_L1x2_SUB0
|
||||
ble ZGEMM_L1x2_SUB0
|
||||
cmpwi cr0, L, 1
|
||||
ble .LZGEMM_L1x2_SUB4
|
||||
ble ZGEMM_L1x2_SUB4
|
||||
|
||||
.LZGEMM_L1x2_LOOP_START:
|
||||
ZGEMM_L1x2_LOOP_START:
|
||||
|
||||
LOAD1x2_1
|
||||
KERNEL1x2_I1
|
||||
|
@ -723,11 +768,11 @@
|
|||
KERNEL1x2_2
|
||||
|
||||
addic. L, L, -2
|
||||
ble .LZGEMM_L1x2_LOOP_END
|
||||
ble ZGEMM_L1x2_LOOP_END
|
||||
|
||||
.align 5
|
||||
|
||||
.LZGEMM_L1x2_LOOP:
|
||||
ZGEMM_L1x2_LOOP:
|
||||
|
||||
KERNEL1x2_1
|
||||
KERNEL1x2_2
|
||||
|
@ -740,9 +785,9 @@
|
|||
KERNEL1x2_2
|
||||
|
||||
addic. L, L, -1
|
||||
bgt .LZGEMM_L1x2_LOOP
|
||||
bgt ZGEMM_L1x2_LOOP
|
||||
|
||||
.LZGEMM_L1x2_LOOP_END:
|
||||
ZGEMM_L1x2_LOOP_END:
|
||||
|
||||
KERNEL1x2_1
|
||||
KERNEL1x2_2
|
||||
|
@ -754,9 +799,9 @@
|
|||
KERNEL1x2_1
|
||||
KERNEL1x2_E2
|
||||
|
||||
b .LZGEMM_L1x2_SUB1
|
||||
b ZGEMM_L1x2_SUB1
|
||||
|
||||
.LZGEMM_L1x2_SUB4:
|
||||
ZGEMM_L1x2_SUB4:
|
||||
|
||||
KERNEL1x2_SUBI1
|
||||
KERNEL1x2_SUB1
|
||||
|
@ -768,48 +813,48 @@
|
|||
KERNEL1x2_SUB1
|
||||
KERNEL1x2_SUB1
|
||||
|
||||
b .LZGEMM_L1x2_SUB1
|
||||
b ZGEMM_L1x2_SUB1
|
||||
|
||||
.LZGEMM_L1x2_SUB0:
|
||||
ZGEMM_L1x2_SUB0:
|
||||
|
||||
andi. L, K, 7
|
||||
|
||||
KERNEL1x2_SUBI1
|
||||
|
||||
addic. L, L, -1
|
||||
ble .LZGEMM_L1x2_SAVE
|
||||
b .LZGEMM_L1x2_SUB2
|
||||
ble ZGEMM_L1x2_SAVE
|
||||
b ZGEMM_L1x2_SUB2
|
||||
|
||||
.LZGEMM_L1x2_SUB1:
|
||||
ZGEMM_L1x2_SUB1:
|
||||
|
||||
andi. L, K, 7
|
||||
ble .LZGEMM_L1x2_SAVE
|
||||
ble ZGEMM_L1x2_SAVE
|
||||
|
||||
.LZGEMM_L1x2_SUB2:
|
||||
ZGEMM_L1x2_SUB2:
|
||||
|
||||
KERNEL1x2_SUB1
|
||||
|
||||
addic. L, L, -1
|
||||
bgt .LZGEMM_L1x2_SUB2
|
||||
bgt ZGEMM_L1x2_SUB2
|
||||
|
||||
.LZGEMM_L1x2_SAVE:
|
||||
ZGEMM_L1x2_SAVE:
|
||||
|
||||
SAVE1x2
|
||||
|
||||
.LZGEMM_L1x2_END:
|
||||
ZGEMM_L1x2_END:
|
||||
|
||||
.LZGEMM_L1x1_BEGIN:
|
||||
ZGEMM_L1x1_BEGIN:
|
||||
|
||||
|
||||
andi. T1, M, 1
|
||||
ble .LZGEMM_L1x1_END
|
||||
mr BO, B
|
||||
ble ZGEMM_L1x1_END
|
||||
mr BO, BBUFFER
|
||||
srawi. L, K, 3
|
||||
ble .LZGEMM_L1x1_SUB0
|
||||
ble ZGEMM_L1x1_SUB0
|
||||
cmpwi cr0, L, 1
|
||||
ble .LZGEMM_L1x1_SUB4
|
||||
ble ZGEMM_L1x1_SUB4
|
||||
|
||||
.LZGEMM_L1x1_LOOP_START:
|
||||
ZGEMM_L1x1_LOOP_START:
|
||||
|
||||
LOAD1x1_1
|
||||
KERNEL1x1_I1
|
||||
|
@ -823,11 +868,11 @@
|
|||
KERNEL1x1_2
|
||||
|
||||
addic. L, L, -2
|
||||
ble .LZGEMM_L1x1_LOOP_END
|
||||
ble ZGEMM_L1x1_LOOP_END
|
||||
|
||||
.align 5
|
||||
|
||||
.LZGEMM_L1x1_LOOP:
|
||||
ZGEMM_L1x1_LOOP:
|
||||
|
||||
KERNEL1x1_1
|
||||
KERNEL1x1_2
|
||||
|
@ -840,9 +885,9 @@
|
|||
KERNEL1x1_2
|
||||
|
||||
addic. L, L, -1
|
||||
bgt .LZGEMM_L1x1_LOOP
|
||||
bgt ZGEMM_L1x1_LOOP
|
||||
|
||||
.LZGEMM_L1x1_LOOP_END:
|
||||
ZGEMM_L1x1_LOOP_END:
|
||||
|
||||
KERNEL1x1_1
|
||||
KERNEL1x1_2
|
||||
|
@ -854,9 +899,9 @@
|
|||
KERNEL1x1_1
|
||||
KERNEL1x1_E2
|
||||
|
||||
b .LZGEMM_L1x1_SUB1
|
||||
b ZGEMM_L1x1_SUB1
|
||||
|
||||
.LZGEMM_L1x1_SUB4:
|
||||
ZGEMM_L1x1_SUB4:
|
||||
|
||||
KERNEL1x1_SUBI1
|
||||
KERNEL1x1_SUB1
|
||||
|
@ -868,34 +913,34 @@
|
|||
KERNEL1x1_SUB1
|
||||
KERNEL1x1_SUB1
|
||||
|
||||
b .LZGEMM_L1x1_SUB1
|
||||
b ZGEMM_L1x1_SUB1
|
||||
|
||||
.LZGEMM_L1x1_SUB0:
|
||||
ZGEMM_L1x1_SUB0:
|
||||
|
||||
andi. L, K, 7
|
||||
|
||||
KERNEL1x1_SUBI1
|
||||
|
||||
addic. L, L, -1
|
||||
ble .LZGEMM_L1x1_SAVE
|
||||
b .LZGEMM_L1x1_SUB2
|
||||
ble ZGEMM_L1x1_SAVE
|
||||
b ZGEMM_L1x1_SUB2
|
||||
|
||||
.LZGEMM_L1x1_SUB1:
|
||||
ZGEMM_L1x1_SUB1:
|
||||
|
||||
andi. L, K, 7
|
||||
ble .LZGEMM_L1x1_SAVE
|
||||
ble ZGEMM_L1x1_SAVE
|
||||
|
||||
.LZGEMM_L1x1_SUB2:
|
||||
ZGEMM_L1x1_SUB2:
|
||||
|
||||
KERNEL1x1_SUB1
|
||||
|
||||
addic. L, L, -1
|
||||
bgt .LZGEMM_L1x1_SUB2
|
||||
bgt ZGEMM_L1x1_SUB2
|
||||
|
||||
.LZGEMM_L1x1_SAVE:
|
||||
ZGEMM_L1x1_SAVE:
|
||||
|
||||
SAVE1x1
|
||||
|
||||
.LZGEMM_L1x1_END:
|
||||
ZGEMM_L1x1_END:
|
||||
|
||||
.LZGEMM_L1_END:
|
||||
ZGEMM_L1_END:
|
||||
|
|
|
@ -1,39 +1,3 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
/**************************************************************************************
|
||||
* 2016/03/05 Werner Saar (wernsaar@googlemail.com)
|
||||
* BLASTEST : OK
|
||||
* CTEST : OK
|
||||
* TEST : OK
|
||||
* LAPACK-TEST : OK
|
||||
**************************************************************************************/
|
||||
|
||||
|
||||
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
|
||||
|
||||
#define XSFADD_R1 xsadddp
|
||||
|
@ -70,12 +34,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
.macro LOAD2x8_1
|
||||
|
||||
lxvdsx vs16, o0, BO // load real part from B
|
||||
lxvdsx vs17, o8, BO // load imag part from B
|
||||
lxvdsx vs18, o16, BO // load real part from B
|
||||
lxvdsx vs19, o24, BO // load imag part from B
|
||||
lxvd2x vs16, o0, BO // load real part from B
|
||||
lxvd2x vs17, o16, BO // load imag part from B
|
||||
lxvd2x vs18, o32, BO // load real part from B
|
||||
lxvd2x vs19, o48, BO // load imag part from B
|
||||
|
||||
addi BO, BO, 32
|
||||
addi BO, BO, 64
|
||||
|
||||
lxvd2x vs0, o0, AO // load real,imag from A
|
||||
lxvd2x vs1, o16, AO // load real,imag from A
|
||||
|
@ -110,12 +74,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
addi AO, AO, 64
|
||||
|
||||
lxvdsx vs20, o0, BO // load real part from B
|
||||
lxvdsx vs21, o8, BO // load imag part from B
|
||||
lxvdsx vs22, o16, BO // load real part from B
|
||||
lxvdsx vs23, o24, BO // load imag part from B
|
||||
lxvd2x vs20, o0, BO // load real part from B
|
||||
lxvd2x vs21, o16, BO // load imag part from B
|
||||
lxvd2x vs22, o32, BO // load real part from B
|
||||
lxvd2x vs23, o48, BO // load imag part from B
|
||||
|
||||
addi BO, BO, 32
|
||||
addi BO, BO, 64
|
||||
|
||||
xvmuldp vs32, vs0, vs16 // real*real, imag*real
|
||||
xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
|
||||
|
@ -156,36 +120,41 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
.macro KERNEL2x8_1
|
||||
|
||||
lxvd2x vs8, o0, AO // load real,imag from A
|
||||
lxvd2x vs9, o16, AO // load real,imag from A
|
||||
lxvd2x vs10, o32, AO // load real,imag from A
|
||||
lxvd2x vs11, o48, AO // load real,imag from A
|
||||
|
||||
addi AO, AO, 64
|
||||
|
||||
lxvd2x vs12, o0, AO // load real,imag from A
|
||||
lxvd2x vs13, o16, AO // load real,imag from A
|
||||
lxvd2x vs14, o32, AO // load real,imag from A
|
||||
lxvd2x vs15, o48, AO // load real,imag from A
|
||||
|
||||
addi AO, AO, 64
|
||||
|
||||
lxvd2x vs20, o0, BO // load real part from B
|
||||
lxvd2x vs21, o16, BO // load imag part from B
|
||||
lxvd2x vs22, o32, BO // load real part from B
|
||||
lxvd2x vs23, o48, BO // load imag part from B
|
||||
|
||||
addi BO, BO, 64
|
||||
|
||||
xvmaddadp vs32, vs0, vs16 // real*real, imag*real
|
||||
xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
|
||||
xvmaddadp vs34, vs1, vs16 // real*real, imag*real
|
||||
xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag
|
||||
|
||||
lxvdsx vs22, o16, BO // load real part from B
|
||||
lxvdsx vs23, o24, BO // load imag part from B
|
||||
|
||||
xvmaddadp vs36, vs2, vs16 // real*real, imag*real
|
||||
xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag
|
||||
xvmaddadp vs38, vs3, vs16 // real*real, imag*real
|
||||
xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag
|
||||
|
||||
lxvd2x vs8, o0, AO // load real,imag from A
|
||||
lxvd2x vs9, o16, AO // load real,imag from A
|
||||
|
||||
xvmaddadp vs40, vs4, vs16 // real*real, imag*real
|
||||
xvmaddadp vs41, vs4, vs17 // real*imag, imag*imag
|
||||
xvmaddadp vs42, vs5, vs16 // real*real, imag*real
|
||||
xvmaddadp vs43, vs5, vs17 // real*imag, imag*imag
|
||||
|
||||
lxvd2x vs10, o32, AO // load real,imag from A
|
||||
lxvd2x vs11, o48, AO // load real,imag from A
|
||||
|
||||
xvmaddadp vs44, vs6, vs16 // real*real, imag*real
|
||||
xvmaddadp vs45, vs6, vs17 // real*imag, imag*imag
|
||||
|
||||
addi AO, AO, 64
|
||||
|
||||
xvmaddadp vs46, vs7, vs16 // real*real, imag*real
|
||||
xvmaddadp vs47, vs7, vs17 // real*imag, imag*imag
|
||||
|
||||
|
@ -193,101 +162,79 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
xvmaddadp vs49, vs0, vs19 // real*imag, imag*imag
|
||||
xvmaddadp vs50, vs1, vs18 // real*real, imag*real
|
||||
xvmaddadp vs51, vs1, vs19 // real*imag, imag*imag
|
||||
|
||||
lxvd2x vs12, o0, AO // load real,imag from A
|
||||
lxvd2x vs13, o16, AO // load real,imag from A
|
||||
|
||||
xvmaddadp vs52, vs2, vs18 // real*real, imag*real
|
||||
xvmaddadp vs53, vs2, vs19 // real*imag, imag*imag
|
||||
xvmaddadp vs54, vs3, vs18 // real*real, imag*real
|
||||
xvmaddadp vs55, vs3, vs19 // real*imag, imag*imag
|
||||
|
||||
lxvd2x vs14, o32, AO // load real,imag from A
|
||||
lxvd2x vs15, o48, AO // load real,imag from A
|
||||
|
||||
xvmaddadp vs56, vs4, vs18 // real*real, imag*real
|
||||
xvmaddadp vs57, vs4, vs19 // real*imag, imag*imag
|
||||
xvmaddadp vs58, vs5, vs18 // real*real, imag*real
|
||||
xvmaddadp vs59, vs5, vs19 // real*imag, imag*imag
|
||||
|
||||
lxvdsx vs20, o0, BO // load real part from B
|
||||
lxvdsx vs21, o8, BO // load imag part from B
|
||||
|
||||
xvmaddadp vs60, vs6, vs18 // real*real, imag*real
|
||||
xvmaddadp vs61, vs6, vs19 // real*imag, imag*imag
|
||||
xvmaddadp vs62, vs7, vs18 // real*real, imag*real
|
||||
xvmaddadp vs63, vs7, vs19 // real*imag, imag*imag
|
||||
|
||||
addi AO, AO, 64
|
||||
addi BO, BO, 32
|
||||
|
||||
.endm
|
||||
|
||||
.macro KERNEL2x8_2
|
||||
|
||||
lxvd2x vs0, o0, AO // load real,imag from A
|
||||
lxvd2x vs1, o16, AO // load real,imag from A
|
||||
lxvd2x vs2, o32, AO // load real,imag from A
|
||||
lxvd2x vs3, o48, AO // load real,imag from A
|
||||
|
||||
addi AO, AO, 64
|
||||
|
||||
lxvd2x vs4, o0, AO // load real,imag from A
|
||||
lxvd2x vs5, o16, AO // load real,imag from A
|
||||
lxvd2x vs6, o32, AO // load real,imag from A
|
||||
lxvd2x vs7, o48, AO // load real,imag from A
|
||||
|
||||
addi AO, AO, 64
|
||||
|
||||
lxvd2x vs16, o0, BO // load real part from B
|
||||
lxvd2x vs17, o16, BO // load imag part from B
|
||||
lxvd2x vs18, o32, BO // load real part from B
|
||||
lxvd2x vs19, o48, BO // load imag part from B
|
||||
|
||||
addi BO, BO, 64
|
||||
|
||||
xvmaddadp vs32, vs8, vs20 // real*real, imag*real
|
||||
xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
|
||||
xvmaddadp vs34, vs9, vs20 // real*real, imag*real
|
||||
xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag
|
||||
|
||||
lxvdsx vs16, o0, BO // load real part from B
|
||||
lxvdsx vs17, o8, BO // load imag part from B
|
||||
|
||||
xvmaddadp vs36, vs10, vs20 // real*real, imag*real
|
||||
xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag
|
||||
xvmaddadp vs38, vs11, vs20 // real*real, imag*real
|
||||
xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag
|
||||
|
||||
lxvd2x vs0, o0, AO // load real,imag from A
|
||||
lxvd2x vs1, o16, AO // load real,imag from A
|
||||
|
||||
xvmaddadp vs40, vs12, vs20 // real*real, imag*real
|
||||
xvmaddadp vs41, vs12, vs21 // real*imag, imag*imag
|
||||
xvmaddadp vs42, vs13, vs20 // real*real, imag*real
|
||||
xvmaddadp vs43, vs13, vs21 // real*imag, imag*imag
|
||||
|
||||
lxvd2x vs2, o32, AO // load real,imag from A
|
||||
lxvd2x vs3, o48, AO // load real,imag from A
|
||||
|
||||
xvmaddadp vs44, vs14, vs20 // real*real, imag*real
|
||||
xvmaddadp vs45, vs14, vs21 // real*imag, imag*imag
|
||||
xvmaddadp vs46, vs15, vs20 // real*real, imag*real
|
||||
xvmaddadp vs47, vs15, vs21 // real*imag, imag*imag
|
||||
|
||||
addi AO, AO, 64
|
||||
|
||||
xvmaddadp vs48, vs8, vs22 // real*real, imag*real
|
||||
xvmaddadp vs49, vs8, vs23 // real*imag, imag*imag
|
||||
xvmaddadp vs50, vs9, vs22 // real*real, imag*real
|
||||
xvmaddadp vs51, vs9, vs23 // real*imag, imag*imag
|
||||
|
||||
lxvd2x vs4, o0, AO // load real,imag from A
|
||||
lxvd2x vs5, o16, AO // load real,imag from A
|
||||
|
||||
xvmaddadp vs52, vs10, vs22 // real*real, imag*real
|
||||
xvmaddadp vs53, vs10, vs23 // real*imag, imag*imag
|
||||
xvmaddadp vs54, vs11, vs22 // real*real, imag*real
|
||||
xvmaddadp vs55, vs11, vs23 // real*imag, imag*imag
|
||||
|
||||
lxvd2x vs6, o32, AO // load real,imag from A
|
||||
lxvd2x vs7, o48, AO // load real,imag from A
|
||||
|
||||
xvmaddadp vs56, vs12, vs22 // real*real, imag*real
|
||||
xvmaddadp vs57, vs12, vs23 // real*imag, imag*imag
|
||||
xvmaddadp vs58, vs13, vs22 // real*real, imag*real
|
||||
xvmaddadp vs59, vs13, vs23 // real*imag, imag*imag
|
||||
|
||||
lxvdsx vs18, o16, BO // load real part from B
|
||||
lxvdsx vs19, o24, BO // load imag part from B
|
||||
|
||||
xvmaddadp vs60, vs14, vs22 // real*real, imag*real
|
||||
xvmaddadp vs61, vs14, vs23 // real*imag, imag*imag
|
||||
xvmaddadp vs62, vs15, vs22 // real*real, imag*real
|
||||
xvmaddadp vs63, vs15, vs23 // real*imag, imag*imag
|
||||
|
||||
addi AO, AO, 64
|
||||
addi BO, BO, 32
|
||||
|
||||
.endm
|
||||
|
||||
|
@ -347,12 +294,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
addi AO, AO, 64
|
||||
|
||||
lxvdsx vs16, o0, BO // load real part from B
|
||||
lxvdsx vs17, o8, BO // load imag part from B
|
||||
lxvdsx vs18, o16, BO // load real part from B
|
||||
lxvdsx vs19, o24, BO // load imag part from B
|
||||
lxvd2x vs16, o0, BO // load real part from B
|
||||
lxvd2x vs17, o16, BO // load imag part from B
|
||||
lxvd2x vs18, o32, BO // load real part from B
|
||||
lxvd2x vs19, o48, BO // load imag part from B
|
||||
|
||||
addi BO, BO, 32
|
||||
addi BO, BO, 64
|
||||
|
||||
xvmuldp vs32, vs0, vs16 // real*real, imag*real
|
||||
xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
|
||||
|
@ -407,12 +354,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
addi AO, AO, 64
|
||||
|
||||
lxvdsx vs16, o0, BO // load real part from B
|
||||
lxvdsx vs17, o8, BO // load imag part from B
|
||||
lxvdsx vs18, o16, BO // load real part from B
|
||||
lxvdsx vs19, o24, BO // load imag part from B
|
||||
lxvd2x vs16, o0, BO // load real part from B
|
||||
lxvd2x vs17, o16, BO // load imag part from B
|
||||
lxvd2x vs18, o32, BO // load real part from B
|
||||
lxvd2x vs19, o48, BO // load imag part from B
|
||||
|
||||
addi BO, BO, 32
|
||||
addi BO, BO, 64
|
||||
|
||||
xvmaddadp vs32, vs0, vs16 // real*real, imag*real
|
||||
xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
|
||||
|
@ -927,12 +874,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
.macro LOAD2x4_1
|
||||
|
||||
lxvdsx vs16, o0, BO // load real part from B
|
||||
lxvdsx vs17, o8, BO // load imag part from B
|
||||
lxvdsx vs18, o16, BO // load real part from B
|
||||
lxvdsx vs19, o24, BO // load imag part from B
|
||||
lxvd2x vs16, o0, BO // load real part from B
|
||||
lxvd2x vs17, o16, BO // load imag part from B
|
||||
lxvd2x vs18, o32, BO // load real part from B
|
||||
lxvd2x vs19, o48, BO // load imag part from B
|
||||
|
||||
addi BO, BO, 32
|
||||
addi BO, BO, 64
|
||||
|
||||
lxvd2x vs0, o0, AO // load real,imag from A
|
||||
lxvd2x vs1, o16, AO // load real,imag from A
|
||||
|
@ -953,12 +900,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
addi AO, AO, 64
|
||||
|
||||
lxvdsx vs20, o0, BO // load real part from B
|
||||
lxvdsx vs21, o8, BO // load imag part from B
|
||||
lxvdsx vs22, o16, BO // load real part from B
|
||||
lxvdsx vs23, o24, BO // load imag part from B
|
||||
lxvd2x vs20, o0, BO // load real part from B
|
||||
lxvd2x vs21, o16, BO // load imag part from B
|
||||
lxvd2x vs22, o32, BO // load real part from B
|
||||
lxvd2x vs23, o48, BO // load imag part from B
|
||||
|
||||
addi BO, BO, 32
|
||||
addi BO, BO, 64
|
||||
|
||||
xvmuldp vs32, vs0, vs16 // real*real, imag*real
|
||||
xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
|
||||
|
@ -990,12 +937,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
addi AO, AO, 64
|
||||
|
||||
lxvdsx vs20, o0, BO // load real part from B
|
||||
lxvdsx vs21, o8, BO // load imag part from B
|
||||
lxvdsx vs22, o16, BO // load real part from B
|
||||
lxvdsx vs23, o24, BO // load imag part from B
|
||||
lxvd2x vs20, o0, BO // load real part from B
|
||||
lxvd2x vs21, o16, BO // load imag part from B
|
||||
lxvd2x vs22, o32, BO // load real part from B
|
||||
lxvd2x vs23, o48, BO // load imag part from B
|
||||
|
||||
addi BO, BO, 32
|
||||
addi BO, BO, 64
|
||||
|
||||
xvmaddadp vs32, vs0, vs16 // real*real, imag*real
|
||||
xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
|
||||
|
@ -1027,12 +974,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
addi AO, AO, 64
|
||||
|
||||
lxvdsx vs16, o0, BO // load real part from B
|
||||
lxvdsx vs17, o8, BO // load imag part from B
|
||||
lxvdsx vs18, o16, BO // load real part from B
|
||||
lxvdsx vs19, o24, BO // load imag part from B
|
||||
lxvd2x vs16, o0, BO // load real part from B
|
||||
lxvd2x vs17, o16, BO // load imag part from B
|
||||
lxvd2x vs18, o32, BO // load real part from B
|
||||
lxvd2x vs19, o48, BO // load imag part from B
|
||||
|
||||
addi BO, BO, 32
|
||||
addi BO, BO, 64
|
||||
|
||||
xvmaddadp vs32, vs8, vs20 // real*real, imag*real
|
||||
xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
|
||||
|
@ -1088,12 +1035,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
addi AO, AO, 64
|
||||
|
||||
lxvdsx vs16, o0, BO // load real part from B
|
||||
lxvdsx vs17, o8, BO // load imag part from B
|
||||
lxvdsx vs18, o16, BO // load real part from B
|
||||
lxvdsx vs19, o24, BO // load imag part from B
|
||||
lxvd2x vs16, o0, BO // load real part from B
|
||||
lxvd2x vs17, o16, BO // load imag part from B
|
||||
lxvd2x vs18, o32, BO // load real part from B
|
||||
lxvd2x vs19, o48, BO // load imag part from B
|
||||
|
||||
addi BO, BO, 32
|
||||
addi BO, BO, 64
|
||||
|
||||
xvmuldp vs32, vs0, vs16 // real*real, imag*real
|
||||
xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
|
||||
|
@ -1125,12 +1072,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
addi AO, AO, 64
|
||||
|
||||
lxvdsx vs16, o0, BO // load real part from B
|
||||
lxvdsx vs17, o8, BO // load imag part from B
|
||||
lxvdsx vs18, o16, BO // load real part from B
|
||||
lxvdsx vs19, o24, BO // load imag part from B
|
||||
lxvd2x vs16, o0, BO // load real part from B
|
||||
lxvd2x vs17, o16, BO // load imag part from B
|
||||
lxvd2x vs18, o32, BO // load real part from B
|
||||
lxvd2x vs19, o48, BO // load imag part from B
|
||||
|
||||
addi BO, BO, 32
|
||||
addi BO, BO, 64
|
||||
|
||||
xvmaddadp vs32, vs0, vs16 // real*real, imag*real
|
||||
xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
|
||||
|
@ -1410,12 +1357,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
.macro LOAD2x2_1
|
||||
|
||||
lxvdsx vs16, o0, BO // load real part from B
|
||||
lxvdsx vs17, o8, BO // load imag part from B
|
||||
lxvdsx vs18, o16, BO // load real part from B
|
||||
lxvdsx vs19, o24, BO // load imag part from B
|
||||
lxvd2x vs16, o0, BO // load real part from B
|
||||
lxvd2x vs17, o16, BO // load imag part from B
|
||||
lxvd2x vs18, o32, BO // load real part from B
|
||||
lxvd2x vs19, o48, BO // load imag part from B
|
||||
|
||||
addi BO, BO, 32
|
||||
addi BO, BO, 64
|
||||
|
||||
lxvd2x vs0, o0, AO // load real,imag from A
|
||||
lxvd2x vs1, o16, AO // load real,imag from A
|
||||
|
@ -1432,12 +1379,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
addi AO, AO, 32
|
||||
|
||||
lxvdsx vs20, o0, BO // load real part from B
|
||||
lxvdsx vs21, o8, BO // load imag part from B
|
||||
lxvdsx vs22, o16, BO // load real part from B
|
||||
lxvdsx vs23, o24, BO // load imag part from B
|
||||
lxvd2x vs20, o0, BO // load real part from B
|
||||
lxvd2x vs21, o16, BO // load imag part from B
|
||||
lxvd2x vs22, o32, BO // load real part from B
|
||||
lxvd2x vs23, o48, BO // load imag part from B
|
||||
|
||||
addi BO, BO, 32
|
||||
addi BO, BO, 64
|
||||
|
||||
xvmuldp vs32, vs0, vs16 // real*real, imag*real
|
||||
xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
|
||||
|
@ -1459,12 +1406,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
addi AO, AO, 32
|
||||
|
||||
lxvdsx vs20, o0, BO // load real part from B
|
||||
lxvdsx vs21, o8, BO // load imag part from B
|
||||
lxvdsx vs22, o16, BO // load real part from B
|
||||
lxvdsx vs23, o24, BO // load imag part from B
|
||||
lxvd2x vs20, o0, BO // load real part from B
|
||||
lxvd2x vs21, o16, BO // load imag part from B
|
||||
lxvd2x vs22, o32, BO // load real part from B
|
||||
lxvd2x vs23, o48, BO // load imag part from B
|
||||
|
||||
addi BO, BO, 32
|
||||
addi BO, BO, 64
|
||||
|
||||
xvmaddadp vs32, vs0, vs16 // real*real, imag*real
|
||||
xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
|
||||
|
@ -1486,12 +1433,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
addi AO, AO, 32
|
||||
|
||||
lxvdsx vs16, o0, BO // load real part from B
|
||||
lxvdsx vs17, o8, BO // load imag part from B
|
||||
lxvdsx vs18, o16, BO // load real part from B
|
||||
lxvdsx vs19, o24, BO // load imag part from B
|
||||
lxvd2x vs16, o0, BO // load real part from B
|
||||
lxvd2x vs17, o16, BO // load imag part from B
|
||||
lxvd2x vs18, o32, BO // load real part from B
|
||||
lxvd2x vs19, o48, BO // load imag part from B
|
||||
|
||||
addi BO, BO, 32
|
||||
addi BO, BO, 64
|
||||
|
||||
xvmaddadp vs32, vs8, vs20 // real*real, imag*real
|
||||
xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
|
||||
|
@ -1529,12 +1476,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
addi AO, AO, 32
|
||||
|
||||
lxvdsx vs16, o0, BO // load real part from B
|
||||
lxvdsx vs17, o8, BO // load imag part from B
|
||||
lxvdsx vs18, o16, BO // load real part from B
|
||||
lxvdsx vs19, o24, BO // load imag part from B
|
||||
lxvd2x vs16, o0, BO // load real part from B
|
||||
lxvd2x vs17, o16, BO // load imag part from B
|
||||
lxvd2x vs18, o32, BO // load real part from B
|
||||
lxvd2x vs19, o48, BO // load imag part from B
|
||||
|
||||
addi BO, BO, 32
|
||||
addi BO, BO, 64
|
||||
|
||||
xvmuldp vs32, vs0, vs16 // real*real, imag*real
|
||||
xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
|
||||
|
@ -1556,12 +1503,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
addi AO, AO, 32
|
||||
|
||||
lxvdsx vs16, o0, BO // load real part from B
|
||||
lxvdsx vs17, o8, BO // load imag part from B
|
||||
lxvdsx vs18, o16, BO // load real part from B
|
||||
lxvdsx vs19, o24, BO // load imag part from B
|
||||
lxvd2x vs16, o0, BO // load real part from B
|
||||
lxvd2x vs17, o16, BO // load imag part from B
|
||||
lxvd2x vs18, o32, BO // load real part from B
|
||||
lxvd2x vs19, o48, BO // load imag part from B
|
||||
|
||||
addi BO, BO, 32
|
||||
addi BO, BO, 64
|
||||
|
||||
xvmaddadp vs32, vs0, vs16 // real*real, imag*real
|
||||
xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
|
||||
|
@ -1725,12 +1672,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
.macro LOAD2x1_1
|
||||
|
||||
lxvdsx vs16, o0, BO // load real part from B
|
||||
lxvdsx vs17, o8, BO // load imag part from B
|
||||
lxvdsx vs18, o16, BO // load real part from B
|
||||
lxvdsx vs19, o24, BO // load imag part from B
|
||||
lxvd2x vs16, o0, BO // load real part from B
|
||||
lxvd2x vs17, o16, BO // load imag part from B
|
||||
lxvd2x vs18, o32, BO // load real part from B
|
||||
lxvd2x vs19, o48, BO // load imag part from B
|
||||
|
||||
addi BO, BO, 32
|
||||
addi BO, BO, 64
|
||||
|
||||
lxvd2x vs0, o0, AO // load real,imag from A
|
||||
|
||||
|
@ -1745,12 +1692,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
addi AO, AO, 16
|
||||
|
||||
lxvdsx vs20, o0, BO // load real part from B
|
||||
lxvdsx vs21, o8, BO // load imag part from B
|
||||
lxvdsx vs22, o16, BO // load real part from B
|
||||
lxvdsx vs23, o24, BO // load imag part from B
|
||||
lxvd2x vs20, o0, BO // load real part from B
|
||||
lxvd2x vs21, o16, BO // load imag part from B
|
||||
lxvd2x vs22, o32, BO // load real part from B
|
||||
lxvd2x vs23, o48, BO // load imag part from B
|
||||
|
||||
addi BO, BO, 32
|
||||
addi BO, BO, 64
|
||||
|
||||
xvmuldp vs32, vs0, vs16 // real*real, imag*real
|
||||
xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
|
||||
|
@ -1767,12 +1714,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
addi AO, AO, 16
|
||||
|
||||
lxvdsx vs20, o0, BO // load real part from B
|
||||
lxvdsx vs21, o8, BO // load imag part from B
|
||||
lxvdsx vs22, o16, BO // load real part from B
|
||||
lxvdsx vs23, o24, BO // load imag part from B
|
||||
lxvd2x vs20, o0, BO // load real part from B
|
||||
lxvd2x vs21, o16, BO // load imag part from B
|
||||
lxvd2x vs22, o32, BO // load real part from B
|
||||
lxvd2x vs23, o48, BO // load imag part from B
|
||||
|
||||
addi BO, BO, 32
|
||||
addi BO, BO, 64
|
||||
|
||||
xvmaddadp vs32, vs0, vs16 // real*real, imag*real
|
||||
xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
|
||||
|
@ -1789,12 +1736,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
addi AO, AO, 16
|
||||
|
||||
lxvdsx vs16, o0, BO // load real part from B
|
||||
lxvdsx vs17, o8, BO // load imag part from B
|
||||
lxvdsx vs18, o16, BO // load real part from B
|
||||
lxvdsx vs19, o24, BO // load imag part from B
|
||||
lxvd2x vs16, o0, BO // load real part from B
|
||||
lxvd2x vs17, o16, BO // load imag part from B
|
||||
lxvd2x vs18, o32, BO // load real part from B
|
||||
lxvd2x vs19, o48, BO // load imag part from B
|
||||
|
||||
addi BO, BO, 32
|
||||
addi BO, BO, 64
|
||||
|
||||
xvmaddadp vs32, vs8, vs20 // real*real, imag*real
|
||||
xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
|
||||
|
@ -1823,12 +1770,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
addi AO, AO, 16
|
||||
|
||||
lxvdsx vs16, o0, BO // load real part from B
|
||||
lxvdsx vs17, o8, BO // load imag part from B
|
||||
lxvdsx vs18, o16, BO // load real part from B
|
||||
lxvdsx vs19, o24, BO // load imag part from B
|
||||
lxvd2x vs16, o0, BO // load real part from B
|
||||
lxvd2x vs17, o16, BO // load imag part from B
|
||||
lxvd2x vs18, o32, BO // load real part from B
|
||||
lxvd2x vs19, o48, BO // load imag part from B
|
||||
|
||||
addi BO, BO, 32
|
||||
addi BO, BO, 64
|
||||
|
||||
xvmuldp vs32, vs0, vs16 // real*real, imag*real
|
||||
xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
|
||||
|
@ -1845,12 +1792,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
addi AO, AO, 16
|
||||
|
||||
lxvdsx vs16, o0, BO // load real part from B
|
||||
lxvdsx vs17, o8, BO // load imag part from B
|
||||
lxvdsx vs18, o16, BO // load real part from B
|
||||
lxvdsx vs19, o24, BO // load imag part from B
|
||||
lxvd2x vs16, o0, BO // load real part from B
|
||||
lxvd2x vs17, o16, BO // load imag part from B
|
||||
lxvd2x vs18, o32, BO // load real part from B
|
||||
lxvd2x vs19, o48, BO // load imag part from B
|
||||
|
||||
addi BO, BO, 32
|
||||
addi BO, BO, 64
|
||||
|
||||
xvmaddadp vs32, vs0, vs16 // real*real, imag*real
|
||||
xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
|
||||
|
@ -1956,10 +1903,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
.macro LOAD1x8_1
|
||||
|
||||
lxvdsx vs16, o0, BO // load real part from B
|
||||
lxvdsx vs17, o8, BO // load imag part from B
|
||||
lxvd2x vs16, o0, BO // load real part from B
|
||||
lxvd2x vs17, o16, BO // load imag part from B
|
||||
|
||||
addi BO, BO, 16
|
||||
addi BO, BO, 32
|
||||
|
||||
lxvd2x vs0, o0, AO // load real,imag from A
|
||||
lxvd2x vs1, o16, AO // load real,imag from A
|
||||
|
@ -1994,10 +1941,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
addi AO, AO, 64
|
||||
|
||||
lxvdsx vs20, o0, BO // load real part from B
|
||||
lxvdsx vs21, o8, BO // load imag part from B
|
||||
lxvd2x vs20, o0, BO // load real part from B
|
||||
lxvd2x vs21, o16, BO // load imag part from B
|
||||
|
||||
addi BO, BO, 16
|
||||
addi BO, BO, 32
|
||||
|
||||
xvmuldp vs32, vs0, vs16 // real*real, imag*real
|
||||
xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
|
||||
|
@ -2035,10 +1982,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
addi AO, AO, 64
|
||||
|
||||
lxvdsx vs20, o0, BO // load real part from B
|
||||
lxvdsx vs21, o8, BO // load imag part from B
|
||||
lxvd2x vs20, o0, BO // load real part from B
|
||||
lxvd2x vs21, o16, BO // load imag part from B
|
||||
|
||||
addi BO, BO, 16
|
||||
addi BO, BO, 32
|
||||
|
||||
xvmaddadp vs32, vs0, vs16 // real*real, imag*real
|
||||
xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
|
||||
|
@ -2076,10 +2023,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
addi AO, AO, 64
|
||||
|
||||
lxvdsx vs16, o0, BO // load real part from B
|
||||
lxvdsx vs17, o8, BO // load imag part from B
|
||||
lxvd2x vs16, o0, BO // load real part from B
|
||||
lxvd2x vs17, o16, BO // load imag part from B
|
||||
|
||||
addi BO, BO, 16
|
||||
addi BO, BO, 32
|
||||
|
||||
xvmaddadp vs32, vs8, vs20 // real*real, imag*real
|
||||
xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
|
||||
|
@ -2140,10 +2087,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
addi AO, AO, 64
|
||||
|
||||
lxvdsx vs16, o0, BO // load real part from B
|
||||
lxvdsx vs17, o8, BO // load imag part from B
|
||||
lxvd2x vs16, o0, BO // load real part from B
|
||||
lxvd2x vs17, o16, BO // load imag part from B
|
||||
|
||||
addi BO, BO, 16
|
||||
addi BO, BO, 32
|
||||
|
||||
xvmuldp vs32, vs0, vs16 // real*real, imag*real
|
||||
xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
|
||||
|
@ -2181,10 +2128,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
addi AO, AO, 64
|
||||
|
||||
lxvdsx vs16, o0, BO // load real part from B
|
||||
lxvdsx vs17, o8, BO // load imag part from B
|
||||
lxvd2x vs16, o0, BO // load real part from B
|
||||
lxvd2x vs17, o16, BO // load imag part from B
|
||||
|
||||
addi BO, BO, 16
|
||||
addi BO, BO, 32
|
||||
|
||||
xvmaddadp vs32, vs0, vs16 // real*real, imag*real
|
||||
xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
|
||||
|
@ -2452,10 +2399,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
.macro LOAD1x4_1
|
||||
|
||||
lxvdsx vs16, o0, BO // load real part from B
|
||||
lxvdsx vs17, o8, BO // load imag part from B
|
||||
lxvd2x vs16, o0, BO // load real part from B
|
||||
lxvd2x vs17, o16, BO // load imag part from B
|
||||
|
||||
addi BO, BO, 16
|
||||
addi BO, BO, 32
|
||||
|
||||
lxvd2x vs0, o0, AO // load real,imag from A
|
||||
lxvd2x vs1, o16, AO // load real,imag from A
|
||||
|
@ -2476,10 +2423,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
addi AO, AO, 64
|
||||
|
||||
lxvdsx vs20, o0, BO // load real part from B
|
||||
lxvdsx vs21, o8, BO // load imag part from B
|
||||
lxvd2x vs20, o0, BO // load real part from B
|
||||
lxvd2x vs21, o16, BO // load imag part from B
|
||||
|
||||
addi BO, BO, 16
|
||||
addi BO, BO, 32
|
||||
|
||||
xvmuldp vs32, vs0, vs16 // real*real, imag*real
|
||||
xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
|
||||
|
@ -2502,10 +2449,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
addi AO, AO, 64
|
||||
|
||||
lxvdsx vs20, o0, BO // load real part from B
|
||||
lxvdsx vs21, o8, BO // load imag part from B
|
||||
lxvd2x vs20, o0, BO // load real part from B
|
||||
lxvd2x vs21, o16, BO // load imag part from B
|
||||
|
||||
addi BO, BO, 16
|
||||
addi BO, BO, 32
|
||||
|
||||
xvmaddadp vs32, vs0, vs16 // real*real, imag*real
|
||||
xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
|
||||
|
@ -2528,10 +2475,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
addi AO, AO, 64
|
||||
|
||||
lxvdsx vs16, o0, BO // load real part from B
|
||||
lxvdsx vs17, o8, BO // load imag part from B
|
||||
lxvd2x vs16, o0, BO // load real part from B
|
||||
lxvd2x vs17, o16, BO // load imag part from B
|
||||
|
||||
addi BO, BO, 16
|
||||
addi BO, BO, 32
|
||||
|
||||
xvmaddadp vs32, vs8, vs20 // real*real, imag*real
|
||||
xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
|
||||
|
@ -2569,10 +2516,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
addi AO, AO, 64
|
||||
|
||||
lxvdsx vs16, o0, BO // load real part from B
|
||||
lxvdsx vs17, o8, BO // load imag part from B
|
||||
lxvd2x vs16, o0, BO // load real part from B
|
||||
lxvd2x vs17, o16, BO // load imag part from B
|
||||
|
||||
addi BO, BO, 16
|
||||
addi BO, BO, 32
|
||||
|
||||
xvmuldp vs32, vs0, vs16 // real*real, imag*real
|
||||
xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
|
||||
|
@ -2595,10 +2542,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
addi AO, AO, 64
|
||||
|
||||
lxvdsx vs16, o0, BO // load real part from B
|
||||
lxvdsx vs17, o8, BO // load imag part from B
|
||||
lxvd2x vs16, o0, BO // load real part from B
|
||||
lxvd2x vs17, o16, BO // load imag part from B
|
||||
|
||||
addi BO, BO, 16
|
||||
addi BO, BO, 32
|
||||
|
||||
xvmaddadp vs32, vs0, vs16 // real*real, imag*real
|
||||
xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
|
||||
|
@ -2748,10 +2695,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
.macro LOAD1x2_1
|
||||
|
||||
lxvdsx vs16, o0, BO // load real part from B
|
||||
lxvdsx vs17, o8, BO // load imag part from B
|
||||
lxvd2x vs16, o0, BO // load real part from B
|
||||
lxvd2x vs17, o16, BO // load imag part from B
|
||||
|
||||
addi BO, BO, 16
|
||||
addi BO, BO, 32
|
||||
|
||||
lxvd2x vs0, o0, AO // load real,imag from A
|
||||
lxvd2x vs1, o16, AO // load real,imag from A
|
||||
|
@ -2768,10 +2715,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
addi AO, AO, 32
|
||||
|
||||
lxvdsx vs20, o0, BO // load real part from B
|
||||
lxvdsx vs21, o8, BO // load imag part from B
|
||||
lxvd2x vs20, o0, BO // load real part from B
|
||||
lxvd2x vs21, o16, BO // load imag part from B
|
||||
|
||||
addi BO, BO, 16
|
||||
addi BO, BO, 32
|
||||
|
||||
xvmuldp vs32, vs0, vs16 // real*real, imag*real
|
||||
xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
|
||||
|
@ -2788,10 +2735,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
addi AO, AO, 32
|
||||
|
||||
lxvdsx vs20, o0, BO // load real part from B
|
||||
lxvdsx vs21, o8, BO // load imag part from B
|
||||
lxvd2x vs20, o0, BO // load real part from B
|
||||
lxvd2x vs21, o16, BO // load imag part from B
|
||||
|
||||
addi BO, BO, 16
|
||||
addi BO, BO, 32
|
||||
|
||||
xvmaddadp vs32, vs0, vs16 // real*real, imag*real
|
||||
xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
|
||||
|
@ -2808,10 +2755,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
addi AO, AO, 32
|
||||
|
||||
lxvdsx vs16, o0, BO // load real part from B
|
||||
lxvdsx vs17, o8, BO // load imag part from B
|
||||
lxvd2x vs16, o0, BO // load real part from B
|
||||
lxvd2x vs17, o16, BO // load imag part from B
|
||||
|
||||
addi BO, BO, 16
|
||||
addi BO, BO, 32
|
||||
|
||||
xvmaddadp vs32, vs8, vs20 // real*real, imag*real
|
||||
xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
|
||||
|
@ -2839,10 +2786,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
addi AO, AO, 32
|
||||
|
||||
lxvdsx vs16, o0, BO // load real part from B
|
||||
lxvdsx vs17, o8, BO // load imag part from B
|
||||
lxvd2x vs16, o0, BO // load real part from B
|
||||
lxvd2x vs17, o16, BO // load imag part from B
|
||||
|
||||
addi BO, BO, 16
|
||||
addi BO, BO, 32
|
||||
|
||||
xvmuldp vs32, vs0, vs16 // real*real, imag*real
|
||||
xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
|
||||
|
@ -2859,10 +2806,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
addi AO, AO, 32
|
||||
|
||||
lxvdsx vs16, o0, BO // load real part from B
|
||||
lxvdsx vs17, o8, BO // load imag part from B
|
||||
lxvd2x vs16, o0, BO // load real part from B
|
||||
lxvd2x vs17, o16, BO // load imag part from B
|
||||
|
||||
addi BO, BO, 16
|
||||
addi BO, BO, 32
|
||||
|
||||
xvmaddadp vs32, vs0, vs16 // real*real, imag*real
|
||||
xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
|
||||
|
@ -2954,10 +2901,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
.macro LOAD1x1_1
|
||||
|
||||
lxvdsx vs16, o0, BO // load real part from B
|
||||
lxvdsx vs17, o8, BO // load imag part from B
|
||||
lxvd2x vs16, o0, BO // load real part from B
|
||||
lxvd2x vs17, o16, BO // load imag part from B
|
||||
|
||||
addi BO, BO, 16
|
||||
addi BO, BO, 32
|
||||
|
||||
lxvd2x vs0, o0, AO // load real,imag from A
|
||||
|
||||
|
@ -2972,10 +2919,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
addi AO, AO, 16
|
||||
|
||||
lxvdsx vs20, o0, BO // load real part from B
|
||||
lxvdsx vs21, o8, BO // load imag part from B
|
||||
lxvd2x vs20, o0, BO // load real part from B
|
||||
lxvd2x vs21, o16, BO // load imag part from B
|
||||
|
||||
addi BO, BO, 16
|
||||
addi BO, BO, 32
|
||||
|
||||
xvmuldp vs32, vs0, vs16 // real*real, imag*real
|
||||
xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
|
||||
|
@ -2989,10 +2936,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
addi AO, AO, 16
|
||||
|
||||
lxvdsx vs20, o0, BO // load real part from B
|
||||
lxvdsx vs21, o8, BO // load imag part from B
|
||||
lxvd2x vs20, o0, BO // load real part from B
|
||||
lxvd2x vs21, o16, BO // load imag part from B
|
||||
|
||||
addi BO, BO, 16
|
||||
addi BO, BO, 32
|
||||
|
||||
xvmaddadp vs32, vs0, vs16 // real*real, imag*real
|
||||
xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
|
||||
|
@ -3006,10 +2953,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
addi AO, AO, 16
|
||||
|
||||
lxvdsx vs16, o0, BO // load real part from B
|
||||
lxvdsx vs17, o8, BO // load imag part from B
|
||||
lxvd2x vs16, o0, BO // load real part from B
|
||||
lxvd2x vs17, o16, BO // load imag part from B
|
||||
|
||||
addi BO, BO, 16
|
||||
addi BO, BO, 32
|
||||
|
||||
xvmaddadp vs32, vs8, vs20 // real*real, imag*real
|
||||
xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
|
||||
|
@ -3032,10 +2979,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
addi AO, AO, 16
|
||||
|
||||
lxvdsx vs16, o0, BO // load real part from B
|
||||
lxvdsx vs17, o8, BO // load imag part from B
|
||||
lxvd2x vs16, o0, BO // load real part from B
|
||||
lxvd2x vs17, o16, BO // load imag part from B
|
||||
|
||||
addi BO, BO, 16
|
||||
addi BO, BO, 32
|
||||
|
||||
xvmuldp vs32, vs0, vs16 // real*real, imag*real
|
||||
xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
|
||||
|
@ -3049,10 +2996,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
addi AO, AO, 16
|
||||
|
||||
lxvdsx vs16, o0, BO // load real part from B
|
||||
lxvdsx vs17, o8, BO // load imag part from B
|
||||
lxvd2x vs16, o0, BO // load real part from B
|
||||
lxvd2x vs17, o16, BO // load imag part from B
|
||||
|
||||
addi BO, BO, 16
|
||||
addi BO, BO, 32
|
||||
|
||||
xvmaddadp vs32, vs0, vs16 // real*real, imag*real
|
||||
xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
|
||||
|
|
|
@ -271,7 +271,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#endif
|
||||
#endif
|
||||
|
||||
#include "zgemm_macros_8x2_power8.S"
|
||||
#include "ztrmm_macros_8x2_power8.S"
|
||||
|
||||
cmpwi cr0, M, 0
|
||||
ble .L999
|
||||
|
|
File diff suppressed because it is too large
Load Diff
14
param.h
14
param.h
|
@ -1964,8 +1964,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define SNUMOPT 16
|
||||
#define DNUMOPT 8
|
||||
|
||||
#define GEMM_DEFAULT_OFFSET_A 131072
|
||||
#define GEMM_DEFAULT_OFFSET_B 1024
|
||||
#define GEMM_DEFAULT_OFFSET_A 4096
|
||||
#define GEMM_DEFAULT_OFFSET_B 4096
|
||||
#define GEMM_DEFAULT_ALIGN 0x03fffUL
|
||||
|
||||
#define SGEMM_DEFAULT_UNROLL_M 16
|
||||
|
@ -1980,17 +1980,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define SGEMM_DEFAULT_P 960
|
||||
#define DGEMM_DEFAULT_P 480
|
||||
#define CGEMM_DEFAULT_P 720
|
||||
#define ZGEMM_DEFAULT_P 240
|
||||
#define ZGEMM_DEFAULT_P 480
|
||||
|
||||
#define SGEMM_DEFAULT_Q 720
|
||||
#define DGEMM_DEFAULT_Q 720
|
||||
#define CGEMM_DEFAULT_Q 720
|
||||
#define ZGEMM_DEFAULT_Q 360
|
||||
#define ZGEMM_DEFAULT_Q 720
|
||||
|
||||
#define SGEMM_DEFAULT_R 14400
|
||||
#define SGEMM_DEFAULT_R 21600
|
||||
#define DGEMM_DEFAULT_R 14400
|
||||
#define CGEMM_DEFAULT_R 14400
|
||||
#define ZGEMM_DEFAULT_R 7200
|
||||
#define CGEMM_DEFAULT_R 16200
|
||||
#define ZGEMM_DEFAULT_R 21600
|
||||
|
||||
#define SYMV_P 8
|
||||
|
||||
|
|
Loading…
Reference in New Issue