Optimized zgemm and tested zgemm again

This commit is contained in:
Werner Saar 2016-04-22 13:07:12 +02:00
parent dd2b897795
commit 879a51165f
8 changed files with 1227 additions and 12 deletions

View File

@ -42,7 +42,7 @@ ZGEMMKERNEL = zgemm_kernel_8x2_power8.S
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
ZGEMMINCOPY = ../generic/zgemm_ncopy_8.c
ZGEMMITCOPY = ../generic/zgemm_tcopy_8.c
ZGEMMITCOPY = zgemm_tcopy_8_power8.S
ZGEMMONCOPYOBJ = zgemm_oncopy.o
ZGEMMOTCOPYOBJ = zgemm_otcopy.o
ZGEMMINCOPYOBJ = zgemm_incopy.o

View File

@ -1,3 +1,73 @@
/***************************************************************************
Copyright (c) 2013-2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2016/04/22 Werner Saar (wernsaar@googlemail.com)
* BLASTEST : OK
* CTEST : OK
* TEST : OK
* LAPACK-TEST : OK
**************************************************************************************/
/***************************************************************************
Copyright (c) 2013-2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2016/04/22 Werner Saar (wernsaar@googlemail.com)
* BLASTEST : OK
* CTEST : OK
* TEST : OK
* LAPACK-TEST : OK
**************************************************************************************/
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
@ -250,7 +320,7 @@
ble L999
slwi LDC, LDC, ZBASE_SHIFT
li PRE, 384
li PRE, 512
li o8 , 8
li o16 , 16
li o24 , 24

View File

@ -1,3 +1,39 @@
/***************************************************************************
Copyright (c) 2013-2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2016/04/22 Werner Saar (wernsaar@googlemail.com)
* BLASTEST : OK
* CTEST : OK
* TEST : OK
* LAPACK-TEST : OK
**************************************************************************************/
srawi. J, N, 1
ble ZGEMM_L2_END
@ -5,20 +41,34 @@ ZGEMM_L2_BEGIN:
mr BO, B
mr BBO, BBUFFER
slwi T1, K, 1
srawi. T1, K, 2
ble ZGEMM_L2_COPYB1
ZGEMM_L2_COPYB:
ZGEMM_L2_COPYB8:
lxvdsx vs4, o0, BO // b0_r
lxvdsx vs5, o8, BO // b0_i
addi BO, BO, 16
stxvd2x vs4, o0, BBO
stxvd2x vs5, o16, BBO
addi T2, PRE, 128
dcbt BO, PRE
dcbtst BBO, PRE
dcbtst BBO, T2
ZCOPYB_8x1
addic. T1, T1, -1
addi BBO, BBO, 32
bge ZGEMM_L2_COPYB
bgt ZGEMM_L2_COPYB8
ZGEMM_L2_COPYB1:
andi. T1, K, 3
ble ZGEMM_L2_COPYB_END
ZGEMM_L2_COPYB_LOOP:
ZCOPYB_1x1
ZCOPYB_1x1
addic. T1, T1, -1
bgt ZGEMM_L2_COPYB_LOOP
ZGEMM_L2_COPYB_END:
mr CO, C
mr AO, A
@ -493,6 +543,7 @@ ZGEMM_L1_BEGIN:
slwi T1, K, 0
ZGEMM_L1_COPYB:
dcbtst BBO, PRE
lxvdsx vs4, o0, BO // b0_r
lxvdsx vs5, o8, BO // b0_i

View File

@ -1,3 +1,38 @@
/***************************************************************************
Copyright (c) 2013-2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2016/04/22 Werner Saar (wernsaar@googlemail.com)
* BLASTEST : OK
* CTEST : OK
* TEST : OK
* LAPACK-TEST : OK
**************************************************************************************/
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
#define XSFADD_R1 xsadddp
@ -3055,3 +3090,76 @@
.endm
.macro ZCOPYB_1x1
lxvdsx vs4, o0, BO // b0_r
lxvdsx vs5, o8, BO // b0_i
addi BO, BO, 16
stxvd2x vs4, o0, BBO
stxvd2x vs5, o16, BBO
addi BBO, BBO, 32
.endm
.macro ZCOPYB_8x1
lxvd2x vs32, o0, BO
lxvd2x vs33, o16, BO
lxvd2x vs34, o32, BO
lxvd2x vs35, o48, BO
addi BO, BO, 64
lxvd2x vs36, o0, BO
lxvd2x vs37, o16, BO
lxvd2x vs38, o32, BO
lxvd2x vs39, o48, BO
addi BO, BO, 64
xxspltd vs40, vs32, 0
xxspltd vs41, vs32, 1
xxspltd vs42, vs33, 0
xxspltd vs43, vs33, 1
xxspltd vs44, vs34, 0
xxspltd vs45, vs34, 1
xxspltd vs46, vs35, 0
xxspltd vs47, vs35, 1
xxspltd vs48, vs36, 0
xxspltd vs49, vs36, 1
xxspltd vs50, vs37, 0
xxspltd vs51, vs37, 1
xxspltd vs52, vs38, 0
xxspltd vs53, vs38, 1
xxspltd vs54, vs39, 0
xxspltd vs55, vs39, 1
stxvd2x vs40, o0, BBO
stxvd2x vs41, o16, BBO
stxvd2x vs42, o32, BBO
stxvd2x vs43, o48, BBO
addi BBO, BBO, 64
stxvd2x vs44, o0, BBO
stxvd2x vs45, o16, BBO
stxvd2x vs46, o32, BBO
stxvd2x vs47, o48, BBO
addi BBO, BBO, 64
stxvd2x vs48, o0, BBO
stxvd2x vs49, o16, BBO
stxvd2x vs50, o32, BBO
stxvd2x vs51, o48, BBO
addi BBO, BBO, 64
stxvd2x vs52, o0, BBO
stxvd2x vs53, o16, BBO
stxvd2x vs54, o32, BBO
stxvd2x vs55, o48, BBO
addi BBO, BBO, 64
.endm

View File

@ -0,0 +1,205 @@
/***************************************************************************
Copyright (c) 2013-2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2016/04/22 Werner Saar (wernsaar@googlemail.com)
* BLASTEST : OK
* CTEST : OK
* TEST : OK
* LAPACK-TEST : OK
**************************************************************************************/
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#define ASSEMBLER
#include "common.h"
#include "def_vsx.h"
#define M r3
#define N r4
#define A r5
#define LDA r6
#define B r7
#define A0 r8
#define A1 r9
#define A2 r10
#define A3 r11
#define J r12
#define PREA r14
#define PREB r15
#define BO r16
#define B8 r17
#define B4 r18
#define B2 r19
#define B1 r20
#define NOTUS1 r21
#define T2 r22
#define I r23
#define o16 r24
#define o32 r25
#define o48 r26
#define NOTUS2 r27
#define M8 r30
#define T1 r31
#define o0 0
#include "zgemm_tcopy_macros_8_power8.S"
#define STACKSIZE 384
PROLOGUE
PROFCODE
addi SP, SP, -STACKSIZE
li r0, 0
std r31, 144(SP)
std r30, 152(SP)
std r29, 160(SP)
std r28, 168(SP)
std r27, 176(SP)
std r26, 184(SP)
std r25, 192(SP)
std r24, 200(SP)
std r23, 208(SP)
std r22, 216(SP)
std r21, 224(SP)
std r20, 232(SP)
std r19, 240(SP)
std r18, 248(SP)
std r17, 256(SP)
std r16, 264(SP)
std r15, 272(SP)
std r14, 280(SP)
cmpwi cr0, M, 0
ble- L999
cmpwi cr0, N, 0
ble- L999
slwi LDA, LDA, ZBASE_SHIFT
slwi M8, M, 3 + ZBASE_SHIFT
li T2, -8
li PREA, -4
li PREB, -2
and B4, N, T2
and B2, N, PREA
and B1, N, PREB
mullw B4, B4, M
mullw B2, B2, M
mullw B1, B1, M
slwi B4, B4, ZBASE_SHIFT
slwi B2, B2, ZBASE_SHIFT
slwi B1, B1, ZBASE_SHIFT
add B4, B4, B
add B2, B2, B
add B1, B1, B
li PREA, 384
addi PREB, M8, 128
li o16, 16
li o32, 32
li o48, 48
#include "zgemm_tcopy_logic_8_power8.S"
L999:
li r3, 0
ld r31, 144(SP)
ld r30, 152(SP)
ld r29, 160(SP)
ld r28, 168(SP)
ld r27, 176(SP)
ld r26, 184(SP)
ld r25, 192(SP)
ld r24, 200(SP)
ld r23, 208(SP)
ld r22, 216(SP)
ld r21, 224(SP)
ld r20, 232(SP)
ld r19, 240(SP)
ld r18, 248(SP)
ld r17, 256(SP)
ld r16, 264(SP)
ld r15, 272(SP)
ld r14, 280(SP)
addi SP, SP, STACKSIZE
blr
EPILOGUE

View File

@ -0,0 +1,246 @@
/***************************************************************************
Copyright (c) 2013-2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2016/04/22 Werner Saar (wernsaar@googlemail.com)
* BLASTEST : OK
* CTEST : OK
* TEST : OK
* LAPACK-TEST : OK
**************************************************************************************/
srawi. I, M, 2
ble ZCOPYT_L2_BEGIN
ZCOPYT_L4_BEGIN:
mr A0, A
add A1, A0, LDA
add A2, A1, LDA
add A3, A2, LDA
add A, A3, LDA
mr B8, B
addi B, B, 64*SIZE
sradi. J, N, 3
ble ZCOPYT_L4x4_BEGIN
mr BO, B8
.align 5
ZCOPYT_L4x8_LOOP:
addi T1, PREB, 128
addi T2, PREB, 256
dcbt A0, PREA
dcbt A1, PREA
dcbt A2, PREA
dcbt A3, PREA
dcbtst BO, M8
dcbtst BO, PREB
dcbtst BO, T1
dcbtst BO, T2
COPY_4x8
add BO, BO, M8
addic. J, J, -1
bgt ZCOPYT_L4x8_LOOP
ZCOPYT_L4x4_BEGIN:
andi. T1, N, 4
ble ZCOPYT_L4x2_BEGIN
mr BO, B4
COPY_4x4
addi B4, B4, 32*SIZE
ZCOPYT_L4x2_BEGIN:
andi. T1, N, 2
ble ZCOPYT_L4x1_BEGIN
mr BO, B2
COPY_4x2
addi B2, B2, 16*SIZE
ZCOPYT_L4x1_BEGIN:
andi. T1, N, 1
ble ZCOPYT_L4_END
mr BO, B1
COPY_4x1
addi B1, B1, 8*SIZE
ZCOPYT_L4_END:
addic. I, I, -1
bgt ZCOPYT_L4_BEGIN
ZCOPYT_L2_BEGIN:
andi. T1, M, 2
ble ZCOPYT_L1_BEGIN
mr A0, A
add A1, A0, LDA
add A, A1, LDA
mr B8, B
addi B, B, 32*SIZE
sradi. J, N, 3
ble ZCOPYT_L2x4_BEGIN
mr BO, B8
ZCOPYT_L2x8_LOOP:
COPY_2x8
add BO, BO, M8
addic. J, J, -1
bgt ZCOPYT_L2x8_LOOP
ZCOPYT_L2x4_BEGIN:
andi. T1, N, 4
ble ZCOPYT_L2x2_BEGIN
mr BO, B4
COPY_2x4
addi B4, B4, 16*SIZE
ZCOPYT_L2x2_BEGIN:
andi. T1, N, 2
ble ZCOPYT_L2x1_BEGIN
mr BO, B2
COPY_2x2
addi B2, B2, 8*SIZE
ZCOPYT_L2x1_BEGIN:
andi. T1, N, 1
ble ZCOPYT_L2_END
mr BO, B1
COPY_2x1
addi B1, B1, 4*SIZE
ZCOPYT_L2_END:
ZCOPYT_L1_BEGIN:
andi. T1, M, 1
ble L999
mr A0, A
add A, A0, LDA
mr B8, B
addi B, B, 16*SIZE
sradi. J, N, 3
ble ZCOPYT_L1x4_BEGIN
mr BO, B8
ZCOPYT_L1x8_LOOP:
COPY_1x8
add BO, BO, M8
addic. J, J, -1
bgt ZCOPYT_L1x8_LOOP
ZCOPYT_L1x4_BEGIN:
andi. T1, N, 4
ble ZCOPYT_L1x2_BEGIN
mr BO, B4
COPY_1x4
addi B4, B4, 8*SIZE
ZCOPYT_L1x2_BEGIN:
andi. T1, N, 2
ble ZCOPYT_L1x1_BEGIN
mr BO, B2
COPY_1x2
addi B2, B2, 4*SIZE
ZCOPYT_L1x1_BEGIN:
andi. T1, N, 1
ble ZCOPYT_L1_END
mr BO, B1
COPY_1x1
addi B1, B1, 2*SIZE
ZCOPYT_L1_END:

View File

@ -0,0 +1,535 @@
/***************************************************************************
Copyright (c) 2013-2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2016/04/22 Werner Saar (wernsaar@googlemail.com)
* BLASTEST : OK
* CTEST : OK
* TEST : OK
* LAPACK-TEST : OK
**************************************************************************************/
/**********************************************************************************************
* Macros for N=4 and M=8
**********************************************************************************************/
.macro COPY_4x8
lxvd2x vs32, o0, A0
lxvd2x vs33, o16, A0
lxvd2x vs34, o32, A0
lxvd2x vs35, o48, A0
addi A0, A0, 64
lxvd2x vs36, o0, A0
lxvd2x vs37, o16, A0
lxvd2x vs38, o32, A0
lxvd2x vs39, o48, A0
addi A0, A0, 64
lxvd2x vs40, o0, A1
lxvd2x vs41, o16, A1
lxvd2x vs42, o32, A1
lxvd2x vs43, o48, A1
addi A1, A1, 64
lxvd2x vs44, o0, A1
lxvd2x vs45, o16, A1
lxvd2x vs46, o32, A1
lxvd2x vs47, o48, A1
addi A1, A1, 64
lxvd2x vs48, o0, A2
lxvd2x vs49, o16, A2
lxvd2x vs50, o32, A2
lxvd2x vs51, o48, A2
addi A2, A2, 64
lxvd2x vs52, o0, A2
lxvd2x vs53, o16, A2
lxvd2x vs54, o32, A2
lxvd2x vs55, o48, A2
addi A2, A2, 64
lxvd2x vs56, o0, A3
lxvd2x vs57, o16, A3
lxvd2x vs58, o32, A3
lxvd2x vs59, o48, A3
addi A3, A3, 64
lxvd2x vs60, o0, A3
lxvd2x vs61, o16, A3
lxvd2x vs62, o32, A3
lxvd2x vs63, o48, A3
addi A3, A3, 64
mr T1, BO
stxvd2x vs32, o0, T1
stxvd2x vs33, o16, T1
stxvd2x vs34, o32, T1
stxvd2x vs35, o48, T1
addi T1, T1, 64
stxvd2x vs36, o0, T1
stxvd2x vs37, o16, T1
stxvd2x vs38, o32, T1
stxvd2x vs39, o48, T1
addi T1, T1, 64
stxvd2x vs40, o0, T1
stxvd2x vs41, o16, T1
stxvd2x vs42, o32, T1
stxvd2x vs43, o48, T1
addi T1, T1, 64
stxvd2x vs44, o0, T1
stxvd2x vs45, o16, T1
stxvd2x vs46, o32, T1
stxvd2x vs47, o48, T1
addi T1, T1, 64
stxvd2x vs48, o0, T1
stxvd2x vs49, o16, T1
stxvd2x vs50, o32, T1
stxvd2x vs51, o48, T1
addi T1, T1, 64
stxvd2x vs52, o0, T1
stxvd2x vs53, o16, T1
stxvd2x vs54, o32, T1
stxvd2x vs55, o48, T1
addi T1, T1, 64
stxvd2x vs56, o0, T1
stxvd2x vs57, o16, T1
stxvd2x vs58, o32, T1
stxvd2x vs59, o48, T1
addi T1, T1, 64
stxvd2x vs60, o0, T1
stxvd2x vs61, o16, T1
stxvd2x vs62, o32, T1
stxvd2x vs63, o48, T1
.endm
/**********************************************************************************************
* Macros for N=4 and M=4
**********************************************************************************************/
.macro COPY_4x4
lxvd2x vs32, o0, A0
lxvd2x vs33, o16, A0
lxvd2x vs34, o32, A0
lxvd2x vs35, o48, A0
addi A0, A0, 64
lxvd2x vs36, o0, A1
lxvd2x vs37, o16, A1
lxvd2x vs38, o32, A1
lxvd2x vs39, o48, A1
addi A1, A1, 64
lxvd2x vs40, o0, A2
lxvd2x vs41, o16, A2
lxvd2x vs42, o32, A2
lxvd2x vs43, o48, A2
addi A2, A2, 64
lxvd2x vs44, o0, A3
lxvd2x vs45, o16, A3
lxvd2x vs46, o32, A3
lxvd2x vs47, o48, A3
addi A3, A3, 64
mr T1, BO
stxvd2x vs32, o0, T1
stxvd2x vs33, o16, T1
stxvd2x vs34, o32, T1
stxvd2x vs35, o48, T1
addi T1, T1, 64
stxvd2x vs36, o0, T1
stxvd2x vs37, o16, T1
stxvd2x vs38, o32, T1
stxvd2x vs39, o48, T1
addi T1, T1, 64
stxvd2x vs40, o0, T1
stxvd2x vs41, o16, T1
stxvd2x vs42, o32, T1
stxvd2x vs43, o48, T1
addi T1, T1, 64
stxvd2x vs44, o0, T1
stxvd2x vs45, o16, T1
stxvd2x vs46, o32, T1
stxvd2x vs47, o48, T1
.endm
/**********************************************************************************************
* Macros for N=4 and M=2
**********************************************************************************************/
.macro COPY_4x2
lxvd2x vs32, o0, A0
lxvd2x vs33, o16, A0
addi A0, A0, 32
lxvd2x vs34, o0, A1
lxvd2x vs35, o16, A1
addi A1, A1, 32
lxvd2x vs36, o0, A2
lxvd2x vs37, o16, A2
addi A2, A2, 32
lxvd2x vs38, o0, A3
lxvd2x vs39, o16, A3
addi A3, A3, 32
mr T1, BO
stxvd2x vs32, o0, T1
stxvd2x vs33, o16, T1
stxvd2x vs34, o32, T1
stxvd2x vs35, o48, T1
addi T1, T1, 64
stxvd2x vs36, o0, T1
stxvd2x vs37, o16, T1
stxvd2x vs38, o32, T1
stxvd2x vs39, o48, T1
.endm
/**********************************************************************************************
* Macros for N=4 and M=1
**********************************************************************************************/
.macro COPY_4x1
lxvd2x vs32, o0, A0
addi A0, A0, 16
lxvd2x vs33, o0, A1
addi A1, A1, 16
lxvd2x vs34, o0, A2
addi A2, A2, 16
lxvd2x vs35, o0, A3
addi A3, A3, 16
mr T1, BO
stxvd2x vs32, o0, T1
stxvd2x vs33, o16, T1
stxvd2x vs34, o32, T1
stxvd2x vs35, o48, T1
.endm
/**********************************************************************************************
* Macros for N=2 and M=8
**********************************************************************************************/
.macro COPY_2x8
lxvd2x vs32, o0, A0
lxvd2x vs33, o16, A0
lxvd2x vs34, o32, A0
lxvd2x vs35, o48, A0
addi A0, A0, 64
lxvd2x vs36, o0, A0
lxvd2x vs37, o16, A0
lxvd2x vs38, o32, A0
lxvd2x vs39, o48, A0
addi A0, A0, 64
lxvd2x vs40, o0, A1
lxvd2x vs41, o16, A1
lxvd2x vs42, o32, A1
lxvd2x vs43, o48, A1
addi A1, A1, 64
lxvd2x vs44, o0, A1
lxvd2x vs45, o16, A1
lxvd2x vs46, o32, A1
lxvd2x vs47, o48, A1
addi A1, A1, 64
mr T1, BO
stxvd2x vs32, o0, T1
stxvd2x vs33, o16, T1
stxvd2x vs34, o32, T1
stxvd2x vs35, o48, T1
addi T1, T1, 64
stxvd2x vs36, o0, T1
stxvd2x vs37, o16, T1
stxvd2x vs38, o32, T1
stxvd2x vs39, o48, T1
addi T1, T1, 64
stxvd2x vs40, o0, T1
stxvd2x vs41, o16, T1
stxvd2x vs42, o32, T1
stxvd2x vs43, o48, T1
addi T1, T1, 64
stxvd2x vs44, o0, T1
stxvd2x vs45, o16, T1
stxvd2x vs46, o32, T1
stxvd2x vs47, o48, T1
.endm
/**********************************************************************************************
* Macros for N=2 and M=4
**********************************************************************************************/
.macro COPY_2x4
lxvd2x vs32, o0, A0
lxvd2x vs33, o16, A0
lxvd2x vs34, o32, A0
lxvd2x vs35, o48, A0
addi A0, A0, 64
lxvd2x vs36, o0, A1
lxvd2x vs37, o16, A1
lxvd2x vs38, o32, A1
lxvd2x vs39, o48, A1
addi A1, A1, 64
mr T1, BO
stxvd2x vs32, o0, T1
stxvd2x vs33, o16, T1
stxvd2x vs34, o32, T1
stxvd2x vs35, o48, T1
addi T1, T1, 64
stxvd2x vs36, o0, T1
stxvd2x vs37, o16, T1
stxvd2x vs38, o32, T1
stxvd2x vs39, o48, T1
.endm
/**********************************************************************************************
* Macros for N=2 and M=2
**********************************************************************************************/
.macro COPY_2x2
lxvd2x vs32, o0, A0
lxvd2x vs33, o16, A0
addi A0, A0, 32
lxvd2x vs34, o0, A1
lxvd2x vs35, o16, A1
addi A1, A1, 32
mr T1, BO
stxvd2x vs32, o0, T1
stxvd2x vs33, o16, T1
stxvd2x vs34, o32, T1
stxvd2x vs35, o48, T1
.endm
/**********************************************************************************************
* Macros for N=2 and M=1
**********************************************************************************************/
.macro COPY_2x1
lxvd2x vs32, o0, A0
addi A0, A0, 16
lxvd2x vs33, o0, A1
addi A1, A1, 16
mr T1, BO
stxvd2x vs32, o0, T1
stxvd2x vs33, o16, T1
.endm
/**********************************************************************************************
* Macros for N=1 and M=8
**********************************************************************************************/
.macro COPY_1x8
lxvd2x vs32, o0, A0
lxvd2x vs33, o16, A0
lxvd2x vs34, o32, A0
lxvd2x vs35, o48, A0
addi A0, A0, 64
lxvd2x vs36, o0, A0
lxvd2x vs37, o16, A0
lxvd2x vs38, o32, A0
lxvd2x vs39, o48, A0
addi A0, A0, 64
mr T1, BO
stxvd2x vs32, o0, T1
stxvd2x vs33, o16, T1
stxvd2x vs34, o32, T1
stxvd2x vs35, o48, T1
addi T1, T1, 64
stxvd2x vs36, o0, T1
stxvd2x vs37, o16, T1
stxvd2x vs38, o32, T1
stxvd2x vs39, o48, T1
.endm
/**********************************************************************************************
* Macros for N=1 and M=4
**********************************************************************************************/
.macro COPY_1x4
lxvd2x vs32, o0, A0
lxvd2x vs33, o16, A0
lxvd2x vs34, o32, A0
lxvd2x vs35, o48, A0
addi A0, A0, 64
mr T1, BO
stxvd2x vs32, o0, T1
stxvd2x vs33, o16, T1
stxvd2x vs34, o32, T1
stxvd2x vs35, o48, T1
.endm
/**********************************************************************************************
* Macros for N=1 and M=2
**********************************************************************************************/
.macro COPY_1x2
lxvd2x vs32, o0, A0
lxvd2x vs33, o16, A0
addi A0, A0, 32
mr T1, BO
stxvd2x vs32, o0, T1
stxvd2x vs33, o16, T1
.endm
/**********************************************************************************************
* Macros for N=1 and M=1
**********************************************************************************************/
.macro COPY_1x1
lxvd2x vs32, o0, A0
addi A0, A0, 16
mr T1, BO
stxvd2x vs32, o0, T1
.endm

View File

@ -1985,7 +1985,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define SGEMM_DEFAULT_Q 720
#define DGEMM_DEFAULT_Q 720
#define CGEMM_DEFAULT_Q 720
#define ZGEMM_DEFAULT_Q 720
#define ZGEMM_DEFAULT_Q 360
#define SYMV_P 8