added cgemm_tcopy_8_power8.S

This commit is contained in:
Werner Saar 2016-04-23 07:37:18 +02:00
parent a670e8061e
commit d46f07bb4e
4 changed files with 839 additions and 1 deletions

View File

@ -30,7 +30,7 @@ DGEMMOTCOPYOBJ = dgemm_otcopy.o
CGEMMKERNEL = cgemm_kernel_8x4_power8.S
CGEMMINCOPY = ../generic/zgemm_ncopy_8.c
CGEMMITCOPY = ../generic/zgemm_tcopy_8.c
CGEMMITCOPY = cgemm_tcopy_8_power8.S
CGEMMONCOPY = ../generic/zgemm_ncopy_4.c
CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c
CGEMMONCOPYOBJ = cgemm_oncopy.o

View File

@ -0,0 +1,206 @@
/***************************************************************************
Copyright (c) 2013-2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2016/04/23 Werner Saar (wernsaar@googlemail.com)
* BLASTEST : OK
* CTEST : OK
* TEST : OK
* LAPACK-TEST : OK
**************************************************************************************/
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#define ASSEMBLER
#include "common.h"
#include "def_vsx.h"
#define M r3
#define N r4
#define A r5
#define LDA r6
#define B r7
#define A0 r8
#define A1 r9
#define A2 r10
#define A3 r11
#define J r12
#define PREA r14
#define PREB r15
#define BO r16
#define B8 r17
#define B4 r18
#define B2 r19
#define B1 r20
#define o4 r21
#define T2 r22
#define I r23
#define o16 r24
#define o32 r25
#define o48 r26
#define NOTUS2 r27
#define M8 r30
#define T1 r31
#define o0 0
#include "cgemm_tcopy_macros_8_power8.S"
#define STACKSIZE 384
PROLOGUE
PROFCODE
addi SP, SP, -STACKSIZE
li r0, 0
std r31, 144(SP)
std r30, 152(SP)
std r29, 160(SP)
std r28, 168(SP)
std r27, 176(SP)
std r26, 184(SP)
std r25, 192(SP)
std r24, 200(SP)
std r23, 208(SP)
std r22, 216(SP)
std r21, 224(SP)
std r20, 232(SP)
std r19, 240(SP)
std r18, 248(SP)
std r17, 256(SP)
std r16, 264(SP)
std r15, 272(SP)
std r14, 280(SP)
cmpwi cr0, M, 0
ble- L999
cmpwi cr0, N, 0
ble- L999
slwi LDA, LDA, ZBASE_SHIFT
slwi M8, M, 3 + ZBASE_SHIFT
li T2, -8
li PREA, -4
li PREB, -2
and B4, N, T2
and B2, N, PREA
and B1, N, PREB
mullw B4, B4, M
mullw B2, B2, M
mullw B1, B1, M
slwi B4, B4, ZBASE_SHIFT
slwi B2, B2, ZBASE_SHIFT
slwi B1, B1, ZBASE_SHIFT
add B4, B4, B
add B2, B2, B
add B1, B1, B
li PREA, 384
addi PREB, M8, 128
li o4, 4
li o16, 16
li o32, 32
li o48, 48
#include "cgemm_tcopy_logic_8_power8.S"
L999:
li r3, 0
ld r31, 144(SP)
ld r30, 152(SP)
ld r29, 160(SP)
ld r28, 168(SP)
ld r27, 176(SP)
ld r26, 184(SP)
ld r25, 192(SP)
ld r24, 200(SP)
ld r23, 208(SP)
ld r22, 216(SP)
ld r21, 224(SP)
ld r20, 232(SP)
ld r19, 240(SP)
ld r18, 248(SP)
ld r17, 256(SP)
ld r16, 264(SP)
ld r15, 272(SP)
ld r14, 280(SP)
addi SP, SP, STACKSIZE
blr
EPILOGUE

View File

@ -0,0 +1,247 @@
/***************************************************************************
Copyright (c) 2013-2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2016/04/23 Werner Saar (wernsaar@googlemail.com)
* BLASTEST : OK
* CTEST : OK
* TEST : OK
* LAPACK-TEST : OK
**************************************************************************************/
srawi. I, M, 2
ble CCOPYT_L2_BEGIN
CCOPYT_L4_BEGIN:
mr A0, A
add A1, A0, LDA
add A2, A1, LDA
add A3, A2, LDA
add A, A3, LDA
mr B8, B
addi B, B, 64*SIZE
sradi. J, N, 3
ble CCOPYT_L4x4_BEGIN
mr BO, B8
CCOPYT_L4x8_LOOP:
dcbt A0, PREA
dcbt A1, PREA
dcbt A2, PREA
dcbt A3, PREA
dcbtst BO, M8
dcbtst BO, PREB
COPY_4x8
add BO, BO, M8
addic. J, J, -1
ble CCOPYT_L4x4_BEGIN
COPY_4x8
add BO, BO, M8
addic. J, J, -1
bgt CCOPYT_L4x8_LOOP
CCOPYT_L4x4_BEGIN:
andi. T1, N, 4
ble CCOPYT_L4x2_BEGIN
mr BO, B4
COPY_4x4
addi B4, B4, 32*SIZE
CCOPYT_L4x2_BEGIN:
andi. T1, N, 2
ble CCOPYT_L4x1_BEGIN
mr BO, B2
COPY_4x2
addi B2, B2, 16*SIZE
CCOPYT_L4x1_BEGIN:
andi. T1, N, 1
ble CCOPYT_L4_END
mr BO, B1
COPY_4x1
addi B1, B1, 8*SIZE
CCOPYT_L4_END:
addic. I, I, -1
bgt CCOPYT_L4_BEGIN
CCOPYT_L2_BEGIN:
andi. T1, M, 2
ble CCOPYT_L1_BEGIN
mr A0, A
add A1, A0, LDA
add A, A1, LDA
mr B8, B
addi B, B, 32*SIZE
sradi. J, N, 3
ble CCOPYT_L2x4_BEGIN
mr BO, B8
CCOPYT_L2x8_LOOP:
COPY_2x8
add BO, BO, M8
addic. J, J, -1
bgt CCOPYT_L2x8_LOOP
CCOPYT_L2x4_BEGIN:
andi. T1, N, 4
ble CCOPYT_L2x2_BEGIN
mr BO, B4
COPY_2x4
addi B4, B4, 16*SIZE
CCOPYT_L2x2_BEGIN:
andi. T1, N, 2
ble CCOPYT_L2x1_BEGIN
mr BO, B2
COPY_2x2
addi B2, B2, 8*SIZE
CCOPYT_L2x1_BEGIN:
andi. T1, N, 1
ble CCOPYT_L2_END
mr BO, B1
COPY_2x1
addi B1, B1, 4*SIZE
CCOPYT_L2_END:
CCOPYT_L1_BEGIN:
andi. T1, M, 1
ble L999
mr A0, A
add A, A0, LDA
mr B8, B
addi B, B, 16*SIZE
sradi. J, N, 3
ble CCOPYT_L1x4_BEGIN
mr BO, B8
CCOPYT_L1x8_LOOP:
COPY_1x8
add BO, BO, M8
addic. J, J, -1
bgt CCOPYT_L1x8_LOOP
CCOPYT_L1x4_BEGIN:
andi. T1, N, 4
ble CCOPYT_L1x2_BEGIN
mr BO, B4
COPY_1x4
addi B4, B4, 8*SIZE
CCOPYT_L1x2_BEGIN:
andi. T1, N, 2
ble CCOPYT_L1x1_BEGIN
mr BO, B2
COPY_1x2
addi B2, B2, 4*SIZE
CCOPYT_L1x1_BEGIN:
andi. T1, N, 1
ble CCOPYT_L1_END
mr BO, B1
COPY_1x1
addi B1, B1, 2*SIZE
CCOPYT_L1_END:

View File

@ -0,0 +1,385 @@
/***************************************************************************
Copyright (c) 2013-2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2016/04/23 Werner Saar (wernsaar@googlemail.com)
* BLASTEST : OK
* CTEST : OK
* TEST : OK
* LAPACK-TEST : OK
**************************************************************************************/
/**********************************************************************************************
* Macros for N=4 and M=8
**********************************************************************************************/
.macro COPY_4x8
lxvw4x vs32, o0, A0
lxvw4x vs33, o16, A0
lxvw4x vs34, o32, A0
lxvw4x vs35, o48, A0
lxvw4x vs36, o0, A1
lxvw4x vs37, o16, A1
lxvw4x vs38, o32, A1
lxvw4x vs39, o48, A1
addi A0, A0, 64
addi A1, A1, 64
lxvw4x vs40, o0, A2
lxvw4x vs41, o16, A2
lxvw4x vs42, o32, A2
lxvw4x vs43, o48, A2
lxvw4x vs44, o0, A3
lxvw4x vs45, o16, A3
lxvw4x vs46, o32, A3
lxvw4x vs47, o48, A3
mr T1, BO
addi A2, A2, 64
addi A3, A3, 64
stxvw4x vs32, o0, T1
stxvw4x vs33, o16, T1
stxvw4x vs34, o32, T1
stxvw4x vs35, o48, T1
addi T1, T1, 64
stxvw4x vs36, o0, T1
stxvw4x vs37, o16, T1
stxvw4x vs38, o32, T1
stxvw4x vs39, o48, T1
addi T1, T1, 64
stxvw4x vs40, o0, T1
stxvw4x vs41, o16, T1
stxvw4x vs42, o32, T1
stxvw4x vs43, o48, T1
addi T1, T1, 64
stxvw4x vs44, o0, T1
stxvw4x vs45, o16, T1
stxvw4x vs46, o32, T1
stxvw4x vs47, o48, T1
.endm
/**********************************************************************************************
* Macros for N=4 and M=4
**********************************************************************************************/
.macro COPY_4x4
lxvw4x vs32, o0, A0
lxvw4x vs33, o16, A0
addi A0, A0, 32
lxvw4x vs34, o0, A1
lxvw4x vs35, o16, A1
addi A1, A1, 32
lxvw4x vs36, o0, A2
lxvw4x vs37, o16, A2
addi A2, A2, 32
lxvw4x vs38, o0, A3
lxvw4x vs39, o16, A3
addi A3, A3, 32
mr T1, BO
stxvw4x vs32, o0, T1
stxvw4x vs33, o16, T1
stxvw4x vs34, o32, T1
stxvw4x vs35, o48, T1
addi T1, T1, 64
stxvw4x vs36, o0, T1
stxvw4x vs37, o16, T1
stxvw4x vs38, o32, T1
stxvw4x vs39, o48, T1
.endm
/**********************************************************************************************
* Macros for N=4 and M=2
**********************************************************************************************/
.macro COPY_4x2
lxvw4x vs32, o0, A0
addi A0, A0, 16
lxvw4x vs33, o0, A1
addi A1, A1, 16
lxvw4x vs34, o0, A2
addi A2, A2, 16
lxvw4x vs35, o0, A3
addi A3, A3, 16
mr T1, BO
stxvw4x vs32, o0, T1
stxvw4x vs33, o16, T1
stxvw4x vs34, o32, T1
stxvw4x vs35, o48, T1
.endm
/**********************************************************************************************
* Macros for N=4 and M=1
**********************************************************************************************/
.macro COPY_4x1
lxsspx vs32, o0, A0
lxsspx vs33, o4, A0
addi A0, A0, 8
lxsspx vs34, o0, A1
lxsspx vs35, o4, A1
addi A1, A1, 8
lxsspx vs36, o0, A2
lxsspx vs37, o4, A2
addi A2, A2, 8
lxsspx vs38, o0, A3
lxsspx vs39, o4, A3
addi A3, A3, 8
mr T1, BO
stxsspx vs32, o0, T1
stxsspx vs33, o4, T1
addi T1, T1, 8
stxsspx vs34, o0, T1
stxsspx vs35, o4, T1
addi T1, T1, 8
stxsspx vs36, o0, T1
stxsspx vs37, o4, T1
addi T1, T1, 8
stxsspx vs38, o0, T1
stxsspx vs39, o4, T1
.endm
/**********************************************************************************************
* Macros for N=2 and M=8
**********************************************************************************************/
.macro COPY_2x8
lxvw4x vs32, o0, A0
lxvw4x vs33, o16, A0
lxvw4x vs34, o32, A0
lxvw4x vs35, o48, A0
addi A0, A0, 64
lxvw4x vs36, o0, A1
lxvw4x vs37, o16, A1
lxvw4x vs38, o32, A1
lxvw4x vs39, o48, A1
addi A1, A1, 64
mr T1, BO
stxvw4x vs32, o0, T1
stxvw4x vs33, o16, T1
stxvw4x vs34, o32, T1
stxvw4x vs35, o48, T1
addi T1, T1, 64
stxvw4x vs36, o0, T1
stxvw4x vs37, o16, T1
stxvw4x vs38, o32, T1
stxvw4x vs39, o48, T1
.endm
/**********************************************************************************************
* Macros for N=2 and M=4
**********************************************************************************************/
.macro COPY_2x4
lxvw4x vs32, o0, A0
lxvw4x vs33, o16, A0
addi A0, A0, 32
lxvw4x vs34, o0, A1
lxvw4x vs35, o16, A1
addi A1, A1, 32
mr T1, BO
stxvw4x vs32, o0, T1
stxvw4x vs33, o16, T1
stxvw4x vs34, o32, T1
stxvw4x vs35, o48, T1
.endm
/**********************************************************************************************
* Macros for N=2 and M=2
**********************************************************************************************/
.macro COPY_2x2
lxvw4x vs32, o0, A0
addi A0, A0, 16
lxvw4x vs33, o0, A1
addi A1, A1, 16
mr T1, BO
stxvw4x vs32, o0, T1
stxvw4x vs33, o16, T1
.endm
/**********************************************************************************************
* Macros for N=2 and M=1
**********************************************************************************************/
.macro COPY_2x1
lxsspx vs32, o0, A0
lxsspx vs33, o4, A0
addi A0, A0, 8
lxsspx vs34, o0, A1
lxsspx vs35, o4, A1
addi A1, A1, 8
mr T1, BO
stxsspx vs32, o0, T1
stxsspx vs33, o4, T1
addi T1, T1, 8
stxsspx vs34, o0, T1
stxsspx vs35, o4, T1
.endm
/**********************************************************************************************
* Macros for N=1 and M=8
**********************************************************************************************/
.macro COPY_1x8
lxvw4x vs32, o0, A0
lxvw4x vs33, o16, A0
lxvw4x vs34, o32, A0
lxvw4x vs35, o48, A0
addi A0, A0, 64
mr T1, BO
stxvw4x vs32, o0, T1
stxvw4x vs33, o16, T1
stxvw4x vs34, o32, T1
stxvw4x vs35, o48, T1
.endm
/**********************************************************************************************
* Macros for N=1 and M=4
**********************************************************************************************/
.macro COPY_1x4
lxvw4x vs32, o0, A0
lxvw4x vs33, o16, A0
addi A0, A0, 32
mr T1, BO
stxvw4x vs32, o0, T1
stxvw4x vs33, o16, T1
.endm
/**********************************************************************************************
* Macros for N=1 and M=2
**********************************************************************************************/
.macro COPY_1x2
lxvw4x vs32, o0, A0
addi A0, A0, 16
mr T1, BO
stxvw4x vs32, o0, T1
.endm
/**********************************************************************************************
* Macros for N=1 and M=1
**********************************************************************************************/
.macro COPY_1x1
lxsspx vs32, o0, A0
lxsspx vs33, o4, A0
addi A0, A0, 8
mr T1, BO
stxsspx vs32, o0, T1
stxsspx vs33, o4, T1
.endm