From 879a51165f952830fc9c27df326bcad70c4c7cb6 Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Fri, 22 Apr 2016 13:07:12 +0200 Subject: [PATCH] Optimized zgemm and tested zgemm again --- kernel/power/KERNEL.POWER8 | 2 +- kernel/power/zgemm_kernel_8x2_power8.S | 72 ++- kernel/power/zgemm_logic_8x2_power8.S | 69 ++- kernel/power/zgemm_macros_8x2_power8.S | 108 +++++ kernel/power/zgemm_tcopy_8_power8.S | 205 ++++++++ kernel/power/zgemm_tcopy_logic_8_power8.S | 246 ++++++++++ kernel/power/zgemm_tcopy_macros_8_power8.S | 535 +++++++++++++++++++++ param.h | 2 +- 8 files changed, 1227 insertions(+), 12 deletions(-) create mode 100644 kernel/power/zgemm_tcopy_8_power8.S create mode 100644 kernel/power/zgemm_tcopy_logic_8_power8.S create mode 100644 kernel/power/zgemm_tcopy_macros_8_power8.S diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8 index e1b89cc97..c7df0e039 100644 --- a/kernel/power/KERNEL.POWER8 +++ b/kernel/power/KERNEL.POWER8 @@ -42,7 +42,7 @@ ZGEMMKERNEL = zgemm_kernel_8x2_power8.S ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c ZGEMMINCOPY = ../generic/zgemm_ncopy_8.c -ZGEMMITCOPY = ../generic/zgemm_tcopy_8.c +ZGEMMITCOPY = zgemm_tcopy_8_power8.S ZGEMMONCOPYOBJ = zgemm_oncopy.o ZGEMMOTCOPYOBJ = zgemm_otcopy.o ZGEMMINCOPYOBJ = zgemm_incopy.o diff --git a/kernel/power/zgemm_kernel_8x2_power8.S b/kernel/power/zgemm_kernel_8x2_power8.S index 336b13b1f..02c94a88a 100644 --- a/kernel/power/zgemm_kernel_8x2_power8.S +++ b/kernel/power/zgemm_kernel_8x2_power8.S @@ -1,3 +1,73 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/04/22 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/04/22 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ @@ -250,7 +320,7 @@ ble L999 slwi LDC, LDC, ZBASE_SHIFT - li PRE, 384 + li PRE, 512 li o8 , 8 li o16 , 16 li o24 , 24 diff --git a/kernel/power/zgemm_logic_8x2_power8.S b/kernel/power/zgemm_logic_8x2_power8.S index 96612da82..0cd784cc0 100644 --- a/kernel/power/zgemm_logic_8x2_power8.S +++ b/kernel/power/zgemm_logic_8x2_power8.S @@ -1,3 +1,39 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/04/22 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + + srawi. J, N, 1 ble ZGEMM_L2_END @@ -5,20 +41,34 @@ ZGEMM_L2_BEGIN: mr BO, B mr BBO, BBUFFER - slwi T1, K, 1 + srawi. T1, K, 2 + ble ZGEMM_L2_COPYB1 -ZGEMM_L2_COPYB: +ZGEMM_L2_COPYB8: - lxvdsx vs4, o0, BO // b0_r - lxvdsx vs5, o8, BO // b0_i - addi BO, BO, 16 - stxvd2x vs4, o0, BBO - stxvd2x vs5, o16, BBO + addi T2, PRE, 128 + dcbt BO, PRE + dcbtst BBO, PRE + dcbtst BBO, T2 + ZCOPYB_8x1 addic. T1, T1, -1 - addi BBO, BBO, 32 - bge ZGEMM_L2_COPYB + bgt ZGEMM_L2_COPYB8 +ZGEMM_L2_COPYB1: + + andi. T1, K, 3 + ble ZGEMM_L2_COPYB_END + +ZGEMM_L2_COPYB_LOOP: + + ZCOPYB_1x1 + ZCOPYB_1x1 + addic. T1, T1, -1 + + bgt ZGEMM_L2_COPYB_LOOP + +ZGEMM_L2_COPYB_END: mr CO, C mr AO, A @@ -493,6 +543,7 @@ ZGEMM_L1_BEGIN: slwi T1, K, 0 ZGEMM_L1_COPYB: + dcbtst BBO, PRE lxvdsx vs4, o0, BO // b0_r lxvdsx vs5, o8, BO // b0_i diff --git a/kernel/power/zgemm_macros_8x2_power8.S b/kernel/power/zgemm_macros_8x2_power8.S index a0fbb2e11..c43a115b2 100644 --- a/kernel/power/zgemm_macros_8x2_power8.S +++ b/kernel/power/zgemm_macros_8x2_power8.S @@ -1,3 +1,38 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/04/22 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + #if defined(NN) || defined(NT) || defined(TN) || defined(TT) #define XSFADD_R1 xsadddp @@ -3055,3 +3090,76 @@ .endm + + +.macro ZCOPYB_1x1 + + lxvdsx vs4, o0, BO // b0_r + lxvdsx vs5, o8, BO // b0_i + addi BO, BO, 16 + stxvd2x vs4, o0, BBO + stxvd2x vs5, o16, BBO + addi BBO, BBO, 32 + +.endm + + +.macro ZCOPYB_8x1 + + lxvd2x vs32, o0, BO + lxvd2x vs33, o16, BO + lxvd2x vs34, o32, BO + lxvd2x vs35, o48, BO + addi BO, BO, 64 + + lxvd2x vs36, o0, BO + lxvd2x vs37, o16, BO + lxvd2x vs38, o32, BO + lxvd2x vs39, o48, BO + addi BO, BO, 64 + + xxspltd vs40, vs32, 0 + xxspltd vs41, vs32, 1 + xxspltd vs42, vs33, 0 + xxspltd vs43, vs33, 1 + xxspltd vs44, vs34, 0 + xxspltd vs45, vs34, 1 + xxspltd vs46, vs35, 0 + xxspltd vs47, vs35, 1 + + xxspltd vs48, vs36, 0 + xxspltd vs49, vs36, 1 + xxspltd vs50, vs37, 0 + xxspltd vs51, vs37, 1 + xxspltd vs52, vs38, 0 + xxspltd vs53, vs38, 1 + xxspltd vs54, vs39, 0 + xxspltd vs55, vs39, 1 + + stxvd2x vs40, o0, BBO + stxvd2x vs41, o16, BBO + stxvd2x vs42, o32, BBO + stxvd2x vs43, o48, BBO + addi BBO, BBO, 64 + + stxvd2x vs44, o0, BBO + stxvd2x vs45, o16, BBO + stxvd2x vs46, o32, BBO + stxvd2x vs47, o48, BBO + addi BBO, BBO, 64 + + stxvd2x vs48, o0, BBO + stxvd2x vs49, o16, BBO + stxvd2x vs50, o32, BBO + stxvd2x vs51, o48, BBO + addi BBO, BBO, 64 + + stxvd2x vs52, o0, BBO + stxvd2x vs53, o16, BBO + stxvd2x vs54, o32, BBO + stxvd2x vs55, o48, BBO + addi BBO, BBO, 64 + +.endm + + diff --git a/kernel/power/zgemm_tcopy_8_power8.S b/kernel/power/zgemm_tcopy_8_power8.S new file mode 100644 index 000000000..1f3f35419 --- /dev/null +++ b/kernel/power/zgemm_tcopy_8_power8.S @@ -0,0 +1,205 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/04/22 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "def_vsx.h" + +#define M r3 +#define N r4 +#define A r5 +#define LDA r6 +#define B r7 + +#define A0 r8 +#define A1 r9 +#define A2 r10 +#define A3 r11 + +#define J r12 + +#define PREA r14 +#define PREB r15 +#define BO r16 +#define B8 r17 +#define B4 r18 +#define B2 r19 +#define B1 r20 +#define NOTUS1 r21 +#define T2 r22 +#define I r23 +#define o16 r24 +#define o32 r25 +#define o48 r26 +#define NOTUS2 r27 +#define M8 r30 +#define T1 r31 + +#define o0 0 + +#include "zgemm_tcopy_macros_8_power8.S" + +#define STACKSIZE 384 + + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) + std r20, 232(SP) + std r19, 240(SP) + std r18, 248(SP) + std r17, 256(SP) + std r16, 264(SP) + std r15, 272(SP) + std r14, 280(SP) + + cmpwi cr0, M, 0 + ble- L999 + cmpwi cr0, N, 0 + ble- L999 + + slwi LDA, LDA, ZBASE_SHIFT + slwi M8, M, 3 + ZBASE_SHIFT + + li T2, -8 + li PREA, -4 + li PREB, -2 + + and B4, N, T2 + and B2, N, PREA + and B1, N, PREB + + mullw B4, B4, M + mullw B2, B2, M + mullw B1, B1, M + + slwi B4, B4, ZBASE_SHIFT + slwi B2, B2, ZBASE_SHIFT + slwi B1, B1, ZBASE_SHIFT + + add B4, B4, B + add B2, B2, B + add B1, B1, B + + li PREA, 384 + addi PREB, M8, 128 + + li o16, 16 + li o32, 32 + li o48, 48 + +#include "zgemm_tcopy_logic_8_power8.S" + +L999: + + li r3, 0 + + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) + ld r20, 232(SP) + ld r19, 240(SP) + ld r18, 248(SP) + ld r17, 256(SP) + ld r16, 264(SP) + ld r15, 272(SP) + ld r14, 280(SP) + + addi SP, SP, STACKSIZE + + blr + EPILOGUE + + diff --git a/kernel/power/zgemm_tcopy_logic_8_power8.S b/kernel/power/zgemm_tcopy_logic_8_power8.S new file mode 100644 index 000000000..34fd307bd --- /dev/null +++ b/kernel/power/zgemm_tcopy_logic_8_power8.S @@ -0,0 +1,246 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/04/22 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + + + srawi. I, M, 2 + ble ZCOPYT_L2_BEGIN + + +ZCOPYT_L4_BEGIN: + + mr A0, A + add A1, A0, LDA + add A2, A1, LDA + add A3, A2, LDA + add A, A3, LDA + mr B8, B + addi B, B, 64*SIZE + + sradi. J, N, 3 + ble ZCOPYT_L4x4_BEGIN + + mr BO, B8 + + .align 5 + +ZCOPYT_L4x8_LOOP: + + addi T1, PREB, 128 + addi T2, PREB, 256 + dcbt A0, PREA + dcbt A1, PREA + dcbt A2, PREA + dcbt A3, PREA + dcbtst BO, M8 + dcbtst BO, PREB + dcbtst BO, T1 + dcbtst BO, T2 + + COPY_4x8 + + add BO, BO, M8 + + addic. J, J, -1 + bgt ZCOPYT_L4x8_LOOP + +ZCOPYT_L4x4_BEGIN: + + andi. T1, N, 4 + ble ZCOPYT_L4x2_BEGIN + + mr BO, B4 + + COPY_4x4 + + + addi B4, B4, 32*SIZE + +ZCOPYT_L4x2_BEGIN: + + andi. T1, N, 2 + ble ZCOPYT_L4x1_BEGIN + + mr BO, B2 + + COPY_4x2 + + + addi B2, B2, 16*SIZE + +ZCOPYT_L4x1_BEGIN: + + andi. T1, N, 1 + ble ZCOPYT_L4_END + + mr BO, B1 + + COPY_4x1 + + + addi B1, B1, 8*SIZE + +ZCOPYT_L4_END: + + addic. I, I, -1 + bgt ZCOPYT_L4_BEGIN + + + +ZCOPYT_L2_BEGIN: + + andi. T1, M, 2 + ble ZCOPYT_L1_BEGIN + + mr A0, A + add A1, A0, LDA + add A, A1, LDA + mr B8, B + addi B, B, 32*SIZE + + sradi. J, N, 3 + ble ZCOPYT_L2x4_BEGIN + + mr BO, B8 + +ZCOPYT_L2x8_LOOP: + + COPY_2x8 + + add BO, BO, M8 + + addic. J, J, -1 + bgt ZCOPYT_L2x8_LOOP + +ZCOPYT_L2x4_BEGIN: + + andi. T1, N, 4 + ble ZCOPYT_L2x2_BEGIN + + mr BO, B4 + + COPY_2x4 + + + addi B4, B4, 16*SIZE + +ZCOPYT_L2x2_BEGIN: + + andi. T1, N, 2 + ble ZCOPYT_L2x1_BEGIN + + mr BO, B2 + + COPY_2x2 + + + addi B2, B2, 8*SIZE + +ZCOPYT_L2x1_BEGIN: + + andi. T1, N, 1 + ble ZCOPYT_L2_END + + mr BO, B1 + + COPY_2x1 + + + addi B1, B1, 4*SIZE + +ZCOPYT_L2_END: + + +ZCOPYT_L1_BEGIN: + + andi. T1, M, 1 + ble L999 + + mr A0, A + add A, A0, LDA + mr B8, B + addi B, B, 16*SIZE + + sradi. J, N, 3 + ble ZCOPYT_L1x4_BEGIN + + mr BO, B8 + +ZCOPYT_L1x8_LOOP: + + COPY_1x8 + + add BO, BO, M8 + + addic. J, J, -1 + bgt ZCOPYT_L1x8_LOOP + +ZCOPYT_L1x4_BEGIN: + + andi. T1, N, 4 + ble ZCOPYT_L1x2_BEGIN + + mr BO, B4 + + COPY_1x4 + + + addi B4, B4, 8*SIZE + +ZCOPYT_L1x2_BEGIN: + + andi. T1, N, 2 + ble ZCOPYT_L1x1_BEGIN + + mr BO, B2 + + COPY_1x2 + + + addi B2, B2, 4*SIZE + +ZCOPYT_L1x1_BEGIN: + + andi. T1, N, 1 + ble ZCOPYT_L1_END + + mr BO, B1 + + COPY_1x1 + + + addi B1, B1, 2*SIZE + +ZCOPYT_L1_END: + diff --git a/kernel/power/zgemm_tcopy_macros_8_power8.S b/kernel/power/zgemm_tcopy_macros_8_power8.S new file mode 100644 index 000000000..e8c2f0baa --- /dev/null +++ b/kernel/power/zgemm_tcopy_macros_8_power8.S @@ -0,0 +1,535 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/04/22 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + + +/********************************************************************************************** +* Macros for N=4 and M=8 +**********************************************************************************************/ + +.macro COPY_4x8 + + lxvd2x vs32, o0, A0 + lxvd2x vs33, o16, A0 + lxvd2x vs34, o32, A0 + lxvd2x vs35, o48, A0 + addi A0, A0, 64 + + lxvd2x vs36, o0, A0 + lxvd2x vs37, o16, A0 + lxvd2x vs38, o32, A0 + lxvd2x vs39, o48, A0 + addi A0, A0, 64 + + + lxvd2x vs40, o0, A1 + lxvd2x vs41, o16, A1 + lxvd2x vs42, o32, A1 + lxvd2x vs43, o48, A1 + addi A1, A1, 64 + + lxvd2x vs44, o0, A1 + lxvd2x vs45, o16, A1 + lxvd2x vs46, o32, A1 + lxvd2x vs47, o48, A1 + addi A1, A1, 64 + + + lxvd2x vs48, o0, A2 + lxvd2x vs49, o16, A2 + lxvd2x vs50, o32, A2 + lxvd2x vs51, o48, A2 + addi A2, A2, 64 + + lxvd2x vs52, o0, A2 + lxvd2x vs53, o16, A2 + lxvd2x vs54, o32, A2 + lxvd2x vs55, o48, A2 + addi A2, A2, 64 + + + lxvd2x vs56, o0, A3 + lxvd2x vs57, o16, A3 + lxvd2x vs58, o32, A3 + lxvd2x vs59, o48, A3 + addi A3, A3, 64 + + lxvd2x vs60, o0, A3 + lxvd2x vs61, o16, A3 + lxvd2x vs62, o32, A3 + lxvd2x vs63, o48, A3 + addi A3, A3, 64 + + + mr T1, BO + + stxvd2x vs32, o0, T1 + stxvd2x vs33, o16, T1 + stxvd2x vs34, o32, T1 + stxvd2x vs35, o48, T1 + addi T1, T1, 64 + + stxvd2x vs36, o0, T1 + stxvd2x vs37, o16, T1 + stxvd2x vs38, o32, T1 + stxvd2x vs39, o48, T1 + + addi T1, T1, 64 + + stxvd2x vs40, o0, T1 + stxvd2x vs41, o16, T1 + stxvd2x vs42, o32, T1 + stxvd2x vs43, o48, T1 + addi T1, T1, 64 + + stxvd2x vs44, o0, T1 + stxvd2x vs45, o16, T1 + stxvd2x vs46, o32, T1 + stxvd2x vs47, o48, T1 + + addi T1, T1, 64 + + stxvd2x vs48, o0, T1 + stxvd2x vs49, o16, T1 + stxvd2x vs50, o32, T1 + stxvd2x vs51, o48, T1 + addi T1, T1, 64 + + stxvd2x vs52, o0, T1 + stxvd2x vs53, o16, T1 + stxvd2x vs54, o32, T1 + stxvd2x vs55, o48, T1 + + addi T1, T1, 64 + + stxvd2x vs56, o0, T1 + stxvd2x vs57, o16, T1 + stxvd2x vs58, o32, T1 + stxvd2x vs59, o48, T1 + addi T1, T1, 64 + + stxvd2x vs60, o0, T1 + stxvd2x vs61, o16, T1 + stxvd2x vs62, o32, T1 + stxvd2x vs63, o48, T1 + +.endm + + +/********************************************************************************************** +* Macros for N=4 and M=4 +**********************************************************************************************/ + +.macro COPY_4x4 + + lxvd2x vs32, o0, A0 + lxvd2x vs33, o16, A0 + lxvd2x vs34, o32, A0 + lxvd2x vs35, o48, A0 + addi A0, A0, 64 + + + lxvd2x vs36, o0, A1 + lxvd2x vs37, o16, A1 + lxvd2x vs38, o32, A1 + lxvd2x vs39, o48, A1 + addi A1, A1, 64 + + + lxvd2x vs40, o0, A2 + lxvd2x vs41, o16, A2 + lxvd2x vs42, o32, A2 + lxvd2x vs43, o48, A2 + addi A2, A2, 64 + + + lxvd2x vs44, o0, A3 + lxvd2x vs45, o16, A3 + lxvd2x vs46, o32, A3 + lxvd2x vs47, o48, A3 + addi A3, A3, 64 + + + mr T1, BO + + stxvd2x vs32, o0, T1 + stxvd2x vs33, o16, T1 + stxvd2x vs34, o32, T1 + stxvd2x vs35, o48, T1 + + addi T1, T1, 64 + + stxvd2x vs36, o0, T1 + stxvd2x vs37, o16, T1 + stxvd2x vs38, o32, T1 + stxvd2x vs39, o48, T1 + + addi T1, T1, 64 + + stxvd2x vs40, o0, T1 + stxvd2x vs41, o16, T1 + stxvd2x vs42, o32, T1 + stxvd2x vs43, o48, T1 + + addi T1, T1, 64 + + stxvd2x vs44, o0, T1 + stxvd2x vs45, o16, T1 + stxvd2x vs46, o32, T1 + stxvd2x vs47, o48, T1 + +.endm + + +/********************************************************************************************** +* Macros for N=4 and M=2 +**********************************************************************************************/ + +.macro COPY_4x2 + + lxvd2x vs32, o0, A0 + lxvd2x vs33, o16, A0 + addi A0, A0, 32 + + + lxvd2x vs34, o0, A1 + lxvd2x vs35, o16, A1 + addi A1, A1, 32 + + + lxvd2x vs36, o0, A2 + lxvd2x vs37, o16, A2 + addi A2, A2, 32 + + + lxvd2x vs38, o0, A3 + lxvd2x vs39, o16, A3 + addi A3, A3, 32 + + + mr T1, BO + + stxvd2x vs32, o0, T1 + stxvd2x vs33, o16, T1 + + stxvd2x vs34, o32, T1 + stxvd2x vs35, o48, T1 + + addi T1, T1, 64 + + stxvd2x vs36, o0, T1 + stxvd2x vs37, o16, T1 + + stxvd2x vs38, o32, T1 + stxvd2x vs39, o48, T1 + +.endm + + +/********************************************************************************************** +* Macros for N=4 and M=1 +**********************************************************************************************/ + +.macro COPY_4x1 + + lxvd2x vs32, o0, A0 + addi A0, A0, 16 + + + lxvd2x vs33, o0, A1 + addi A1, A1, 16 + + + lxvd2x vs34, o0, A2 + addi A2, A2, 16 + + + lxvd2x vs35, o0, A3 + addi A3, A3, 16 + + + mr T1, BO + + stxvd2x vs32, o0, T1 + + stxvd2x vs33, o16, T1 + + stxvd2x vs34, o32, T1 + + stxvd2x vs35, o48, T1 + +.endm + + +/********************************************************************************************** +* Macros for N=2 and M=8 +**********************************************************************************************/ + +.macro COPY_2x8 + + lxvd2x vs32, o0, A0 + lxvd2x vs33, o16, A0 + lxvd2x vs34, o32, A0 + lxvd2x vs35, o48, A0 + addi A0, A0, 64 + + lxvd2x vs36, o0, A0 + lxvd2x vs37, o16, A0 + lxvd2x vs38, o32, A0 + lxvd2x vs39, o48, A0 + addi A0, A0, 64 + + + lxvd2x vs40, o0, A1 + lxvd2x vs41, o16, A1 + lxvd2x vs42, o32, A1 + lxvd2x vs43, o48, A1 + addi A1, A1, 64 + + lxvd2x vs44, o0, A1 + lxvd2x vs45, o16, A1 + lxvd2x vs46, o32, A1 + lxvd2x vs47, o48, A1 + addi A1, A1, 64 + + + mr T1, BO + + stxvd2x vs32, o0, T1 + stxvd2x vs33, o16, T1 + stxvd2x vs34, o32, T1 + stxvd2x vs35, o48, T1 + addi T1, T1, 64 + + stxvd2x vs36, o0, T1 + stxvd2x vs37, o16, T1 + stxvd2x vs38, o32, T1 + stxvd2x vs39, o48, T1 + + addi T1, T1, 64 + + stxvd2x vs40, o0, T1 + stxvd2x vs41, o16, T1 + stxvd2x vs42, o32, T1 + stxvd2x vs43, o48, T1 + addi T1, T1, 64 + + stxvd2x vs44, o0, T1 + stxvd2x vs45, o16, T1 + stxvd2x vs46, o32, T1 + stxvd2x vs47, o48, T1 + +.endm + + +/********************************************************************************************** +* Macros for N=2 and M=4 +**********************************************************************************************/ + +.macro COPY_2x4 + + lxvd2x vs32, o0, A0 + lxvd2x vs33, o16, A0 + lxvd2x vs34, o32, A0 + lxvd2x vs35, o48, A0 + addi A0, A0, 64 + + + lxvd2x vs36, o0, A1 + lxvd2x vs37, o16, A1 + lxvd2x vs38, o32, A1 + lxvd2x vs39, o48, A1 + addi A1, A1, 64 + + + mr T1, BO + + stxvd2x vs32, o0, T1 + stxvd2x vs33, o16, T1 + stxvd2x vs34, o32, T1 + stxvd2x vs35, o48, T1 + + addi T1, T1, 64 + + stxvd2x vs36, o0, T1 + stxvd2x vs37, o16, T1 + stxvd2x vs38, o32, T1 + stxvd2x vs39, o48, T1 + +.endm + + +/********************************************************************************************** +* Macros for N=2 and M=2 +**********************************************************************************************/ + +.macro COPY_2x2 + + lxvd2x vs32, o0, A0 + lxvd2x vs33, o16, A0 + addi A0, A0, 32 + + + lxvd2x vs34, o0, A1 + lxvd2x vs35, o16, A1 + addi A1, A1, 32 + + + mr T1, BO + + stxvd2x vs32, o0, T1 + stxvd2x vs33, o16, T1 + + stxvd2x vs34, o32, T1 + stxvd2x vs35, o48, T1 + +.endm + + +/********************************************************************************************** +* Macros for N=2 and M=1 +**********************************************************************************************/ + +.macro COPY_2x1 + + lxvd2x vs32, o0, A0 + addi A0, A0, 16 + + + lxvd2x vs33, o0, A1 + addi A1, A1, 16 + + + mr T1, BO + + stxvd2x vs32, o0, T1 + + stxvd2x vs33, o16, T1 + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=8 +**********************************************************************************************/ + +.macro COPY_1x8 + + lxvd2x vs32, o0, A0 + lxvd2x vs33, o16, A0 + lxvd2x vs34, o32, A0 + lxvd2x vs35, o48, A0 + addi A0, A0, 64 + + lxvd2x vs36, o0, A0 + lxvd2x vs37, o16, A0 + lxvd2x vs38, o32, A0 + lxvd2x vs39, o48, A0 + addi A0, A0, 64 + + + mr T1, BO + + stxvd2x vs32, o0, T1 + stxvd2x vs33, o16, T1 + stxvd2x vs34, o32, T1 + stxvd2x vs35, o48, T1 + addi T1, T1, 64 + + stxvd2x vs36, o0, T1 + stxvd2x vs37, o16, T1 + stxvd2x vs38, o32, T1 + stxvd2x vs39, o48, T1 + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=4 +**********************************************************************************************/ + +.macro COPY_1x4 + + lxvd2x vs32, o0, A0 + lxvd2x vs33, o16, A0 + lxvd2x vs34, o32, A0 + lxvd2x vs35, o48, A0 + addi A0, A0, 64 + + + mr T1, BO + + stxvd2x vs32, o0, T1 + stxvd2x vs33, o16, T1 + stxvd2x vs34, o32, T1 + stxvd2x vs35, o48, T1 + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=2 +**********************************************************************************************/ + +.macro COPY_1x2 + + lxvd2x vs32, o0, A0 + lxvd2x vs33, o16, A0 + addi A0, A0, 32 + + + mr T1, BO + + stxvd2x vs32, o0, T1 + stxvd2x vs33, o16, T1 + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=1 +**********************************************************************************************/ + +.macro COPY_1x1 + + lxvd2x vs32, o0, A0 + addi A0, A0, 16 + + + mr T1, BO + + stxvd2x vs32, o0, T1 + +.endm + diff --git a/param.h b/param.h index 8ecc812dc..0a9f02fde 100644 --- a/param.h +++ b/param.h @@ -1985,7 +1985,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SGEMM_DEFAULT_Q 720 #define DGEMM_DEFAULT_Q 720 #define CGEMM_DEFAULT_Q 720 -#define ZGEMM_DEFAULT_Q 720 +#define ZGEMM_DEFAULT_Q 360 #define SYMV_P 8