diff --git a/kernel/power/dgemm_kernel_16x4_power8.S b/kernel/power/dgemm_kernel_16x4_power8.S index 53205ade8..c67f31160 100644 --- a/kernel/power/dgemm_kernel_16x4_power8.S +++ b/kernel/power/dgemm_kernel_16x4_power8.S @@ -1,3 +1,38 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/05 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ @@ -218,11 +253,11 @@ cmpwi cr0, M, 0 - ble L999_H1 + ble .L999_H1 cmpwi cr0, N, 0 - ble L999_H1 + ble .L999_H1 cmpwi cr0, K, 0 - ble L999_H1 + ble .L999_H1 #ifdef __64BIT__ addi ALPHA, SP, 296 @@ -241,7 +276,7 @@ #include "dgemm_logic_16x4_power8.S" -L999: +.L999: addi r3, 0, 0 lfd f14, 0(SP) diff --git a/kernel/power/dgemm_logic_16x4_power8.S b/kernel/power/dgemm_logic_16x4_power8.S index e19f78b8d..49c438f61 100644 --- a/kernel/power/dgemm_logic_16x4_power8.S +++ b/kernel/power/dgemm_logic_16x4_power8.S @@ -1,25 +1,61 @@ - srawi. J, N, 2 - ble DGEMM_L4_END +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ -DGEMM_L4_BEGIN: +/************************************************************************************** +* 2016/03/05 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + + + srawi. J, N, 2 + ble .LDGEMM_L4_END + +.LDGEMM_L4_BEGIN: mr CO, C mr AO, A slwi T1, LDC , 2 add C, C, T1 srawi. I, M, 4 - ble DGEMM_L4x16_END + ble .LDGEMM_L4x16_END -DGEMM_L4x16_BEGIN: +.LDGEMM_L4x16_BEGIN: mr BO, B srawi. L, K, 3 - ble DGEMM_L4x16_SUB0 + ble .LDGEMM_L4x16_SUB0 cmpwi cr0, L, 1 - ble DGEMM_L4x16_SUB4 + ble .LDGEMM_L4x16_SUB4 -DGEMM_L4x16_LOOP_START: +.LDGEMM_L4x16_LOOP_START: dcbt AO, PRE LOAD4x16_1 @@ -42,11 +78,11 @@ DGEMM_L4x16_LOOP_START: KERNEL4x16_2 addic. L, L, -2 - ble DGEMM_L4x16_LOOP_END + ble .LDGEMM_L4x16_LOOP_END .align 5 -DGEMM_L4x16_LOOP: +.LDGEMM_L4x16_LOOP: dcbt AO, PRE KERNEL4x16_1 @@ -67,9 +103,9 @@ DGEMM_L4x16_LOOP: KERNEL4x16_2 addic. L, L, -1 - bgt DGEMM_L4x16_LOOP + bgt .LDGEMM_L4x16_LOOP -DGEMM_L4x16_LOOP_END: +.LDGEMM_L4x16_LOOP_END: dcbt AO, PRE KERNEL4x16_1 @@ -88,9 +124,9 @@ DGEMM_L4x16_LOOP_END: KERNEL4x16_1 KERNEL4x16_E2 - b DGEMM_L4x16_SUB1 + b .LDGEMM_L4x16_SUB1 -DGEMM_L4x16_SUB4: +.LDGEMM_L4x16_SUB4: dcbt AO, PRE KERNEL4x16_SUBI1 @@ -106,53 +142,53 @@ DGEMM_L4x16_SUB4: KERNEL4x16_SUB1 KERNEL4x16_SUB1 - b DGEMM_L4x16_SUB1 + b .LDGEMM_L4x16_SUB1 -DGEMM_L4x16_SUB0: +.LDGEMM_L4x16_SUB0: andi. L, K, 7 KERNEL4x16_SUBI1 addic. L, L, -1 - ble DGEMM_L4x16_SAVE - b DGEMM_L4x16_SUB2 + ble .LDGEMM_L4x16_SAVE + b .LDGEMM_L4x16_SUB2 -DGEMM_L4x16_SUB1: +.LDGEMM_L4x16_SUB1: andi. L, K, 7 - ble DGEMM_L4x16_SAVE + ble .LDGEMM_L4x16_SAVE -DGEMM_L4x16_SUB2: +.LDGEMM_L4x16_SUB2: KERNEL4x16_SUB1 addic. L, L, -1 - bgt DGEMM_L4x16_SUB2 + bgt .LDGEMM_L4x16_SUB2 -DGEMM_L4x16_SAVE: +.LDGEMM_L4x16_SAVE: SAVE4x16 addic. I, I, -1 - bgt DGEMM_L4x16_BEGIN + bgt .LDGEMM_L4x16_BEGIN -DGEMM_L4x16_END: +.LDGEMM_L4x16_END: -DGEMM_L4x8_BEGIN: +.LDGEMM_L4x8_BEGIN: andi. T2, M, 15 - ble DGEMM_L4x1_END + ble .LDGEMM_L4x1_END andi. T1, M, 8 - ble DGEMM_L4x8_END + ble .LDGEMM_L4x8_END mr BO, B srawi. L, K, 3 - ble DGEMM_L4x8_SUB0 + ble .LDGEMM_L4x8_SUB0 cmpwi cr0, L, 1 - ble DGEMM_L4x8_SUB4 + ble .LDGEMM_L4x8_SUB4 -DGEMM_L4x8_LOOP_START: +.LDGEMM_L4x8_LOOP_START: LOAD4x8_1 KERNEL4x8_I1 @@ -166,11 +202,11 @@ DGEMM_L4x8_LOOP_START: KERNEL4x8_2 addic. L, L, -2 - ble DGEMM_L4x8_LOOP_END + ble .LDGEMM_L4x8_LOOP_END .align 5 -DGEMM_L4x8_LOOP: +.LDGEMM_L4x8_LOOP: KERNEL4x8_1 KERNEL4x8_2 @@ -183,9 +219,9 @@ DGEMM_L4x8_LOOP: KERNEL4x8_2 addic. L, L, -1 - bgt DGEMM_L4x8_LOOP + bgt .LDGEMM_L4x8_LOOP -DGEMM_L4x8_LOOP_END: +.LDGEMM_L4x8_LOOP_END: KERNEL4x8_1 KERNEL4x8_2 @@ -197,9 +233,9 @@ DGEMM_L4x8_LOOP_END: KERNEL4x8_1 KERNEL4x8_E2 - b DGEMM_L4x8_SUB1 + b .LDGEMM_L4x8_SUB1 -DGEMM_L4x8_SUB4: +.LDGEMM_L4x8_SUB4: KERNEL4x8_SUBI1 KERNEL4x8_SUB1 @@ -211,48 +247,48 @@ DGEMM_L4x8_SUB4: KERNEL4x8_SUB1 KERNEL4x8_SUB1 - b DGEMM_L4x8_SUB1 + b .LDGEMM_L4x8_SUB1 -DGEMM_L4x8_SUB0: +.LDGEMM_L4x8_SUB0: andi. L, K, 7 KERNEL4x8_SUBI1 addic. L, L, -1 - ble DGEMM_L4x8_SAVE - b DGEMM_L4x8_SUB2 + ble .LDGEMM_L4x8_SAVE + b .LDGEMM_L4x8_SUB2 -DGEMM_L4x8_SUB1: +.LDGEMM_L4x8_SUB1: andi. L, K, 7 - ble DGEMM_L4x8_SAVE + ble .LDGEMM_L4x8_SAVE -DGEMM_L4x8_SUB2: +.LDGEMM_L4x8_SUB2: KERNEL4x8_SUB1 addic. L, L, -1 - bgt DGEMM_L4x8_SUB2 + bgt .LDGEMM_L4x8_SUB2 -DGEMM_L4x8_SAVE: +.LDGEMM_L4x8_SAVE: SAVE4x8 -DGEMM_L4x8_END: +.LDGEMM_L4x8_END: -DGEMM_L4x4_BEGIN: +.LDGEMM_L4x4_BEGIN: andi. T1, M, 4 - ble DGEMM_L4x4_END + ble .LDGEMM_L4x4_END mr BO, B srawi. L, K, 3 - ble DGEMM_L4x4_SUB0 + ble .LDGEMM_L4x4_SUB0 cmpwi cr0, L, 1 - ble DGEMM_L4x4_SUB4 + ble .LDGEMM_L4x4_SUB4 -DGEMM_L4x4_LOOP_START: +.LDGEMM_L4x4_LOOP_START: LOAD4x4_1 KERNEL4x4_I1 @@ -266,11 +302,11 @@ DGEMM_L4x4_LOOP_START: KERNEL4x4_2 addic. L, L, -2 - ble DGEMM_L4x4_LOOP_END + ble .LDGEMM_L4x4_LOOP_END .align 5 -DGEMM_L4x4_LOOP: +.LDGEMM_L4x4_LOOP: KERNEL4x4_1 KERNEL4x4_2 @@ -283,9 +319,9 @@ DGEMM_L4x4_LOOP: KERNEL4x4_2 addic. L, L, -1 - bgt DGEMM_L4x4_LOOP + bgt .LDGEMM_L4x4_LOOP -DGEMM_L4x4_LOOP_END: +.LDGEMM_L4x4_LOOP_END: KERNEL4x4_1 KERNEL4x4_2 @@ -297,9 +333,9 @@ DGEMM_L4x4_LOOP_END: KERNEL4x4_1 KERNEL4x4_E2 - b DGEMM_L4x4_SUB1 + b .LDGEMM_L4x4_SUB1 -DGEMM_L4x4_SUB4: +.LDGEMM_L4x4_SUB4: KERNEL4x4_SUBI1 KERNEL4x4_SUB1 @@ -311,48 +347,48 @@ DGEMM_L4x4_SUB4: KERNEL4x4_SUB1 KERNEL4x4_SUB1 - b DGEMM_L4x4_SUB1 + b .LDGEMM_L4x4_SUB1 -DGEMM_L4x4_SUB0: +.LDGEMM_L4x4_SUB0: andi. L, K, 7 KERNEL4x4_SUBI1 addic. L, L, -1 - ble DGEMM_L4x4_SAVE - b DGEMM_L4x4_SUB2 + ble .LDGEMM_L4x4_SAVE + b .LDGEMM_L4x4_SUB2 -DGEMM_L4x4_SUB1: +.LDGEMM_L4x4_SUB1: andi. L, K, 7 - ble DGEMM_L4x4_SAVE + ble .LDGEMM_L4x4_SAVE -DGEMM_L4x4_SUB2: +.LDGEMM_L4x4_SUB2: KERNEL4x4_SUB1 addic. L, L, -1 - bgt DGEMM_L4x4_SUB2 + bgt .LDGEMM_L4x4_SUB2 -DGEMM_L4x4_SAVE: +.LDGEMM_L4x4_SAVE: SAVE4x4 -DGEMM_L4x4_END: +.LDGEMM_L4x4_END: -DGEMM_L4x2_BEGIN: +.LDGEMM_L4x2_BEGIN: andi. T1, M, 2 - ble DGEMM_L4x2_END + ble .LDGEMM_L4x2_END mr BO, B srawi. L, K, 3 - ble DGEMM_L4x2_SUB0 + ble .LDGEMM_L4x2_SUB0 cmpwi cr0, L, 1 - ble DGEMM_L4x2_SUB4 + ble .LDGEMM_L4x2_SUB4 -DGEMM_L4x2_LOOP_START: +.LDGEMM_L4x2_LOOP_START: LOAD4x2_1 KERNEL4x2_I1 @@ -366,11 +402,11 @@ DGEMM_L4x2_LOOP_START: KERNEL4x2_2 addic. L, L, -2 - ble DGEMM_L4x2_LOOP_END + ble .LDGEMM_L4x2_LOOP_END .align 5 -DGEMM_L4x2_LOOP: +.LDGEMM_L4x2_LOOP: KERNEL4x2_1 KERNEL4x2_2 @@ -383,9 +419,9 @@ DGEMM_L4x2_LOOP: KERNEL4x2_2 addic. L, L, -1 - bgt DGEMM_L4x2_LOOP + bgt .LDGEMM_L4x2_LOOP -DGEMM_L4x2_LOOP_END: +.LDGEMM_L4x2_LOOP_END: KERNEL4x2_1 KERNEL4x2_2 @@ -397,9 +433,9 @@ DGEMM_L4x2_LOOP_END: KERNEL4x2_1 KERNEL4x2_E2 - b DGEMM_L4x2_SUB1 + b .LDGEMM_L4x2_SUB1 -DGEMM_L4x2_SUB4: +.LDGEMM_L4x2_SUB4: KERNEL4x2_SUBI1 KERNEL4x2_SUB1 @@ -411,48 +447,48 @@ DGEMM_L4x2_SUB4: KERNEL4x2_SUB1 KERNEL4x2_SUB1 - b DGEMM_L4x2_SUB1 + b .LDGEMM_L4x2_SUB1 -DGEMM_L4x2_SUB0: +.LDGEMM_L4x2_SUB0: andi. L, K, 7 KERNEL4x2_SUBI1 addic. L, L, -1 - ble DGEMM_L4x2_SAVE - b DGEMM_L4x2_SUB2 + ble .LDGEMM_L4x2_SAVE + b .LDGEMM_L4x2_SUB2 -DGEMM_L4x2_SUB1: +.LDGEMM_L4x2_SUB1: andi. L, K, 7 - ble DGEMM_L4x2_SAVE + ble .LDGEMM_L4x2_SAVE -DGEMM_L4x2_SUB2: +.LDGEMM_L4x2_SUB2: KERNEL4x2_SUB1 addic. L, L, -1 - bgt DGEMM_L4x2_SUB2 + bgt .LDGEMM_L4x2_SUB2 -DGEMM_L4x2_SAVE: +.LDGEMM_L4x2_SAVE: SAVE4x2 -DGEMM_L4x2_END: +.LDGEMM_L4x2_END: -DGEMM_L4x1_BEGIN: +.LDGEMM_L4x1_BEGIN: andi. T1, M, 1 - ble DGEMM_L4x1_END + ble .LDGEMM_L4x1_END mr BO, B srawi. L, K, 3 - ble DGEMM_L4x1_SUB0 + ble .LDGEMM_L4x1_SUB0 cmpwi cr0, L, 1 - ble DGEMM_L4x1_SUB4 + ble .LDGEMM_L4x1_SUB4 -DGEMM_L4x1_LOOP_START: +.LDGEMM_L4x1_LOOP_START: LOAD4x1_1 KERNEL4x1_I1 @@ -466,11 +502,11 @@ DGEMM_L4x1_LOOP_START: KERNEL4x1_2 addic. L, L, -2 - ble DGEMM_L4x1_LOOP_END + ble .LDGEMM_L4x1_LOOP_END .align 5 -DGEMM_L4x1_LOOP: +.LDGEMM_L4x1_LOOP: KERNEL4x1_1 KERNEL4x1_2 @@ -483,9 +519,9 @@ DGEMM_L4x1_LOOP: KERNEL4x1_2 addic. L, L, -1 - bgt DGEMM_L4x1_LOOP + bgt .LDGEMM_L4x1_LOOP -DGEMM_L4x1_LOOP_END: +.LDGEMM_L4x1_LOOP_END: KERNEL4x1_1 KERNEL4x1_2 @@ -497,9 +533,9 @@ DGEMM_L4x1_LOOP_END: KERNEL4x1_1 KERNEL4x1_E2 - b DGEMM_L4x1_SUB1 + b .LDGEMM_L4x1_SUB1 -DGEMM_L4x1_SUB4: +.LDGEMM_L4x1_SUB4: KERNEL4x1_SUBI1 KERNEL4x1_SUB1 @@ -511,74 +547,74 @@ DGEMM_L4x1_SUB4: KERNEL4x1_SUB1 KERNEL4x1_SUB1 - b DGEMM_L4x1_SUB1 + b .LDGEMM_L4x1_SUB1 -DGEMM_L4x1_SUB0: +.LDGEMM_L4x1_SUB0: andi. L, K, 7 KERNEL4x1_SUBI1 addic. L, L, -1 - ble DGEMM_L4x1_SAVE - b DGEMM_L4x1_SUB2 + ble .LDGEMM_L4x1_SAVE + b .LDGEMM_L4x1_SUB2 -DGEMM_L4x1_SUB1: +.LDGEMM_L4x1_SUB1: andi. L, K, 7 - ble DGEMM_L4x1_SAVE + ble .LDGEMM_L4x1_SAVE -DGEMM_L4x1_SUB2: +.LDGEMM_L4x1_SUB2: KERNEL4x1_SUB1 addic. L, L, -1 - bgt DGEMM_L4x1_SUB2 + bgt .LDGEMM_L4x1_SUB2 -DGEMM_L4x1_SAVE: +.LDGEMM_L4x1_SAVE: SAVE4x1 -DGEMM_L4x1_END: +.LDGEMM_L4x1_END: slwi T1, K, 5 add B, B, T1 addic. J, J, -1 - bgt DGEMM_L4_BEGIN + bgt .LDGEMM_L4_BEGIN andi. T2, N, 3 - ble L999 + ble .L999 -DGEMM_L4_END: +.LDGEMM_L4_END: - b DGEMM_L2_BEGIN + b .LDGEMM_L2_BEGIN -L999_H1: +.L999_H1: - b L999 + b .L999 -DGEMM_L2_BEGIN: +.LDGEMM_L2_BEGIN: andi. T1, N, 2 - ble DGEMM_L2_END + ble .LDGEMM_L2_END mr CO, C mr AO, A slwi T1, LDC , 1 add C, C, T1 srawi. I, M, 4 - ble DGEMM_L2x16_END + ble .LDGEMM_L2x16_END -DGEMM_L2x16_BEGIN: +.LDGEMM_L2x16_BEGIN: mr BO, B srawi. L, K, 3 - ble DGEMM_L2x16_SUB0 + ble .LDGEMM_L2x16_SUB0 cmpwi cr0, L, 1 - ble DGEMM_L2x16_SUB4 + ble .LDGEMM_L2x16_SUB4 -DGEMM_L2x16_LOOP_START: +.LDGEMM_L2x16_LOOP_START: dcbt AO, PRE LOAD2x16_1 @@ -601,11 +637,11 @@ DGEMM_L2x16_LOOP_START: KERNEL2x16_2 addic. L, L, -2 - ble DGEMM_L2x16_LOOP_END + ble .LDGEMM_L2x16_LOOP_END .align 5 -DGEMM_L2x16_LOOP: +.LDGEMM_L2x16_LOOP: dcbt AO, PRE KERNEL2x16_1 @@ -626,9 +662,9 @@ DGEMM_L2x16_LOOP: KERNEL2x16_2 addic. L, L, -1 - bgt DGEMM_L2x16_LOOP + bgt .LDGEMM_L2x16_LOOP -DGEMM_L2x16_LOOP_END: +.LDGEMM_L2x16_LOOP_END: dcbt AO, PRE KERNEL2x16_1 @@ -647,9 +683,9 @@ DGEMM_L2x16_LOOP_END: KERNEL2x16_1 KERNEL2x16_E2 - b DGEMM_L2x16_SUB1 + b .LDGEMM_L2x16_SUB1 -DGEMM_L2x16_SUB4: +.LDGEMM_L2x16_SUB4: dcbt AO, PRE KERNEL2x16_SUBI1 @@ -665,53 +701,53 @@ DGEMM_L2x16_SUB4: KERNEL2x16_SUB1 KERNEL2x16_SUB1 - b DGEMM_L2x16_SUB1 + b .LDGEMM_L2x16_SUB1 -DGEMM_L2x16_SUB0: +.LDGEMM_L2x16_SUB0: andi. L, K, 7 KERNEL2x16_SUBI1 addic. L, L, -1 - ble DGEMM_L2x16_SAVE - b DGEMM_L2x16_SUB2 + ble .LDGEMM_L2x16_SAVE + b .LDGEMM_L2x16_SUB2 -DGEMM_L2x16_SUB1: +.LDGEMM_L2x16_SUB1: andi. L, K, 7 - ble DGEMM_L2x16_SAVE + ble .LDGEMM_L2x16_SAVE -DGEMM_L2x16_SUB2: +.LDGEMM_L2x16_SUB2: KERNEL2x16_SUB1 addic. L, L, -1 - bgt DGEMM_L2x16_SUB2 + bgt .LDGEMM_L2x16_SUB2 -DGEMM_L2x16_SAVE: +.LDGEMM_L2x16_SAVE: SAVE2x16 addic. I, I, -1 - bgt DGEMM_L2x16_BEGIN + bgt .LDGEMM_L2x16_BEGIN -DGEMM_L2x16_END: +.LDGEMM_L2x16_END: -DGEMM_L2x8_BEGIN: +.LDGEMM_L2x8_BEGIN: andi. T2, M, 15 - ble DGEMM_L2x1_END + ble .LDGEMM_L2x1_END andi. T1, M, 8 - ble DGEMM_L2x8_END + ble .LDGEMM_L2x8_END mr BO, B srawi. L, K, 3 - ble DGEMM_L2x8_SUB0 + ble .LDGEMM_L2x8_SUB0 cmpwi cr0, L, 1 - ble DGEMM_L2x8_SUB4 + ble .LDGEMM_L2x8_SUB4 -DGEMM_L2x8_LOOP_START: +.LDGEMM_L2x8_LOOP_START: LOAD2x8_1 KERNEL2x8_I1 @@ -725,11 +761,11 @@ DGEMM_L2x8_LOOP_START: KERNEL2x8_2 addic. L, L, -2 - ble DGEMM_L2x8_LOOP_END + ble .LDGEMM_L2x8_LOOP_END .align 5 -DGEMM_L2x8_LOOP: +.LDGEMM_L2x8_LOOP: KERNEL2x8_1 KERNEL2x8_2 @@ -742,9 +778,9 @@ DGEMM_L2x8_LOOP: KERNEL2x8_2 addic. L, L, -1 - bgt DGEMM_L2x8_LOOP + bgt .LDGEMM_L2x8_LOOP -DGEMM_L2x8_LOOP_END: +.LDGEMM_L2x8_LOOP_END: KERNEL2x8_1 KERNEL2x8_2 @@ -756,9 +792,9 @@ DGEMM_L2x8_LOOP_END: KERNEL2x8_1 KERNEL2x8_E2 - b DGEMM_L2x8_SUB1 + b .LDGEMM_L2x8_SUB1 -DGEMM_L2x8_SUB4: +.LDGEMM_L2x8_SUB4: KERNEL2x8_SUBI1 KERNEL2x8_SUB1 @@ -770,48 +806,48 @@ DGEMM_L2x8_SUB4: KERNEL2x8_SUB1 KERNEL2x8_SUB1 - b DGEMM_L2x8_SUB1 + b .LDGEMM_L2x8_SUB1 -DGEMM_L2x8_SUB0: +.LDGEMM_L2x8_SUB0: andi. L, K, 7 KERNEL2x8_SUBI1 addic. L, L, -1 - ble DGEMM_L2x8_SAVE - b DGEMM_L2x8_SUB2 + ble .LDGEMM_L2x8_SAVE + b .LDGEMM_L2x8_SUB2 -DGEMM_L2x8_SUB1: +.LDGEMM_L2x8_SUB1: andi. L, K, 7 - ble DGEMM_L2x8_SAVE + ble .LDGEMM_L2x8_SAVE -DGEMM_L2x8_SUB2: +.LDGEMM_L2x8_SUB2: KERNEL2x8_SUB1 addic. L, L, -1 - bgt DGEMM_L2x8_SUB2 + bgt .LDGEMM_L2x8_SUB2 -DGEMM_L2x8_SAVE: +.LDGEMM_L2x8_SAVE: SAVE2x8 -DGEMM_L2x8_END: +.LDGEMM_L2x8_END: -DGEMM_L2x4_BEGIN: +.LDGEMM_L2x4_BEGIN: andi. T1, M, 4 - ble DGEMM_L2x4_END + ble .LDGEMM_L2x4_END mr BO, B srawi. L, K, 3 - ble DGEMM_L2x4_SUB0 + ble .LDGEMM_L2x4_SUB0 cmpwi cr0, L, 1 - ble DGEMM_L2x4_SUB4 + ble .LDGEMM_L2x4_SUB4 -DGEMM_L2x4_LOOP_START: +.LDGEMM_L2x4_LOOP_START: LOAD2x4_1 KERNEL2x4_I1 @@ -825,11 +861,11 @@ DGEMM_L2x4_LOOP_START: KERNEL2x4_2 addic. L, L, -2 - ble DGEMM_L2x4_LOOP_END + ble .LDGEMM_L2x4_LOOP_END .align 5 -DGEMM_L2x4_LOOP: +.LDGEMM_L2x4_LOOP: KERNEL2x4_1 KERNEL2x4_2 @@ -842,9 +878,9 @@ DGEMM_L2x4_LOOP: KERNEL2x4_2 addic. L, L, -1 - bgt DGEMM_L2x4_LOOP + bgt .LDGEMM_L2x4_LOOP -DGEMM_L2x4_LOOP_END: +.LDGEMM_L2x4_LOOP_END: KERNEL2x4_1 KERNEL2x4_2 @@ -856,9 +892,9 @@ DGEMM_L2x4_LOOP_END: KERNEL2x4_1 KERNEL2x4_E2 - b DGEMM_L2x4_SUB1 + b .LDGEMM_L2x4_SUB1 -DGEMM_L2x4_SUB4: +.LDGEMM_L2x4_SUB4: KERNEL2x4_SUBI1 KERNEL2x4_SUB1 @@ -870,48 +906,48 @@ DGEMM_L2x4_SUB4: KERNEL2x4_SUB1 KERNEL2x4_SUB1 - b DGEMM_L2x4_SUB1 + b .LDGEMM_L2x4_SUB1 -DGEMM_L2x4_SUB0: +.LDGEMM_L2x4_SUB0: andi. L, K, 7 KERNEL2x4_SUBI1 addic. L, L, -1 - ble DGEMM_L2x4_SAVE - b DGEMM_L2x4_SUB2 + ble .LDGEMM_L2x4_SAVE + b .LDGEMM_L2x4_SUB2 -DGEMM_L2x4_SUB1: +.LDGEMM_L2x4_SUB1: andi. L, K, 7 - ble DGEMM_L2x4_SAVE + ble .LDGEMM_L2x4_SAVE -DGEMM_L2x4_SUB2: +.LDGEMM_L2x4_SUB2: KERNEL2x4_SUB1 addic. L, L, -1 - bgt DGEMM_L2x4_SUB2 + bgt .LDGEMM_L2x4_SUB2 -DGEMM_L2x4_SAVE: +.LDGEMM_L2x4_SAVE: SAVE2x4 -DGEMM_L2x4_END: +.LDGEMM_L2x4_END: -DGEMM_L2x2_BEGIN: +.LDGEMM_L2x2_BEGIN: andi. T1, M, 2 - ble DGEMM_L2x2_END + ble .LDGEMM_L2x2_END mr BO, B srawi. L, K, 3 - ble DGEMM_L2x2_SUB0 + ble .LDGEMM_L2x2_SUB0 cmpwi cr0, L, 1 - ble DGEMM_L2x2_SUB4 + ble .LDGEMM_L2x2_SUB4 -DGEMM_L2x2_LOOP_START: +.LDGEMM_L2x2_LOOP_START: LOAD2x2_1 KERNEL2x2_I1 @@ -925,11 +961,11 @@ DGEMM_L2x2_LOOP_START: KERNEL2x2_2 addic. L, L, -2 - ble DGEMM_L2x2_LOOP_END + ble .LDGEMM_L2x2_LOOP_END .align 5 -DGEMM_L2x2_LOOP: +.LDGEMM_L2x2_LOOP: KERNEL2x2_1 KERNEL2x2_2 @@ -942,9 +978,9 @@ DGEMM_L2x2_LOOP: KERNEL2x2_2 addic. L, L, -1 - bgt DGEMM_L2x2_LOOP + bgt .LDGEMM_L2x2_LOOP -DGEMM_L2x2_LOOP_END: +.LDGEMM_L2x2_LOOP_END: KERNEL2x2_1 KERNEL2x2_2 @@ -956,9 +992,9 @@ DGEMM_L2x2_LOOP_END: KERNEL2x2_1 KERNEL2x2_E2 - b DGEMM_L2x2_SUB1 + b .LDGEMM_L2x2_SUB1 -DGEMM_L2x2_SUB4: +.LDGEMM_L2x2_SUB4: KERNEL2x2_SUBI1 KERNEL2x2_SUB1 @@ -970,48 +1006,48 @@ DGEMM_L2x2_SUB4: KERNEL2x2_SUB1 KERNEL2x2_SUB1 - b DGEMM_L2x2_SUB1 + b .LDGEMM_L2x2_SUB1 -DGEMM_L2x2_SUB0: +.LDGEMM_L2x2_SUB0: andi. L, K, 7 KERNEL2x2_SUBI1 addic. L, L, -1 - ble DGEMM_L2x2_SAVE - b DGEMM_L2x2_SUB2 + ble .LDGEMM_L2x2_SAVE + b .LDGEMM_L2x2_SUB2 -DGEMM_L2x2_SUB1: +.LDGEMM_L2x2_SUB1: andi. L, K, 7 - ble DGEMM_L2x2_SAVE + ble .LDGEMM_L2x2_SAVE -DGEMM_L2x2_SUB2: +.LDGEMM_L2x2_SUB2: KERNEL2x2_SUB1 addic. L, L, -1 - bgt DGEMM_L2x2_SUB2 + bgt .LDGEMM_L2x2_SUB2 -DGEMM_L2x2_SAVE: +.LDGEMM_L2x2_SAVE: SAVE2x2 -DGEMM_L2x2_END: +.LDGEMM_L2x2_END: -DGEMM_L2x1_BEGIN: +.LDGEMM_L2x1_BEGIN: andi. T1, M, 1 - ble DGEMM_L2x1_END + ble .LDGEMM_L2x1_END mr BO, B srawi. L, K, 3 - ble DGEMM_L2x1_SUB0 + ble .LDGEMM_L2x1_SUB0 cmpwi cr0, L, 1 - ble DGEMM_L2x1_SUB4 + ble .LDGEMM_L2x1_SUB4 -DGEMM_L2x1_LOOP_START: +.LDGEMM_L2x1_LOOP_START: LOAD2x1_1 KERNEL2x1_I1 @@ -1025,11 +1061,11 @@ DGEMM_L2x1_LOOP_START: KERNEL2x1_2 addic. L, L, -2 - ble DGEMM_L2x1_LOOP_END + ble .LDGEMM_L2x1_LOOP_END .align 5 -DGEMM_L2x1_LOOP: +.LDGEMM_L2x1_LOOP: KERNEL2x1_1 KERNEL2x1_2 @@ -1042,9 +1078,9 @@ DGEMM_L2x1_LOOP: KERNEL2x1_2 addic. L, L, -1 - bgt DGEMM_L2x1_LOOP + bgt .LDGEMM_L2x1_LOOP -DGEMM_L2x1_LOOP_END: +.LDGEMM_L2x1_LOOP_END: KERNEL2x1_1 KERNEL2x1_2 @@ -1056,9 +1092,9 @@ DGEMM_L2x1_LOOP_END: KERNEL2x1_1 KERNEL2x1_E2 - b DGEMM_L2x1_SUB1 + b .LDGEMM_L2x1_SUB1 -DGEMM_L2x1_SUB4: +.LDGEMM_L2x1_SUB4: KERNEL2x1_SUBI1 KERNEL2x1_SUB1 @@ -1070,59 +1106,59 @@ DGEMM_L2x1_SUB4: KERNEL2x1_SUB1 KERNEL2x1_SUB1 - b DGEMM_L2x1_SUB1 + b .LDGEMM_L2x1_SUB1 -DGEMM_L2x1_SUB0: +.LDGEMM_L2x1_SUB0: andi. L, K, 7 KERNEL2x1_SUBI1 addic. L, L, -1 - ble DGEMM_L2x1_SAVE - b DGEMM_L2x1_SUB2 + ble .LDGEMM_L2x1_SAVE + b .LDGEMM_L2x1_SUB2 -DGEMM_L2x1_SUB1: +.LDGEMM_L2x1_SUB1: andi. L, K, 7 - ble DGEMM_L2x1_SAVE + ble .LDGEMM_L2x1_SAVE -DGEMM_L2x1_SUB2: +.LDGEMM_L2x1_SUB2: KERNEL2x1_SUB1 addic. L, L, -1 - bgt DGEMM_L2x1_SUB2 + bgt .LDGEMM_L2x1_SUB2 -DGEMM_L2x1_SAVE: +.LDGEMM_L2x1_SAVE: SAVE2x1 -DGEMM_L2x1_END: +.LDGEMM_L2x1_END: slwi T1, K, 4 add B, B, T1 -DGEMM_L2_END: -DGEMM_L1_BEGIN: +.LDGEMM_L2_END: +.LDGEMM_L1_BEGIN: andi. T1, N, 1 - ble DGEMM_L1_END + ble .LDGEMM_L1_END mr CO, C mr AO, A srawi. I, M, 4 - ble DGEMM_L1x16_END + ble .LDGEMM_L1x16_END -DGEMM_L1x16_BEGIN: +.LDGEMM_L1x16_BEGIN: mr BO, B srawi. L, K, 3 - ble DGEMM_L1x16_SUB0 + ble .LDGEMM_L1x16_SUB0 cmpwi cr0, L, 1 - ble DGEMM_L1x16_SUB4 + ble .LDGEMM_L1x16_SUB4 -DGEMM_L1x16_LOOP_START: +.LDGEMM_L1x16_LOOP_START: dcbt AO, PRE LOAD1x16_1 @@ -1145,11 +1181,11 @@ DGEMM_L1x16_LOOP_START: KERNEL1x16_2 addic. L, L, -2 - ble DGEMM_L1x16_LOOP_END + ble .LDGEMM_L1x16_LOOP_END .align 5 -DGEMM_L1x16_LOOP: +.LDGEMM_L1x16_LOOP: dcbt AO, PRE KERNEL1x16_1 @@ -1170,9 +1206,9 @@ DGEMM_L1x16_LOOP: KERNEL1x16_2 addic. L, L, -1 - bgt DGEMM_L1x16_LOOP + bgt .LDGEMM_L1x16_LOOP -DGEMM_L1x16_LOOP_END: +.LDGEMM_L1x16_LOOP_END: dcbt AO, PRE KERNEL1x16_1 @@ -1191,9 +1227,9 @@ DGEMM_L1x16_LOOP_END: KERNEL1x16_1 KERNEL1x16_E2 - b DGEMM_L1x16_SUB1 + b .LDGEMM_L1x16_SUB1 -DGEMM_L1x16_SUB4: +.LDGEMM_L1x16_SUB4: dcbt AO, PRE KERNEL1x16_SUBI1 @@ -1209,53 +1245,53 @@ DGEMM_L1x16_SUB4: KERNEL1x16_SUB1 KERNEL1x16_SUB1 - b DGEMM_L1x16_SUB1 + b .LDGEMM_L1x16_SUB1 -DGEMM_L1x16_SUB0: +.LDGEMM_L1x16_SUB0: andi. L, K, 7 KERNEL1x16_SUBI1 addic. L, L, -1 - ble DGEMM_L1x16_SAVE - b DGEMM_L1x16_SUB2 + ble .LDGEMM_L1x16_SAVE + b .LDGEMM_L1x16_SUB2 -DGEMM_L1x16_SUB1: +.LDGEMM_L1x16_SUB1: andi. L, K, 7 - ble DGEMM_L1x16_SAVE + ble .LDGEMM_L1x16_SAVE -DGEMM_L1x16_SUB2: +.LDGEMM_L1x16_SUB2: KERNEL1x16_SUB1 addic. L, L, -1 - bgt DGEMM_L1x16_SUB2 + bgt .LDGEMM_L1x16_SUB2 -DGEMM_L1x16_SAVE: +.LDGEMM_L1x16_SAVE: SAVE1x16 addic. I, I, -1 - bgt DGEMM_L1x16_BEGIN + bgt .LDGEMM_L1x16_BEGIN -DGEMM_L1x16_END: +.LDGEMM_L1x16_END: -DGEMM_L1x8_BEGIN: +.LDGEMM_L1x8_BEGIN: andi. T2, M, 15 - ble DGEMM_L1x1_END + ble .LDGEMM_L1x1_END andi. T1, M, 8 - ble DGEMM_L1x8_END + ble .LDGEMM_L1x8_END mr BO, B srawi. L, K, 3 - ble DGEMM_L1x8_SUB0 + ble .LDGEMM_L1x8_SUB0 cmpwi cr0, L, 1 - ble DGEMM_L1x8_SUB4 + ble .LDGEMM_L1x8_SUB4 -DGEMM_L1x8_LOOP_START: +.LDGEMM_L1x8_LOOP_START: LOAD1x8_1 KERNEL1x8_I1 @@ -1269,11 +1305,11 @@ DGEMM_L1x8_LOOP_START: KERNEL1x8_2 addic. L, L, -2 - ble DGEMM_L1x8_LOOP_END + ble .LDGEMM_L1x8_LOOP_END .align 5 -DGEMM_L1x8_LOOP: +.LDGEMM_L1x8_LOOP: KERNEL1x8_1 KERNEL1x8_2 @@ -1286,9 +1322,9 @@ DGEMM_L1x8_LOOP: KERNEL1x8_2 addic. L, L, -1 - bgt DGEMM_L1x8_LOOP + bgt .LDGEMM_L1x8_LOOP -DGEMM_L1x8_LOOP_END: +.LDGEMM_L1x8_LOOP_END: KERNEL1x8_1 KERNEL1x8_2 @@ -1300,9 +1336,9 @@ DGEMM_L1x8_LOOP_END: KERNEL1x8_1 KERNEL1x8_E2 - b DGEMM_L1x8_SUB1 + b .LDGEMM_L1x8_SUB1 -DGEMM_L1x8_SUB4: +.LDGEMM_L1x8_SUB4: KERNEL1x8_SUBI1 KERNEL1x8_SUB1 @@ -1314,48 +1350,48 @@ DGEMM_L1x8_SUB4: KERNEL1x8_SUB1 KERNEL1x8_SUB1 - b DGEMM_L1x8_SUB1 + b .LDGEMM_L1x8_SUB1 -DGEMM_L1x8_SUB0: +.LDGEMM_L1x8_SUB0: andi. L, K, 7 KERNEL1x8_SUBI1 addic. L, L, -1 - ble DGEMM_L1x8_SAVE - b DGEMM_L1x8_SUB2 + ble .LDGEMM_L1x8_SAVE + b .LDGEMM_L1x8_SUB2 -DGEMM_L1x8_SUB1: +.LDGEMM_L1x8_SUB1: andi. L, K, 7 - ble DGEMM_L1x8_SAVE + ble .LDGEMM_L1x8_SAVE -DGEMM_L1x8_SUB2: +.LDGEMM_L1x8_SUB2: KERNEL1x8_SUB1 addic. L, L, -1 - bgt DGEMM_L1x8_SUB2 + bgt .LDGEMM_L1x8_SUB2 -DGEMM_L1x8_SAVE: +.LDGEMM_L1x8_SAVE: SAVE1x8 -DGEMM_L1x8_END: +.LDGEMM_L1x8_END: -DGEMM_L1x4_BEGIN: +.LDGEMM_L1x4_BEGIN: andi. T1, M, 4 - ble DGEMM_L1x4_END + ble .LDGEMM_L1x4_END mr BO, B srawi. L, K, 3 - ble DGEMM_L1x4_SUB0 + ble .LDGEMM_L1x4_SUB0 cmpwi cr0, L, 1 - ble DGEMM_L1x4_SUB4 + ble .LDGEMM_L1x4_SUB4 -DGEMM_L1x4_LOOP_START: +.LDGEMM_L1x4_LOOP_START: LOAD1x4_1 KERNEL1x4_I1 @@ -1369,11 +1405,11 @@ DGEMM_L1x4_LOOP_START: KERNEL1x4_2 addic. L, L, -2 - ble DGEMM_L1x4_LOOP_END + ble .LDGEMM_L1x4_LOOP_END .align 5 -DGEMM_L1x4_LOOP: +.LDGEMM_L1x4_LOOP: KERNEL1x4_1 KERNEL1x4_2 @@ -1386,9 +1422,9 @@ DGEMM_L1x4_LOOP: KERNEL1x4_2 addic. L, L, -1 - bgt DGEMM_L1x4_LOOP + bgt .LDGEMM_L1x4_LOOP -DGEMM_L1x4_LOOP_END: +.LDGEMM_L1x4_LOOP_END: KERNEL1x4_1 KERNEL1x4_2 @@ -1400,9 +1436,9 @@ DGEMM_L1x4_LOOP_END: KERNEL1x4_1 KERNEL1x4_E2 - b DGEMM_L1x4_SUB1 + b .LDGEMM_L1x4_SUB1 -DGEMM_L1x4_SUB4: +.LDGEMM_L1x4_SUB4: KERNEL1x4_SUBI1 KERNEL1x4_SUB1 @@ -1414,48 +1450,48 @@ DGEMM_L1x4_SUB4: KERNEL1x4_SUB1 KERNEL1x4_SUB1 - b DGEMM_L1x4_SUB1 + b .LDGEMM_L1x4_SUB1 -DGEMM_L1x4_SUB0: +.LDGEMM_L1x4_SUB0: andi. L, K, 7 KERNEL1x4_SUBI1 addic. L, L, -1 - ble DGEMM_L1x4_SAVE - b DGEMM_L1x4_SUB2 + ble .LDGEMM_L1x4_SAVE + b .LDGEMM_L1x4_SUB2 -DGEMM_L1x4_SUB1: +.LDGEMM_L1x4_SUB1: andi. L, K, 7 - ble DGEMM_L1x4_SAVE + ble .LDGEMM_L1x4_SAVE -DGEMM_L1x4_SUB2: +.LDGEMM_L1x4_SUB2: KERNEL1x4_SUB1 addic. L, L, -1 - bgt DGEMM_L1x4_SUB2 + bgt .LDGEMM_L1x4_SUB2 -DGEMM_L1x4_SAVE: +.LDGEMM_L1x4_SAVE: SAVE1x4 -DGEMM_L1x4_END: +.LDGEMM_L1x4_END: -DGEMM_L1x2_BEGIN: +.LDGEMM_L1x2_BEGIN: andi. T1, M, 2 - ble DGEMM_L1x2_END + ble .LDGEMM_L1x2_END mr BO, B srawi. L, K, 3 - ble DGEMM_L1x2_SUB0 + ble .LDGEMM_L1x2_SUB0 cmpwi cr0, L, 1 - ble DGEMM_L1x2_SUB4 + ble .LDGEMM_L1x2_SUB4 -DGEMM_L1x2_LOOP_START: +.LDGEMM_L1x2_LOOP_START: LOAD1x2_1 KERNEL1x2_I1 @@ -1469,11 +1505,11 @@ DGEMM_L1x2_LOOP_START: KERNEL1x2_2 addic. L, L, -2 - ble DGEMM_L1x2_LOOP_END + ble .LDGEMM_L1x2_LOOP_END .align 5 -DGEMM_L1x2_LOOP: +.LDGEMM_L1x2_LOOP: KERNEL1x2_1 KERNEL1x2_2 @@ -1486,9 +1522,9 @@ DGEMM_L1x2_LOOP: KERNEL1x2_2 addic. L, L, -1 - bgt DGEMM_L1x2_LOOP + bgt .LDGEMM_L1x2_LOOP -DGEMM_L1x2_LOOP_END: +.LDGEMM_L1x2_LOOP_END: KERNEL1x2_1 KERNEL1x2_2 @@ -1500,9 +1536,9 @@ DGEMM_L1x2_LOOP_END: KERNEL1x2_1 KERNEL1x2_E2 - b DGEMM_L1x2_SUB1 + b .LDGEMM_L1x2_SUB1 -DGEMM_L1x2_SUB4: +.LDGEMM_L1x2_SUB4: KERNEL1x2_SUBI1 KERNEL1x2_SUB1 @@ -1514,48 +1550,48 @@ DGEMM_L1x2_SUB4: KERNEL1x2_SUB1 KERNEL1x2_SUB1 - b DGEMM_L1x2_SUB1 + b .LDGEMM_L1x2_SUB1 -DGEMM_L1x2_SUB0: +.LDGEMM_L1x2_SUB0: andi. L, K, 7 KERNEL1x2_SUBI1 addic. L, L, -1 - ble DGEMM_L1x2_SAVE - b DGEMM_L1x2_SUB2 + ble .LDGEMM_L1x2_SAVE + b .LDGEMM_L1x2_SUB2 -DGEMM_L1x2_SUB1: +.LDGEMM_L1x2_SUB1: andi. L, K, 7 - ble DGEMM_L1x2_SAVE + ble .LDGEMM_L1x2_SAVE -DGEMM_L1x2_SUB2: +.LDGEMM_L1x2_SUB2: KERNEL1x2_SUB1 addic. L, L, -1 - bgt DGEMM_L1x2_SUB2 + bgt .LDGEMM_L1x2_SUB2 -DGEMM_L1x2_SAVE: +.LDGEMM_L1x2_SAVE: SAVE1x2 -DGEMM_L1x2_END: +.LDGEMM_L1x2_END: -DGEMM_L1x1_BEGIN: +.LDGEMM_L1x1_BEGIN: andi. T1, M, 1 - ble DGEMM_L1x1_END + ble .LDGEMM_L1x1_END mr BO, B srawi. L, K, 3 - ble DGEMM_L1x1_SUB0 + ble .LDGEMM_L1x1_SUB0 cmpwi cr0, L, 1 - ble DGEMM_L1x1_SUB4 + ble .LDGEMM_L1x1_SUB4 -DGEMM_L1x1_LOOP_START: +.LDGEMM_L1x1_LOOP_START: LOAD1x1_1 KERNEL1x1_I1 @@ -1569,11 +1605,11 @@ DGEMM_L1x1_LOOP_START: KERNEL1x1_2 addic. L, L, -2 - ble DGEMM_L1x1_LOOP_END + ble .LDGEMM_L1x1_LOOP_END .align 5 -DGEMM_L1x1_LOOP: +.LDGEMM_L1x1_LOOP: KERNEL1x1_1 KERNEL1x1_2 @@ -1586,9 +1622,9 @@ DGEMM_L1x1_LOOP: KERNEL1x1_2 addic. L, L, -1 - bgt DGEMM_L1x1_LOOP + bgt .LDGEMM_L1x1_LOOP -DGEMM_L1x1_LOOP_END: +.LDGEMM_L1x1_LOOP_END: KERNEL1x1_1 KERNEL1x1_2 @@ -1600,9 +1636,9 @@ DGEMM_L1x1_LOOP_END: KERNEL1x1_1 KERNEL1x1_E2 - b DGEMM_L1x1_SUB1 + b .LDGEMM_L1x1_SUB1 -DGEMM_L1x1_SUB4: +.LDGEMM_L1x1_SUB4: KERNEL1x1_SUBI1 KERNEL1x1_SUB1 @@ -1614,34 +1650,34 @@ DGEMM_L1x1_SUB4: KERNEL1x1_SUB1 KERNEL1x1_SUB1 - b DGEMM_L1x1_SUB1 + b .LDGEMM_L1x1_SUB1 -DGEMM_L1x1_SUB0: +.LDGEMM_L1x1_SUB0: andi. L, K, 7 KERNEL1x1_SUBI1 addic. L, L, -1 - ble DGEMM_L1x1_SAVE - b DGEMM_L1x1_SUB2 + ble .LDGEMM_L1x1_SAVE + b .LDGEMM_L1x1_SUB2 -DGEMM_L1x1_SUB1: +.LDGEMM_L1x1_SUB1: andi. L, K, 7 - ble DGEMM_L1x1_SAVE + ble .LDGEMM_L1x1_SAVE -DGEMM_L1x1_SUB2: +.LDGEMM_L1x1_SUB2: KERNEL1x1_SUB1 addic. L, L, -1 - bgt DGEMM_L1x1_SUB2 + bgt .LDGEMM_L1x1_SUB2 -DGEMM_L1x1_SAVE: +.LDGEMM_L1x1_SAVE: SAVE1x1 -DGEMM_L1x1_END: +.LDGEMM_L1x1_END: -DGEMM_L1_END: +.LDGEMM_L1_END: diff --git a/kernel/power/dgemm_macros_16x4_power8.S b/kernel/power/dgemm_macros_16x4_power8.S index d4090985b..27c05e08e 100644 --- a/kernel/power/dgemm_macros_16x4_power8.S +++ b/kernel/power/dgemm_macros_16x4_power8.S @@ -1,3 +1,38 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/05 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + /********************************************************************* * Macros for N=4, M=16 * *********************************************************************/ diff --git a/kernel/power/dtrmm_kernel_16x4_power8.S b/kernel/power/dtrmm_kernel_16x4_power8.S index c892c65d3..2294128a2 100644 --- a/kernel/power/dtrmm_kernel_16x4_power8.S +++ b/kernel/power/dtrmm_kernel_16x4_power8.S @@ -1,3 +1,38 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/05 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ @@ -228,11 +263,11 @@ #endif cmpwi cr0, M, 0 - ble L999_H1 + ble .L999_H1 cmpwi cr0, N, 0 - ble L999_H1 + ble .L999_H1 cmpwi cr0, K, 0 - ble L999_H1 + ble .L999_H1 #ifdef __64BIT__ addi ALPHA, SP, 296 @@ -251,7 +286,7 @@ #include "dtrmm_logic_16x4_power8.S" -L999: +.L999: addi r3, 0, 0 lfd f14, 0(SP) diff --git a/kernel/power/dtrmm_logic_16x4_power8.S b/kernel/power/dtrmm_logic_16x4_power8.S index f2886f8d6..a4340c598 100644 --- a/kernel/power/dtrmm_logic_16x4_power8.S +++ b/kernel/power/dtrmm_logic_16x4_power8.S @@ -1,7 +1,44 @@ - srawi. J, N, 2 - ble DTRMM_L4_END +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ -DTRMM_L4_BEGIN: +/************************************************************************************** +* 2016/03/05 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + + + + srawi. J, N, 2 + ble .LDTRMM_L4_END + +.LDTRMM_L4_BEGIN: mr CO, C mr AO, A @@ -13,9 +50,9 @@ DTRMM_L4_BEGIN: #endif srawi. I, M, 4 - ble DTRMM_L4x16_END + ble .LDTRMM_L4x16_END -DTRMM_L4x16_BEGIN: +.LDTRMM_L4x16_BEGIN: #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) @@ -42,11 +79,11 @@ DTRMM_L4x16_BEGIN: mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L - ble DTRMM_L4x16_SUB0 + ble .LDTRMM_L4x16_SUB0 cmpwi cr0, L, 1 - ble DTRMM_L4x16_SUB4 + ble .LDTRMM_L4x16_SUB4 -DTRMM_L4x16_LOOP_START: +.LDTRMM_L4x16_LOOP_START: dcbt AO, PRE LOAD4x16_1 @@ -69,11 +106,11 @@ DTRMM_L4x16_LOOP_START: KERNEL4x16_2 addic. L, L, -2 - ble DTRMM_L4x16_LOOP_END + ble .LDTRMM_L4x16_LOOP_END .align 5 -DTRMM_L4x16_LOOP: +.LDTRMM_L4x16_LOOP: dcbt AO, PRE KERNEL4x16_1 @@ -94,9 +131,9 @@ DTRMM_L4x16_LOOP: KERNEL4x16_2 addic. L, L, -1 - bgt DTRMM_L4x16_LOOP + bgt .LDTRMM_L4x16_LOOP -DTRMM_L4x16_LOOP_END: +.LDTRMM_L4x16_LOOP_END: dcbt AO, PRE KERNEL4x16_1 @@ -115,9 +152,9 @@ DTRMM_L4x16_LOOP_END: KERNEL4x16_1 KERNEL4x16_E2 - b DTRMM_L4x16_SUB1 + b .LDTRMM_L4x16_SUB1 -DTRMM_L4x16_SUB4: +.LDTRMM_L4x16_SUB4: dcbt AO, PRE KERNEL4x16_SUBI1 @@ -133,31 +170,31 @@ DTRMM_L4x16_SUB4: KERNEL4x16_SUB1 KERNEL4x16_SUB1 - b DTRMM_L4x16_SUB1 + b .LDTRMM_L4x16_SUB1 -DTRMM_L4x16_SUB0: +.LDTRMM_L4x16_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL4x16_SUBI1 addic. L, L, -1 - ble DTRMM_L4x16_SAVE - b DTRMM_L4x16_SUB2 + ble .LDTRMM_L4x16_SAVE + b .LDTRMM_L4x16_SUB2 -DTRMM_L4x16_SUB1: +.LDTRMM_L4x16_SUB1: andi. L, K1, 7 // K1 & 7 -> L - ble DTRMM_L4x16_SAVE + ble .LDTRMM_L4x16_SAVE -DTRMM_L4x16_SUB2: +.LDTRMM_L4x16_SUB2: KERNEL4x16_SUB1 addic. L, L, -1 - bgt DTRMM_L4x16_SUB2 + bgt .LDTRMM_L4x16_SUB2 -DTRMM_L4x16_SAVE: +.LDTRMM_L4x16_SAVE: SAVE4x16 @@ -175,16 +212,16 @@ DTRMM_L4x16_SAVE: addic. I, I, -1 - bgt DTRMM_L4x16_BEGIN + bgt .LDTRMM_L4x16_BEGIN -DTRMM_L4x16_END: +.LDTRMM_L4x16_END: -DTRMM_L4x8_BEGIN: +.LDTRMM_L4x8_BEGIN: andi. T2, M, 15 - ble DTRMM_L4x1_END + ble .LDTRMM_L4x1_END andi. T1, M, 8 - ble DTRMM_L4x8_END + ble .LDTRMM_L4x8_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO @@ -210,11 +247,11 @@ DTRMM_L4x8_BEGIN: mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L - ble DTRMM_L4x8_SUB0 + ble .LDTRMM_L4x8_SUB0 cmpwi cr0, L, 1 - ble DTRMM_L4x8_SUB4 + ble .LDTRMM_L4x8_SUB4 -DTRMM_L4x8_LOOP_START: +.LDTRMM_L4x8_LOOP_START: LOAD4x8_1 KERNEL4x8_I1 @@ -228,11 +265,11 @@ DTRMM_L4x8_LOOP_START: KERNEL4x8_2 addic. L, L, -2 - ble DTRMM_L4x8_LOOP_END + ble .LDTRMM_L4x8_LOOP_END .align 5 -DTRMM_L4x8_LOOP: +.LDTRMM_L4x8_LOOP: KERNEL4x8_1 KERNEL4x8_2 @@ -245,9 +282,9 @@ DTRMM_L4x8_LOOP: KERNEL4x8_2 addic. L, L, -1 - bgt DTRMM_L4x8_LOOP + bgt .LDTRMM_L4x8_LOOP -DTRMM_L4x8_LOOP_END: +.LDTRMM_L4x8_LOOP_END: KERNEL4x8_1 KERNEL4x8_2 @@ -259,9 +296,9 @@ DTRMM_L4x8_LOOP_END: KERNEL4x8_1 KERNEL4x8_E2 - b DTRMM_L4x8_SUB1 + b .LDTRMM_L4x8_SUB1 -DTRMM_L4x8_SUB4: +.LDTRMM_L4x8_SUB4: KERNEL4x8_SUBI1 KERNEL4x8_SUB1 @@ -273,31 +310,31 @@ DTRMM_L4x8_SUB4: KERNEL4x8_SUB1 KERNEL4x8_SUB1 - b DTRMM_L4x8_SUB1 + b .LDTRMM_L4x8_SUB1 -DTRMM_L4x8_SUB0: +.LDTRMM_L4x8_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL4x8_SUBI1 addic. L, L, -1 - ble DTRMM_L4x8_SAVE - b DTRMM_L4x8_SUB2 + ble .LDTRMM_L4x8_SAVE + b .LDTRMM_L4x8_SUB2 -DTRMM_L4x8_SUB1: +.LDTRMM_L4x8_SUB1: andi. L, K1, 7 // K1 & 7 -> L - ble DTRMM_L4x8_SAVE + ble .LDTRMM_L4x8_SAVE -DTRMM_L4x8_SUB2: +.LDTRMM_L4x8_SUB2: KERNEL4x8_SUB1 addic. L, L, -1 - bgt DTRMM_L4x8_SUB2 + bgt .LDTRMM_L4x8_SUB2 -DTRMM_L4x8_SAVE: +.LDTRMM_L4x8_SAVE: SAVE4x8 @@ -314,12 +351,12 @@ DTRMM_L4x8_SAVE: #endif -DTRMM_L4x8_END: +.LDTRMM_L4x8_END: -DTRMM_L4x4_BEGIN: +.LDTRMM_L4x4_BEGIN: andi. T1, M, 4 - ble DTRMM_L4x4_END + ble .LDTRMM_L4x4_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO @@ -345,11 +382,11 @@ DTRMM_L4x4_BEGIN: mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L - ble DTRMM_L4x4_SUB0 + ble .LDTRMM_L4x4_SUB0 cmpwi cr0, L, 1 - ble DTRMM_L4x4_SUB4 + ble .LDTRMM_L4x4_SUB4 -DTRMM_L4x4_LOOP_START: +.LDTRMM_L4x4_LOOP_START: LOAD4x4_1 KERNEL4x4_I1 @@ -363,11 +400,11 @@ DTRMM_L4x4_LOOP_START: KERNEL4x4_2 addic. L, L, -2 - ble DTRMM_L4x4_LOOP_END + ble .LDTRMM_L4x4_LOOP_END .align 5 -DTRMM_L4x4_LOOP: +.LDTRMM_L4x4_LOOP: KERNEL4x4_1 KERNEL4x4_2 @@ -380,9 +417,9 @@ DTRMM_L4x4_LOOP: KERNEL4x4_2 addic. L, L, -1 - bgt DTRMM_L4x4_LOOP + bgt .LDTRMM_L4x4_LOOP -DTRMM_L4x4_LOOP_END: +.LDTRMM_L4x4_LOOP_END: KERNEL4x4_1 KERNEL4x4_2 @@ -394,9 +431,9 @@ DTRMM_L4x4_LOOP_END: KERNEL4x4_1 KERNEL4x4_E2 - b DTRMM_L4x4_SUB1 + b .LDTRMM_L4x4_SUB1 -DTRMM_L4x4_SUB4: +.LDTRMM_L4x4_SUB4: KERNEL4x4_SUBI1 KERNEL4x4_SUB1 @@ -408,31 +445,31 @@ DTRMM_L4x4_SUB4: KERNEL4x4_SUB1 KERNEL4x4_SUB1 - b DTRMM_L4x4_SUB1 + b .LDTRMM_L4x4_SUB1 -DTRMM_L4x4_SUB0: +.LDTRMM_L4x4_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL4x4_SUBI1 addic. L, L, -1 - ble DTRMM_L4x4_SAVE - b DTRMM_L4x4_SUB2 + ble .LDTRMM_L4x4_SAVE + b .LDTRMM_L4x4_SUB2 -DTRMM_L4x4_SUB1: +.LDTRMM_L4x4_SUB1: andi. L, K1, 7 // K1 & 7 -> L - ble DTRMM_L4x4_SAVE + ble .LDTRMM_L4x4_SAVE -DTRMM_L4x4_SUB2: +.LDTRMM_L4x4_SUB2: KERNEL4x4_SUB1 addic. L, L, -1 - bgt DTRMM_L4x4_SUB2 + bgt .LDTRMM_L4x4_SUB2 -DTRMM_L4x4_SAVE: +.LDTRMM_L4x4_SAVE: SAVE4x4 @@ -449,12 +486,12 @@ DTRMM_L4x4_SAVE: #endif -DTRMM_L4x4_END: +.LDTRMM_L4x4_END: -DTRMM_L4x2_BEGIN: +.LDTRMM_L4x2_BEGIN: andi. T1, M, 2 - ble DTRMM_L4x2_END + ble .LDTRMM_L4x2_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO @@ -480,11 +517,11 @@ DTRMM_L4x2_BEGIN: mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L - ble DTRMM_L4x2_SUB0 + ble .LDTRMM_L4x2_SUB0 cmpwi cr0, L, 1 - ble DTRMM_L4x2_SUB4 + ble .LDTRMM_L4x2_SUB4 -DTRMM_L4x2_LOOP_START: +.LDTRMM_L4x2_LOOP_START: LOAD4x2_1 KERNEL4x2_I1 @@ -498,11 +535,11 @@ DTRMM_L4x2_LOOP_START: KERNEL4x2_2 addic. L, L, -2 - ble DTRMM_L4x2_LOOP_END + ble .LDTRMM_L4x2_LOOP_END .align 5 -DTRMM_L4x2_LOOP: +.LDTRMM_L4x2_LOOP: KERNEL4x2_1 KERNEL4x2_2 @@ -515,9 +552,9 @@ DTRMM_L4x2_LOOP: KERNEL4x2_2 addic. L, L, -1 - bgt DTRMM_L4x2_LOOP + bgt .LDTRMM_L4x2_LOOP -DTRMM_L4x2_LOOP_END: +.LDTRMM_L4x2_LOOP_END: KERNEL4x2_1 KERNEL4x2_2 @@ -529,9 +566,9 @@ DTRMM_L4x2_LOOP_END: KERNEL4x2_1 KERNEL4x2_E2 - b DTRMM_L4x2_SUB1 + b .LDTRMM_L4x2_SUB1 -DTRMM_L4x2_SUB4: +.LDTRMM_L4x2_SUB4: KERNEL4x2_SUBI1 KERNEL4x2_SUB1 @@ -543,31 +580,31 @@ DTRMM_L4x2_SUB4: KERNEL4x2_SUB1 KERNEL4x2_SUB1 - b DTRMM_L4x2_SUB1 + b .LDTRMM_L4x2_SUB1 -DTRMM_L4x2_SUB0: +.LDTRMM_L4x2_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL4x2_SUBI1 addic. L, L, -1 - ble DTRMM_L4x2_SAVE - b DTRMM_L4x2_SUB2 + ble .LDTRMM_L4x2_SAVE + b .LDTRMM_L4x2_SUB2 -DTRMM_L4x2_SUB1: +.LDTRMM_L4x2_SUB1: andi. L, K1, 7 // K1 & 7 -> L - ble DTRMM_L4x2_SAVE + ble .LDTRMM_L4x2_SAVE -DTRMM_L4x2_SUB2: +.LDTRMM_L4x2_SUB2: KERNEL4x2_SUB1 addic. L, L, -1 - bgt DTRMM_L4x2_SUB2 + bgt .LDTRMM_L4x2_SUB2 -DTRMM_L4x2_SAVE: +.LDTRMM_L4x2_SAVE: SAVE4x2 @@ -584,12 +621,12 @@ DTRMM_L4x2_SAVE: #endif -DTRMM_L4x2_END: +.LDTRMM_L4x2_END: -DTRMM_L4x1_BEGIN: +.LDTRMM_L4x1_BEGIN: andi. T1, M, 1 - ble DTRMM_L4x1_END + ble .LDTRMM_L4x1_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO @@ -615,11 +652,11 @@ DTRMM_L4x1_BEGIN: mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L - ble DTRMM_L4x1_SUB0 + ble .LDTRMM_L4x1_SUB0 cmpwi cr0, L, 1 - ble DTRMM_L4x1_SUB4 + ble .LDTRMM_L4x1_SUB4 -DTRMM_L4x1_LOOP_START: +.LDTRMM_L4x1_LOOP_START: LOAD4x1_1 KERNEL4x1_I1 @@ -633,11 +670,11 @@ DTRMM_L4x1_LOOP_START: KERNEL4x1_2 addic. L, L, -2 - ble DTRMM_L4x1_LOOP_END + ble .LDTRMM_L4x1_LOOP_END .align 5 -DTRMM_L4x1_LOOP: +.LDTRMM_L4x1_LOOP: KERNEL4x1_1 KERNEL4x1_2 @@ -650,9 +687,9 @@ DTRMM_L4x1_LOOP: KERNEL4x1_2 addic. L, L, -1 - bgt DTRMM_L4x1_LOOP + bgt .LDTRMM_L4x1_LOOP -DTRMM_L4x1_LOOP_END: +.LDTRMM_L4x1_LOOP_END: KERNEL4x1_1 KERNEL4x1_2 @@ -664,9 +701,9 @@ DTRMM_L4x1_LOOP_END: KERNEL4x1_1 KERNEL4x1_E2 - b DTRMM_L4x1_SUB1 + b .LDTRMM_L4x1_SUB1 -DTRMM_L4x1_SUB4: +.LDTRMM_L4x1_SUB4: KERNEL4x1_SUBI1 KERNEL4x1_SUB1 @@ -678,31 +715,31 @@ DTRMM_L4x1_SUB4: KERNEL4x1_SUB1 KERNEL4x1_SUB1 - b DTRMM_L4x1_SUB1 + b .LDTRMM_L4x1_SUB1 -DTRMM_L4x1_SUB0: +.LDTRMM_L4x1_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL4x1_SUBI1 addic. L, L, -1 - ble DTRMM_L4x1_SAVE - b DTRMM_L4x1_SUB2 + ble .LDTRMM_L4x1_SAVE + b .LDTRMM_L4x1_SUB2 -DTRMM_L4x1_SUB1: +.LDTRMM_L4x1_SUB1: andi. L, K1, 7 // K1 & 7 -> L - ble DTRMM_L4x1_SAVE + ble .LDTRMM_L4x1_SAVE -DTRMM_L4x1_SUB2: +.LDTRMM_L4x1_SUB2: KERNEL4x1_SUB1 addic. L, L, -1 - bgt DTRMM_L4x1_SUB2 + bgt .LDTRMM_L4x1_SUB2 -DTRMM_L4x1_SAVE: +.LDTRMM_L4x1_SAVE: SAVE4x1 @@ -719,7 +756,7 @@ DTRMM_L4x1_SAVE: #endif -DTRMM_L4x1_END: +.LDTRMM_L4x1_END: slwi T1, K, 5 add B, B, T1 @@ -730,23 +767,23 @@ DTRMM_L4x1_END: addic. J, J, -1 - bgt DTRMM_L4_BEGIN + bgt .LDTRMM_L4_BEGIN andi. T2, N, 3 - ble L999 + ble .L999 -DTRMM_L4_END: +.LDTRMM_L4_END: - b DTRMM_L2_BEGIN + b .LDTRMM_L2_BEGIN -L999_H1: +.L999_H1: - b L999 + b .L999 -DTRMM_L2_BEGIN: +.LDTRMM_L2_BEGIN: andi. T1, N, 2 - ble DTRMM_L2_END + ble .LDTRMM_L2_END mr CO, C mr AO, A slwi T1, LDC , 1 @@ -757,9 +794,9 @@ DTRMM_L2_BEGIN: #endif srawi. I, M, 4 - ble DTRMM_L2x16_END + ble .LDTRMM_L2x16_END -DTRMM_L2x16_BEGIN: +.LDTRMM_L2x16_BEGIN: #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) @@ -786,11 +823,11 @@ DTRMM_L2x16_BEGIN: mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L - ble DTRMM_L2x16_SUB0 + ble .LDTRMM_L2x16_SUB0 cmpwi cr0, L, 1 - ble DTRMM_L2x16_SUB4 + ble .LDTRMM_L2x16_SUB4 -DTRMM_L2x16_LOOP_START: +.LDTRMM_L2x16_LOOP_START: dcbt AO, PRE LOAD2x16_1 @@ -813,11 +850,11 @@ DTRMM_L2x16_LOOP_START: KERNEL2x16_2 addic. L, L, -2 - ble DTRMM_L2x16_LOOP_END + ble .LDTRMM_L2x16_LOOP_END .align 5 -DTRMM_L2x16_LOOP: +.LDTRMM_L2x16_LOOP: dcbt AO, PRE KERNEL2x16_1 @@ -838,9 +875,9 @@ DTRMM_L2x16_LOOP: KERNEL2x16_2 addic. L, L, -1 - bgt DTRMM_L2x16_LOOP + bgt .LDTRMM_L2x16_LOOP -DTRMM_L2x16_LOOP_END: +.LDTRMM_L2x16_LOOP_END: dcbt AO, PRE KERNEL2x16_1 @@ -859,9 +896,9 @@ DTRMM_L2x16_LOOP_END: KERNEL2x16_1 KERNEL2x16_E2 - b DTRMM_L2x16_SUB1 + b .LDTRMM_L2x16_SUB1 -DTRMM_L2x16_SUB4: +.LDTRMM_L2x16_SUB4: dcbt AO, PRE KERNEL2x16_SUBI1 @@ -877,31 +914,31 @@ DTRMM_L2x16_SUB4: KERNEL2x16_SUB1 KERNEL2x16_SUB1 - b DTRMM_L2x16_SUB1 + b .LDTRMM_L2x16_SUB1 -DTRMM_L2x16_SUB0: +.LDTRMM_L2x16_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL2x16_SUBI1 addic. L, L, -1 - ble DTRMM_L2x16_SAVE - b DTRMM_L2x16_SUB2 + ble .LDTRMM_L2x16_SAVE + b .LDTRMM_L2x16_SUB2 -DTRMM_L2x16_SUB1: +.LDTRMM_L2x16_SUB1: andi. L, K1, 7 // K1 & 7 -> L - ble DTRMM_L2x16_SAVE + ble .LDTRMM_L2x16_SAVE -DTRMM_L2x16_SUB2: +.LDTRMM_L2x16_SUB2: KERNEL2x16_SUB1 addic. L, L, -1 - bgt DTRMM_L2x16_SUB2 + bgt .LDTRMM_L2x16_SUB2 -DTRMM_L2x16_SAVE: +.LDTRMM_L2x16_SAVE: SAVE2x16 @@ -919,16 +956,16 @@ DTRMM_L2x16_SAVE: addic. I, I, -1 - bgt DTRMM_L2x16_BEGIN + bgt .LDTRMM_L2x16_BEGIN -DTRMM_L2x16_END: +.LDTRMM_L2x16_END: -DTRMM_L2x8_BEGIN: +.LDTRMM_L2x8_BEGIN: andi. T2, M, 15 - ble DTRMM_L2x1_END + ble .LDTRMM_L2x1_END andi. T1, M, 8 - ble DTRMM_L2x8_END + ble .LDTRMM_L2x8_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO @@ -954,11 +991,11 @@ DTRMM_L2x8_BEGIN: mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L - ble DTRMM_L2x8_SUB0 + ble .LDTRMM_L2x8_SUB0 cmpwi cr0, L, 1 - ble DTRMM_L2x8_SUB4 + ble .LDTRMM_L2x8_SUB4 -DTRMM_L2x8_LOOP_START: +.LDTRMM_L2x8_LOOP_START: LOAD2x8_1 KERNEL2x8_I1 @@ -972,11 +1009,11 @@ DTRMM_L2x8_LOOP_START: KERNEL2x8_2 addic. L, L, -2 - ble DTRMM_L2x8_LOOP_END + ble .LDTRMM_L2x8_LOOP_END .align 5 -DTRMM_L2x8_LOOP: +.LDTRMM_L2x8_LOOP: KERNEL2x8_1 KERNEL2x8_2 @@ -989,9 +1026,9 @@ DTRMM_L2x8_LOOP: KERNEL2x8_2 addic. L, L, -1 - bgt DTRMM_L2x8_LOOP + bgt .LDTRMM_L2x8_LOOP -DTRMM_L2x8_LOOP_END: +.LDTRMM_L2x8_LOOP_END: KERNEL2x8_1 KERNEL2x8_2 @@ -1003,9 +1040,9 @@ DTRMM_L2x8_LOOP_END: KERNEL2x8_1 KERNEL2x8_E2 - b DTRMM_L2x8_SUB1 + b .LDTRMM_L2x8_SUB1 -DTRMM_L2x8_SUB4: +.LDTRMM_L2x8_SUB4: KERNEL2x8_SUBI1 KERNEL2x8_SUB1 @@ -1017,31 +1054,31 @@ DTRMM_L2x8_SUB4: KERNEL2x8_SUB1 KERNEL2x8_SUB1 - b DTRMM_L2x8_SUB1 + b .LDTRMM_L2x8_SUB1 -DTRMM_L2x8_SUB0: +.LDTRMM_L2x8_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL2x8_SUBI1 addic. L, L, -1 - ble DTRMM_L2x8_SAVE - b DTRMM_L2x8_SUB2 + ble .LDTRMM_L2x8_SAVE + b .LDTRMM_L2x8_SUB2 -DTRMM_L2x8_SUB1: +.LDTRMM_L2x8_SUB1: andi. L, K1, 7 // K1 & 7 -> L - ble DTRMM_L2x8_SAVE + ble .LDTRMM_L2x8_SAVE -DTRMM_L2x8_SUB2: +.LDTRMM_L2x8_SUB2: KERNEL2x8_SUB1 addic. L, L, -1 - bgt DTRMM_L2x8_SUB2 + bgt .LDTRMM_L2x8_SUB2 -DTRMM_L2x8_SAVE: +.LDTRMM_L2x8_SAVE: SAVE2x8 @@ -1058,12 +1095,12 @@ DTRMM_L2x8_SAVE: #endif -DTRMM_L2x8_END: +.LDTRMM_L2x8_END: -DTRMM_L2x4_BEGIN: +.LDTRMM_L2x4_BEGIN: andi. T1, M, 4 - ble DTRMM_L2x4_END + ble .LDTRMM_L2x4_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO @@ -1089,11 +1126,11 @@ DTRMM_L2x4_BEGIN: mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L - ble DTRMM_L2x4_SUB0 + ble .LDTRMM_L2x4_SUB0 cmpwi cr0, L, 1 - ble DTRMM_L2x4_SUB4 + ble .LDTRMM_L2x4_SUB4 -DTRMM_L2x4_LOOP_START: +.LDTRMM_L2x4_LOOP_START: LOAD2x4_1 KERNEL2x4_I1 @@ -1107,11 +1144,11 @@ DTRMM_L2x4_LOOP_START: KERNEL2x4_2 addic. L, L, -2 - ble DTRMM_L2x4_LOOP_END + ble .LDTRMM_L2x4_LOOP_END .align 5 -DTRMM_L2x4_LOOP: +.LDTRMM_L2x4_LOOP: KERNEL2x4_1 KERNEL2x4_2 @@ -1124,9 +1161,9 @@ DTRMM_L2x4_LOOP: KERNEL2x4_2 addic. L, L, -1 - bgt DTRMM_L2x4_LOOP + bgt .LDTRMM_L2x4_LOOP -DTRMM_L2x4_LOOP_END: +.LDTRMM_L2x4_LOOP_END: KERNEL2x4_1 KERNEL2x4_2 @@ -1138,9 +1175,9 @@ DTRMM_L2x4_LOOP_END: KERNEL2x4_1 KERNEL2x4_E2 - b DTRMM_L2x4_SUB1 + b .LDTRMM_L2x4_SUB1 -DTRMM_L2x4_SUB4: +.LDTRMM_L2x4_SUB4: KERNEL2x4_SUBI1 KERNEL2x4_SUB1 @@ -1152,31 +1189,31 @@ DTRMM_L2x4_SUB4: KERNEL2x4_SUB1 KERNEL2x4_SUB1 - b DTRMM_L2x4_SUB1 + b .LDTRMM_L2x4_SUB1 -DTRMM_L2x4_SUB0: +.LDTRMM_L2x4_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL2x4_SUBI1 addic. L, L, -1 - ble DTRMM_L2x4_SAVE - b DTRMM_L2x4_SUB2 + ble .LDTRMM_L2x4_SAVE + b .LDTRMM_L2x4_SUB2 -DTRMM_L2x4_SUB1: +.LDTRMM_L2x4_SUB1: andi. L, K1, 7 // K1 & 7 -> L - ble DTRMM_L2x4_SAVE + ble .LDTRMM_L2x4_SAVE -DTRMM_L2x4_SUB2: +.LDTRMM_L2x4_SUB2: KERNEL2x4_SUB1 addic. L, L, -1 - bgt DTRMM_L2x4_SUB2 + bgt .LDTRMM_L2x4_SUB2 -DTRMM_L2x4_SAVE: +.LDTRMM_L2x4_SAVE: SAVE2x4 @@ -1193,12 +1230,12 @@ DTRMM_L2x4_SAVE: #endif -DTRMM_L2x4_END: +.LDTRMM_L2x4_END: -DTRMM_L2x2_BEGIN: +.LDTRMM_L2x2_BEGIN: andi. T1, M, 2 - ble DTRMM_L2x2_END + ble .LDTRMM_L2x2_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO @@ -1224,11 +1261,11 @@ DTRMM_L2x2_BEGIN: mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L - ble DTRMM_L2x2_SUB0 + ble .LDTRMM_L2x2_SUB0 cmpwi cr0, L, 1 - ble DTRMM_L2x2_SUB4 + ble .LDTRMM_L2x2_SUB4 -DTRMM_L2x2_LOOP_START: +.LDTRMM_L2x2_LOOP_START: LOAD2x2_1 KERNEL2x2_I1 @@ -1242,11 +1279,11 @@ DTRMM_L2x2_LOOP_START: KERNEL2x2_2 addic. L, L, -2 - ble DTRMM_L2x2_LOOP_END + ble .LDTRMM_L2x2_LOOP_END .align 5 -DTRMM_L2x2_LOOP: +.LDTRMM_L2x2_LOOP: KERNEL2x2_1 KERNEL2x2_2 @@ -1259,9 +1296,9 @@ DTRMM_L2x2_LOOP: KERNEL2x2_2 addic. L, L, -1 - bgt DTRMM_L2x2_LOOP + bgt .LDTRMM_L2x2_LOOP -DTRMM_L2x2_LOOP_END: +.LDTRMM_L2x2_LOOP_END: KERNEL2x2_1 KERNEL2x2_2 @@ -1273,9 +1310,9 @@ DTRMM_L2x2_LOOP_END: KERNEL2x2_1 KERNEL2x2_E2 - b DTRMM_L2x2_SUB1 + b .LDTRMM_L2x2_SUB1 -DTRMM_L2x2_SUB4: +.LDTRMM_L2x2_SUB4: KERNEL2x2_SUBI1 KERNEL2x2_SUB1 @@ -1287,31 +1324,31 @@ DTRMM_L2x2_SUB4: KERNEL2x2_SUB1 KERNEL2x2_SUB1 - b DTRMM_L2x2_SUB1 + b .LDTRMM_L2x2_SUB1 -DTRMM_L2x2_SUB0: +.LDTRMM_L2x2_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL2x2_SUBI1 addic. L, L, -1 - ble DTRMM_L2x2_SAVE - b DTRMM_L2x2_SUB2 + ble .LDTRMM_L2x2_SAVE + b .LDTRMM_L2x2_SUB2 -DTRMM_L2x2_SUB1: +.LDTRMM_L2x2_SUB1: andi. L, K1, 7 // K1 & 7 -> L - ble DTRMM_L2x2_SAVE + ble .LDTRMM_L2x2_SAVE -DTRMM_L2x2_SUB2: +.LDTRMM_L2x2_SUB2: KERNEL2x2_SUB1 addic. L, L, -1 - bgt DTRMM_L2x2_SUB2 + bgt .LDTRMM_L2x2_SUB2 -DTRMM_L2x2_SAVE: +.LDTRMM_L2x2_SAVE: SAVE2x2 @@ -1328,12 +1365,12 @@ DTRMM_L2x2_SAVE: #endif -DTRMM_L2x2_END: +.LDTRMM_L2x2_END: -DTRMM_L2x1_BEGIN: +.LDTRMM_L2x1_BEGIN: andi. T1, M, 1 - ble DTRMM_L2x1_END + ble .LDTRMM_L2x1_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO @@ -1359,11 +1396,11 @@ DTRMM_L2x1_BEGIN: mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L - ble DTRMM_L2x1_SUB0 + ble .LDTRMM_L2x1_SUB0 cmpwi cr0, L, 1 - ble DTRMM_L2x1_SUB4 + ble .LDTRMM_L2x1_SUB4 -DTRMM_L2x1_LOOP_START: +.LDTRMM_L2x1_LOOP_START: LOAD2x1_1 KERNEL2x1_I1 @@ -1377,11 +1414,11 @@ DTRMM_L2x1_LOOP_START: KERNEL2x1_2 addic. L, L, -2 - ble DTRMM_L2x1_LOOP_END + ble .LDTRMM_L2x1_LOOP_END .align 5 -DTRMM_L2x1_LOOP: +.LDTRMM_L2x1_LOOP: KERNEL2x1_1 KERNEL2x1_2 @@ -1394,9 +1431,9 @@ DTRMM_L2x1_LOOP: KERNEL2x1_2 addic. L, L, -1 - bgt DTRMM_L2x1_LOOP + bgt .LDTRMM_L2x1_LOOP -DTRMM_L2x1_LOOP_END: +.LDTRMM_L2x1_LOOP_END: KERNEL2x1_1 KERNEL2x1_2 @@ -1408,9 +1445,9 @@ DTRMM_L2x1_LOOP_END: KERNEL2x1_1 KERNEL2x1_E2 - b DTRMM_L2x1_SUB1 + b .LDTRMM_L2x1_SUB1 -DTRMM_L2x1_SUB4: +.LDTRMM_L2x1_SUB4: KERNEL2x1_SUBI1 KERNEL2x1_SUB1 @@ -1422,31 +1459,31 @@ DTRMM_L2x1_SUB4: KERNEL2x1_SUB1 KERNEL2x1_SUB1 - b DTRMM_L2x1_SUB1 + b .LDTRMM_L2x1_SUB1 -DTRMM_L2x1_SUB0: +.LDTRMM_L2x1_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL2x1_SUBI1 addic. L, L, -1 - ble DTRMM_L2x1_SAVE - b DTRMM_L2x1_SUB2 + ble .LDTRMM_L2x1_SAVE + b .LDTRMM_L2x1_SUB2 -DTRMM_L2x1_SUB1: +.LDTRMM_L2x1_SUB1: andi. L, K1, 7 // K1 & 7 -> L - ble DTRMM_L2x1_SAVE + ble .LDTRMM_L2x1_SAVE -DTRMM_L2x1_SUB2: +.LDTRMM_L2x1_SUB2: KERNEL2x1_SUB1 addic. L, L, -1 - bgt DTRMM_L2x1_SUB2 + bgt .LDTRMM_L2x1_SUB2 -DTRMM_L2x1_SAVE: +.LDTRMM_L2x1_SAVE: SAVE2x1 @@ -1463,7 +1500,7 @@ DTRMM_L2x1_SAVE: #endif -DTRMM_L2x1_END: +.LDTRMM_L2x1_END: slwi T1, K, 4 add B, B, T1 @@ -1473,11 +1510,11 @@ DTRMM_L2x1_END: #endif -DTRMM_L2_END: -DTRMM_L1_BEGIN: +.LDTRMM_L2_END: +.LDTRMM_L1_BEGIN: andi. T1, N, 1 - ble DTRMM_L1_END + ble .LDTRMM_L1_END mr CO, C mr AO, A @@ -1486,9 +1523,9 @@ DTRMM_L1_BEGIN: #endif srawi. I, M, 4 - ble DTRMM_L1x16_END + ble .LDTRMM_L1x16_END -DTRMM_L1x16_BEGIN: +.LDTRMM_L1x16_BEGIN: #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) @@ -1515,11 +1552,11 @@ DTRMM_L1x16_BEGIN: mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L - ble DTRMM_L1x16_SUB0 + ble .LDTRMM_L1x16_SUB0 cmpwi cr0, L, 1 - ble DTRMM_L1x16_SUB4 + ble .LDTRMM_L1x16_SUB4 -DTRMM_L1x16_LOOP_START: +.LDTRMM_L1x16_LOOP_START: dcbt AO, PRE LOAD1x16_1 @@ -1542,11 +1579,11 @@ DTRMM_L1x16_LOOP_START: KERNEL1x16_2 addic. L, L, -2 - ble DTRMM_L1x16_LOOP_END + ble .LDTRMM_L1x16_LOOP_END .align 5 -DTRMM_L1x16_LOOP: +.LDTRMM_L1x16_LOOP: dcbt AO, PRE KERNEL1x16_1 @@ -1567,9 +1604,9 @@ DTRMM_L1x16_LOOP: KERNEL1x16_2 addic. L, L, -1 - bgt DTRMM_L1x16_LOOP + bgt .LDTRMM_L1x16_LOOP -DTRMM_L1x16_LOOP_END: +.LDTRMM_L1x16_LOOP_END: dcbt AO, PRE KERNEL1x16_1 @@ -1588,9 +1625,9 @@ DTRMM_L1x16_LOOP_END: KERNEL1x16_1 KERNEL1x16_E2 - b DTRMM_L1x16_SUB1 + b .LDTRMM_L1x16_SUB1 -DTRMM_L1x16_SUB4: +.LDTRMM_L1x16_SUB4: dcbt AO, PRE KERNEL1x16_SUBI1 @@ -1606,31 +1643,31 @@ DTRMM_L1x16_SUB4: KERNEL1x16_SUB1 KERNEL1x16_SUB1 - b DTRMM_L1x16_SUB1 + b .LDTRMM_L1x16_SUB1 -DTRMM_L1x16_SUB0: +.LDTRMM_L1x16_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL1x16_SUBI1 addic. L, L, -1 - ble DTRMM_L1x16_SAVE - b DTRMM_L1x16_SUB2 + ble .LDTRMM_L1x16_SAVE + b .LDTRMM_L1x16_SUB2 -DTRMM_L1x16_SUB1: +.LDTRMM_L1x16_SUB1: andi. L, K1, 7 // K1 & 7 -> L - ble DTRMM_L1x16_SAVE + ble .LDTRMM_L1x16_SAVE -DTRMM_L1x16_SUB2: +.LDTRMM_L1x16_SUB2: KERNEL1x16_SUB1 addic. L, L, -1 - bgt DTRMM_L1x16_SUB2 + bgt .LDTRMM_L1x16_SUB2 -DTRMM_L1x16_SAVE: +.LDTRMM_L1x16_SAVE: SAVE1x16 @@ -1648,16 +1685,16 @@ DTRMM_L1x16_SAVE: addic. I, I, -1 - bgt DTRMM_L1x16_BEGIN + bgt .LDTRMM_L1x16_BEGIN -DTRMM_L1x16_END: +.LDTRMM_L1x16_END: -DTRMM_L1x8_BEGIN: +.LDTRMM_L1x8_BEGIN: andi. T2, M, 15 - ble DTRMM_L1x1_END + ble .LDTRMM_L1x1_END andi. T1, M, 8 - ble DTRMM_L1x8_END + ble .LDTRMM_L1x8_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO @@ -1683,11 +1720,11 @@ DTRMM_L1x8_BEGIN: mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L - ble DTRMM_L1x8_SUB0 + ble .LDTRMM_L1x8_SUB0 cmpwi cr0, L, 1 - ble DTRMM_L1x8_SUB4 + ble .LDTRMM_L1x8_SUB4 -DTRMM_L1x8_LOOP_START: +.LDTRMM_L1x8_LOOP_START: LOAD1x8_1 KERNEL1x8_I1 @@ -1701,11 +1738,11 @@ DTRMM_L1x8_LOOP_START: KERNEL1x8_2 addic. L, L, -2 - ble DTRMM_L1x8_LOOP_END + ble .LDTRMM_L1x8_LOOP_END .align 5 -DTRMM_L1x8_LOOP: +.LDTRMM_L1x8_LOOP: KERNEL1x8_1 KERNEL1x8_2 @@ -1718,9 +1755,9 @@ DTRMM_L1x8_LOOP: KERNEL1x8_2 addic. L, L, -1 - bgt DTRMM_L1x8_LOOP + bgt .LDTRMM_L1x8_LOOP -DTRMM_L1x8_LOOP_END: +.LDTRMM_L1x8_LOOP_END: KERNEL1x8_1 KERNEL1x8_2 @@ -1732,9 +1769,9 @@ DTRMM_L1x8_LOOP_END: KERNEL1x8_1 KERNEL1x8_E2 - b DTRMM_L1x8_SUB1 + b .LDTRMM_L1x8_SUB1 -DTRMM_L1x8_SUB4: +.LDTRMM_L1x8_SUB4: KERNEL1x8_SUBI1 KERNEL1x8_SUB1 @@ -1746,31 +1783,31 @@ DTRMM_L1x8_SUB4: KERNEL1x8_SUB1 KERNEL1x8_SUB1 - b DTRMM_L1x8_SUB1 + b .LDTRMM_L1x8_SUB1 -DTRMM_L1x8_SUB0: +.LDTRMM_L1x8_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL1x8_SUBI1 addic. L, L, -1 - ble DTRMM_L1x8_SAVE - b DTRMM_L1x8_SUB2 + ble .LDTRMM_L1x8_SAVE + b .LDTRMM_L1x8_SUB2 -DTRMM_L1x8_SUB1: +.LDTRMM_L1x8_SUB1: andi. L, K1, 7 // K1 & 7 -> L - ble DTRMM_L1x8_SAVE + ble .LDTRMM_L1x8_SAVE -DTRMM_L1x8_SUB2: +.LDTRMM_L1x8_SUB2: KERNEL1x8_SUB1 addic. L, L, -1 - bgt DTRMM_L1x8_SUB2 + bgt .LDTRMM_L1x8_SUB2 -DTRMM_L1x8_SAVE: +.LDTRMM_L1x8_SAVE: SAVE1x8 @@ -1787,12 +1824,12 @@ DTRMM_L1x8_SAVE: #endif -DTRMM_L1x8_END: +.LDTRMM_L1x8_END: -DTRMM_L1x4_BEGIN: +.LDTRMM_L1x4_BEGIN: andi. T1, M, 4 - ble DTRMM_L1x4_END + ble .LDTRMM_L1x4_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO @@ -1818,11 +1855,11 @@ DTRMM_L1x4_BEGIN: mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L - ble DTRMM_L1x4_SUB0 + ble .LDTRMM_L1x4_SUB0 cmpwi cr0, L, 1 - ble DTRMM_L1x4_SUB4 + ble .LDTRMM_L1x4_SUB4 -DTRMM_L1x4_LOOP_START: +.LDTRMM_L1x4_LOOP_START: LOAD1x4_1 KERNEL1x4_I1 @@ -1836,11 +1873,11 @@ DTRMM_L1x4_LOOP_START: KERNEL1x4_2 addic. L, L, -2 - ble DTRMM_L1x4_LOOP_END + ble .LDTRMM_L1x4_LOOP_END .align 5 -DTRMM_L1x4_LOOP: +.LDTRMM_L1x4_LOOP: KERNEL1x4_1 KERNEL1x4_2 @@ -1853,9 +1890,9 @@ DTRMM_L1x4_LOOP: KERNEL1x4_2 addic. L, L, -1 - bgt DTRMM_L1x4_LOOP + bgt .LDTRMM_L1x4_LOOP -DTRMM_L1x4_LOOP_END: +.LDTRMM_L1x4_LOOP_END: KERNEL1x4_1 KERNEL1x4_2 @@ -1867,9 +1904,9 @@ DTRMM_L1x4_LOOP_END: KERNEL1x4_1 KERNEL1x4_E2 - b DTRMM_L1x4_SUB1 + b .LDTRMM_L1x4_SUB1 -DTRMM_L1x4_SUB4: +.LDTRMM_L1x4_SUB4: KERNEL1x4_SUBI1 KERNEL1x4_SUB1 @@ -1881,31 +1918,31 @@ DTRMM_L1x4_SUB4: KERNEL1x4_SUB1 KERNEL1x4_SUB1 - b DTRMM_L1x4_SUB1 + b .LDTRMM_L1x4_SUB1 -DTRMM_L1x4_SUB0: +.LDTRMM_L1x4_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL1x4_SUBI1 addic. L, L, -1 - ble DTRMM_L1x4_SAVE - b DTRMM_L1x4_SUB2 + ble .LDTRMM_L1x4_SAVE + b .LDTRMM_L1x4_SUB2 -DTRMM_L1x4_SUB1: +.LDTRMM_L1x4_SUB1: andi. L, K1, 7 // K1 & 7 -> L - ble DTRMM_L1x4_SAVE + ble .LDTRMM_L1x4_SAVE -DTRMM_L1x4_SUB2: +.LDTRMM_L1x4_SUB2: KERNEL1x4_SUB1 addic. L, L, -1 - bgt DTRMM_L1x4_SUB2 + bgt .LDTRMM_L1x4_SUB2 -DTRMM_L1x4_SAVE: +.LDTRMM_L1x4_SAVE: SAVE1x4 @@ -1922,12 +1959,12 @@ DTRMM_L1x4_SAVE: #endif -DTRMM_L1x4_END: +.LDTRMM_L1x4_END: -DTRMM_L1x2_BEGIN: +.LDTRMM_L1x2_BEGIN: andi. T1, M, 2 - ble DTRMM_L1x2_END + ble .LDTRMM_L1x2_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO @@ -1953,11 +1990,11 @@ DTRMM_L1x2_BEGIN: mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L - ble DTRMM_L1x2_SUB0 + ble .LDTRMM_L1x2_SUB0 cmpwi cr0, L, 1 - ble DTRMM_L1x2_SUB4 + ble .LDTRMM_L1x2_SUB4 -DTRMM_L1x2_LOOP_START: +.LDTRMM_L1x2_LOOP_START: LOAD1x2_1 KERNEL1x2_I1 @@ -1971,11 +2008,11 @@ DTRMM_L1x2_LOOP_START: KERNEL1x2_2 addic. L, L, -2 - ble DTRMM_L1x2_LOOP_END + ble .LDTRMM_L1x2_LOOP_END .align 5 -DTRMM_L1x2_LOOP: +.LDTRMM_L1x2_LOOP: KERNEL1x2_1 KERNEL1x2_2 @@ -1988,9 +2025,9 @@ DTRMM_L1x2_LOOP: KERNEL1x2_2 addic. L, L, -1 - bgt DTRMM_L1x2_LOOP + bgt .LDTRMM_L1x2_LOOP -DTRMM_L1x2_LOOP_END: +.LDTRMM_L1x2_LOOP_END: KERNEL1x2_1 KERNEL1x2_2 @@ -2002,9 +2039,9 @@ DTRMM_L1x2_LOOP_END: KERNEL1x2_1 KERNEL1x2_E2 - b DTRMM_L1x2_SUB1 + b .LDTRMM_L1x2_SUB1 -DTRMM_L1x2_SUB4: +.LDTRMM_L1x2_SUB4: KERNEL1x2_SUBI1 KERNEL1x2_SUB1 @@ -2016,31 +2053,31 @@ DTRMM_L1x2_SUB4: KERNEL1x2_SUB1 KERNEL1x2_SUB1 - b DTRMM_L1x2_SUB1 + b .LDTRMM_L1x2_SUB1 -DTRMM_L1x2_SUB0: +.LDTRMM_L1x2_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL1x2_SUBI1 addic. L, L, -1 - ble DTRMM_L1x2_SAVE - b DTRMM_L1x2_SUB2 + ble .LDTRMM_L1x2_SAVE + b .LDTRMM_L1x2_SUB2 -DTRMM_L1x2_SUB1: +.LDTRMM_L1x2_SUB1: andi. L, K1, 7 // K1 & 7 -> L - ble DTRMM_L1x2_SAVE + ble .LDTRMM_L1x2_SAVE -DTRMM_L1x2_SUB2: +.LDTRMM_L1x2_SUB2: KERNEL1x2_SUB1 addic. L, L, -1 - bgt DTRMM_L1x2_SUB2 + bgt .LDTRMM_L1x2_SUB2 -DTRMM_L1x2_SAVE: +.LDTRMM_L1x2_SAVE: SAVE1x2 @@ -2057,12 +2094,12 @@ DTRMM_L1x2_SAVE: #endif -DTRMM_L1x2_END: +.LDTRMM_L1x2_END: -DTRMM_L1x1_BEGIN: +.LDTRMM_L1x1_BEGIN: andi. T1, M, 1 - ble DTRMM_L1x1_END + ble .LDTRMM_L1x1_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO @@ -2088,11 +2125,11 @@ DTRMM_L1x1_BEGIN: mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L - ble DTRMM_L1x1_SUB0 + ble .LDTRMM_L1x1_SUB0 cmpwi cr0, L, 1 - ble DTRMM_L1x1_SUB4 + ble .LDTRMM_L1x1_SUB4 -DTRMM_L1x1_LOOP_START: +.LDTRMM_L1x1_LOOP_START: LOAD1x1_1 KERNEL1x1_I1 @@ -2106,11 +2143,11 @@ DTRMM_L1x1_LOOP_START: KERNEL1x1_2 addic. L, L, -2 - ble DTRMM_L1x1_LOOP_END + ble .LDTRMM_L1x1_LOOP_END .align 5 -DTRMM_L1x1_LOOP: +.LDTRMM_L1x1_LOOP: KERNEL1x1_1 KERNEL1x1_2 @@ -2123,9 +2160,9 @@ DTRMM_L1x1_LOOP: KERNEL1x1_2 addic. L, L, -1 - bgt DTRMM_L1x1_LOOP + bgt .LDTRMM_L1x1_LOOP -DTRMM_L1x1_LOOP_END: +.LDTRMM_L1x1_LOOP_END: KERNEL1x1_1 KERNEL1x1_2 @@ -2137,9 +2174,9 @@ DTRMM_L1x1_LOOP_END: KERNEL1x1_1 KERNEL1x1_E2 - b DTRMM_L1x1_SUB1 + b .LDTRMM_L1x1_SUB1 -DTRMM_L1x1_SUB4: +.LDTRMM_L1x1_SUB4: KERNEL1x1_SUBI1 KERNEL1x1_SUB1 @@ -2151,31 +2188,31 @@ DTRMM_L1x1_SUB4: KERNEL1x1_SUB1 KERNEL1x1_SUB1 - b DTRMM_L1x1_SUB1 + b .LDTRMM_L1x1_SUB1 -DTRMM_L1x1_SUB0: +.LDTRMM_L1x1_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL1x1_SUBI1 addic. L, L, -1 - ble DTRMM_L1x1_SAVE - b DTRMM_L1x1_SUB2 + ble .LDTRMM_L1x1_SAVE + b .LDTRMM_L1x1_SUB2 -DTRMM_L1x1_SUB1: +.LDTRMM_L1x1_SUB1: andi. L, K1, 7 // K1 & 7 -> L - ble DTRMM_L1x1_SAVE + ble .LDTRMM_L1x1_SAVE -DTRMM_L1x1_SUB2: +.LDTRMM_L1x1_SUB2: KERNEL1x1_SUB1 addic. L, L, -1 - bgt DTRMM_L1x1_SUB2 + bgt .LDTRMM_L1x1_SUB2 -DTRMM_L1x1_SAVE: +.LDTRMM_L1x1_SAVE: SAVE1x1 @@ -2192,11 +2229,11 @@ DTRMM_L1x1_SAVE: #endif -DTRMM_L1x1_END: +.LDTRMM_L1x1_END: #if !defined(LEFT) addi KK, KK, 1 // KK += Number of values in B #endif -DTRMM_L1_END: +.LDTRMM_L1_END: diff --git a/kernel/power/zgemm_kernel_8x2_power8.S b/kernel/power/zgemm_kernel_8x2_power8.S index 03957f406..a7665f749 100644 --- a/kernel/power/zgemm_kernel_8x2_power8.S +++ b/kernel/power/zgemm_kernel_8x2_power8.S @@ -1,3 +1,38 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/05 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ @@ -233,11 +268,11 @@ #include "zgemm_macros_8x2_power8.S" cmpwi cr0, M, 0 - ble L999 + ble .L999 cmpwi cr0, N, 0 - ble L999 + ble .L999 cmpwi cr0, K, 0 - ble L999 + ble .L999 slwi LDC, LDC, ZBASE_SHIFT li PRE, 256 @@ -260,7 +295,7 @@ #include "zgemm_logic_8x2_power8.S" -L999: +.L999: addi r3, 0, 0 lfd f14, 0(SP) diff --git a/kernel/power/zgemm_logic_8x2_power8.S b/kernel/power/zgemm_logic_8x2_power8.S index e829fd68e..5fcade5bf 100644 --- a/kernel/power/zgemm_logic_8x2_power8.S +++ b/kernel/power/zgemm_logic_8x2_power8.S @@ -1,25 +1,25 @@ srawi. J, N, 1 - ble ZGEMM_L2_END + ble .LZGEMM_L2_END -ZGEMM_L2_BEGIN: +.LZGEMM_L2_BEGIN: mr CO, C mr AO, A slwi T1, LDC , 1 add C, C, T1 srawi. I, M, 3 - ble ZGEMM_L2x8_END + ble .LZGEMM_L2x8_END -ZGEMM_L2x8_BEGIN: +.LZGEMM_L2x8_BEGIN: mr BO, B srawi. L, K, 3 - ble ZGEMM_L2x8_SUB0 + ble .LZGEMM_L2x8_SUB0 cmpwi cr0, L, 1 - ble ZGEMM_L2x8_SUB4 + ble .LZGEMM_L2x8_SUB4 -ZGEMM_L2x8_LOOP_START: +.LZGEMM_L2x8_LOOP_START: dcbt AO, PRE LOAD2x8_1 @@ -42,11 +42,11 @@ ZGEMM_L2x8_LOOP_START: KERNEL2x8_2 addic. L, L, -2 - ble ZGEMM_L2x8_LOOP_END + ble .LZGEMM_L2x8_LOOP_END .align 5 -ZGEMM_L2x8_LOOP: +.LZGEMM_L2x8_LOOP: dcbt AO, PRE KERNEL2x8_1 @@ -67,9 +67,9 @@ ZGEMM_L2x8_LOOP: KERNEL2x8_2 addic. L, L, -1 - bgt ZGEMM_L2x8_LOOP + bgt .LZGEMM_L2x8_LOOP -ZGEMM_L2x8_LOOP_END: +.LZGEMM_L2x8_LOOP_END: dcbt AO, PRE KERNEL2x8_1 @@ -88,9 +88,9 @@ ZGEMM_L2x8_LOOP_END: KERNEL2x8_1 KERNEL2x8_E2 - b ZGEMM_L2x8_SUB1 + b .LZGEMM_L2x8_SUB1 -ZGEMM_L2x8_SUB4: +.LZGEMM_L2x8_SUB4: dcbt AO, PRE KERNEL2x8_SUBI1 @@ -106,53 +106,53 @@ ZGEMM_L2x8_SUB4: KERNEL2x8_SUB1 KERNEL2x8_SUB1 - b ZGEMM_L2x8_SUB1 + b .LZGEMM_L2x8_SUB1 -ZGEMM_L2x8_SUB0: +.LZGEMM_L2x8_SUB0: andi. L, K, 7 KERNEL2x8_SUBI1 addic. L, L, -1 - ble ZGEMM_L2x8_SAVE - b ZGEMM_L2x8_SUB2 + ble .LZGEMM_L2x8_SAVE + b .LZGEMM_L2x8_SUB2 -ZGEMM_L2x8_SUB1: +.LZGEMM_L2x8_SUB1: andi. L, K, 7 - ble ZGEMM_L2x8_SAVE + ble .LZGEMM_L2x8_SAVE -ZGEMM_L2x8_SUB2: +.LZGEMM_L2x8_SUB2: KERNEL2x8_SUB1 addic. L, L, -1 - bgt ZGEMM_L2x8_SUB2 + bgt .LZGEMM_L2x8_SUB2 -ZGEMM_L2x8_SAVE: +.LZGEMM_L2x8_SAVE: SAVE2x8 addic. I, I, -1 - bgt ZGEMM_L2x8_BEGIN + bgt .LZGEMM_L2x8_BEGIN -ZGEMM_L2x8_END: +.LZGEMM_L2x8_END: -ZGEMM_L2x4_BEGIN: +.LZGEMM_L2x4_BEGIN: andi. T2, M, 7 - ble ZGEMM_L2x1_END + ble .LZGEMM_L2x1_END andi. T1, M, 4 - ble ZGEMM_L2x4_END + ble .LZGEMM_L2x4_END mr BO, B srawi. L, K, 3 - ble ZGEMM_L2x4_SUB0 + ble .LZGEMM_L2x4_SUB0 cmpwi cr0, L, 1 - ble ZGEMM_L2x4_SUB4 + ble .LZGEMM_L2x4_SUB4 -ZGEMM_L2x4_LOOP_START: +.LZGEMM_L2x4_LOOP_START: LOAD2x4_1 KERNEL2x4_I1 @@ -166,11 +166,11 @@ ZGEMM_L2x4_LOOP_START: KERNEL2x4_2 addic. L, L, -2 - ble ZGEMM_L2x4_LOOP_END + ble .LZGEMM_L2x4_LOOP_END .align 5 -ZGEMM_L2x4_LOOP: +.LZGEMM_L2x4_LOOP: KERNEL2x4_1 KERNEL2x4_2 @@ -183,9 +183,9 @@ ZGEMM_L2x4_LOOP: KERNEL2x4_2 addic. L, L, -1 - bgt ZGEMM_L2x4_LOOP + bgt .LZGEMM_L2x4_LOOP -ZGEMM_L2x4_LOOP_END: +.LZGEMM_L2x4_LOOP_END: KERNEL2x4_1 KERNEL2x4_2 @@ -197,9 +197,9 @@ ZGEMM_L2x4_LOOP_END: KERNEL2x4_1 KERNEL2x4_E2 - b ZGEMM_L2x4_SUB1 + b .LZGEMM_L2x4_SUB1 -ZGEMM_L2x4_SUB4: +.LZGEMM_L2x4_SUB4: KERNEL2x4_SUBI1 KERNEL2x4_SUB1 @@ -211,48 +211,48 @@ ZGEMM_L2x4_SUB4: KERNEL2x4_SUB1 KERNEL2x4_SUB1 - b ZGEMM_L2x4_SUB1 + b .LZGEMM_L2x4_SUB1 -ZGEMM_L2x4_SUB0: +.LZGEMM_L2x4_SUB0: andi. L, K, 7 KERNEL2x4_SUBI1 addic. L, L, -1 - ble ZGEMM_L2x4_SAVE - b ZGEMM_L2x4_SUB2 + ble .LZGEMM_L2x4_SAVE + b .LZGEMM_L2x4_SUB2 -ZGEMM_L2x4_SUB1: +.LZGEMM_L2x4_SUB1: andi. L, K, 7 - ble ZGEMM_L2x4_SAVE + ble .LZGEMM_L2x4_SAVE -ZGEMM_L2x4_SUB2: +.LZGEMM_L2x4_SUB2: KERNEL2x4_SUB1 addic. L, L, -1 - bgt ZGEMM_L2x4_SUB2 + bgt .LZGEMM_L2x4_SUB2 -ZGEMM_L2x4_SAVE: +.LZGEMM_L2x4_SAVE: SAVE2x4 -ZGEMM_L2x4_END: +.LZGEMM_L2x4_END: -ZGEMM_L2x2_BEGIN: +.LZGEMM_L2x2_BEGIN: andi. T1, M, 2 - ble ZGEMM_L2x2_END + ble .LZGEMM_L2x2_END mr BO, B srawi. L, K, 3 - ble ZGEMM_L2x2_SUB0 + ble .LZGEMM_L2x2_SUB0 cmpwi cr0, L, 1 - ble ZGEMM_L2x2_SUB4 + ble .LZGEMM_L2x2_SUB4 -ZGEMM_L2x2_LOOP_START: +.LZGEMM_L2x2_LOOP_START: LOAD2x2_1 KERNEL2x2_I1 @@ -266,11 +266,11 @@ ZGEMM_L2x2_LOOP_START: KERNEL2x2_2 addic. L, L, -2 - ble ZGEMM_L2x2_LOOP_END + ble .LZGEMM_L2x2_LOOP_END .align 5 -ZGEMM_L2x2_LOOP: +.LZGEMM_L2x2_LOOP: KERNEL2x2_1 KERNEL2x2_2 @@ -283,9 +283,9 @@ ZGEMM_L2x2_LOOP: KERNEL2x2_2 addic. L, L, -1 - bgt ZGEMM_L2x2_LOOP + bgt .LZGEMM_L2x2_LOOP -ZGEMM_L2x2_LOOP_END: +.LZGEMM_L2x2_LOOP_END: KERNEL2x2_1 KERNEL2x2_2 @@ -297,9 +297,9 @@ ZGEMM_L2x2_LOOP_END: KERNEL2x2_1 KERNEL2x2_E2 - b ZGEMM_L2x2_SUB1 + b .LZGEMM_L2x2_SUB1 -ZGEMM_L2x2_SUB4: +.LZGEMM_L2x2_SUB4: KERNEL2x2_SUBI1 KERNEL2x2_SUB1 @@ -311,48 +311,48 @@ ZGEMM_L2x2_SUB4: KERNEL2x2_SUB1 KERNEL2x2_SUB1 - b ZGEMM_L2x2_SUB1 + b .LZGEMM_L2x2_SUB1 -ZGEMM_L2x2_SUB0: +.LZGEMM_L2x2_SUB0: andi. L, K, 7 KERNEL2x2_SUBI1 addic. L, L, -1 - ble ZGEMM_L2x2_SAVE - b ZGEMM_L2x2_SUB2 + ble .LZGEMM_L2x2_SAVE + b .LZGEMM_L2x2_SUB2 -ZGEMM_L2x2_SUB1: +.LZGEMM_L2x2_SUB1: andi. L, K, 7 - ble ZGEMM_L2x2_SAVE + ble .LZGEMM_L2x2_SAVE -ZGEMM_L2x2_SUB2: +.LZGEMM_L2x2_SUB2: KERNEL2x2_SUB1 addic. L, L, -1 - bgt ZGEMM_L2x2_SUB2 + bgt .LZGEMM_L2x2_SUB2 -ZGEMM_L2x2_SAVE: +.LZGEMM_L2x2_SAVE: SAVE2x2 -ZGEMM_L2x2_END: +.LZGEMM_L2x2_END: -ZGEMM_L2x1_BEGIN: +.LZGEMM_L2x1_BEGIN: andi. T1, M, 1 - ble ZGEMM_L2x1_END + ble .LZGEMM_L2x1_END mr BO, B srawi. L, K, 3 - ble ZGEMM_L2x1_SUB0 + ble .LZGEMM_L2x1_SUB0 cmpwi cr0, L, 1 - ble ZGEMM_L2x1_SUB4 + ble .LZGEMM_L2x1_SUB4 -ZGEMM_L2x1_LOOP_START: +.LZGEMM_L2x1_LOOP_START: LOAD2x1_1 KERNEL2x1_I1 @@ -366,11 +366,11 @@ ZGEMM_L2x1_LOOP_START: KERNEL2x1_2 addic. L, L, -2 - ble ZGEMM_L2x1_LOOP_END + ble .LZGEMM_L2x1_LOOP_END .align 5 -ZGEMM_L2x1_LOOP: +.LZGEMM_L2x1_LOOP: KERNEL2x1_1 KERNEL2x1_2 @@ -383,9 +383,9 @@ ZGEMM_L2x1_LOOP: KERNEL2x1_2 addic. L, L, -1 - bgt ZGEMM_L2x1_LOOP + bgt .LZGEMM_L2x1_LOOP -ZGEMM_L2x1_LOOP_END: +.LZGEMM_L2x1_LOOP_END: KERNEL2x1_1 KERNEL2x1_2 @@ -397,9 +397,9 @@ ZGEMM_L2x1_LOOP_END: KERNEL2x1_1 KERNEL2x1_E2 - b ZGEMM_L2x1_SUB1 + b .LZGEMM_L2x1_SUB1 -ZGEMM_L2x1_SUB4: +.LZGEMM_L2x1_SUB4: KERNEL2x1_SUBI1 KERNEL2x1_SUB1 @@ -411,72 +411,72 @@ ZGEMM_L2x1_SUB4: KERNEL2x1_SUB1 KERNEL2x1_SUB1 - b ZGEMM_L2x1_SUB1 + b .LZGEMM_L2x1_SUB1 -ZGEMM_L2x1_SUB0: +.LZGEMM_L2x1_SUB0: andi. L, K, 7 KERNEL2x1_SUBI1 addic. L, L, -1 - ble ZGEMM_L2x1_SAVE - b ZGEMM_L2x1_SUB2 + ble .LZGEMM_L2x1_SAVE + b .LZGEMM_L2x1_SUB2 -ZGEMM_L2x1_SUB1: +.LZGEMM_L2x1_SUB1: andi. L, K, 7 - ble ZGEMM_L2x1_SAVE + ble .LZGEMM_L2x1_SAVE -ZGEMM_L2x1_SUB2: +.LZGEMM_L2x1_SUB2: KERNEL2x1_SUB1 addic. L, L, -1 - bgt ZGEMM_L2x1_SUB2 + bgt .LZGEMM_L2x1_SUB2 -ZGEMM_L2x1_SAVE: +.LZGEMM_L2x1_SAVE: SAVE2x1 -ZGEMM_L2x1_END: +.LZGEMM_L2x1_END: slwi T1, K, 5 add B, B, T1 addic. J, J, -1 - bgt ZGEMM_L2_BEGIN + bgt .LZGEMM_L2_BEGIN andi. T2, N, 1 - ble L999 + ble .L999 -ZGEMM_L2_END: +.LZGEMM_L2_END: - b ZGEMM_L1_BEGIN + b .LZGEMM_L1_BEGIN -L999_H1: +.L999_H1: - b L999 + b .L999 -ZGEMM_L1_BEGIN: +.LZGEMM_L1_BEGIN: andi. T1, N, 1 - ble ZGEMM_L1_END + ble .LZGEMM_L1_END mr CO, C mr AO, A srawi. I, M, 3 - ble ZGEMM_L1x8_END + ble .LZGEMM_L1x8_END -ZGEMM_L1x8_BEGIN: +.LZGEMM_L1x8_BEGIN: mr BO, B srawi. L, K, 3 - ble ZGEMM_L1x8_SUB0 + ble .LZGEMM_L1x8_SUB0 cmpwi cr0, L, 1 - ble ZGEMM_L1x8_SUB4 + ble .LZGEMM_L1x8_SUB4 -ZGEMM_L1x8_LOOP_START: +.LZGEMM_L1x8_LOOP_START: dcbt AO, PRE LOAD1x8_1 @@ -499,11 +499,11 @@ ZGEMM_L1x8_LOOP_START: KERNEL1x8_2 addic. L, L, -2 - ble ZGEMM_L1x8_LOOP_END + ble .LZGEMM_L1x8_LOOP_END .align 5 -ZGEMM_L1x8_LOOP: +.LZGEMM_L1x8_LOOP: dcbt AO, PRE KERNEL1x8_1 @@ -524,9 +524,9 @@ ZGEMM_L1x8_LOOP: KERNEL1x8_2 addic. L, L, -1 - bgt ZGEMM_L1x8_LOOP + bgt .LZGEMM_L1x8_LOOP -ZGEMM_L1x8_LOOP_END: +.LZGEMM_L1x8_LOOP_END: dcbt AO, PRE KERNEL1x8_1 @@ -545,9 +545,9 @@ ZGEMM_L1x8_LOOP_END: KERNEL1x8_1 KERNEL1x8_E2 - b ZGEMM_L1x8_SUB1 + b .LZGEMM_L1x8_SUB1 -ZGEMM_L1x8_SUB4: +.LZGEMM_L1x8_SUB4: dcbt AO, PRE KERNEL1x8_SUBI1 @@ -563,53 +563,53 @@ ZGEMM_L1x8_SUB4: KERNEL1x8_SUB1 KERNEL1x8_SUB1 - b ZGEMM_L1x8_SUB1 + b .LZGEMM_L1x8_SUB1 -ZGEMM_L1x8_SUB0: +.LZGEMM_L1x8_SUB0: andi. L, K, 7 KERNEL1x8_SUBI1 addic. L, L, -1 - ble ZGEMM_L1x8_SAVE - b ZGEMM_L1x8_SUB2 + ble .LZGEMM_L1x8_SAVE + b .LZGEMM_L1x8_SUB2 -ZGEMM_L1x8_SUB1: +.LZGEMM_L1x8_SUB1: andi. L, K, 7 - ble ZGEMM_L1x8_SAVE + ble .LZGEMM_L1x8_SAVE -ZGEMM_L1x8_SUB2: +.LZGEMM_L1x8_SUB2: KERNEL1x8_SUB1 addic. L, L, -1 - bgt ZGEMM_L1x8_SUB2 + bgt .LZGEMM_L1x8_SUB2 -ZGEMM_L1x8_SAVE: +.LZGEMM_L1x8_SAVE: SAVE1x8 addic. I, I, -1 - bgt ZGEMM_L1x8_BEGIN + bgt .LZGEMM_L1x8_BEGIN -ZGEMM_L1x8_END: +.LZGEMM_L1x8_END: -ZGEMM_L1x4_BEGIN: +.LZGEMM_L1x4_BEGIN: andi. T2, M, 7 - ble ZGEMM_L1x1_END + ble .LZGEMM_L1x1_END andi. T1, M, 4 - ble ZGEMM_L1x4_END + ble .LZGEMM_L1x4_END mr BO, B srawi. L, K, 3 - ble ZGEMM_L1x4_SUB0 + ble .LZGEMM_L1x4_SUB0 cmpwi cr0, L, 1 - ble ZGEMM_L1x4_SUB4 + ble .LZGEMM_L1x4_SUB4 -ZGEMM_L1x4_LOOP_START: +.LZGEMM_L1x4_LOOP_START: LOAD1x4_1 KERNEL1x4_I1 @@ -623,11 +623,11 @@ ZGEMM_L1x4_LOOP_START: KERNEL1x4_2 addic. L, L, -2 - ble ZGEMM_L1x4_LOOP_END + ble .LZGEMM_L1x4_LOOP_END .align 5 -ZGEMM_L1x4_LOOP: +.LZGEMM_L1x4_LOOP: KERNEL1x4_1 KERNEL1x4_2 @@ -640,9 +640,9 @@ ZGEMM_L1x4_LOOP: KERNEL1x4_2 addic. L, L, -1 - bgt ZGEMM_L1x4_LOOP + bgt .LZGEMM_L1x4_LOOP -ZGEMM_L1x4_LOOP_END: +.LZGEMM_L1x4_LOOP_END: KERNEL1x4_1 KERNEL1x4_2 @@ -654,9 +654,9 @@ ZGEMM_L1x4_LOOP_END: KERNEL1x4_1 KERNEL1x4_E2 - b ZGEMM_L1x4_SUB1 + b .LZGEMM_L1x4_SUB1 -ZGEMM_L1x4_SUB4: +.LZGEMM_L1x4_SUB4: KERNEL1x4_SUBI1 KERNEL1x4_SUB1 @@ -668,48 +668,48 @@ ZGEMM_L1x4_SUB4: KERNEL1x4_SUB1 KERNEL1x4_SUB1 - b ZGEMM_L1x4_SUB1 + b .LZGEMM_L1x4_SUB1 -ZGEMM_L1x4_SUB0: +.LZGEMM_L1x4_SUB0: andi. L, K, 7 KERNEL1x4_SUBI1 addic. L, L, -1 - ble ZGEMM_L1x4_SAVE - b ZGEMM_L1x4_SUB2 + ble .LZGEMM_L1x4_SAVE + b .LZGEMM_L1x4_SUB2 -ZGEMM_L1x4_SUB1: +.LZGEMM_L1x4_SUB1: andi. L, K, 7 - ble ZGEMM_L1x4_SAVE + ble .LZGEMM_L1x4_SAVE -ZGEMM_L1x4_SUB2: +.LZGEMM_L1x4_SUB2: KERNEL1x4_SUB1 addic. L, L, -1 - bgt ZGEMM_L1x4_SUB2 + bgt .LZGEMM_L1x4_SUB2 -ZGEMM_L1x4_SAVE: +.LZGEMM_L1x4_SAVE: SAVE1x4 -ZGEMM_L1x4_END: +.LZGEMM_L1x4_END: -ZGEMM_L1x2_BEGIN: +.LZGEMM_L1x2_BEGIN: andi. T1, M, 2 - ble ZGEMM_L1x2_END + ble .LZGEMM_L1x2_END mr BO, B srawi. L, K, 3 - ble ZGEMM_L1x2_SUB0 + ble .LZGEMM_L1x2_SUB0 cmpwi cr0, L, 1 - ble ZGEMM_L1x2_SUB4 + ble .LZGEMM_L1x2_SUB4 -ZGEMM_L1x2_LOOP_START: +.LZGEMM_L1x2_LOOP_START: LOAD1x2_1 KERNEL1x2_I1 @@ -723,11 +723,11 @@ ZGEMM_L1x2_LOOP_START: KERNEL1x2_2 addic. L, L, -2 - ble ZGEMM_L1x2_LOOP_END + ble .LZGEMM_L1x2_LOOP_END .align 5 -ZGEMM_L1x2_LOOP: +.LZGEMM_L1x2_LOOP: KERNEL1x2_1 KERNEL1x2_2 @@ -740,9 +740,9 @@ ZGEMM_L1x2_LOOP: KERNEL1x2_2 addic. L, L, -1 - bgt ZGEMM_L1x2_LOOP + bgt .LZGEMM_L1x2_LOOP -ZGEMM_L1x2_LOOP_END: +.LZGEMM_L1x2_LOOP_END: KERNEL1x2_1 KERNEL1x2_2 @@ -754,9 +754,9 @@ ZGEMM_L1x2_LOOP_END: KERNEL1x2_1 KERNEL1x2_E2 - b ZGEMM_L1x2_SUB1 + b .LZGEMM_L1x2_SUB1 -ZGEMM_L1x2_SUB4: +.LZGEMM_L1x2_SUB4: KERNEL1x2_SUBI1 KERNEL1x2_SUB1 @@ -768,48 +768,48 @@ ZGEMM_L1x2_SUB4: KERNEL1x2_SUB1 KERNEL1x2_SUB1 - b ZGEMM_L1x2_SUB1 + b .LZGEMM_L1x2_SUB1 -ZGEMM_L1x2_SUB0: +.LZGEMM_L1x2_SUB0: andi. L, K, 7 KERNEL1x2_SUBI1 addic. L, L, -1 - ble ZGEMM_L1x2_SAVE - b ZGEMM_L1x2_SUB2 + ble .LZGEMM_L1x2_SAVE + b .LZGEMM_L1x2_SUB2 -ZGEMM_L1x2_SUB1: +.LZGEMM_L1x2_SUB1: andi. L, K, 7 - ble ZGEMM_L1x2_SAVE + ble .LZGEMM_L1x2_SAVE -ZGEMM_L1x2_SUB2: +.LZGEMM_L1x2_SUB2: KERNEL1x2_SUB1 addic. L, L, -1 - bgt ZGEMM_L1x2_SUB2 + bgt .LZGEMM_L1x2_SUB2 -ZGEMM_L1x2_SAVE: +.LZGEMM_L1x2_SAVE: SAVE1x2 -ZGEMM_L1x2_END: +.LZGEMM_L1x2_END: -ZGEMM_L1x1_BEGIN: +.LZGEMM_L1x1_BEGIN: andi. T1, M, 1 - ble ZGEMM_L1x1_END + ble .LZGEMM_L1x1_END mr BO, B srawi. L, K, 3 - ble ZGEMM_L1x1_SUB0 + ble .LZGEMM_L1x1_SUB0 cmpwi cr0, L, 1 - ble ZGEMM_L1x1_SUB4 + ble .LZGEMM_L1x1_SUB4 -ZGEMM_L1x1_LOOP_START: +.LZGEMM_L1x1_LOOP_START: LOAD1x1_1 KERNEL1x1_I1 @@ -823,11 +823,11 @@ ZGEMM_L1x1_LOOP_START: KERNEL1x1_2 addic. L, L, -2 - ble ZGEMM_L1x1_LOOP_END + ble .LZGEMM_L1x1_LOOP_END .align 5 -ZGEMM_L1x1_LOOP: +.LZGEMM_L1x1_LOOP: KERNEL1x1_1 KERNEL1x1_2 @@ -840,9 +840,9 @@ ZGEMM_L1x1_LOOP: KERNEL1x1_2 addic. L, L, -1 - bgt ZGEMM_L1x1_LOOP + bgt .LZGEMM_L1x1_LOOP -ZGEMM_L1x1_LOOP_END: +.LZGEMM_L1x1_LOOP_END: KERNEL1x1_1 KERNEL1x1_2 @@ -854,9 +854,9 @@ ZGEMM_L1x1_LOOP_END: KERNEL1x1_1 KERNEL1x1_E2 - b ZGEMM_L1x1_SUB1 + b .LZGEMM_L1x1_SUB1 -ZGEMM_L1x1_SUB4: +.LZGEMM_L1x1_SUB4: KERNEL1x1_SUBI1 KERNEL1x1_SUB1 @@ -868,34 +868,34 @@ ZGEMM_L1x1_SUB4: KERNEL1x1_SUB1 KERNEL1x1_SUB1 - b ZGEMM_L1x1_SUB1 + b .LZGEMM_L1x1_SUB1 -ZGEMM_L1x1_SUB0: +.LZGEMM_L1x1_SUB0: andi. L, K, 7 KERNEL1x1_SUBI1 addic. L, L, -1 - ble ZGEMM_L1x1_SAVE - b ZGEMM_L1x1_SUB2 + ble .LZGEMM_L1x1_SAVE + b .LZGEMM_L1x1_SUB2 -ZGEMM_L1x1_SUB1: +.LZGEMM_L1x1_SUB1: andi. L, K, 7 - ble ZGEMM_L1x1_SAVE + ble .LZGEMM_L1x1_SAVE -ZGEMM_L1x1_SUB2: +.LZGEMM_L1x1_SUB2: KERNEL1x1_SUB1 addic. L, L, -1 - bgt ZGEMM_L1x1_SUB2 + bgt .LZGEMM_L1x1_SUB2 -ZGEMM_L1x1_SAVE: +.LZGEMM_L1x1_SAVE: SAVE1x1 -ZGEMM_L1x1_END: +.LZGEMM_L1x1_END: -ZGEMM_L1_END: +.LZGEMM_L1_END: diff --git a/kernel/power/zgemm_macros_8x2_power8.S b/kernel/power/zgemm_macros_8x2_power8.S index 3e5ea9ce8..701ec65c8 100644 --- a/kernel/power/zgemm_macros_8x2_power8.S +++ b/kernel/power/zgemm_macros_8x2_power8.S @@ -1,3 +1,39 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/05 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + + #if defined(NN) || defined(NT) || defined(TN) || defined(TT) #define XSFADD_R1 xsadddp diff --git a/kernel/power/ztrmm_kernel_8x2_power8.S b/kernel/power/ztrmm_kernel_8x2_power8.S index dbbc8f9ac..8b953765e 100644 --- a/kernel/power/ztrmm_kernel_8x2_power8.S +++ b/kernel/power/ztrmm_kernel_8x2_power8.S @@ -1,3 +1,38 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/05 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ @@ -239,11 +274,11 @@ #include "zgemm_macros_8x2_power8.S" cmpwi cr0, M, 0 - ble L999 + ble .L999 cmpwi cr0, N, 0 - ble L999 + ble .L999 cmpwi cr0, K, 0 - ble L999 + ble .L999 slwi LDC, LDC, ZBASE_SHIFT li PRE, 256 @@ -266,7 +301,7 @@ #include "ztrmm_logic_8x2_power8.S" -L999: +.L999: addi r3, 0, 0 lfd f14, 0(SP) diff --git a/kernel/power/ztrmm_logic_8x2_power8.S b/kernel/power/ztrmm_logic_8x2_power8.S index e250dfac5..f422b17b1 100644 --- a/kernel/power/ztrmm_logic_8x2_power8.S +++ b/kernel/power/ztrmm_logic_8x2_power8.S @@ -1,7 +1,43 @@ - srawi. J, N, 1 - ble ZTRMM_L2_END +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ -ZTRMM_L2_BEGIN: +/************************************************************************************** +* 2016/03/05 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + + + srawi. J, N, 1 + ble .LZTRMM_L2_END + +.LZTRMM_L2_BEGIN: mr CO, C mr AO, A @@ -13,9 +49,9 @@ ZTRMM_L2_BEGIN: #endif srawi. I, M, 3 - ble ZTRMM_L2x8_END + ble .LZTRMM_L2x8_END -ZTRMM_L2x8_BEGIN: +.LZTRMM_L2x8_BEGIN: #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) @@ -42,11 +78,11 @@ ZTRMM_L2x8_BEGIN: mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L - ble ZTRMM_L2x8_SUB0 + ble .LZTRMM_L2x8_SUB0 cmpwi cr0, L, 1 - ble ZTRMM_L2x8_SUB4 + ble .LZTRMM_L2x8_SUB4 -ZTRMM_L2x8_LOOP_START: +.LZTRMM_L2x8_LOOP_START: dcbt AO, PRE LOAD2x8_1 @@ -69,11 +105,11 @@ ZTRMM_L2x8_LOOP_START: KERNEL2x8_2 addic. L, L, -2 - ble ZTRMM_L2x8_LOOP_END + ble .LZTRMM_L2x8_LOOP_END .align 5 -ZTRMM_L2x8_LOOP: +.LZTRMM_L2x8_LOOP: dcbt AO, PRE KERNEL2x8_1 @@ -94,9 +130,9 @@ ZTRMM_L2x8_LOOP: KERNEL2x8_2 addic. L, L, -1 - bgt ZTRMM_L2x8_LOOP + bgt .LZTRMM_L2x8_LOOP -ZTRMM_L2x8_LOOP_END: +.LZTRMM_L2x8_LOOP_END: dcbt AO, PRE KERNEL2x8_1 @@ -115,9 +151,9 @@ ZTRMM_L2x8_LOOP_END: KERNEL2x8_1 KERNEL2x8_E2 - b ZTRMM_L2x8_SUB1 + b .LZTRMM_L2x8_SUB1 -ZTRMM_L2x8_SUB4: +.LZTRMM_L2x8_SUB4: dcbt AO, PRE KERNEL2x8_SUBI1 @@ -133,31 +169,31 @@ ZTRMM_L2x8_SUB4: KERNEL2x8_SUB1 KERNEL2x8_SUB1 - b ZTRMM_L2x8_SUB1 + b .LZTRMM_L2x8_SUB1 -ZTRMM_L2x8_SUB0: +.LZTRMM_L2x8_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL2x8_SUBI1 addic. L, L, -1 - ble ZTRMM_L2x8_SAVE - b ZTRMM_L2x8_SUB2 + ble .LZTRMM_L2x8_SAVE + b .LZTRMM_L2x8_SUB2 -ZTRMM_L2x8_SUB1: +.LZTRMM_L2x8_SUB1: andi. L, K1, 7 // K1 & 7 -> L - ble ZTRMM_L2x8_SAVE + ble .LZTRMM_L2x8_SAVE -ZTRMM_L2x8_SUB2: +.LZTRMM_L2x8_SUB2: KERNEL2x8_SUB1 addic. L, L, -1 - bgt ZTRMM_L2x8_SUB2 + bgt .LZTRMM_L2x8_SUB2 -ZTRMM_L2x8_SAVE: +.LZTRMM_L2x8_SAVE: SAVE2x8 @@ -175,16 +211,16 @@ ZTRMM_L2x8_SAVE: addic. I, I, -1 - bgt ZTRMM_L2x8_BEGIN + bgt .LZTRMM_L2x8_BEGIN -ZTRMM_L2x8_END: +.LZTRMM_L2x8_END: -ZTRMM_L2x4_BEGIN: +.LZTRMM_L2x4_BEGIN: andi. T2, M, 7 - ble ZTRMM_L2x1_END + ble .LZTRMM_L2x1_END andi. T1, M, 4 - ble ZTRMM_L2x4_END + ble .LZTRMM_L2x4_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO @@ -210,11 +246,11 @@ ZTRMM_L2x4_BEGIN: mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L - ble ZTRMM_L2x4_SUB0 + ble .LZTRMM_L2x4_SUB0 cmpwi cr0, L, 1 - ble ZTRMM_L2x4_SUB4 + ble .LZTRMM_L2x4_SUB4 -ZTRMM_L2x4_LOOP_START: +.LZTRMM_L2x4_LOOP_START: LOAD2x4_1 KERNEL2x4_I1 @@ -228,11 +264,11 @@ ZTRMM_L2x4_LOOP_START: KERNEL2x4_2 addic. L, L, -2 - ble ZTRMM_L2x4_LOOP_END + ble .LZTRMM_L2x4_LOOP_END .align 5 -ZTRMM_L2x4_LOOP: +.LZTRMM_L2x4_LOOP: KERNEL2x4_1 KERNEL2x4_2 @@ -245,9 +281,9 @@ ZTRMM_L2x4_LOOP: KERNEL2x4_2 addic. L, L, -1 - bgt ZTRMM_L2x4_LOOP + bgt .LZTRMM_L2x4_LOOP -ZTRMM_L2x4_LOOP_END: +.LZTRMM_L2x4_LOOP_END: KERNEL2x4_1 KERNEL2x4_2 @@ -259,9 +295,9 @@ ZTRMM_L2x4_LOOP_END: KERNEL2x4_1 KERNEL2x4_E2 - b ZTRMM_L2x4_SUB1 + b .LZTRMM_L2x4_SUB1 -ZTRMM_L2x4_SUB4: +.LZTRMM_L2x4_SUB4: KERNEL2x4_SUBI1 KERNEL2x4_SUB1 @@ -273,31 +309,31 @@ ZTRMM_L2x4_SUB4: KERNEL2x4_SUB1 KERNEL2x4_SUB1 - b ZTRMM_L2x4_SUB1 + b .LZTRMM_L2x4_SUB1 -ZTRMM_L2x4_SUB0: +.LZTRMM_L2x4_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL2x4_SUBI1 addic. L, L, -1 - ble ZTRMM_L2x4_SAVE - b ZTRMM_L2x4_SUB2 + ble .LZTRMM_L2x4_SAVE + b .LZTRMM_L2x4_SUB2 -ZTRMM_L2x4_SUB1: +.LZTRMM_L2x4_SUB1: andi. L, K1, 7 // K1 & 7 -> L - ble ZTRMM_L2x4_SAVE + ble .LZTRMM_L2x4_SAVE -ZTRMM_L2x4_SUB2: +.LZTRMM_L2x4_SUB2: KERNEL2x4_SUB1 addic. L, L, -1 - bgt ZTRMM_L2x4_SUB2 + bgt .LZTRMM_L2x4_SUB2 -ZTRMM_L2x4_SAVE: +.LZTRMM_L2x4_SAVE: SAVE2x4 @@ -314,12 +350,12 @@ ZTRMM_L2x4_SAVE: #endif -ZTRMM_L2x4_END: +.LZTRMM_L2x4_END: -ZTRMM_L2x2_BEGIN: +.LZTRMM_L2x2_BEGIN: andi. T1, M, 2 - ble ZTRMM_L2x2_END + ble .LZTRMM_L2x2_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO @@ -345,11 +381,11 @@ ZTRMM_L2x2_BEGIN: mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L - ble ZTRMM_L2x2_SUB0 + ble .LZTRMM_L2x2_SUB0 cmpwi cr0, L, 1 - ble ZTRMM_L2x2_SUB4 + ble .LZTRMM_L2x2_SUB4 -ZTRMM_L2x2_LOOP_START: +.LZTRMM_L2x2_LOOP_START: LOAD2x2_1 KERNEL2x2_I1 @@ -363,11 +399,11 @@ ZTRMM_L2x2_LOOP_START: KERNEL2x2_2 addic. L, L, -2 - ble ZTRMM_L2x2_LOOP_END + ble .LZTRMM_L2x2_LOOP_END .align 5 -ZTRMM_L2x2_LOOP: +.LZTRMM_L2x2_LOOP: KERNEL2x2_1 KERNEL2x2_2 @@ -380,9 +416,9 @@ ZTRMM_L2x2_LOOP: KERNEL2x2_2 addic. L, L, -1 - bgt ZTRMM_L2x2_LOOP + bgt .LZTRMM_L2x2_LOOP -ZTRMM_L2x2_LOOP_END: +.LZTRMM_L2x2_LOOP_END: KERNEL2x2_1 KERNEL2x2_2 @@ -394,9 +430,9 @@ ZTRMM_L2x2_LOOP_END: KERNEL2x2_1 KERNEL2x2_E2 - b ZTRMM_L2x2_SUB1 + b .LZTRMM_L2x2_SUB1 -ZTRMM_L2x2_SUB4: +.LZTRMM_L2x2_SUB4: KERNEL2x2_SUBI1 KERNEL2x2_SUB1 @@ -408,31 +444,31 @@ ZTRMM_L2x2_SUB4: KERNEL2x2_SUB1 KERNEL2x2_SUB1 - b ZTRMM_L2x2_SUB1 + b .LZTRMM_L2x2_SUB1 -ZTRMM_L2x2_SUB0: +.LZTRMM_L2x2_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL2x2_SUBI1 addic. L, L, -1 - ble ZTRMM_L2x2_SAVE - b ZTRMM_L2x2_SUB2 + ble .LZTRMM_L2x2_SAVE + b .LZTRMM_L2x2_SUB2 -ZTRMM_L2x2_SUB1: +.LZTRMM_L2x2_SUB1: andi. L, K1, 7 // K1 & 7 -> L - ble ZTRMM_L2x2_SAVE + ble .LZTRMM_L2x2_SAVE -ZTRMM_L2x2_SUB2: +.LZTRMM_L2x2_SUB2: KERNEL2x2_SUB1 addic. L, L, -1 - bgt ZTRMM_L2x2_SUB2 + bgt .LZTRMM_L2x2_SUB2 -ZTRMM_L2x2_SAVE: +.LZTRMM_L2x2_SAVE: SAVE2x2 @@ -449,12 +485,12 @@ ZTRMM_L2x2_SAVE: #endif -ZTRMM_L2x2_END: +.LZTRMM_L2x2_END: -ZTRMM_L2x1_BEGIN: +.LZTRMM_L2x1_BEGIN: andi. T1, M, 1 - ble ZTRMM_L2x1_END + ble .LZTRMM_L2x1_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO @@ -480,11 +516,11 @@ ZTRMM_L2x1_BEGIN: mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L - ble ZTRMM_L2x1_SUB0 + ble .LZTRMM_L2x1_SUB0 cmpwi cr0, L, 1 - ble ZTRMM_L2x1_SUB4 + ble .LZTRMM_L2x1_SUB4 -ZTRMM_L2x1_LOOP_START: +.LZTRMM_L2x1_LOOP_START: LOAD2x1_1 KERNEL2x1_I1 @@ -498,11 +534,11 @@ ZTRMM_L2x1_LOOP_START: KERNEL2x1_2 addic. L, L, -2 - ble ZTRMM_L2x1_LOOP_END + ble .LZTRMM_L2x1_LOOP_END .align 5 -ZTRMM_L2x1_LOOP: +.LZTRMM_L2x1_LOOP: KERNEL2x1_1 KERNEL2x1_2 @@ -515,9 +551,9 @@ ZTRMM_L2x1_LOOP: KERNEL2x1_2 addic. L, L, -1 - bgt ZTRMM_L2x1_LOOP + bgt .LZTRMM_L2x1_LOOP -ZTRMM_L2x1_LOOP_END: +.LZTRMM_L2x1_LOOP_END: KERNEL2x1_1 KERNEL2x1_2 @@ -529,9 +565,9 @@ ZTRMM_L2x1_LOOP_END: KERNEL2x1_1 KERNEL2x1_E2 - b ZTRMM_L2x1_SUB1 + b .LZTRMM_L2x1_SUB1 -ZTRMM_L2x1_SUB4: +.LZTRMM_L2x1_SUB4: KERNEL2x1_SUBI1 KERNEL2x1_SUB1 @@ -543,31 +579,31 @@ ZTRMM_L2x1_SUB4: KERNEL2x1_SUB1 KERNEL2x1_SUB1 - b ZTRMM_L2x1_SUB1 + b .LZTRMM_L2x1_SUB1 -ZTRMM_L2x1_SUB0: +.LZTRMM_L2x1_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL2x1_SUBI1 addic. L, L, -1 - ble ZTRMM_L2x1_SAVE - b ZTRMM_L2x1_SUB2 + ble .LZTRMM_L2x1_SAVE + b .LZTRMM_L2x1_SUB2 -ZTRMM_L2x1_SUB1: +.LZTRMM_L2x1_SUB1: andi. L, K1, 7 // K1 & 7 -> L - ble ZTRMM_L2x1_SAVE + ble .LZTRMM_L2x1_SAVE -ZTRMM_L2x1_SUB2: +.LZTRMM_L2x1_SUB2: KERNEL2x1_SUB1 addic. L, L, -1 - bgt ZTRMM_L2x1_SUB2 + bgt .LZTRMM_L2x1_SUB2 -ZTRMM_L2x1_SAVE: +.LZTRMM_L2x1_SAVE: SAVE2x1 @@ -584,7 +620,7 @@ ZTRMM_L2x1_SAVE: #endif -ZTRMM_L2x1_END: +.LZTRMM_L2x1_END: slwi T1, K, 5 add B, B, T1 @@ -595,23 +631,23 @@ ZTRMM_L2x1_END: addic. J, J, -1 - bgt ZTRMM_L2_BEGIN + bgt .LZTRMM_L2_BEGIN andi. T2, N, 1 - ble L999 + ble .L999 -ZTRMM_L2_END: +.LZTRMM_L2_END: - b ZTRMM_L1_BEGIN + b .LZTRMM_L1_BEGIN -L999_H1: +.L999_H1: - b L999 + b .L999 -ZTRMM_L1_BEGIN: +.LZTRMM_L1_BEGIN: andi. T1, N, 1 - ble ZTRMM_L1_END + ble .LZTRMM_L1_END mr CO, C mr AO, A @@ -620,9 +656,9 @@ ZTRMM_L1_BEGIN: #endif srawi. I, M, 3 - ble ZTRMM_L1x8_END + ble .LZTRMM_L1x8_END -ZTRMM_L1x8_BEGIN: +.LZTRMM_L1x8_BEGIN: #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) @@ -649,11 +685,11 @@ ZTRMM_L1x8_BEGIN: mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L - ble ZTRMM_L1x8_SUB0 + ble .LZTRMM_L1x8_SUB0 cmpwi cr0, L, 1 - ble ZTRMM_L1x8_SUB4 + ble .LZTRMM_L1x8_SUB4 -ZTRMM_L1x8_LOOP_START: +.LZTRMM_L1x8_LOOP_START: dcbt AO, PRE LOAD1x8_1 @@ -676,11 +712,11 @@ ZTRMM_L1x8_LOOP_START: KERNEL1x8_2 addic. L, L, -2 - ble ZTRMM_L1x8_LOOP_END + ble .LZTRMM_L1x8_LOOP_END .align 5 -ZTRMM_L1x8_LOOP: +.LZTRMM_L1x8_LOOP: dcbt AO, PRE KERNEL1x8_1 @@ -701,9 +737,9 @@ ZTRMM_L1x8_LOOP: KERNEL1x8_2 addic. L, L, -1 - bgt ZTRMM_L1x8_LOOP + bgt .LZTRMM_L1x8_LOOP -ZTRMM_L1x8_LOOP_END: +.LZTRMM_L1x8_LOOP_END: dcbt AO, PRE KERNEL1x8_1 @@ -722,9 +758,9 @@ ZTRMM_L1x8_LOOP_END: KERNEL1x8_1 KERNEL1x8_E2 - b ZTRMM_L1x8_SUB1 + b .LZTRMM_L1x8_SUB1 -ZTRMM_L1x8_SUB4: +.LZTRMM_L1x8_SUB4: dcbt AO, PRE KERNEL1x8_SUBI1 @@ -740,31 +776,31 @@ ZTRMM_L1x8_SUB4: KERNEL1x8_SUB1 KERNEL1x8_SUB1 - b ZTRMM_L1x8_SUB1 + b .LZTRMM_L1x8_SUB1 -ZTRMM_L1x8_SUB0: +.LZTRMM_L1x8_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL1x8_SUBI1 addic. L, L, -1 - ble ZTRMM_L1x8_SAVE - b ZTRMM_L1x8_SUB2 + ble .LZTRMM_L1x8_SAVE + b .LZTRMM_L1x8_SUB2 -ZTRMM_L1x8_SUB1: +.LZTRMM_L1x8_SUB1: andi. L, K1, 7 // K1 & 7 -> L - ble ZTRMM_L1x8_SAVE + ble .LZTRMM_L1x8_SAVE -ZTRMM_L1x8_SUB2: +.LZTRMM_L1x8_SUB2: KERNEL1x8_SUB1 addic. L, L, -1 - bgt ZTRMM_L1x8_SUB2 + bgt .LZTRMM_L1x8_SUB2 -ZTRMM_L1x8_SAVE: +.LZTRMM_L1x8_SAVE: SAVE1x8 @@ -782,16 +818,16 @@ ZTRMM_L1x8_SAVE: addic. I, I, -1 - bgt ZTRMM_L1x8_BEGIN + bgt .LZTRMM_L1x8_BEGIN -ZTRMM_L1x8_END: +.LZTRMM_L1x8_END: -ZTRMM_L1x4_BEGIN: +.LZTRMM_L1x4_BEGIN: andi. T2, M, 7 - ble ZTRMM_L1x1_END + ble .LZTRMM_L1x1_END andi. T1, M, 4 - ble ZTRMM_L1x4_END + ble .LZTRMM_L1x4_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO @@ -817,11 +853,11 @@ ZTRMM_L1x4_BEGIN: mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L - ble ZTRMM_L1x4_SUB0 + ble .LZTRMM_L1x4_SUB0 cmpwi cr0, L, 1 - ble ZTRMM_L1x4_SUB4 + ble .LZTRMM_L1x4_SUB4 -ZTRMM_L1x4_LOOP_START: +.LZTRMM_L1x4_LOOP_START: LOAD1x4_1 KERNEL1x4_I1 @@ -835,11 +871,11 @@ ZTRMM_L1x4_LOOP_START: KERNEL1x4_2 addic. L, L, -2 - ble ZTRMM_L1x4_LOOP_END + ble .LZTRMM_L1x4_LOOP_END .align 5 -ZTRMM_L1x4_LOOP: +.LZTRMM_L1x4_LOOP: KERNEL1x4_1 KERNEL1x4_2 @@ -852,9 +888,9 @@ ZTRMM_L1x4_LOOP: KERNEL1x4_2 addic. L, L, -1 - bgt ZTRMM_L1x4_LOOP + bgt .LZTRMM_L1x4_LOOP -ZTRMM_L1x4_LOOP_END: +.LZTRMM_L1x4_LOOP_END: KERNEL1x4_1 KERNEL1x4_2 @@ -866,9 +902,9 @@ ZTRMM_L1x4_LOOP_END: KERNEL1x4_1 KERNEL1x4_E2 - b ZTRMM_L1x4_SUB1 + b .LZTRMM_L1x4_SUB1 -ZTRMM_L1x4_SUB4: +.LZTRMM_L1x4_SUB4: KERNEL1x4_SUBI1 KERNEL1x4_SUB1 @@ -880,31 +916,31 @@ ZTRMM_L1x4_SUB4: KERNEL1x4_SUB1 KERNEL1x4_SUB1 - b ZTRMM_L1x4_SUB1 + b .LZTRMM_L1x4_SUB1 -ZTRMM_L1x4_SUB0: +.LZTRMM_L1x4_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL1x4_SUBI1 addic. L, L, -1 - ble ZTRMM_L1x4_SAVE - b ZTRMM_L1x4_SUB2 + ble .LZTRMM_L1x4_SAVE + b .LZTRMM_L1x4_SUB2 -ZTRMM_L1x4_SUB1: +.LZTRMM_L1x4_SUB1: andi. L, K1, 7 // K1 & 7 -> L - ble ZTRMM_L1x4_SAVE + ble .LZTRMM_L1x4_SAVE -ZTRMM_L1x4_SUB2: +.LZTRMM_L1x4_SUB2: KERNEL1x4_SUB1 addic. L, L, -1 - bgt ZTRMM_L1x4_SUB2 + bgt .LZTRMM_L1x4_SUB2 -ZTRMM_L1x4_SAVE: +.LZTRMM_L1x4_SAVE: SAVE1x4 @@ -921,12 +957,12 @@ ZTRMM_L1x4_SAVE: #endif -ZTRMM_L1x4_END: +.LZTRMM_L1x4_END: -ZTRMM_L1x2_BEGIN: +.LZTRMM_L1x2_BEGIN: andi. T1, M, 2 - ble ZTRMM_L1x2_END + ble .LZTRMM_L1x2_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO @@ -952,11 +988,11 @@ ZTRMM_L1x2_BEGIN: mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L - ble ZTRMM_L1x2_SUB0 + ble .LZTRMM_L1x2_SUB0 cmpwi cr0, L, 1 - ble ZTRMM_L1x2_SUB4 + ble .LZTRMM_L1x2_SUB4 -ZTRMM_L1x2_LOOP_START: +.LZTRMM_L1x2_LOOP_START: LOAD1x2_1 KERNEL1x2_I1 @@ -970,11 +1006,11 @@ ZTRMM_L1x2_LOOP_START: KERNEL1x2_2 addic. L, L, -2 - ble ZTRMM_L1x2_LOOP_END + ble .LZTRMM_L1x2_LOOP_END .align 5 -ZTRMM_L1x2_LOOP: +.LZTRMM_L1x2_LOOP: KERNEL1x2_1 KERNEL1x2_2 @@ -987,9 +1023,9 @@ ZTRMM_L1x2_LOOP: KERNEL1x2_2 addic. L, L, -1 - bgt ZTRMM_L1x2_LOOP + bgt .LZTRMM_L1x2_LOOP -ZTRMM_L1x2_LOOP_END: +.LZTRMM_L1x2_LOOP_END: KERNEL1x2_1 KERNEL1x2_2 @@ -1001,9 +1037,9 @@ ZTRMM_L1x2_LOOP_END: KERNEL1x2_1 KERNEL1x2_E2 - b ZTRMM_L1x2_SUB1 + b .LZTRMM_L1x2_SUB1 -ZTRMM_L1x2_SUB4: +.LZTRMM_L1x2_SUB4: KERNEL1x2_SUBI1 KERNEL1x2_SUB1 @@ -1015,31 +1051,31 @@ ZTRMM_L1x2_SUB4: KERNEL1x2_SUB1 KERNEL1x2_SUB1 - b ZTRMM_L1x2_SUB1 + b .LZTRMM_L1x2_SUB1 -ZTRMM_L1x2_SUB0: +.LZTRMM_L1x2_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL1x2_SUBI1 addic. L, L, -1 - ble ZTRMM_L1x2_SAVE - b ZTRMM_L1x2_SUB2 + ble .LZTRMM_L1x2_SAVE + b .LZTRMM_L1x2_SUB2 -ZTRMM_L1x2_SUB1: +.LZTRMM_L1x2_SUB1: andi. L, K1, 7 // K1 & 7 -> L - ble ZTRMM_L1x2_SAVE + ble .LZTRMM_L1x2_SAVE -ZTRMM_L1x2_SUB2: +.LZTRMM_L1x2_SUB2: KERNEL1x2_SUB1 addic. L, L, -1 - bgt ZTRMM_L1x2_SUB2 + bgt .LZTRMM_L1x2_SUB2 -ZTRMM_L1x2_SAVE: +.LZTRMM_L1x2_SAVE: SAVE1x2 @@ -1056,12 +1092,12 @@ ZTRMM_L1x2_SAVE: #endif -ZTRMM_L1x2_END: +.LZTRMM_L1x2_END: -ZTRMM_L1x1_BEGIN: +.LZTRMM_L1x1_BEGIN: andi. T1, M, 1 - ble ZTRMM_L1x1_END + ble .LZTRMM_L1x1_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO @@ -1087,11 +1123,11 @@ ZTRMM_L1x1_BEGIN: mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L - ble ZTRMM_L1x1_SUB0 + ble .LZTRMM_L1x1_SUB0 cmpwi cr0, L, 1 - ble ZTRMM_L1x1_SUB4 + ble .LZTRMM_L1x1_SUB4 -ZTRMM_L1x1_LOOP_START: +.LZTRMM_L1x1_LOOP_START: LOAD1x1_1 KERNEL1x1_I1 @@ -1105,11 +1141,11 @@ ZTRMM_L1x1_LOOP_START: KERNEL1x1_2 addic. L, L, -2 - ble ZTRMM_L1x1_LOOP_END + ble .LZTRMM_L1x1_LOOP_END .align 5 -ZTRMM_L1x1_LOOP: +.LZTRMM_L1x1_LOOP: KERNEL1x1_1 KERNEL1x1_2 @@ -1122,9 +1158,9 @@ ZTRMM_L1x1_LOOP: KERNEL1x1_2 addic. L, L, -1 - bgt ZTRMM_L1x1_LOOP + bgt .LZTRMM_L1x1_LOOP -ZTRMM_L1x1_LOOP_END: +.LZTRMM_L1x1_LOOP_END: KERNEL1x1_1 KERNEL1x1_2 @@ -1136,9 +1172,9 @@ ZTRMM_L1x1_LOOP_END: KERNEL1x1_1 KERNEL1x1_E2 - b ZTRMM_L1x1_SUB1 + b .LZTRMM_L1x1_SUB1 -ZTRMM_L1x1_SUB4: +.LZTRMM_L1x1_SUB4: KERNEL1x1_SUBI1 KERNEL1x1_SUB1 @@ -1150,31 +1186,31 @@ ZTRMM_L1x1_SUB4: KERNEL1x1_SUB1 KERNEL1x1_SUB1 - b ZTRMM_L1x1_SUB1 + b .LZTRMM_L1x1_SUB1 -ZTRMM_L1x1_SUB0: +.LZTRMM_L1x1_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL1x1_SUBI1 addic. L, L, -1 - ble ZTRMM_L1x1_SAVE - b ZTRMM_L1x1_SUB2 + ble .LZTRMM_L1x1_SAVE + b .LZTRMM_L1x1_SUB2 -ZTRMM_L1x1_SUB1: +.LZTRMM_L1x1_SUB1: andi. L, K1, 7 // K1 & 7 -> L - ble ZTRMM_L1x1_SAVE + ble .LZTRMM_L1x1_SAVE -ZTRMM_L1x1_SUB2: +.LZTRMM_L1x1_SUB2: KERNEL1x1_SUB1 addic. L, L, -1 - bgt ZTRMM_L1x1_SUB2 + bgt .LZTRMM_L1x1_SUB2 -ZTRMM_L1x1_SAVE: +.LZTRMM_L1x1_SAVE: SAVE1x1 @@ -1191,11 +1227,11 @@ ZTRMM_L1x1_SAVE: #endif -ZTRMM_L1x1_END: +.LZTRMM_L1x1_END: #if !defined(LEFT) addi KK, KK, 1 // KK += Number of values in B #endif -ZTRMM_L1_END: +.LZTRMM_L1_END: