Modified assembly label name, so that they are hidden.

Added license informations.
This commit is contained in:
Werner Saar 2016-03-05 10:27:27 +01:00
parent 0afc76fd65
commit 085f215257
10 changed files with 1375 additions and 1055 deletions

View File

@ -1,3 +1,38 @@
/***************************************************************************
Copyright (c) 2013-2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2016/03/05 Werner Saar (wernsaar@googlemail.com)
* BLASTEST : OK
* CTEST : OK
* TEST : OK
* LAPACK-TEST : OK
**************************************************************************************/
/*********************************************************************/ /*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */ /* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */ /* All rights reserved. */
@ -218,11 +253,11 @@
cmpwi cr0, M, 0 cmpwi cr0, M, 0
ble L999_H1 ble .L999_H1
cmpwi cr0, N, 0 cmpwi cr0, N, 0
ble L999_H1 ble .L999_H1
cmpwi cr0, K, 0 cmpwi cr0, K, 0
ble L999_H1 ble .L999_H1
#ifdef __64BIT__ #ifdef __64BIT__
addi ALPHA, SP, 296 addi ALPHA, SP, 296
@ -241,7 +276,7 @@
#include "dgemm_logic_16x4_power8.S" #include "dgemm_logic_16x4_power8.S"
L999: .L999:
addi r3, 0, 0 addi r3, 0, 0
lfd f14, 0(SP) lfd f14, 0(SP)

File diff suppressed because it is too large Load Diff

View File

@ -1,3 +1,38 @@
/***************************************************************************
Copyright (c) 2013-2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2016/03/05 Werner Saar (wernsaar@googlemail.com)
* BLASTEST : OK
* CTEST : OK
* TEST : OK
* LAPACK-TEST : OK
**************************************************************************************/
/********************************************************************* /*********************************************************************
* Macros for N=4, M=16 * * Macros for N=4, M=16 *
*********************************************************************/ *********************************************************************/

View File

@ -1,3 +1,38 @@
/***************************************************************************
Copyright (c) 2013-2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2016/03/05 Werner Saar (wernsaar@googlemail.com)
* BLASTEST : OK
* CTEST : OK
* TEST : OK
* LAPACK-TEST : OK
**************************************************************************************/
/*********************************************************************/ /*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */ /* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */ /* All rights reserved. */
@ -228,11 +263,11 @@
#endif #endif
cmpwi cr0, M, 0 cmpwi cr0, M, 0
ble L999_H1 ble .L999_H1
cmpwi cr0, N, 0 cmpwi cr0, N, 0
ble L999_H1 ble .L999_H1
cmpwi cr0, K, 0 cmpwi cr0, K, 0
ble L999_H1 ble .L999_H1
#ifdef __64BIT__ #ifdef __64BIT__
addi ALPHA, SP, 296 addi ALPHA, SP, 296
@ -251,7 +286,7 @@
#include "dtrmm_logic_16x4_power8.S" #include "dtrmm_logic_16x4_power8.S"
L999: .L999:
addi r3, 0, 0 addi r3, 0, 0
lfd f14, 0(SP) lfd f14, 0(SP)

File diff suppressed because it is too large Load Diff

View File

@ -1,3 +1,38 @@
/***************************************************************************
Copyright (c) 2013-2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2016/03/05 Werner Saar (wernsaar@googlemail.com)
* BLASTEST : OK
* CTEST : OK
* TEST : OK
* LAPACK-TEST : OK
**************************************************************************************/
/*********************************************************************/ /*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */ /* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */ /* All rights reserved. */
@ -233,11 +268,11 @@
#include "zgemm_macros_8x2_power8.S" #include "zgemm_macros_8x2_power8.S"
cmpwi cr0, M, 0 cmpwi cr0, M, 0
ble L999 ble .L999
cmpwi cr0, N, 0 cmpwi cr0, N, 0
ble L999 ble .L999
cmpwi cr0, K, 0 cmpwi cr0, K, 0
ble L999 ble .L999
slwi LDC, LDC, ZBASE_SHIFT slwi LDC, LDC, ZBASE_SHIFT
li PRE, 256 li PRE, 256
@ -260,7 +295,7 @@
#include "zgemm_logic_8x2_power8.S" #include "zgemm_logic_8x2_power8.S"
L999: .L999:
addi r3, 0, 0 addi r3, 0, 0
lfd f14, 0(SP) lfd f14, 0(SP)

View File

@ -1,25 +1,25 @@
srawi. J, N, 1 srawi. J, N, 1
ble ZGEMM_L2_END ble .LZGEMM_L2_END
ZGEMM_L2_BEGIN: .LZGEMM_L2_BEGIN:
mr CO, C mr CO, C
mr AO, A mr AO, A
slwi T1, LDC , 1 slwi T1, LDC , 1
add C, C, T1 add C, C, T1
srawi. I, M, 3 srawi. I, M, 3
ble ZGEMM_L2x8_END ble .LZGEMM_L2x8_END
ZGEMM_L2x8_BEGIN: .LZGEMM_L2x8_BEGIN:
mr BO, B mr BO, B
srawi. L, K, 3 srawi. L, K, 3
ble ZGEMM_L2x8_SUB0 ble .LZGEMM_L2x8_SUB0
cmpwi cr0, L, 1 cmpwi cr0, L, 1
ble ZGEMM_L2x8_SUB4 ble .LZGEMM_L2x8_SUB4
ZGEMM_L2x8_LOOP_START: .LZGEMM_L2x8_LOOP_START:
dcbt AO, PRE dcbt AO, PRE
LOAD2x8_1 LOAD2x8_1
@ -42,11 +42,11 @@ ZGEMM_L2x8_LOOP_START:
KERNEL2x8_2 KERNEL2x8_2
addic. L, L, -2 addic. L, L, -2
ble ZGEMM_L2x8_LOOP_END ble .LZGEMM_L2x8_LOOP_END
.align 5 .align 5
ZGEMM_L2x8_LOOP: .LZGEMM_L2x8_LOOP:
dcbt AO, PRE dcbt AO, PRE
KERNEL2x8_1 KERNEL2x8_1
@ -67,9 +67,9 @@ ZGEMM_L2x8_LOOP:
KERNEL2x8_2 KERNEL2x8_2
addic. L, L, -1 addic. L, L, -1
bgt ZGEMM_L2x8_LOOP bgt .LZGEMM_L2x8_LOOP
ZGEMM_L2x8_LOOP_END: .LZGEMM_L2x8_LOOP_END:
dcbt AO, PRE dcbt AO, PRE
KERNEL2x8_1 KERNEL2x8_1
@ -88,9 +88,9 @@ ZGEMM_L2x8_LOOP_END:
KERNEL2x8_1 KERNEL2x8_1
KERNEL2x8_E2 KERNEL2x8_E2
b ZGEMM_L2x8_SUB1 b .LZGEMM_L2x8_SUB1
ZGEMM_L2x8_SUB4: .LZGEMM_L2x8_SUB4:
dcbt AO, PRE dcbt AO, PRE
KERNEL2x8_SUBI1 KERNEL2x8_SUBI1
@ -106,53 +106,53 @@ ZGEMM_L2x8_SUB4:
KERNEL2x8_SUB1 KERNEL2x8_SUB1
KERNEL2x8_SUB1 KERNEL2x8_SUB1
b ZGEMM_L2x8_SUB1 b .LZGEMM_L2x8_SUB1
ZGEMM_L2x8_SUB0: .LZGEMM_L2x8_SUB0:
andi. L, K, 7 andi. L, K, 7
KERNEL2x8_SUBI1 KERNEL2x8_SUBI1
addic. L, L, -1 addic. L, L, -1
ble ZGEMM_L2x8_SAVE ble .LZGEMM_L2x8_SAVE
b ZGEMM_L2x8_SUB2 b .LZGEMM_L2x8_SUB2
ZGEMM_L2x8_SUB1: .LZGEMM_L2x8_SUB1:
andi. L, K, 7 andi. L, K, 7
ble ZGEMM_L2x8_SAVE ble .LZGEMM_L2x8_SAVE
ZGEMM_L2x8_SUB2: .LZGEMM_L2x8_SUB2:
KERNEL2x8_SUB1 KERNEL2x8_SUB1
addic. L, L, -1 addic. L, L, -1
bgt ZGEMM_L2x8_SUB2 bgt .LZGEMM_L2x8_SUB2
ZGEMM_L2x8_SAVE: .LZGEMM_L2x8_SAVE:
SAVE2x8 SAVE2x8
addic. I, I, -1 addic. I, I, -1
bgt ZGEMM_L2x8_BEGIN bgt .LZGEMM_L2x8_BEGIN
ZGEMM_L2x8_END: .LZGEMM_L2x8_END:
ZGEMM_L2x4_BEGIN: .LZGEMM_L2x4_BEGIN:
andi. T2, M, 7 andi. T2, M, 7
ble ZGEMM_L2x1_END ble .LZGEMM_L2x1_END
andi. T1, M, 4 andi. T1, M, 4
ble ZGEMM_L2x4_END ble .LZGEMM_L2x4_END
mr BO, B mr BO, B
srawi. L, K, 3 srawi. L, K, 3
ble ZGEMM_L2x4_SUB0 ble .LZGEMM_L2x4_SUB0
cmpwi cr0, L, 1 cmpwi cr0, L, 1
ble ZGEMM_L2x4_SUB4 ble .LZGEMM_L2x4_SUB4
ZGEMM_L2x4_LOOP_START: .LZGEMM_L2x4_LOOP_START:
LOAD2x4_1 LOAD2x4_1
KERNEL2x4_I1 KERNEL2x4_I1
@ -166,11 +166,11 @@ ZGEMM_L2x4_LOOP_START:
KERNEL2x4_2 KERNEL2x4_2
addic. L, L, -2 addic. L, L, -2
ble ZGEMM_L2x4_LOOP_END ble .LZGEMM_L2x4_LOOP_END
.align 5 .align 5
ZGEMM_L2x4_LOOP: .LZGEMM_L2x4_LOOP:
KERNEL2x4_1 KERNEL2x4_1
KERNEL2x4_2 KERNEL2x4_2
@ -183,9 +183,9 @@ ZGEMM_L2x4_LOOP:
KERNEL2x4_2 KERNEL2x4_2
addic. L, L, -1 addic. L, L, -1
bgt ZGEMM_L2x4_LOOP bgt .LZGEMM_L2x4_LOOP
ZGEMM_L2x4_LOOP_END: .LZGEMM_L2x4_LOOP_END:
KERNEL2x4_1 KERNEL2x4_1
KERNEL2x4_2 KERNEL2x4_2
@ -197,9 +197,9 @@ ZGEMM_L2x4_LOOP_END:
KERNEL2x4_1 KERNEL2x4_1
KERNEL2x4_E2 KERNEL2x4_E2
b ZGEMM_L2x4_SUB1 b .LZGEMM_L2x4_SUB1
ZGEMM_L2x4_SUB4: .LZGEMM_L2x4_SUB4:
KERNEL2x4_SUBI1 KERNEL2x4_SUBI1
KERNEL2x4_SUB1 KERNEL2x4_SUB1
@ -211,48 +211,48 @@ ZGEMM_L2x4_SUB4:
KERNEL2x4_SUB1 KERNEL2x4_SUB1
KERNEL2x4_SUB1 KERNEL2x4_SUB1
b ZGEMM_L2x4_SUB1 b .LZGEMM_L2x4_SUB1
ZGEMM_L2x4_SUB0: .LZGEMM_L2x4_SUB0:
andi. L, K, 7 andi. L, K, 7
KERNEL2x4_SUBI1 KERNEL2x4_SUBI1
addic. L, L, -1 addic. L, L, -1
ble ZGEMM_L2x4_SAVE ble .LZGEMM_L2x4_SAVE
b ZGEMM_L2x4_SUB2 b .LZGEMM_L2x4_SUB2
ZGEMM_L2x4_SUB1: .LZGEMM_L2x4_SUB1:
andi. L, K, 7 andi. L, K, 7
ble ZGEMM_L2x4_SAVE ble .LZGEMM_L2x4_SAVE
ZGEMM_L2x4_SUB2: .LZGEMM_L2x4_SUB2:
KERNEL2x4_SUB1 KERNEL2x4_SUB1
addic. L, L, -1 addic. L, L, -1
bgt ZGEMM_L2x4_SUB2 bgt .LZGEMM_L2x4_SUB2
ZGEMM_L2x4_SAVE: .LZGEMM_L2x4_SAVE:
SAVE2x4 SAVE2x4
ZGEMM_L2x4_END: .LZGEMM_L2x4_END:
ZGEMM_L2x2_BEGIN: .LZGEMM_L2x2_BEGIN:
andi. T1, M, 2 andi. T1, M, 2
ble ZGEMM_L2x2_END ble .LZGEMM_L2x2_END
mr BO, B mr BO, B
srawi. L, K, 3 srawi. L, K, 3
ble ZGEMM_L2x2_SUB0 ble .LZGEMM_L2x2_SUB0
cmpwi cr0, L, 1 cmpwi cr0, L, 1
ble ZGEMM_L2x2_SUB4 ble .LZGEMM_L2x2_SUB4
ZGEMM_L2x2_LOOP_START: .LZGEMM_L2x2_LOOP_START:
LOAD2x2_1 LOAD2x2_1
KERNEL2x2_I1 KERNEL2x2_I1
@ -266,11 +266,11 @@ ZGEMM_L2x2_LOOP_START:
KERNEL2x2_2 KERNEL2x2_2
addic. L, L, -2 addic. L, L, -2
ble ZGEMM_L2x2_LOOP_END ble .LZGEMM_L2x2_LOOP_END
.align 5 .align 5
ZGEMM_L2x2_LOOP: .LZGEMM_L2x2_LOOP:
KERNEL2x2_1 KERNEL2x2_1
KERNEL2x2_2 KERNEL2x2_2
@ -283,9 +283,9 @@ ZGEMM_L2x2_LOOP:
KERNEL2x2_2 KERNEL2x2_2
addic. L, L, -1 addic. L, L, -1
bgt ZGEMM_L2x2_LOOP bgt .LZGEMM_L2x2_LOOP
ZGEMM_L2x2_LOOP_END: .LZGEMM_L2x2_LOOP_END:
KERNEL2x2_1 KERNEL2x2_1
KERNEL2x2_2 KERNEL2x2_2
@ -297,9 +297,9 @@ ZGEMM_L2x2_LOOP_END:
KERNEL2x2_1 KERNEL2x2_1
KERNEL2x2_E2 KERNEL2x2_E2
b ZGEMM_L2x2_SUB1 b .LZGEMM_L2x2_SUB1
ZGEMM_L2x2_SUB4: .LZGEMM_L2x2_SUB4:
KERNEL2x2_SUBI1 KERNEL2x2_SUBI1
KERNEL2x2_SUB1 KERNEL2x2_SUB1
@ -311,48 +311,48 @@ ZGEMM_L2x2_SUB4:
KERNEL2x2_SUB1 KERNEL2x2_SUB1
KERNEL2x2_SUB1 KERNEL2x2_SUB1
b ZGEMM_L2x2_SUB1 b .LZGEMM_L2x2_SUB1
ZGEMM_L2x2_SUB0: .LZGEMM_L2x2_SUB0:
andi. L, K, 7 andi. L, K, 7
KERNEL2x2_SUBI1 KERNEL2x2_SUBI1
addic. L, L, -1 addic. L, L, -1
ble ZGEMM_L2x2_SAVE ble .LZGEMM_L2x2_SAVE
b ZGEMM_L2x2_SUB2 b .LZGEMM_L2x2_SUB2
ZGEMM_L2x2_SUB1: .LZGEMM_L2x2_SUB1:
andi. L, K, 7 andi. L, K, 7
ble ZGEMM_L2x2_SAVE ble .LZGEMM_L2x2_SAVE
ZGEMM_L2x2_SUB2: .LZGEMM_L2x2_SUB2:
KERNEL2x2_SUB1 KERNEL2x2_SUB1
addic. L, L, -1 addic. L, L, -1
bgt ZGEMM_L2x2_SUB2 bgt .LZGEMM_L2x2_SUB2
ZGEMM_L2x2_SAVE: .LZGEMM_L2x2_SAVE:
SAVE2x2 SAVE2x2
ZGEMM_L2x2_END: .LZGEMM_L2x2_END:
ZGEMM_L2x1_BEGIN: .LZGEMM_L2x1_BEGIN:
andi. T1, M, 1 andi. T1, M, 1
ble ZGEMM_L2x1_END ble .LZGEMM_L2x1_END
mr BO, B mr BO, B
srawi. L, K, 3 srawi. L, K, 3
ble ZGEMM_L2x1_SUB0 ble .LZGEMM_L2x1_SUB0
cmpwi cr0, L, 1 cmpwi cr0, L, 1
ble ZGEMM_L2x1_SUB4 ble .LZGEMM_L2x1_SUB4
ZGEMM_L2x1_LOOP_START: .LZGEMM_L2x1_LOOP_START:
LOAD2x1_1 LOAD2x1_1
KERNEL2x1_I1 KERNEL2x1_I1
@ -366,11 +366,11 @@ ZGEMM_L2x1_LOOP_START:
KERNEL2x1_2 KERNEL2x1_2
addic. L, L, -2 addic. L, L, -2
ble ZGEMM_L2x1_LOOP_END ble .LZGEMM_L2x1_LOOP_END
.align 5 .align 5
ZGEMM_L2x1_LOOP: .LZGEMM_L2x1_LOOP:
KERNEL2x1_1 KERNEL2x1_1
KERNEL2x1_2 KERNEL2x1_2
@ -383,9 +383,9 @@ ZGEMM_L2x1_LOOP:
KERNEL2x1_2 KERNEL2x1_2
addic. L, L, -1 addic. L, L, -1
bgt ZGEMM_L2x1_LOOP bgt .LZGEMM_L2x1_LOOP
ZGEMM_L2x1_LOOP_END: .LZGEMM_L2x1_LOOP_END:
KERNEL2x1_1 KERNEL2x1_1
KERNEL2x1_2 KERNEL2x1_2
@ -397,9 +397,9 @@ ZGEMM_L2x1_LOOP_END:
KERNEL2x1_1 KERNEL2x1_1
KERNEL2x1_E2 KERNEL2x1_E2
b ZGEMM_L2x1_SUB1 b .LZGEMM_L2x1_SUB1
ZGEMM_L2x1_SUB4: .LZGEMM_L2x1_SUB4:
KERNEL2x1_SUBI1 KERNEL2x1_SUBI1
KERNEL2x1_SUB1 KERNEL2x1_SUB1
@ -411,72 +411,72 @@ ZGEMM_L2x1_SUB4:
KERNEL2x1_SUB1 KERNEL2x1_SUB1
KERNEL2x1_SUB1 KERNEL2x1_SUB1
b ZGEMM_L2x1_SUB1 b .LZGEMM_L2x1_SUB1
ZGEMM_L2x1_SUB0: .LZGEMM_L2x1_SUB0:
andi. L, K, 7 andi. L, K, 7
KERNEL2x1_SUBI1 KERNEL2x1_SUBI1
addic. L, L, -1 addic. L, L, -1
ble ZGEMM_L2x1_SAVE ble .LZGEMM_L2x1_SAVE
b ZGEMM_L2x1_SUB2 b .LZGEMM_L2x1_SUB2
ZGEMM_L2x1_SUB1: .LZGEMM_L2x1_SUB1:
andi. L, K, 7 andi. L, K, 7
ble ZGEMM_L2x1_SAVE ble .LZGEMM_L2x1_SAVE
ZGEMM_L2x1_SUB2: .LZGEMM_L2x1_SUB2:
KERNEL2x1_SUB1 KERNEL2x1_SUB1
addic. L, L, -1 addic. L, L, -1
bgt ZGEMM_L2x1_SUB2 bgt .LZGEMM_L2x1_SUB2
ZGEMM_L2x1_SAVE: .LZGEMM_L2x1_SAVE:
SAVE2x1 SAVE2x1
ZGEMM_L2x1_END: .LZGEMM_L2x1_END:
slwi T1, K, 5 slwi T1, K, 5
add B, B, T1 add B, B, T1
addic. J, J, -1 addic. J, J, -1
bgt ZGEMM_L2_BEGIN bgt .LZGEMM_L2_BEGIN
andi. T2, N, 1 andi. T2, N, 1
ble L999 ble .L999
ZGEMM_L2_END: .LZGEMM_L2_END:
b ZGEMM_L1_BEGIN b .LZGEMM_L1_BEGIN
L999_H1: .L999_H1:
b L999 b .L999
ZGEMM_L1_BEGIN: .LZGEMM_L1_BEGIN:
andi. T1, N, 1 andi. T1, N, 1
ble ZGEMM_L1_END ble .LZGEMM_L1_END
mr CO, C mr CO, C
mr AO, A mr AO, A
srawi. I, M, 3 srawi. I, M, 3
ble ZGEMM_L1x8_END ble .LZGEMM_L1x8_END
ZGEMM_L1x8_BEGIN: .LZGEMM_L1x8_BEGIN:
mr BO, B mr BO, B
srawi. L, K, 3 srawi. L, K, 3
ble ZGEMM_L1x8_SUB0 ble .LZGEMM_L1x8_SUB0
cmpwi cr0, L, 1 cmpwi cr0, L, 1
ble ZGEMM_L1x8_SUB4 ble .LZGEMM_L1x8_SUB4
ZGEMM_L1x8_LOOP_START: .LZGEMM_L1x8_LOOP_START:
dcbt AO, PRE dcbt AO, PRE
LOAD1x8_1 LOAD1x8_1
@ -499,11 +499,11 @@ ZGEMM_L1x8_LOOP_START:
KERNEL1x8_2 KERNEL1x8_2
addic. L, L, -2 addic. L, L, -2
ble ZGEMM_L1x8_LOOP_END ble .LZGEMM_L1x8_LOOP_END
.align 5 .align 5
ZGEMM_L1x8_LOOP: .LZGEMM_L1x8_LOOP:
dcbt AO, PRE dcbt AO, PRE
KERNEL1x8_1 KERNEL1x8_1
@ -524,9 +524,9 @@ ZGEMM_L1x8_LOOP:
KERNEL1x8_2 KERNEL1x8_2
addic. L, L, -1 addic. L, L, -1
bgt ZGEMM_L1x8_LOOP bgt .LZGEMM_L1x8_LOOP
ZGEMM_L1x8_LOOP_END: .LZGEMM_L1x8_LOOP_END:
dcbt AO, PRE dcbt AO, PRE
KERNEL1x8_1 KERNEL1x8_1
@ -545,9 +545,9 @@ ZGEMM_L1x8_LOOP_END:
KERNEL1x8_1 KERNEL1x8_1
KERNEL1x8_E2 KERNEL1x8_E2
b ZGEMM_L1x8_SUB1 b .LZGEMM_L1x8_SUB1
ZGEMM_L1x8_SUB4: .LZGEMM_L1x8_SUB4:
dcbt AO, PRE dcbt AO, PRE
KERNEL1x8_SUBI1 KERNEL1x8_SUBI1
@ -563,53 +563,53 @@ ZGEMM_L1x8_SUB4:
KERNEL1x8_SUB1 KERNEL1x8_SUB1
KERNEL1x8_SUB1 KERNEL1x8_SUB1
b ZGEMM_L1x8_SUB1 b .LZGEMM_L1x8_SUB1
ZGEMM_L1x8_SUB0: .LZGEMM_L1x8_SUB0:
andi. L, K, 7 andi. L, K, 7
KERNEL1x8_SUBI1 KERNEL1x8_SUBI1
addic. L, L, -1 addic. L, L, -1
ble ZGEMM_L1x8_SAVE ble .LZGEMM_L1x8_SAVE
b ZGEMM_L1x8_SUB2 b .LZGEMM_L1x8_SUB2
ZGEMM_L1x8_SUB1: .LZGEMM_L1x8_SUB1:
andi. L, K, 7 andi. L, K, 7
ble ZGEMM_L1x8_SAVE ble .LZGEMM_L1x8_SAVE
ZGEMM_L1x8_SUB2: .LZGEMM_L1x8_SUB2:
KERNEL1x8_SUB1 KERNEL1x8_SUB1
addic. L, L, -1 addic. L, L, -1
bgt ZGEMM_L1x8_SUB2 bgt .LZGEMM_L1x8_SUB2
ZGEMM_L1x8_SAVE: .LZGEMM_L1x8_SAVE:
SAVE1x8 SAVE1x8
addic. I, I, -1 addic. I, I, -1
bgt ZGEMM_L1x8_BEGIN bgt .LZGEMM_L1x8_BEGIN
ZGEMM_L1x8_END: .LZGEMM_L1x8_END:
ZGEMM_L1x4_BEGIN: .LZGEMM_L1x4_BEGIN:
andi. T2, M, 7 andi. T2, M, 7
ble ZGEMM_L1x1_END ble .LZGEMM_L1x1_END
andi. T1, M, 4 andi. T1, M, 4
ble ZGEMM_L1x4_END ble .LZGEMM_L1x4_END
mr BO, B mr BO, B
srawi. L, K, 3 srawi. L, K, 3
ble ZGEMM_L1x4_SUB0 ble .LZGEMM_L1x4_SUB0
cmpwi cr0, L, 1 cmpwi cr0, L, 1
ble ZGEMM_L1x4_SUB4 ble .LZGEMM_L1x4_SUB4
ZGEMM_L1x4_LOOP_START: .LZGEMM_L1x4_LOOP_START:
LOAD1x4_1 LOAD1x4_1
KERNEL1x4_I1 KERNEL1x4_I1
@ -623,11 +623,11 @@ ZGEMM_L1x4_LOOP_START:
KERNEL1x4_2 KERNEL1x4_2
addic. L, L, -2 addic. L, L, -2
ble ZGEMM_L1x4_LOOP_END ble .LZGEMM_L1x4_LOOP_END
.align 5 .align 5
ZGEMM_L1x4_LOOP: .LZGEMM_L1x4_LOOP:
KERNEL1x4_1 KERNEL1x4_1
KERNEL1x4_2 KERNEL1x4_2
@ -640,9 +640,9 @@ ZGEMM_L1x4_LOOP:
KERNEL1x4_2 KERNEL1x4_2
addic. L, L, -1 addic. L, L, -1
bgt ZGEMM_L1x4_LOOP bgt .LZGEMM_L1x4_LOOP
ZGEMM_L1x4_LOOP_END: .LZGEMM_L1x4_LOOP_END:
KERNEL1x4_1 KERNEL1x4_1
KERNEL1x4_2 KERNEL1x4_2
@ -654,9 +654,9 @@ ZGEMM_L1x4_LOOP_END:
KERNEL1x4_1 KERNEL1x4_1
KERNEL1x4_E2 KERNEL1x4_E2
b ZGEMM_L1x4_SUB1 b .LZGEMM_L1x4_SUB1
ZGEMM_L1x4_SUB4: .LZGEMM_L1x4_SUB4:
KERNEL1x4_SUBI1 KERNEL1x4_SUBI1
KERNEL1x4_SUB1 KERNEL1x4_SUB1
@ -668,48 +668,48 @@ ZGEMM_L1x4_SUB4:
KERNEL1x4_SUB1 KERNEL1x4_SUB1
KERNEL1x4_SUB1 KERNEL1x4_SUB1
b ZGEMM_L1x4_SUB1 b .LZGEMM_L1x4_SUB1
ZGEMM_L1x4_SUB0: .LZGEMM_L1x4_SUB0:
andi. L, K, 7 andi. L, K, 7
KERNEL1x4_SUBI1 KERNEL1x4_SUBI1
addic. L, L, -1 addic. L, L, -1
ble ZGEMM_L1x4_SAVE ble .LZGEMM_L1x4_SAVE
b ZGEMM_L1x4_SUB2 b .LZGEMM_L1x4_SUB2
ZGEMM_L1x4_SUB1: .LZGEMM_L1x4_SUB1:
andi. L, K, 7 andi. L, K, 7
ble ZGEMM_L1x4_SAVE ble .LZGEMM_L1x4_SAVE
ZGEMM_L1x4_SUB2: .LZGEMM_L1x4_SUB2:
KERNEL1x4_SUB1 KERNEL1x4_SUB1
addic. L, L, -1 addic. L, L, -1
bgt ZGEMM_L1x4_SUB2 bgt .LZGEMM_L1x4_SUB2
ZGEMM_L1x4_SAVE: .LZGEMM_L1x4_SAVE:
SAVE1x4 SAVE1x4
ZGEMM_L1x4_END: .LZGEMM_L1x4_END:
ZGEMM_L1x2_BEGIN: .LZGEMM_L1x2_BEGIN:
andi. T1, M, 2 andi. T1, M, 2
ble ZGEMM_L1x2_END ble .LZGEMM_L1x2_END
mr BO, B mr BO, B
srawi. L, K, 3 srawi. L, K, 3
ble ZGEMM_L1x2_SUB0 ble .LZGEMM_L1x2_SUB0
cmpwi cr0, L, 1 cmpwi cr0, L, 1
ble ZGEMM_L1x2_SUB4 ble .LZGEMM_L1x2_SUB4
ZGEMM_L1x2_LOOP_START: .LZGEMM_L1x2_LOOP_START:
LOAD1x2_1 LOAD1x2_1
KERNEL1x2_I1 KERNEL1x2_I1
@ -723,11 +723,11 @@ ZGEMM_L1x2_LOOP_START:
KERNEL1x2_2 KERNEL1x2_2
addic. L, L, -2 addic. L, L, -2
ble ZGEMM_L1x2_LOOP_END ble .LZGEMM_L1x2_LOOP_END
.align 5 .align 5
ZGEMM_L1x2_LOOP: .LZGEMM_L1x2_LOOP:
KERNEL1x2_1 KERNEL1x2_1
KERNEL1x2_2 KERNEL1x2_2
@ -740,9 +740,9 @@ ZGEMM_L1x2_LOOP:
KERNEL1x2_2 KERNEL1x2_2
addic. L, L, -1 addic. L, L, -1
bgt ZGEMM_L1x2_LOOP bgt .LZGEMM_L1x2_LOOP
ZGEMM_L1x2_LOOP_END: .LZGEMM_L1x2_LOOP_END:
KERNEL1x2_1 KERNEL1x2_1
KERNEL1x2_2 KERNEL1x2_2
@ -754,9 +754,9 @@ ZGEMM_L1x2_LOOP_END:
KERNEL1x2_1 KERNEL1x2_1
KERNEL1x2_E2 KERNEL1x2_E2
b ZGEMM_L1x2_SUB1 b .LZGEMM_L1x2_SUB1
ZGEMM_L1x2_SUB4: .LZGEMM_L1x2_SUB4:
KERNEL1x2_SUBI1 KERNEL1x2_SUBI1
KERNEL1x2_SUB1 KERNEL1x2_SUB1
@ -768,48 +768,48 @@ ZGEMM_L1x2_SUB4:
KERNEL1x2_SUB1 KERNEL1x2_SUB1
KERNEL1x2_SUB1 KERNEL1x2_SUB1
b ZGEMM_L1x2_SUB1 b .LZGEMM_L1x2_SUB1
ZGEMM_L1x2_SUB0: .LZGEMM_L1x2_SUB0:
andi. L, K, 7 andi. L, K, 7
KERNEL1x2_SUBI1 KERNEL1x2_SUBI1
addic. L, L, -1 addic. L, L, -1
ble ZGEMM_L1x2_SAVE ble .LZGEMM_L1x2_SAVE
b ZGEMM_L1x2_SUB2 b .LZGEMM_L1x2_SUB2
ZGEMM_L1x2_SUB1: .LZGEMM_L1x2_SUB1:
andi. L, K, 7 andi. L, K, 7
ble ZGEMM_L1x2_SAVE ble .LZGEMM_L1x2_SAVE
ZGEMM_L1x2_SUB2: .LZGEMM_L1x2_SUB2:
KERNEL1x2_SUB1 KERNEL1x2_SUB1
addic. L, L, -1 addic. L, L, -1
bgt ZGEMM_L1x2_SUB2 bgt .LZGEMM_L1x2_SUB2
ZGEMM_L1x2_SAVE: .LZGEMM_L1x2_SAVE:
SAVE1x2 SAVE1x2
ZGEMM_L1x2_END: .LZGEMM_L1x2_END:
ZGEMM_L1x1_BEGIN: .LZGEMM_L1x1_BEGIN:
andi. T1, M, 1 andi. T1, M, 1
ble ZGEMM_L1x1_END ble .LZGEMM_L1x1_END
mr BO, B mr BO, B
srawi. L, K, 3 srawi. L, K, 3
ble ZGEMM_L1x1_SUB0 ble .LZGEMM_L1x1_SUB0
cmpwi cr0, L, 1 cmpwi cr0, L, 1
ble ZGEMM_L1x1_SUB4 ble .LZGEMM_L1x1_SUB4
ZGEMM_L1x1_LOOP_START: .LZGEMM_L1x1_LOOP_START:
LOAD1x1_1 LOAD1x1_1
KERNEL1x1_I1 KERNEL1x1_I1
@ -823,11 +823,11 @@ ZGEMM_L1x1_LOOP_START:
KERNEL1x1_2 KERNEL1x1_2
addic. L, L, -2 addic. L, L, -2
ble ZGEMM_L1x1_LOOP_END ble .LZGEMM_L1x1_LOOP_END
.align 5 .align 5
ZGEMM_L1x1_LOOP: .LZGEMM_L1x1_LOOP:
KERNEL1x1_1 KERNEL1x1_1
KERNEL1x1_2 KERNEL1x1_2
@ -840,9 +840,9 @@ ZGEMM_L1x1_LOOP:
KERNEL1x1_2 KERNEL1x1_2
addic. L, L, -1 addic. L, L, -1
bgt ZGEMM_L1x1_LOOP bgt .LZGEMM_L1x1_LOOP
ZGEMM_L1x1_LOOP_END: .LZGEMM_L1x1_LOOP_END:
KERNEL1x1_1 KERNEL1x1_1
KERNEL1x1_2 KERNEL1x1_2
@ -854,9 +854,9 @@ ZGEMM_L1x1_LOOP_END:
KERNEL1x1_1 KERNEL1x1_1
KERNEL1x1_E2 KERNEL1x1_E2
b ZGEMM_L1x1_SUB1 b .LZGEMM_L1x1_SUB1
ZGEMM_L1x1_SUB4: .LZGEMM_L1x1_SUB4:
KERNEL1x1_SUBI1 KERNEL1x1_SUBI1
KERNEL1x1_SUB1 KERNEL1x1_SUB1
@ -868,34 +868,34 @@ ZGEMM_L1x1_SUB4:
KERNEL1x1_SUB1 KERNEL1x1_SUB1
KERNEL1x1_SUB1 KERNEL1x1_SUB1
b ZGEMM_L1x1_SUB1 b .LZGEMM_L1x1_SUB1
ZGEMM_L1x1_SUB0: .LZGEMM_L1x1_SUB0:
andi. L, K, 7 andi. L, K, 7
KERNEL1x1_SUBI1 KERNEL1x1_SUBI1
addic. L, L, -1 addic. L, L, -1
ble ZGEMM_L1x1_SAVE ble .LZGEMM_L1x1_SAVE
b ZGEMM_L1x1_SUB2 b .LZGEMM_L1x1_SUB2
ZGEMM_L1x1_SUB1: .LZGEMM_L1x1_SUB1:
andi. L, K, 7 andi. L, K, 7
ble ZGEMM_L1x1_SAVE ble .LZGEMM_L1x1_SAVE
ZGEMM_L1x1_SUB2: .LZGEMM_L1x1_SUB2:
KERNEL1x1_SUB1 KERNEL1x1_SUB1
addic. L, L, -1 addic. L, L, -1
bgt ZGEMM_L1x1_SUB2 bgt .LZGEMM_L1x1_SUB2
ZGEMM_L1x1_SAVE: .LZGEMM_L1x1_SAVE:
SAVE1x1 SAVE1x1
ZGEMM_L1x1_END: .LZGEMM_L1x1_END:
ZGEMM_L1_END: .LZGEMM_L1_END:

View File

@ -1,3 +1,39 @@
/***************************************************************************
Copyright (c) 2013-2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2016/03/05 Werner Saar (wernsaar@googlemail.com)
* BLASTEST : OK
* CTEST : OK
* TEST : OK
* LAPACK-TEST : OK
**************************************************************************************/
#if defined(NN) || defined(NT) || defined(TN) || defined(TT) #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
#define XSFADD_R1 xsadddp #define XSFADD_R1 xsadddp

View File

@ -1,3 +1,38 @@
/***************************************************************************
Copyright (c) 2013-2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2016/03/05 Werner Saar (wernsaar@googlemail.com)
* BLASTEST : OK
* CTEST : OK
* TEST : OK
* LAPACK-TEST : OK
**************************************************************************************/
/*********************************************************************/ /*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */ /* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */ /* All rights reserved. */
@ -239,11 +274,11 @@
#include "zgemm_macros_8x2_power8.S" #include "zgemm_macros_8x2_power8.S"
cmpwi cr0, M, 0 cmpwi cr0, M, 0
ble L999 ble .L999
cmpwi cr0, N, 0 cmpwi cr0, N, 0
ble L999 ble .L999
cmpwi cr0, K, 0 cmpwi cr0, K, 0
ble L999 ble .L999
slwi LDC, LDC, ZBASE_SHIFT slwi LDC, LDC, ZBASE_SHIFT
li PRE, 256 li PRE, 256
@ -266,7 +301,7 @@
#include "ztrmm_logic_8x2_power8.S" #include "ztrmm_logic_8x2_power8.S"
L999: .L999:
addi r3, 0, 0 addi r3, 0, 0
lfd f14, 0(SP) lfd f14, 0(SP)

View File

@ -1,7 +1,43 @@
srawi. J, N, 1 /***************************************************************************
ble ZTRMM_L2_END Copyright (c) 2013-2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
ZTRMM_L2_BEGIN: /**************************************************************************************
* 2016/03/05 Werner Saar (wernsaar@googlemail.com)
* BLASTEST : OK
* CTEST : OK
* TEST : OK
* LAPACK-TEST : OK
**************************************************************************************/
srawi. J, N, 1
ble .LZTRMM_L2_END
.LZTRMM_L2_BEGIN:
mr CO, C mr CO, C
mr AO, A mr AO, A
@ -13,9 +49,9 @@ ZTRMM_L2_BEGIN:
#endif #endif
srawi. I, M, 3 srawi. I, M, 3
ble ZTRMM_L2x8_END ble .LZTRMM_L2x8_END
ZTRMM_L2x8_BEGIN: .LZTRMM_L2x8_BEGIN:
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
@ -42,11 +78,11 @@ ZTRMM_L2x8_BEGIN:
mr KKK, T1 mr KKK, T1
mr K1, T1 mr K1, T1
srawi. L, K1, 3 // KTEMP / 8 -> L srawi. L, K1, 3 // KTEMP / 8 -> L
ble ZTRMM_L2x8_SUB0 ble .LZTRMM_L2x8_SUB0
cmpwi cr0, L, 1 cmpwi cr0, L, 1
ble ZTRMM_L2x8_SUB4 ble .LZTRMM_L2x8_SUB4
ZTRMM_L2x8_LOOP_START: .LZTRMM_L2x8_LOOP_START:
dcbt AO, PRE dcbt AO, PRE
LOAD2x8_1 LOAD2x8_1
@ -69,11 +105,11 @@ ZTRMM_L2x8_LOOP_START:
KERNEL2x8_2 KERNEL2x8_2
addic. L, L, -2 addic. L, L, -2
ble ZTRMM_L2x8_LOOP_END ble .LZTRMM_L2x8_LOOP_END
.align 5 .align 5
ZTRMM_L2x8_LOOP: .LZTRMM_L2x8_LOOP:
dcbt AO, PRE dcbt AO, PRE
KERNEL2x8_1 KERNEL2x8_1
@ -94,9 +130,9 @@ ZTRMM_L2x8_LOOP:
KERNEL2x8_2 KERNEL2x8_2
addic. L, L, -1 addic. L, L, -1
bgt ZTRMM_L2x8_LOOP bgt .LZTRMM_L2x8_LOOP
ZTRMM_L2x8_LOOP_END: .LZTRMM_L2x8_LOOP_END:
dcbt AO, PRE dcbt AO, PRE
KERNEL2x8_1 KERNEL2x8_1
@ -115,9 +151,9 @@ ZTRMM_L2x8_LOOP_END:
KERNEL2x8_1 KERNEL2x8_1
KERNEL2x8_E2 KERNEL2x8_E2
b ZTRMM_L2x8_SUB1 b .LZTRMM_L2x8_SUB1
ZTRMM_L2x8_SUB4: .LZTRMM_L2x8_SUB4:
dcbt AO, PRE dcbt AO, PRE
KERNEL2x8_SUBI1 KERNEL2x8_SUBI1
@ -133,31 +169,31 @@ ZTRMM_L2x8_SUB4:
KERNEL2x8_SUB1 KERNEL2x8_SUB1
KERNEL2x8_SUB1 KERNEL2x8_SUB1
b ZTRMM_L2x8_SUB1 b .LZTRMM_L2x8_SUB1
ZTRMM_L2x8_SUB0: .LZTRMM_L2x8_SUB0:
andi. L, K1, 7 // K1 & 7 -> L andi. L, K1, 7 // K1 & 7 -> L
KERNEL2x8_SUBI1 KERNEL2x8_SUBI1
addic. L, L, -1 addic. L, L, -1
ble ZTRMM_L2x8_SAVE ble .LZTRMM_L2x8_SAVE
b ZTRMM_L2x8_SUB2 b .LZTRMM_L2x8_SUB2
ZTRMM_L2x8_SUB1: .LZTRMM_L2x8_SUB1:
andi. L, K1, 7 // K1 & 7 -> L andi. L, K1, 7 // K1 & 7 -> L
ble ZTRMM_L2x8_SAVE ble .LZTRMM_L2x8_SAVE
ZTRMM_L2x8_SUB2: .LZTRMM_L2x8_SUB2:
KERNEL2x8_SUB1 KERNEL2x8_SUB1
addic. L, L, -1 addic. L, L, -1
bgt ZTRMM_L2x8_SUB2 bgt .LZTRMM_L2x8_SUB2
ZTRMM_L2x8_SAVE: .LZTRMM_L2x8_SAVE:
SAVE2x8 SAVE2x8
@ -175,16 +211,16 @@ ZTRMM_L2x8_SAVE:
addic. I, I, -1 addic. I, I, -1
bgt ZTRMM_L2x8_BEGIN bgt .LZTRMM_L2x8_BEGIN
ZTRMM_L2x8_END: .LZTRMM_L2x8_END:
ZTRMM_L2x4_BEGIN: .LZTRMM_L2x4_BEGIN:
andi. T2, M, 7 andi. T2, M, 7
ble ZTRMM_L2x1_END ble .LZTRMM_L2x1_END
andi. T1, M, 4 andi. T1, M, 4
ble ZTRMM_L2x4_END ble .LZTRMM_L2x4_END
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
mr BO, B // B -> BO mr BO, B // B -> BO
@ -210,11 +246,11 @@ ZTRMM_L2x4_BEGIN:
mr KKK, T1 mr KKK, T1
mr K1, T1 mr K1, T1
srawi. L, K1, 3 // KTEMP / 8 -> L srawi. L, K1, 3 // KTEMP / 8 -> L
ble ZTRMM_L2x4_SUB0 ble .LZTRMM_L2x4_SUB0
cmpwi cr0, L, 1 cmpwi cr0, L, 1
ble ZTRMM_L2x4_SUB4 ble .LZTRMM_L2x4_SUB4
ZTRMM_L2x4_LOOP_START: .LZTRMM_L2x4_LOOP_START:
LOAD2x4_1 LOAD2x4_1
KERNEL2x4_I1 KERNEL2x4_I1
@ -228,11 +264,11 @@ ZTRMM_L2x4_LOOP_START:
KERNEL2x4_2 KERNEL2x4_2
addic. L, L, -2 addic. L, L, -2
ble ZTRMM_L2x4_LOOP_END ble .LZTRMM_L2x4_LOOP_END
.align 5 .align 5
ZTRMM_L2x4_LOOP: .LZTRMM_L2x4_LOOP:
KERNEL2x4_1 KERNEL2x4_1
KERNEL2x4_2 KERNEL2x4_2
@ -245,9 +281,9 @@ ZTRMM_L2x4_LOOP:
KERNEL2x4_2 KERNEL2x4_2
addic. L, L, -1 addic. L, L, -1
bgt ZTRMM_L2x4_LOOP bgt .LZTRMM_L2x4_LOOP
ZTRMM_L2x4_LOOP_END: .LZTRMM_L2x4_LOOP_END:
KERNEL2x4_1 KERNEL2x4_1
KERNEL2x4_2 KERNEL2x4_2
@ -259,9 +295,9 @@ ZTRMM_L2x4_LOOP_END:
KERNEL2x4_1 KERNEL2x4_1
KERNEL2x4_E2 KERNEL2x4_E2
b ZTRMM_L2x4_SUB1 b .LZTRMM_L2x4_SUB1
ZTRMM_L2x4_SUB4: .LZTRMM_L2x4_SUB4:
KERNEL2x4_SUBI1 KERNEL2x4_SUBI1
KERNEL2x4_SUB1 KERNEL2x4_SUB1
@ -273,31 +309,31 @@ ZTRMM_L2x4_SUB4:
KERNEL2x4_SUB1 KERNEL2x4_SUB1
KERNEL2x4_SUB1 KERNEL2x4_SUB1
b ZTRMM_L2x4_SUB1 b .LZTRMM_L2x4_SUB1
ZTRMM_L2x4_SUB0: .LZTRMM_L2x4_SUB0:
andi. L, K1, 7 // K1 & 7 -> L andi. L, K1, 7 // K1 & 7 -> L
KERNEL2x4_SUBI1 KERNEL2x4_SUBI1
addic. L, L, -1 addic. L, L, -1
ble ZTRMM_L2x4_SAVE ble .LZTRMM_L2x4_SAVE
b ZTRMM_L2x4_SUB2 b .LZTRMM_L2x4_SUB2
ZTRMM_L2x4_SUB1: .LZTRMM_L2x4_SUB1:
andi. L, K1, 7 // K1 & 7 -> L andi. L, K1, 7 // K1 & 7 -> L
ble ZTRMM_L2x4_SAVE ble .LZTRMM_L2x4_SAVE
ZTRMM_L2x4_SUB2: .LZTRMM_L2x4_SUB2:
KERNEL2x4_SUB1 KERNEL2x4_SUB1
addic. L, L, -1 addic. L, L, -1
bgt ZTRMM_L2x4_SUB2 bgt .LZTRMM_L2x4_SUB2
ZTRMM_L2x4_SAVE: .LZTRMM_L2x4_SAVE:
SAVE2x4 SAVE2x4
@ -314,12 +350,12 @@ ZTRMM_L2x4_SAVE:
#endif #endif
ZTRMM_L2x4_END: .LZTRMM_L2x4_END:
ZTRMM_L2x2_BEGIN: .LZTRMM_L2x2_BEGIN:
andi. T1, M, 2 andi. T1, M, 2
ble ZTRMM_L2x2_END ble .LZTRMM_L2x2_END
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
mr BO, B // B -> BO mr BO, B // B -> BO
@ -345,11 +381,11 @@ ZTRMM_L2x2_BEGIN:
mr KKK, T1 mr KKK, T1
mr K1, T1 mr K1, T1
srawi. L, K1, 3 // KTEMP / 8 -> L srawi. L, K1, 3 // KTEMP / 8 -> L
ble ZTRMM_L2x2_SUB0 ble .LZTRMM_L2x2_SUB0
cmpwi cr0, L, 1 cmpwi cr0, L, 1
ble ZTRMM_L2x2_SUB4 ble .LZTRMM_L2x2_SUB4
ZTRMM_L2x2_LOOP_START: .LZTRMM_L2x2_LOOP_START:
LOAD2x2_1 LOAD2x2_1
KERNEL2x2_I1 KERNEL2x2_I1
@ -363,11 +399,11 @@ ZTRMM_L2x2_LOOP_START:
KERNEL2x2_2 KERNEL2x2_2
addic. L, L, -2 addic. L, L, -2
ble ZTRMM_L2x2_LOOP_END ble .LZTRMM_L2x2_LOOP_END
.align 5 .align 5
ZTRMM_L2x2_LOOP: .LZTRMM_L2x2_LOOP:
KERNEL2x2_1 KERNEL2x2_1
KERNEL2x2_2 KERNEL2x2_2
@ -380,9 +416,9 @@ ZTRMM_L2x2_LOOP:
KERNEL2x2_2 KERNEL2x2_2
addic. L, L, -1 addic. L, L, -1
bgt ZTRMM_L2x2_LOOP bgt .LZTRMM_L2x2_LOOP
ZTRMM_L2x2_LOOP_END: .LZTRMM_L2x2_LOOP_END:
KERNEL2x2_1 KERNEL2x2_1
KERNEL2x2_2 KERNEL2x2_2
@ -394,9 +430,9 @@ ZTRMM_L2x2_LOOP_END:
KERNEL2x2_1 KERNEL2x2_1
KERNEL2x2_E2 KERNEL2x2_E2
b ZTRMM_L2x2_SUB1 b .LZTRMM_L2x2_SUB1
ZTRMM_L2x2_SUB4: .LZTRMM_L2x2_SUB4:
KERNEL2x2_SUBI1 KERNEL2x2_SUBI1
KERNEL2x2_SUB1 KERNEL2x2_SUB1
@ -408,31 +444,31 @@ ZTRMM_L2x2_SUB4:
KERNEL2x2_SUB1 KERNEL2x2_SUB1
KERNEL2x2_SUB1 KERNEL2x2_SUB1
b ZTRMM_L2x2_SUB1 b .LZTRMM_L2x2_SUB1
ZTRMM_L2x2_SUB0: .LZTRMM_L2x2_SUB0:
andi. L, K1, 7 // K1 & 7 -> L andi. L, K1, 7 // K1 & 7 -> L
KERNEL2x2_SUBI1 KERNEL2x2_SUBI1
addic. L, L, -1 addic. L, L, -1
ble ZTRMM_L2x2_SAVE ble .LZTRMM_L2x2_SAVE
b ZTRMM_L2x2_SUB2 b .LZTRMM_L2x2_SUB2
ZTRMM_L2x2_SUB1: .LZTRMM_L2x2_SUB1:
andi. L, K1, 7 // K1 & 7 -> L andi. L, K1, 7 // K1 & 7 -> L
ble ZTRMM_L2x2_SAVE ble .LZTRMM_L2x2_SAVE
ZTRMM_L2x2_SUB2: .LZTRMM_L2x2_SUB2:
KERNEL2x2_SUB1 KERNEL2x2_SUB1
addic. L, L, -1 addic. L, L, -1
bgt ZTRMM_L2x2_SUB2 bgt .LZTRMM_L2x2_SUB2
ZTRMM_L2x2_SAVE: .LZTRMM_L2x2_SAVE:
SAVE2x2 SAVE2x2
@ -449,12 +485,12 @@ ZTRMM_L2x2_SAVE:
#endif #endif
ZTRMM_L2x2_END: .LZTRMM_L2x2_END:
ZTRMM_L2x1_BEGIN: .LZTRMM_L2x1_BEGIN:
andi. T1, M, 1 andi. T1, M, 1
ble ZTRMM_L2x1_END ble .LZTRMM_L2x1_END
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
mr BO, B // B -> BO mr BO, B // B -> BO
@ -480,11 +516,11 @@ ZTRMM_L2x1_BEGIN:
mr KKK, T1 mr KKK, T1
mr K1, T1 mr K1, T1
srawi. L, K1, 3 // KTEMP / 8 -> L srawi. L, K1, 3 // KTEMP / 8 -> L
ble ZTRMM_L2x1_SUB0 ble .LZTRMM_L2x1_SUB0
cmpwi cr0, L, 1 cmpwi cr0, L, 1
ble ZTRMM_L2x1_SUB4 ble .LZTRMM_L2x1_SUB4
ZTRMM_L2x1_LOOP_START: .LZTRMM_L2x1_LOOP_START:
LOAD2x1_1 LOAD2x1_1
KERNEL2x1_I1 KERNEL2x1_I1
@ -498,11 +534,11 @@ ZTRMM_L2x1_LOOP_START:
KERNEL2x1_2 KERNEL2x1_2
addic. L, L, -2 addic. L, L, -2
ble ZTRMM_L2x1_LOOP_END ble .LZTRMM_L2x1_LOOP_END
.align 5 .align 5
ZTRMM_L2x1_LOOP: .LZTRMM_L2x1_LOOP:
KERNEL2x1_1 KERNEL2x1_1
KERNEL2x1_2 KERNEL2x1_2
@ -515,9 +551,9 @@ ZTRMM_L2x1_LOOP:
KERNEL2x1_2 KERNEL2x1_2
addic. L, L, -1 addic. L, L, -1
bgt ZTRMM_L2x1_LOOP bgt .LZTRMM_L2x1_LOOP
ZTRMM_L2x1_LOOP_END: .LZTRMM_L2x1_LOOP_END:
KERNEL2x1_1 KERNEL2x1_1
KERNEL2x1_2 KERNEL2x1_2
@ -529,9 +565,9 @@ ZTRMM_L2x1_LOOP_END:
KERNEL2x1_1 KERNEL2x1_1
KERNEL2x1_E2 KERNEL2x1_E2
b ZTRMM_L2x1_SUB1 b .LZTRMM_L2x1_SUB1
ZTRMM_L2x1_SUB4: .LZTRMM_L2x1_SUB4:
KERNEL2x1_SUBI1 KERNEL2x1_SUBI1
KERNEL2x1_SUB1 KERNEL2x1_SUB1
@ -543,31 +579,31 @@ ZTRMM_L2x1_SUB4:
KERNEL2x1_SUB1 KERNEL2x1_SUB1
KERNEL2x1_SUB1 KERNEL2x1_SUB1
b ZTRMM_L2x1_SUB1 b .LZTRMM_L2x1_SUB1
ZTRMM_L2x1_SUB0: .LZTRMM_L2x1_SUB0:
andi. L, K1, 7 // K1 & 7 -> L andi. L, K1, 7 // K1 & 7 -> L
KERNEL2x1_SUBI1 KERNEL2x1_SUBI1
addic. L, L, -1 addic. L, L, -1
ble ZTRMM_L2x1_SAVE ble .LZTRMM_L2x1_SAVE
b ZTRMM_L2x1_SUB2 b .LZTRMM_L2x1_SUB2
ZTRMM_L2x1_SUB1: .LZTRMM_L2x1_SUB1:
andi. L, K1, 7 // K1 & 7 -> L andi. L, K1, 7 // K1 & 7 -> L
ble ZTRMM_L2x1_SAVE ble .LZTRMM_L2x1_SAVE
ZTRMM_L2x1_SUB2: .LZTRMM_L2x1_SUB2:
KERNEL2x1_SUB1 KERNEL2x1_SUB1
addic. L, L, -1 addic. L, L, -1
bgt ZTRMM_L2x1_SUB2 bgt .LZTRMM_L2x1_SUB2
ZTRMM_L2x1_SAVE: .LZTRMM_L2x1_SAVE:
SAVE2x1 SAVE2x1
@ -584,7 +620,7 @@ ZTRMM_L2x1_SAVE:
#endif #endif
ZTRMM_L2x1_END: .LZTRMM_L2x1_END:
slwi T1, K, 5 slwi T1, K, 5
add B, B, T1 add B, B, T1
@ -595,23 +631,23 @@ ZTRMM_L2x1_END:
addic. J, J, -1 addic. J, J, -1
bgt ZTRMM_L2_BEGIN bgt .LZTRMM_L2_BEGIN
andi. T2, N, 1 andi. T2, N, 1
ble L999 ble .L999
ZTRMM_L2_END: .LZTRMM_L2_END:
b ZTRMM_L1_BEGIN b .LZTRMM_L1_BEGIN
L999_H1: .L999_H1:
b L999 b .L999
ZTRMM_L1_BEGIN: .LZTRMM_L1_BEGIN:
andi. T1, N, 1 andi. T1, N, 1
ble ZTRMM_L1_END ble .LZTRMM_L1_END
mr CO, C mr CO, C
mr AO, A mr AO, A
@ -620,9 +656,9 @@ ZTRMM_L1_BEGIN:
#endif #endif
srawi. I, M, 3 srawi. I, M, 3
ble ZTRMM_L1x8_END ble .LZTRMM_L1x8_END
ZTRMM_L1x8_BEGIN: .LZTRMM_L1x8_BEGIN:
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
@ -649,11 +685,11 @@ ZTRMM_L1x8_BEGIN:
mr KKK, T1 mr KKK, T1
mr K1, T1 mr K1, T1
srawi. L, K1, 3 // KTEMP / 8 -> L srawi. L, K1, 3 // KTEMP / 8 -> L
ble ZTRMM_L1x8_SUB0 ble .LZTRMM_L1x8_SUB0
cmpwi cr0, L, 1 cmpwi cr0, L, 1
ble ZTRMM_L1x8_SUB4 ble .LZTRMM_L1x8_SUB4
ZTRMM_L1x8_LOOP_START: .LZTRMM_L1x8_LOOP_START:
dcbt AO, PRE dcbt AO, PRE
LOAD1x8_1 LOAD1x8_1
@ -676,11 +712,11 @@ ZTRMM_L1x8_LOOP_START:
KERNEL1x8_2 KERNEL1x8_2
addic. L, L, -2 addic. L, L, -2
ble ZTRMM_L1x8_LOOP_END ble .LZTRMM_L1x8_LOOP_END
.align 5 .align 5
ZTRMM_L1x8_LOOP: .LZTRMM_L1x8_LOOP:
dcbt AO, PRE dcbt AO, PRE
KERNEL1x8_1 KERNEL1x8_1
@ -701,9 +737,9 @@ ZTRMM_L1x8_LOOP:
KERNEL1x8_2 KERNEL1x8_2
addic. L, L, -1 addic. L, L, -1
bgt ZTRMM_L1x8_LOOP bgt .LZTRMM_L1x8_LOOP
ZTRMM_L1x8_LOOP_END: .LZTRMM_L1x8_LOOP_END:
dcbt AO, PRE dcbt AO, PRE
KERNEL1x8_1 KERNEL1x8_1
@ -722,9 +758,9 @@ ZTRMM_L1x8_LOOP_END:
KERNEL1x8_1 KERNEL1x8_1
KERNEL1x8_E2 KERNEL1x8_E2
b ZTRMM_L1x8_SUB1 b .LZTRMM_L1x8_SUB1
ZTRMM_L1x8_SUB4: .LZTRMM_L1x8_SUB4:
dcbt AO, PRE dcbt AO, PRE
KERNEL1x8_SUBI1 KERNEL1x8_SUBI1
@ -740,31 +776,31 @@ ZTRMM_L1x8_SUB4:
KERNEL1x8_SUB1 KERNEL1x8_SUB1
KERNEL1x8_SUB1 KERNEL1x8_SUB1
b ZTRMM_L1x8_SUB1 b .LZTRMM_L1x8_SUB1
ZTRMM_L1x8_SUB0: .LZTRMM_L1x8_SUB0:
andi. L, K1, 7 // K1 & 7 -> L andi. L, K1, 7 // K1 & 7 -> L
KERNEL1x8_SUBI1 KERNEL1x8_SUBI1
addic. L, L, -1 addic. L, L, -1
ble ZTRMM_L1x8_SAVE ble .LZTRMM_L1x8_SAVE
b ZTRMM_L1x8_SUB2 b .LZTRMM_L1x8_SUB2
ZTRMM_L1x8_SUB1: .LZTRMM_L1x8_SUB1:
andi. L, K1, 7 // K1 & 7 -> L andi. L, K1, 7 // K1 & 7 -> L
ble ZTRMM_L1x8_SAVE ble .LZTRMM_L1x8_SAVE
ZTRMM_L1x8_SUB2: .LZTRMM_L1x8_SUB2:
KERNEL1x8_SUB1 KERNEL1x8_SUB1
addic. L, L, -1 addic. L, L, -1
bgt ZTRMM_L1x8_SUB2 bgt .LZTRMM_L1x8_SUB2
ZTRMM_L1x8_SAVE: .LZTRMM_L1x8_SAVE:
SAVE1x8 SAVE1x8
@ -782,16 +818,16 @@ ZTRMM_L1x8_SAVE:
addic. I, I, -1 addic. I, I, -1
bgt ZTRMM_L1x8_BEGIN bgt .LZTRMM_L1x8_BEGIN
ZTRMM_L1x8_END: .LZTRMM_L1x8_END:
ZTRMM_L1x4_BEGIN: .LZTRMM_L1x4_BEGIN:
andi. T2, M, 7 andi. T2, M, 7
ble ZTRMM_L1x1_END ble .LZTRMM_L1x1_END
andi. T1, M, 4 andi. T1, M, 4
ble ZTRMM_L1x4_END ble .LZTRMM_L1x4_END
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
mr BO, B // B -> BO mr BO, B // B -> BO
@ -817,11 +853,11 @@ ZTRMM_L1x4_BEGIN:
mr KKK, T1 mr KKK, T1
mr K1, T1 mr K1, T1
srawi. L, K1, 3 // KTEMP / 8 -> L srawi. L, K1, 3 // KTEMP / 8 -> L
ble ZTRMM_L1x4_SUB0 ble .LZTRMM_L1x4_SUB0
cmpwi cr0, L, 1 cmpwi cr0, L, 1
ble ZTRMM_L1x4_SUB4 ble .LZTRMM_L1x4_SUB4
ZTRMM_L1x4_LOOP_START: .LZTRMM_L1x4_LOOP_START:
LOAD1x4_1 LOAD1x4_1
KERNEL1x4_I1 KERNEL1x4_I1
@ -835,11 +871,11 @@ ZTRMM_L1x4_LOOP_START:
KERNEL1x4_2 KERNEL1x4_2
addic. L, L, -2 addic. L, L, -2
ble ZTRMM_L1x4_LOOP_END ble .LZTRMM_L1x4_LOOP_END
.align 5 .align 5
ZTRMM_L1x4_LOOP: .LZTRMM_L1x4_LOOP:
KERNEL1x4_1 KERNEL1x4_1
KERNEL1x4_2 KERNEL1x4_2
@ -852,9 +888,9 @@ ZTRMM_L1x4_LOOP:
KERNEL1x4_2 KERNEL1x4_2
addic. L, L, -1 addic. L, L, -1
bgt ZTRMM_L1x4_LOOP bgt .LZTRMM_L1x4_LOOP
ZTRMM_L1x4_LOOP_END: .LZTRMM_L1x4_LOOP_END:
KERNEL1x4_1 KERNEL1x4_1
KERNEL1x4_2 KERNEL1x4_2
@ -866,9 +902,9 @@ ZTRMM_L1x4_LOOP_END:
KERNEL1x4_1 KERNEL1x4_1
KERNEL1x4_E2 KERNEL1x4_E2
b ZTRMM_L1x4_SUB1 b .LZTRMM_L1x4_SUB1
ZTRMM_L1x4_SUB4: .LZTRMM_L1x4_SUB4:
KERNEL1x4_SUBI1 KERNEL1x4_SUBI1
KERNEL1x4_SUB1 KERNEL1x4_SUB1
@ -880,31 +916,31 @@ ZTRMM_L1x4_SUB4:
KERNEL1x4_SUB1 KERNEL1x4_SUB1
KERNEL1x4_SUB1 KERNEL1x4_SUB1
b ZTRMM_L1x4_SUB1 b .LZTRMM_L1x4_SUB1
ZTRMM_L1x4_SUB0: .LZTRMM_L1x4_SUB0:
andi. L, K1, 7 // K1 & 7 -> L andi. L, K1, 7 // K1 & 7 -> L
KERNEL1x4_SUBI1 KERNEL1x4_SUBI1
addic. L, L, -1 addic. L, L, -1
ble ZTRMM_L1x4_SAVE ble .LZTRMM_L1x4_SAVE
b ZTRMM_L1x4_SUB2 b .LZTRMM_L1x4_SUB2
ZTRMM_L1x4_SUB1: .LZTRMM_L1x4_SUB1:
andi. L, K1, 7 // K1 & 7 -> L andi. L, K1, 7 // K1 & 7 -> L
ble ZTRMM_L1x4_SAVE ble .LZTRMM_L1x4_SAVE
ZTRMM_L1x4_SUB2: .LZTRMM_L1x4_SUB2:
KERNEL1x4_SUB1 KERNEL1x4_SUB1
addic. L, L, -1 addic. L, L, -1
bgt ZTRMM_L1x4_SUB2 bgt .LZTRMM_L1x4_SUB2
ZTRMM_L1x4_SAVE: .LZTRMM_L1x4_SAVE:
SAVE1x4 SAVE1x4
@ -921,12 +957,12 @@ ZTRMM_L1x4_SAVE:
#endif #endif
ZTRMM_L1x4_END: .LZTRMM_L1x4_END:
ZTRMM_L1x2_BEGIN: .LZTRMM_L1x2_BEGIN:
andi. T1, M, 2 andi. T1, M, 2
ble ZTRMM_L1x2_END ble .LZTRMM_L1x2_END
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
mr BO, B // B -> BO mr BO, B // B -> BO
@ -952,11 +988,11 @@ ZTRMM_L1x2_BEGIN:
mr KKK, T1 mr KKK, T1
mr K1, T1 mr K1, T1
srawi. L, K1, 3 // KTEMP / 8 -> L srawi. L, K1, 3 // KTEMP / 8 -> L
ble ZTRMM_L1x2_SUB0 ble .LZTRMM_L1x2_SUB0
cmpwi cr0, L, 1 cmpwi cr0, L, 1
ble ZTRMM_L1x2_SUB4 ble .LZTRMM_L1x2_SUB4
ZTRMM_L1x2_LOOP_START: .LZTRMM_L1x2_LOOP_START:
LOAD1x2_1 LOAD1x2_1
KERNEL1x2_I1 KERNEL1x2_I1
@ -970,11 +1006,11 @@ ZTRMM_L1x2_LOOP_START:
KERNEL1x2_2 KERNEL1x2_2
addic. L, L, -2 addic. L, L, -2
ble ZTRMM_L1x2_LOOP_END ble .LZTRMM_L1x2_LOOP_END
.align 5 .align 5
ZTRMM_L1x2_LOOP: .LZTRMM_L1x2_LOOP:
KERNEL1x2_1 KERNEL1x2_1
KERNEL1x2_2 KERNEL1x2_2
@ -987,9 +1023,9 @@ ZTRMM_L1x2_LOOP:
KERNEL1x2_2 KERNEL1x2_2
addic. L, L, -1 addic. L, L, -1
bgt ZTRMM_L1x2_LOOP bgt .LZTRMM_L1x2_LOOP
ZTRMM_L1x2_LOOP_END: .LZTRMM_L1x2_LOOP_END:
KERNEL1x2_1 KERNEL1x2_1
KERNEL1x2_2 KERNEL1x2_2
@ -1001,9 +1037,9 @@ ZTRMM_L1x2_LOOP_END:
KERNEL1x2_1 KERNEL1x2_1
KERNEL1x2_E2 KERNEL1x2_E2
b ZTRMM_L1x2_SUB1 b .LZTRMM_L1x2_SUB1
ZTRMM_L1x2_SUB4: .LZTRMM_L1x2_SUB4:
KERNEL1x2_SUBI1 KERNEL1x2_SUBI1
KERNEL1x2_SUB1 KERNEL1x2_SUB1
@ -1015,31 +1051,31 @@ ZTRMM_L1x2_SUB4:
KERNEL1x2_SUB1 KERNEL1x2_SUB1
KERNEL1x2_SUB1 KERNEL1x2_SUB1
b ZTRMM_L1x2_SUB1 b .LZTRMM_L1x2_SUB1
ZTRMM_L1x2_SUB0: .LZTRMM_L1x2_SUB0:
andi. L, K1, 7 // K1 & 7 -> L andi. L, K1, 7 // K1 & 7 -> L
KERNEL1x2_SUBI1 KERNEL1x2_SUBI1
addic. L, L, -1 addic. L, L, -1
ble ZTRMM_L1x2_SAVE ble .LZTRMM_L1x2_SAVE
b ZTRMM_L1x2_SUB2 b .LZTRMM_L1x2_SUB2
ZTRMM_L1x2_SUB1: .LZTRMM_L1x2_SUB1:
andi. L, K1, 7 // K1 & 7 -> L andi. L, K1, 7 // K1 & 7 -> L
ble ZTRMM_L1x2_SAVE ble .LZTRMM_L1x2_SAVE
ZTRMM_L1x2_SUB2: .LZTRMM_L1x2_SUB2:
KERNEL1x2_SUB1 KERNEL1x2_SUB1
addic. L, L, -1 addic. L, L, -1
bgt ZTRMM_L1x2_SUB2 bgt .LZTRMM_L1x2_SUB2
ZTRMM_L1x2_SAVE: .LZTRMM_L1x2_SAVE:
SAVE1x2 SAVE1x2
@ -1056,12 +1092,12 @@ ZTRMM_L1x2_SAVE:
#endif #endif
ZTRMM_L1x2_END: .LZTRMM_L1x2_END:
ZTRMM_L1x1_BEGIN: .LZTRMM_L1x1_BEGIN:
andi. T1, M, 1 andi. T1, M, 1
ble ZTRMM_L1x1_END ble .LZTRMM_L1x1_END
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
mr BO, B // B -> BO mr BO, B // B -> BO
@ -1087,11 +1123,11 @@ ZTRMM_L1x1_BEGIN:
mr KKK, T1 mr KKK, T1
mr K1, T1 mr K1, T1
srawi. L, K1, 3 // KTEMP / 8 -> L srawi. L, K1, 3 // KTEMP / 8 -> L
ble ZTRMM_L1x1_SUB0 ble .LZTRMM_L1x1_SUB0
cmpwi cr0, L, 1 cmpwi cr0, L, 1
ble ZTRMM_L1x1_SUB4 ble .LZTRMM_L1x1_SUB4
ZTRMM_L1x1_LOOP_START: .LZTRMM_L1x1_LOOP_START:
LOAD1x1_1 LOAD1x1_1
KERNEL1x1_I1 KERNEL1x1_I1
@ -1105,11 +1141,11 @@ ZTRMM_L1x1_LOOP_START:
KERNEL1x1_2 KERNEL1x1_2
addic. L, L, -2 addic. L, L, -2
ble ZTRMM_L1x1_LOOP_END ble .LZTRMM_L1x1_LOOP_END
.align 5 .align 5
ZTRMM_L1x1_LOOP: .LZTRMM_L1x1_LOOP:
KERNEL1x1_1 KERNEL1x1_1
KERNEL1x1_2 KERNEL1x1_2
@ -1122,9 +1158,9 @@ ZTRMM_L1x1_LOOP:
KERNEL1x1_2 KERNEL1x1_2
addic. L, L, -1 addic. L, L, -1
bgt ZTRMM_L1x1_LOOP bgt .LZTRMM_L1x1_LOOP
ZTRMM_L1x1_LOOP_END: .LZTRMM_L1x1_LOOP_END:
KERNEL1x1_1 KERNEL1x1_1
KERNEL1x1_2 KERNEL1x1_2
@ -1136,9 +1172,9 @@ ZTRMM_L1x1_LOOP_END:
KERNEL1x1_1 KERNEL1x1_1
KERNEL1x1_E2 KERNEL1x1_E2
b ZTRMM_L1x1_SUB1 b .LZTRMM_L1x1_SUB1
ZTRMM_L1x1_SUB4: .LZTRMM_L1x1_SUB4:
KERNEL1x1_SUBI1 KERNEL1x1_SUBI1
KERNEL1x1_SUB1 KERNEL1x1_SUB1
@ -1150,31 +1186,31 @@ ZTRMM_L1x1_SUB4:
KERNEL1x1_SUB1 KERNEL1x1_SUB1
KERNEL1x1_SUB1 KERNEL1x1_SUB1
b ZTRMM_L1x1_SUB1 b .LZTRMM_L1x1_SUB1
ZTRMM_L1x1_SUB0: .LZTRMM_L1x1_SUB0:
andi. L, K1, 7 // K1 & 7 -> L andi. L, K1, 7 // K1 & 7 -> L
KERNEL1x1_SUBI1 KERNEL1x1_SUBI1
addic. L, L, -1 addic. L, L, -1
ble ZTRMM_L1x1_SAVE ble .LZTRMM_L1x1_SAVE
b ZTRMM_L1x1_SUB2 b .LZTRMM_L1x1_SUB2
ZTRMM_L1x1_SUB1: .LZTRMM_L1x1_SUB1:
andi. L, K1, 7 // K1 & 7 -> L andi. L, K1, 7 // K1 & 7 -> L
ble ZTRMM_L1x1_SAVE ble .LZTRMM_L1x1_SAVE
ZTRMM_L1x1_SUB2: .LZTRMM_L1x1_SUB2:
KERNEL1x1_SUB1 KERNEL1x1_SUB1
addic. L, L, -1 addic. L, L, -1
bgt ZTRMM_L1x1_SUB2 bgt .LZTRMM_L1x1_SUB2
ZTRMM_L1x1_SAVE: .LZTRMM_L1x1_SAVE:
SAVE1x1 SAVE1x1
@ -1191,11 +1227,11 @@ ZTRMM_L1x1_SAVE:
#endif #endif
ZTRMM_L1x1_END: .LZTRMM_L1x1_END:
#if !defined(LEFT) #if !defined(LEFT)
addi KK, KK, 1 // KK += Number of values in B addi KK, KK, 1 // KK += Number of values in B
#endif #endif
ZTRMM_L1_END: .LZTRMM_L1_END: