5575 lines
		
	
	
		
			141 KiB
		
	
	
	
		
			ArmAsm
		
	
	
	
			
		
		
	
	
			5575 lines
		
	
	
		
			141 KiB
		
	
	
	
		
			ArmAsm
		
	
	
	
| /***************************************************************************
 | |
| Copyright (c) 2013-2019, The OpenBLAS Project
 | |
| All rights reserved.
 | |
| Redistribution and use in source and binary forms, with or without
 | |
| modification, are permitted provided that the following conditions are
 | |
| met:
 | |
| 1. Redistributions of source code must retain the above copyright
 | |
| notice, this list of conditions and the following disclaimer.
 | |
| 2. Redistributions in binary form must reproduce the above copyright
 | |
| notice, this list of conditions and the following disclaimer in
 | |
| the documentation and/or other materials provided with the
 | |
| distribution.
 | |
| 3. Neither the name of the OpenBLAS project nor the names of
 | |
| its contributors may be used to endorse or promote products
 | |
| derived from this software without specific prior written permission.
 | |
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 | |
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 | |
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 | |
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 | |
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 | |
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 | |
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 | |
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 | |
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 | |
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 | |
| *****************************************************************************/
 | |
|  
 | |
| #define unit_size 4
 | |
| #define DISP64(ind,disp) (ind*unit_size*64+disp)
 | |
| #define DISP32(ind,disp) (ind*unit_size*32+disp)
 | |
| #define DISP16(ind,disp) (ind*unit_size*16+disp)
 | |
| #define DISP8(ind,disp) (ind*unit_size*8+disp)
 | |
| #define DISP4(ind,disp) (ind*unit_size*4+disp)
 | |
| #define DISP2(ind,disp) (ind*unit_size*2+disp)
 | |
| #define DISP1(ind,disp) (ind*unit_size+disp)
 | |
| 
 | |
| /**********************************************************************************************
 | |
| * Macros for N=8 and M=16
 | |
| **********************************************************************************************/
 | |
| 
 | |
|  
 | |
| 
 | |
| .macro KERNEL8x16_L1_L4  Index,IsLast
 | |
|   KERNEL8x16_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0
 | |
| .endm
 | |
| 
 | |
| .macro KERNEL8x16_I1_L4  OffsetA,OffsetB, Index,IsLast
 | |
|   KERNEL8x16_L1_L4_I  AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0
 | |
| .endm
 | |
| 
 | |
| .macro KERNEL8x16_I1_L4_2  OffsetA,OffsetB, Index,IsLast
 | |
|   KERNEL8x16_L1_L4_I  AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast,0
 | |
| .endm
 | |
| 
 | |
| .macro KERNEL8x16_I1_L4_3  OffsetA,OffsetB, Index,IsLast
 | |
|   KERNEL8x16_L1_L4_I  AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast,1
 | |
| .endm
 | |
|  
 | |
| .macro KERNEL8x16_I2_L4_2  AREG,BREG,OffsetA,OffsetB, Index,IsLast
 | |
|   KERNEL8x16_L1_L4_I  \AREG,\BREG,  \OffsetA,\OffsetB,\Index,\IsLast,0
 | |
| .endm
 | |
| 
 | |
| .macro KERNEL8x16_I2_L4_3  AREG,BREG,OffsetA,OffsetB, Index,IsLast
 | |
|   KERNEL8x16_L1_L4_I \AREG,\BREG,  \OffsetA,\OffsetB,\Index,\IsLast,1
 | |
| .endm
 | |
| 
 | |
| .macro Zero8X16
 | |
|     xxlxor		vs32,	vs32,	vs32
 | |
|     xxlxor		vs33,	vs33,	vs33
 | |
| 	xxlxor		vs34,	vs34,	vs34
 | |
| 	xxlxor		vs35,	vs35,	vs35
 | |
| 	xxlxor		vs36,	vs36,	vs36
 | |
| 	xxlxor		vs37,	vs37,	vs37
 | |
| 	xxlxor		vs38,	vs38,	vs38
 | |
| 	xxlxor		vs39,	vs39,	vs39
 | |
| 	xxlxor		vs40,	vs40,	vs40
 | |
| 	xxlxor		vs41,	vs41,	vs41
 | |
| 	xxlxor		vs42,	vs42,	vs42
 | |
| 	xxlxor		vs43,	vs43,	vs43
 | |
| 	xxlxor		vs44,	vs44,	vs44
 | |
| 	xxlxor		vs45,	vs45,	vs45
 | |
| 	xxlxor		vs46,	vs46,	vs46
 | |
| 	xxlxor		vs47,	vs47,	vs47
 | |
| 	xxlxor		vs48,	vs48,	vs48
 | |
| 	xxlxor		vs49,	vs49,	vs49
 | |
| 	xxlxor		vs50,	vs50,	vs50
 | |
| 	xxlxor		vs51,	vs51,	vs51 
 | |
| 	xxlxor		vs52,	vs52,	vs52
 | |
| 	xxlxor		vs53,	vs53,	vs53
 | |
| 	xxlxor		vs54,	vs54,	vs54
 | |
| 	xxlxor		vs55,	vs55,	vs55 
 | |
| 	xxlxor		vs56,	vs56,	vs56
 | |
| 	xxlxor		vs57,	vs57,	vs57
 | |
| 	xxlxor		vs58,	vs58,	vs58
 | |
| 	xxlxor		vs59,	vs59,	vs59 
 | |
| 	xxlxor		vs60,	vs60,	vs60
 | |
| 	xxlxor		vs61,	vs61,	vs61
 | |
| 	xxlxor		vs62,	vs62,	vs62
 | |
| 	xxlxor		vs63,	vs63,	vs63	
 | |
| .endm
 | |
| 
 | |
| .macro LOAD8x16  OffsetA,OffsetB
 | |
| 
 | |
| 	lxv	vs24,	(\OffsetB+0)(BO)
 | |
| 	lxv	vs28,	(\OffsetB+16)(BO)
 | |
| 	xxperm  	vs26,	vs24,		permute_mask
 | |
| 	xxperm  	vs30,	vs28,		permute_mask	  
 | |
| 	lxv	vs0,	(\OffsetA+0)(AO)
 | |
| 	lxv	vs1,	(\OffsetA+16)(AO)
 | |
| 	xxpermdi	vs25,	vs24,	vs24,2	   
 | |
| 	xxpermdi	vs29,	vs28,	vs28,2	  
 | |
| 	lxv	vs2,	(\OffsetA+32)(AO)
 | |
| 	lxv	vs3,	(\OffsetA+48)(AO) 
 | |
| 	xxpermdi	vs27,	vs26,	vs26,2	
 | |
| 	xxpermdi	vs31,	vs30,	vs30,2	 	
 | |
| 
 | |
| .endm
 | |
| 
 | |
| .macro END8x16_NORMAL
 | |
|   END8x16 0, AO, BO, 64,32 
 | |
| .endm
 | |
| 
 | |
| .macro END8x16_WITHOUT_ADD
 | |
| 	END8x16 0, AO,BO,0,0
 | |
| .endm
 | |
| 
 | |
| .macro END8x16 First, AREG, BREG, OffsetA, OffsetB
 | |
| 
 | |
| .if \OffsetB != 0 
 | |
|     addi        \BREG, \BREG, \OffsetB 
 | |
| .endif
 | |
| .if \OffsetA != 0 
 | |
|     addi        \AREG, \AREG, \OffsetA 
 | |
| .endif  
 | |
| 
 | |
| .if \First==1
 | |
|     xvmulsp     vs32, vs0,vs24
 | |
|     xvmulsp     vs33, vs1,vs24
 | |
|     xvmulsp     vs34, vs2,vs24  
 | |
|     xvmulsp     vs35, vs3,vs24  
 | |
| 
 | |
|     xvmulsp     vs36, vs0,vs25
 | |
|     xvmulsp     vs37, vs1,vs25
 | |
|     xvmulsp     vs38, vs2,vs25  
 | |
|     xvmulsp     vs39, vs3,vs25
 | |
| 
 | |
|     xvmulsp     vs40, vs0,vs26
 | |
|     xvmulsp     vs41, vs1,vs26
 | |
|     xvmulsp     vs42, vs2,vs26  
 | |
|     xvmulsp     vs43, vs3,vs26
 | |
| 
 | |
|     xvmulsp     vs44, vs0,vs27
 | |
|     xvmulsp     vs45, vs1,vs27
 | |
|     xvmulsp     vs46, vs2,vs27  
 | |
|     xvmulsp     vs47, vs3,vs27
 | |
| 
 | |
|     xvmulsp     vs48, vs0,vs28
 | |
|     xvmulsp     vs49, vs1,vs28
 | |
|     xvmulsp     vs50, vs2,vs28  
 | |
|     xvmulsp     vs51, vs3,vs28  
 | |
| 
 | |
|     xvmulsp     vs52, vs0,vs29
 | |
|     xvmulsp     vs53, vs1,vs29
 | |
|     xvmulsp     vs54, vs2,vs29  
 | |
|     xvmulsp     vs55, vs3,vs29
 | |
| 
 | |
|     xvmulsp     vs56, vs0,vs30
 | |
|     xvmulsp     vs57, vs1,vs30
 | |
|     xvmulsp     vs58, vs2,vs30  
 | |
|     xvmulsp     vs59, vs3,vs30
 | |
| 
 | |
|     xvmulsp     vs60, vs0,vs31
 | |
|     xvmulsp     vs61, vs1,vs31
 | |
|     xvmulsp     vs62, vs2,vs31  
 | |
|     xvmulsp     vs63, vs3,vs31
 | |
| 
 | |
| .else
 | |
|     xvmaddasp       vs32, vs0,vs24
 | |
|     xvmaddasp       vs33, vs1,vs24
 | |
|     xvmaddasp       vs34, vs2,vs24  
 | |
|     xvmaddasp       vs35, vs3,vs24  
 | |
| 
 | |
|     xvmaddasp       vs36, vs0,vs25
 | |
|     xvmaddasp       vs37, vs1,vs25
 | |
|     xvmaddasp       vs38, vs2,vs25  
 | |
|     xvmaddasp       vs39, vs3,vs25 
 | |
|     xvmaddasp       vs40, vs0,vs26
 | |
|     xvmaddasp       vs41, vs1,vs26
 | |
|     xvmaddasp       vs42, vs2,vs26  
 | |
|     xvmaddasp       vs43, vs3,vs26
 | |
| 
 | |
|     xvmaddasp       vs44, vs0,vs27
 | |
|     xvmaddasp       vs45, vs1,vs27
 | |
|     xvmaddasp       vs46, vs2,vs27  
 | |
|     xvmaddasp       vs47, vs3,vs27
 | |
| 
 | |
|     xvmaddasp       vs48, vs0,vs28
 | |
|     xvmaddasp       vs49, vs1,vs28
 | |
|     xvmaddasp       vs50, vs2,vs28  
 | |
|     xvmaddasp       vs51, vs3,vs28  
 | |
| 
 | |
|     xvmaddasp       vs52, vs0,vs29
 | |
|     xvmaddasp       vs53, vs1,vs29
 | |
|     xvmaddasp       vs54, vs2,vs29  
 | |
|     xvmaddasp       vs55, vs3,vs29
 | |
| 
 | |
|     xvmaddasp       vs56, vs0,vs30
 | |
|     xvmaddasp       vs57, vs1,vs30
 | |
|     xvmaddasp       vs58, vs2,vs30  
 | |
|     xvmaddasp       vs59, vs3,vs30
 | |
| 
 | |
|     xvmaddasp       vs60, vs0,vs31
 | |
|     xvmaddasp       vs61, vs1,vs31
 | |
|     xvmaddasp       vs62, vs2,vs31  
 | |
|     xvmaddasp       vs63, vs3,vs31 
 | |
| 
 | |
| .endif
 | |
| .endm  
 | |
| 
 | |
| .macro KERNEL8x16_L1_L4_I  AREG,BREG,   OffsetA,OffsetB, Index,IsLast ,Complete
 | |
| 
 | |
| KERNEL8x16_2  \AREG,\BREG, \OffsetA,\OffsetB, (\Index*2),0 ,0
 | |
| KERNEL8x16_2  \AREG,\BREG,\OffsetA,\OffsetB, (\Index*2+1),\IsLast ,\Complete
 | |
| 
 | |
| .endm
 | |
| 
 | |
| .macro KERNEL8x16 First
 | |
| 
 | |
|   LOAD8x16 0,0
 | |
|   END8x16 \First, AO, BO, 64,32 
 | |
| .endm
 | |
| 
 | |
| .macro LOAD8x16_2
 | |
|     LOAD8x16_2O AO,BO, 0,0
 | |
| .endm	
 | |
| 
 | |
| .macro LOAD8x16_2O  AREG,BREG, OffsetA,OffsetB
 | |
|   lxv	vs8,	(\OffsetB)(\BREG)
 | |
|   lxv	vs12,	(16+\OffsetB)(\BREG)
 | |
|   lxv	vs24,	(32+\OffsetB)(\BREG)
 | |
|   lxv	vs28,	(32+16+\OffsetB)(\BREG)
 | |
|   lxv	vs4,	(0+\OffsetA)(\AREG)
 | |
|   lxv	vs5,	(16+\OffsetA)(\AREG)
 | |
|   xxperm  	vs10,	vs8,		permute_mask
 | |
|   xxperm  	vs14,	vs12,		permute_mask	
 | |
|   lxv	vs6,	(32+\OffsetA)(\AREG)
 | |
|   lxv	vs7,	(48+\OffsetA)(\AREG) 
 | |
|   xxpermdi	vs9,	vs8,	 vs8,2	 
 | |
|   xxpermdi	vs13,	vs12,	vs12,2	 
 | |
|   lxv	vs0,	(64+\OffsetA)(\AREG)
 | |
|   lxv	vs1,	(64+16+\OffsetA)(\AREG) 
 | |
|   xxpermdi	vs11,	vs10,	vs10,2	
 | |
|   xxpermdi	vs15,	vs14,	vs14,2	
 | |
|   lxv	vs2,	(64+32+\OffsetA)(\AREG)
 | |
|   lxv	vs3,	(64+48+\OffsetA)(\AREG)
 | |
| 
 | |
|   xxperm  	vs26,	vs24,	permute_mask
 | |
|   xxperm  	vs30,	vs28,	permute_mask	
 | |
|   xxpermdi	vs25,	vs24,	vs24,2 
 | |
|   xxpermdi	vs29,	vs28,	vs28,2	      
 | |
|   xxpermdi	vs27,	vs26,	vs26,2	
 | |
|   xxpermdi	vs31,	vs30,	vs30,2	 
 | |
| .endm
 | |
| 
 | |
| .macro END8x16_2	  
 | |
|   /*for load2 offset will be 128 and 64*/
 | |
|    KERNEL8x16_2	AO,BO,	128,64,0 ,1,1 
 | |
| .endm
 | |
|  
 | |
| 
 | |
| 
 | |
| .macro KERNEL8x16_E2	OffsetA,OffsetB, Index,IsLast 
 | |
|   KERNEL8x16_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,1 
 | |
| .endm
 | |
| 
 | |
| 
 | |
| .macro KERNEL8x16_L2	OffsetA,OffsetB, Index,IsLast
 | |
|   KERNEL8x16_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,0 
 | |
| .endm
 | |
| 
 | |
| 
 | |
| .macro KERNEL8x16_2  AREG,BREG,  OffsetA,OffsetB, Index,IsLast ,Complete
 | |
|   xvmaddasp		vs32, vs4,vs8
 | |
|   xvmaddasp		vs33, vs5,vs8
 | |
|   xvmaddasp		vs48, vs4,vs12
 | |
|   xvmaddasp		vs49, vs5,vs12
 | |
| 
 | |
|   xvmaddasp		vs40, vs4,vs10
 | |
|   xvmaddasp		vs41, vs5,vs10
 | |
|   xvmaddasp		vs56, vs4,vs14
 | |
|   xvmaddasp		vs57, vs5,vs14
 | |
| 
 | |
|   xvmaddasp		vs36, vs4,vs9
 | |
|   xvmaddasp		vs37, vs5,vs9
 | |
|   xvmaddasp		vs52, vs4,vs13
 | |
|   xvmaddasp		vs53, vs5,vs13
 | |
| 
 | |
|   xvmaddasp		vs44, vs4,vs11
 | |
|   xvmaddasp		vs45, vs5,vs11
 | |
|   xvmaddasp		vs60, vs4,vs15
 | |
|   xvmaddasp		vs61, vs5,vs15
 | |
| 
 | |
| .if \Complete==0	
 | |
|    lxv	vs4,	DISP32(\Index,0+\OffsetA)(\AREG)
 | |
|    lxv	vs5,	DISP32(\Index,16+\OffsetA)(\AREG)
 | |
| .endif
 | |
| 
 | |
|   xvmaddasp		vs34, vs6,vs8	
 | |
|   xvmaddasp		vs35, vs7,vs8	
 | |
|   xvmaddasp		vs50, vs6,vs12
 | |
|   xvmaddasp		vs51, vs7,vs12
 | |
| .if \Complete==0  
 | |
|   lxv vs8,  DISP16(\Index,\OffsetB)(\BREG)
 | |
|   lxv vs12, DISP16(\Index,16+\OffsetB)(\BREG)
 | |
| .endif    
 | |
|   xvmaddasp		vs42, vs6,vs10
 | |
|   xvmaddasp		vs43, vs7,vs10
 | |
|   xvmaddasp		vs58, vs6,vs14
 | |
|   xvmaddasp		vs59, vs7,vs14
 | |
| .if \Complete==0  
 | |
|   xxperm    vs10, vs8,    permute_mask
 | |
|   xxperm    vs14, vs12,   permute_mask    
 | |
| .endif    
 | |
|   xvmaddasp		vs38, vs6,vs9	
 | |
|   xvmaddasp		vs39, vs7,vs9	
 | |
|   xvmaddasp   vs54, vs6,vs13
 | |
|   xvmaddasp   vs55, vs7,vs13
 | |
| .if \Complete==0
 | |
|   xxpermdi  vs9,  vs8,   vs8,2   
 | |
|   xxpermdi  vs13, vs12, vs12,2   
 | |
| .endif    
 | |
|   xvmaddasp		vs46, vs6,vs11
 | |
|   xvmaddasp		vs47, vs7,vs11
 | |
|   xvmaddasp		vs62, vs6,vs15
 | |
|   xvmaddasp		vs63, vs7,vs15
 | |
| .if \Complete==0
 | |
|   xxpermdi  vs11, vs10, vs10,2  
 | |
|   xxpermdi  vs15, vs14, vs14,2  
 | |
| .endif  
 | |
| 
 | |
| .if \Complete==0
 | |
|    lxv	vs6,	DISP32(\Index,32+\OffsetA)(\AREG)
 | |
|    lxv	vs7,	DISP32(\Index,48+\OffsetA)(\AREG) 
 | |
| .endif 
 | |
| 
 | |
|   xvmaddasp		vs32, vs0,vs24
 | |
|   xvmaddasp		vs33, vs1,vs24
 | |
|   xvmaddasp		vs48, vs0,vs28
 | |
|   xvmaddasp		vs49, vs1,vs28
 | |
|   xvmaddasp		vs40, vs0,vs26
 | |
|   xvmaddasp		vs41, vs1,vs26
 | |
|   xvmaddasp		vs56, vs0,vs30
 | |
|   xvmaddasp		vs57, vs1,vs30
 | |
|   xvmaddasp		vs36, vs0,vs25
 | |
|   xvmaddasp		vs37, vs1,vs25
 | |
|   xvmaddasp		vs52, vs0,vs29
 | |
|   xvmaddasp		vs53, vs1,vs29
 | |
|   xvmaddasp		vs44, vs0,vs27
 | |
|   xvmaddasp		vs45, vs1,vs27
 | |
|   xvmaddasp		vs60, vs0,vs31
 | |
|   xvmaddasp		vs61, vs1,vs31 
 | |
| .if \Complete==0
 | |
|   lxv	vs0,	DISP32(\Index,64+\OffsetA)(\AREG)
 | |
|   lxv	vs1,	DISP32(\Index,64+16+\OffsetA)(\AREG) 
 | |
| .endif
 | |
| 
 | |
|   xvmaddasp		vs34, vs2,vs24
 | |
|   xvmaddasp		vs35, vs3,vs24	  
 | |
|   xvmaddasp		vs50, vs2,vs28
 | |
|   xvmaddasp		vs51, vs3,vs28
 | |
| .if \Complete==0
 | |
|   lxv vs24, DISP16(\Index,32+\OffsetB)(\BREG)
 | |
|   lxv vs28, DISP16(\Index,32+16+\OffsetB)(\BREG)
 | |
| .endif  
 | |
|   xvmaddasp		vs42, vs2,vs26
 | |
|   xvmaddasp		vs43, vs3,vs26
 | |
|   xvmaddasp		vs58, vs2,vs30
 | |
|   xvmaddasp		vs59, vs3,vs30
 | |
| .if \Complete==0
 | |
|   xxperm    vs26, vs24, permute_mask
 | |
|   xxperm    vs30, vs28, permute_mask  
 | |
| .endif  
 | |
|   xvmaddasp		vs38, vs2,vs25
 | |
|   xvmaddasp		vs39, vs3,vs25
 | |
|   xvmaddasp		vs54, vs2,vs29
 | |
|   xvmaddasp		vs55, vs3,vs29
 | |
| .if \Complete==0
 | |
|   xxpermdi  vs25, vs24, vs24,2 
 | |
|   xxpermdi  vs29, vs28, vs28,2    
 | |
| .endif  
 | |
|   xvmaddasp		vs46, vs2,vs27
 | |
|   xvmaddasp		vs47, vs3,vs27
 | |
|   xvmaddasp		vs62, vs2,vs31	
 | |
|   xvmaddasp		vs63, vs3,vs31
 | |
| .if \Complete==0
 | |
|   xxpermdi  vs27, vs26, vs26,2  
 | |
|   xxpermdi  vs31, vs30, vs30,2   
 | |
| .endif
 | |
| .if \Complete==0
 | |
|   lxv	vs2,	DISP32(\Index,64+32+\OffsetA)(\AREG)
 | |
|   lxv	vs3,	DISP32(\Index,64+48+\OffsetA)(\AREG)
 | |
| .endif
 | |
| 
 | |
| 
 | |
| .if \IsLast==1	
 | |
| .if \Complete==1
 | |
| 	addi		\BREG, \BREG,  DISP16(\Index,\OffsetB)
 | |
|   addi    \AREG, \AREG, DISP32(\Index,\OffsetA)  
 | |
| 
 | |
| .else
 | |
| 	addi		\BREG, \BREG,  DISP16(\Index,64)
 | |
|   addi    \AREG, \AREG, DISP32(\Index,128)  
 | |
| 
 | |
| .endif
 | |
| .endif   
 | |
| 
 | |
| 
 | |
| .endm
 | |
| 
 | |
|  
 | |
| .macro SAVE8x16
 | |
| 
 | |
|   slwi    T10, LDC ,   1 
 | |
|   add     T1, CO, LDC 
 | |
| 
 | |
|   add     T2, CO, T10  
 | |
|   add     T3, T1, T10  
 | |
| 
 | |
|   add     T4, T2, T10  
 | |
|   add     T5, T3, T10 
 | |
| 
 | |
|   add     T6, T4, T10 
 | |
|   add     T7, T5, T10 
 | |
| 
 | |
| 
 | |
| 
 | |
|    /* permute to restore butterfly rank 1 updateto normal promoted one */  
 | |
|     /* permute 16 vs8 MEM(CO) vs9 MEM(CO+LDC) vs10 MEM(CO+2*LDC)  vs11 MEM(CO+3*LDC) */
 | |
|     /* permute 16 vs12 MEM(16+CO) vs13 MEM(16+CO+LDC) vs14 MEM(16+CO+2*LDC)  vs15 MEM(16+CO+3*LDC) */
 | |
|     /* permute 16 vs16 MEM(32+CO) vs17 MEM(32+CO+LDC) vs18 MEM(32+CO+2*LDC)  vs19 MEM(32+CO+3*LDC) */
 | |
|     /* permute 16 vs24 MEM(32+CO) vs25 MEM(32+CO+LDC) vs26 MEM(32+CO+2*LDC)  vs27 MEM(32+CO+3*LDC) */
 | |
| 
 | |
|     xxmrglw     vs8,    vs32,   vs44
 | |
|     xxmrglw     vs10,   vs36,   vs40  
 | |
| 
 | |
|     xxmrghw     vs1,    vs32,   vs44
 | |
|     xxmrghw     vs0,    vs36,   vs40
 | |
| 
 | |
|     xxmrglw     vs12,   vs33,   vs45
 | |
|     xxmrglw     vs14,   vs37,   vs41  
 | |
| 
 | |
|     xxmrghw     vs2,    vs37,   vs41
 | |
|     xxmrghw     vs3,    vs33,   vs45
 | |
| #ifndef TRMMKERNEL    
 | |
|     lxv        vs32, 0(CO)
 | |
|     lxv        vs33, 16(CO) 
 | |
| #endif 
 | |
|     xxmrglw     vs16,   vs34,   vs46
 | |
|     xxmrglw     vs18,   vs38,   vs42   
 | |
| 
 | |
|     xxlor      vs9, vs8,    vs8
 | |
|     xxlor      vs11,    vs10,   vs10 
 | |
| 
 | |
|     xxmrghw     vs4,    vs38,   vs42
 | |
|     xxmrghw     vs5,    vs34,   vs46
 | |
| 
 | |
|     xxlor      vs13,    vs12,   vs12
 | |
|     xxlor      vs15,    vs14,   vs14
 | |
| 
 | |
|     xxmrglw     vs24,   vs35,   vs47
 | |
|     xxmrglw     vs26,   vs39,   vs43  
 | |
| 
 | |
|     xxlor      vs17,    vs16,   vs16
 | |
|     xxlor      vs19,    vs18,   vs18
 | |
| 
 | |
|     xxmrghw     vs30,   vs39,   vs43 
 | |
|     xxmrghw     vs31,   vs35,   vs47
 | |
| #ifndef TRMMKERNEL       
 | |
|     lxv        vs34, 32(CO)  
 | |
|     lxv        vs35, 48(CO)      
 | |
| #endif
 | |
|     xxperm      vs8,    vs0,    save_permute_1
 | |
|     xxperm      vs10,   vs1,    save_permute_1
 | |
| #ifndef TRMMKERNEL    
 | |
|     lxv        vs36, 0(T1)
 | |
|     lxv        vs37, 16(T1) 
 | |
| #endif
 | |
|     xxperm      vs9,    vs0,    save_permute_2  
 | |
|     xxperm      vs11,   vs1,    save_permute_2      
 | |
| 
 | |
| #ifndef TRMMKERNEL      
 | |
|     lxv        vs38, 32(T1)  
 | |
|     lxv        vs39, 48(T1)     
 | |
| #endif
 | |
| 
 | |
|     xxlor      vs25,    vs24,   vs24
 | |
|     xxlor      vs27,    vs26,   vs26 
 | |
| 
 | |
| 
 | |
| 
 | |
| #ifndef TRMMKERNEL       
 | |
|     lxv        vs40, 0(T2)
 | |
|     lxv        vs41, 16(T2) 
 | |
| #endif
 | |
| 
 | |
|     xxperm     vs12,    vs2,    save_permute_1
 | |
|     xxperm     vs14,    vs3,    save_permute_1
 | |
| #ifndef TRMMKERNEL     
 | |
|     lxv        vs42, 32(T2)  
 | |
|     lxv        vs43, 48(T2)     
 | |
| #endif  
 | |
|        
 | |
|     xxperm     vs13,    vs2,    save_permute_2   
 | |
|     xxperm     vs15,    vs3,    save_permute_2  
 | |
| #ifndef TRMMKERNEL    
 | |
|     lxv        vs44, 0(T3)
 | |
|     lxv        vs45, 16(T3)
 | |
| #endif
 | |
|     xxperm     vs16,    vs4,    save_permute_1
 | |
|     xxperm     vs18,    vs5,    save_permute_1
 | |
| #ifndef TRMMKERNEL      
 | |
|     lxv        vs46, 32(T3)  
 | |
|     lxv        vs47, 48(T3)                 
 | |
| #endif  
 | |
| 
 | |
|     
 | |
| 
 | |
| 
 | |
|       
 | |
|     xxperm     vs17,    vs4,    save_permute_2   
 | |
|     xxperm     vs19,    vs5,    save_permute_2      
 | |
| #ifdef TRMMKERNEL
 | |
|     xvmulsp     vs32,   vs8,    alpha_r 
 | |
|     xvmulsp     vs33,   vs12,   alpha_r                 
 | |
| #else 
 | |
|     xvmaddasp   vs32,   vs8,    alpha_r 
 | |
|     xvmaddasp   vs33,   vs12,   alpha_r            
 | |
| #endif 
 | |
|     xxperm     vs24,    vs30,   save_permute_1
 | |
|     xxperm     vs26,    vs31,   save_permute_1 
 | |
| 
 | |
|  
 | |
|     stxv        vs32, 0(CO)
 | |
|     stxv        vs33, 16(CO)     
 | |
| #ifdef TRMMKERNEL   
 | |
|     xvmulsp     vs34,   vs16,   alpha_r 
 | |
|     xvmulsp     vs35,   vs24,   alpha_r                 
 | |
| #else    
 | |
|     xvmaddasp   vs34,   vs16,   alpha_r 
 | |
|     xvmaddasp   vs35,   vs24,   alpha_r           
 | |
| #endif 
 | |
|          
 | |
|     xxperm     vs25,    vs30,   save_permute_2   
 | |
|     xxperm     vs27,    vs31,   save_permute_2  
 | |
| 
 | |
| 
 | |
|     stxv        vs34, 32(CO)  
 | |
|     stxv        vs35, 48(CO)  
 | |
| #ifdef TRMMKERNEL  
 | |
|     xvmulsp     vs36,   vs9,    alpha_r 
 | |
|     xvmulsp     vs37,   vs13,   alpha_r                
 | |
| #else   
 | |
|     xvmaddasp   vs36,   vs9,    alpha_r 
 | |
|     xvmaddasp   vs37,   vs13,   alpha_r           
 | |
| #endif 
 | |
|     stxv        vs36, 0(T1)
 | |
|     stxv        vs37, 16(T1)
 | |
| #ifdef TRMMKERNEL  
 | |
|     xvmulsp     vs38,   vs17,   alpha_r 
 | |
|     xvmulsp     vs39,   vs25,   alpha_r               
 | |
| #else   
 | |
|     xvmaddasp   vs38,   vs17,   alpha_r 
 | |
|     xvmaddasp   vs39,   vs25,   alpha_r         
 | |
| #endif 
 | |
|     stxv        vs38, 32(T1)  
 | |
|     stxv        vs39, 48(T1)
 | |
| 
 | |
| #ifdef TRMMKERNEL
 | |
|     xvmulsp     vs40,   vs10,   alpha_r 
 | |
|     xvmulsp     vs41,   vs14,   alpha_r                    
 | |
| #else 
 | |
|     xvmaddasp   vs40,   vs10,   alpha_r 
 | |
|     xvmaddasp   vs41,   vs14,   alpha_r   
 | |
| #endif   
 | |
| 
 | |
|     stxv        vs40, 0(T2)
 | |
|     stxv        vs41, 16(T2)  
 | |
| #ifdef TRMMKERNEL 
 | |
|     xvmulsp     vs42,   vs18,   alpha_r 
 | |
|     xvmulsp     vs43,   vs26,   alpha_r                     
 | |
| #else   
 | |
|     xvmaddasp   vs42,   vs18,   alpha_r 
 | |
|     xvmaddasp   vs43,   vs26,   alpha_r
 | |
| #endif      
 | |
|     stxv        vs42, 32(T2)  
 | |
|     stxv        vs43, 48(T2)  
 | |
| #ifdef TRMMKERNEL  
 | |
|     xvmulsp     vs44,   vs11,   alpha_r 
 | |
|     xvmulsp     vs45,   vs15,   alpha_r                    
 | |
| #else
 | |
|     xvmaddasp   vs44,   vs11,   alpha_r 
 | |
|     xvmaddasp   vs45,   vs15,   alpha_r    
 | |
| #endif      
 | |
|     stxv        vs44, 0(T3)
 | |
|     stxv        vs45, 16(T3) 
 | |
| #ifdef TRMMKERNEL 
 | |
|     xvmulsp     vs46,   vs19,   alpha_r 
 | |
|     xvmulsp     vs47,   vs27,   alpha_r                   
 | |
| #else 
 | |
|     xvmaddasp   vs46,   vs19,   alpha_r 
 | |
|     xvmaddasp   vs47,   vs27,   alpha_r 
 | |
| #endif      
 | |
|     stxv        vs46, 32(T3)  
 | |
|     stxv        vs47, 48(T3)
 | |
|   
 | |
|  /*****the same with the second 8X8 ****/
 | |
|  #ifndef TRMMKERNEL 
 | |
|     lxv        vs32, 0(T4)
 | |
|     lxv        vs33, 16(T4) 
 | |
| #endif  
 | |
|     xxmrglw     vs8,    vs48,   vs60
 | |
|     xxmrglw     vs10,   vs52,   vs56  
 | |
| #ifndef TRMMKERNEL    
 | |
|     lxv        vs34, 32(T4)  
 | |
|     lxv        vs35, 48(T4)  
 | |
| #endif  
 | |
|     xxmrghw     vs1,    vs48,   vs60
 | |
|     xxmrghw     vs0,    vs52,   vs56
 | |
| #ifndef TRMMKERNEL        
 | |
|     lxv        vs36, 0(T5)
 | |
|     lxv        vs37, 16(T5) 
 | |
| #endif  
 | |
|     xxmrglw     vs12,   vs49,   vs61
 | |
|     xxmrglw     vs14,   vs53,   vs57  
 | |
| #ifndef TRMMKERNEL    
 | |
|     lxv        vs38,32(T5)  
 | |
|     lxv        vs39, 48(T5)     
 | |
| #endif   
 | |
|  
 | |
|     xxmrghw     vs2,    vs53,   vs57
 | |
|     xxmrghw     vs3,    vs49,   vs61
 | |
| #ifndef TRMMKERNEL   
 | |
|     lxv        vs40, 0(T6)
 | |
|     lxv        vs41, 16(T6)
 | |
| #endif  
 | |
|     xxmrglw     vs16,   vs50,   vs62
 | |
|     xxmrglw     vs18,   vs54,   vs58   
 | |
| #ifndef TRMMKERNEL      
 | |
|     lxv        vs42, 32(T6)  
 | |
|     lxv        vs43, 48(T6) 
 | |
| #endif  
 | |
|     xxlor      vs9, vs8,    vs8
 | |
|     xxlor      vs11,    vs10,   vs10 
 | |
|     xxmrghw     vs4,    vs54,   vs58
 | |
|     xxmrghw     vs5,    vs50,   vs62
 | |
| #ifndef TRMMKERNEL              
 | |
|     lxv        vs44, 0(T7)
 | |
|     lxv        vs45, 16(T7) 
 | |
| #endif  
 | |
|     xxlor      vs13,    vs12,   vs12
 | |
|     xxlor      vs15,    vs14,   vs14
 | |
|  
 | |
|     xxmrglw     vs24,   vs51,   vs63
 | |
|     xxmrglw     vs26,   vs55,   vs59 
 | |
| #ifndef TRMMKERNEL    
 | |
|     lxv        vs46, 32(T7)  
 | |
|     lxv        vs47, 48(T7)     
 | |
| #endif  
 | |
|     xxlor      vs17,    vs16,   vs16
 | |
|     xxlor      vs19,    vs18,   vs18
 | |
|     xxmrghw     vs30,   vs55,   vs59 
 | |
|     xxmrghw     vs31,   vs51,   vs63 
 | |
| 
 | |
|  
 | |
| 
 | |
|     xxperm      vs8,    vs0,    save_permute_1
 | |
|     xxperm      vs10,   vs1,    save_permute_1
 | |
|      
 | |
|     xxperm      vs9,    vs0,    save_permute_2  
 | |
|     xxperm      vs11,   vs1,    save_permute_2      
 | |
| 
 | |
|     xxlor      vs25,    vs24,   vs24
 | |
|     xxlor      vs27,    vs26,   vs26 
 | |
|     xxperm     vs12,    vs2,    save_permute_1
 | |
|     xxperm     vs14,    vs3,    save_permute_1
 | |
| 
 | |
|     xxperm     vs13,    vs2,    save_permute_2   
 | |
|     xxperm     vs15,    vs3,    save_permute_2      
 | |
|  #ifdef TRMMKERNEL
 | |
|     xvmulsp     vs32,   vs8,    alpha_r 
 | |
|     xvmulsp     vs33,   vs12,   alpha_r                 
 | |
| #else 
 | |
|     xvmaddasp   vs32,   vs8,    alpha_r 
 | |
|     xvmaddasp   vs33,   vs12,   alpha_r            
 | |
| #endif  
 | |
|     xxperm     vs16,    vs4,    save_permute_1
 | |
|     xxperm     vs18,    vs5,    save_permute_1
 | |
|     stxv        vs32, 0(T4)
 | |
|     stxv        vs33, 16(T4) 
 | |
|     xxperm     vs17,    vs4,    save_permute_2   
 | |
|     xxperm     vs19,    vs5,    save_permute_2      
 | |
|     xxperm     vs24,    vs30,   save_permute_1
 | |
|     xxperm     vs26,    vs31,   save_permute_1 
 | |
|     xxperm     vs25,    vs30,   save_permute_2   
 | |
|     xxperm     vs27,    vs31,   save_permute_2      
 | |
| 
 | |
| #ifdef TRMMKERNEL   
 | |
|     xvmulsp     vs34,   vs16,   alpha_r 
 | |
|     xvmulsp     vs35,   vs24,   alpha_r                 
 | |
| #else    
 | |
|     xvmaddasp   vs34,   vs16,   alpha_r 
 | |
|     xvmaddasp   vs35,   vs24,   alpha_r           
 | |
| #endif 
 | |
|     stxv        vs34, 32(T4)  
 | |
|     stxv        vs35, 48(T4)  
 | |
| 
 | |
| #ifdef TRMMKERNEL  
 | |
|     xvmulsp     vs36,   vs9,    alpha_r 
 | |
|     xvmulsp     vs37,   vs13,   alpha_r                
 | |
| #else   
 | |
|     xvmaddasp   vs36,   vs9,    alpha_r 
 | |
|     xvmaddasp   vs37,   vs13,   alpha_r           
 | |
| #endif 
 | |
|     stxv        vs36, 0(T5)
 | |
|     stxv        vs37, 16(T5) 
 | |
| 
 | |
| #ifdef TRMMKERNEL  
 | |
|     xvmulsp     vs38,   vs17,   alpha_r 
 | |
|     xvmulsp     vs39,   vs25,   alpha_r               
 | |
| #else  
 | |
|     xvmaddasp   vs38,   vs17,   alpha_r 
 | |
|     xvmaddasp   vs39,   vs25,   alpha_r         
 | |
| #endif 
 | |
| 
 | |
| 
 | |
| 
 | |
|  
 | |
|     stxv        vs38, 32(T5)  
 | |
|     stxv        vs39, 48(T5)
 | |
| 
 | |
| 
 | |
| #ifdef TRMMKERNEL
 | |
|     xvmulsp     vs40,   vs10,   alpha_r 
 | |
|     xvmulsp     vs41,   vs14,   alpha_r                    
 | |
| #else 
 | |
|     xvmaddasp   vs40,   vs10,   alpha_r 
 | |
|     xvmaddasp   vs41,   vs14,   alpha_r   
 | |
| #endif  
 | |
|     stxv        vs40, 0(T6)
 | |
|     stxv        vs41, 16(T6) 
 | |
| #ifdef TRMMKERNEL 
 | |
|     xvmulsp     vs42,   vs18,   alpha_r 
 | |
|     xvmulsp     vs43,   vs26,   alpha_r                     
 | |
| #else   
 | |
|     xvmaddasp   vs42,   vs18,   alpha_r 
 | |
|     xvmaddasp   vs43,   vs26,   alpha_r
 | |
| #endif  
 | |
|     stxv        vs42, 32(T6)  
 | |
|     stxv        vs43, 48(T6)  
 | |
| #ifdef TRMMKERNEL  
 | |
|     xvmulsp     vs44,   vs11,   alpha_r 
 | |
|     xvmulsp     vs45,   vs15,   alpha_r                    
 | |
| #else
 | |
|     xvmaddasp   vs44,   vs11,   alpha_r 
 | |
|     xvmaddasp   vs45,   vs15,   alpha_r    
 | |
| #endif  
 | |
| 
 | |
|     stxv        vs44, 0(T7)
 | |
|     stxv        vs45, 16(T7) 
 | |
| #ifdef TRMMKERNEL 
 | |
|     xvmulsp     vs46,   vs19,   alpha_r 
 | |
|     xvmulsp     vs47,   vs27,   alpha_r                   
 | |
| #else 
 | |
|     xvmaddasp   vs46,   vs19,   alpha_r 
 | |
|     xvmaddasp   vs47,   vs27,   alpha_r 
 | |
| #endif  
 | |
|  
 | |
|     stxv        vs46, 32(T7)  
 | |
|     stxv        vs47, 48(T7)
 | |
|   
 | |
| 
 | |
|     addi CO,CO,64
 | |
| 
 | |
| 
 | |
| .endm
 | |
| 
 | |
| 
 | |
| 
 | |
| /**********************************************************************************************
 | |
| * Macros for N=8 and M=8
 | |
| **********************************************************************************************/
 | |
| 
 | |
| .macro LOAD8x8_1
 | |
|    LOAD8x8 1
 | |
| .endm
 | |
| 
 | |
| .macro LOAD8x8_0
 | |
|    LOAD8x8 0
 | |
| .endm
 | |
| 
 | |
| .macro KERNEL8x8_L1_L4  Index,IsLast
 | |
|   KERNEL8x8_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0
 | |
| .endm
 | |
| 
 | |
| .macro KERNEL8x8_I1_L4  OffsetA,OffsetB, Index,IsLast
 | |
|   KERNEL8x8_L1_L4_I  AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0
 | |
| .endm
 | |
| 
 | |
| .macro KERNEL8x8_I1_L4_2  OffsetA,OffsetB, Index,IsLast
 | |
|   KERNEL8x8_L1_L4_I  AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast,0
 | |
| .endm
 | |
| 
 | |
| .macro KERNEL8x8_I1_L4_3  OffsetA,OffsetB, Index,IsLast
 | |
|   KERNEL8x8_L1_L4_I  AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast,1
 | |
| .endm
 | |
| .macro KERNEL8x8_I1_L2_3  OffsetA,OffsetB, Index,IsLast
 | |
|    KERNEL8x8_L1_L2_I AO,BO,0,  \OffsetA,\OffsetB,\Index,\IsLast,1
 | |
| .endm
 | |
| 
 | |
| .macro KERNEL8x8_I2_L4_2  AREG,BREG,OffsetA,OffsetB, Index,IsLast
 | |
|   KERNEL8x8_L1_L4_I  \AREG,\BREG,  \OffsetA,\OffsetB,\Index,\IsLast,0
 | |
| .endm
 | |
| 
 | |
| .macro KERNEL8x8_I2_L4_3  AREG,BREG,OffsetA,OffsetB, Index,IsLast
 | |
|   KERNEL8x8_L1_L4_I \AREG,\BREG,  \OffsetA,\OffsetB,\Index,\IsLast,1
 | |
| .endm
 | |
| 
 | |
| .macro END8x8_NORMAL
 | |
|   END8x8 0, AO, BO, 32,32 
 | |
| .endm
 | |
| 
 | |
| .macro Zero8X8
 | |
|     xxlxor      vs32,   vs32,   vs32
 | |
|     xxlxor      vs33,   vs33,   vs33
 | |
|  
 | |
|     xxlxor      vs36,   vs36,   vs36
 | |
|     xxlxor      vs37,   vs37,   vs37
 | |
|  
 | |
|     xxlxor      vs40,   vs40,   vs40
 | |
|     xxlxor      vs41,   vs41,   vs41
 | |
|  
 | |
|     xxlxor      vs44,   vs44,   vs44
 | |
|     xxlxor      vs45,   vs45,   vs45
 | |
|  
 | |
|     xxlxor      vs48,   vs48,   vs48
 | |
|     xxlxor      vs49,   vs49,   vs49
 | |
|  
 | |
|     xxlxor      vs52,   vs52,   vs52
 | |
|     xxlxor      vs53,   vs53,   vs53
 | |
|  
 | |
|     xxlxor      vs56,   vs56,   vs56
 | |
|     xxlxor      vs57,   vs57,   vs57
 | |
|   
 | |
|     xxlxor      vs60,   vs60,   vs60
 | |
|     xxlxor      vs61,   vs61,   vs61
 | |
|     
 | |
| .endm
 | |
| 
 | |
| .macro LOAD8x8  Zero
 | |
| 
 | |
|     lxv vs24,   0(BO)
 | |
|     lxv vs28,   16(BO)
 | |
|     lxv vs0,     0(AO)
 | |
|     lxv vs1,    16(AO)
 | |
| 
 | |
|     xxperm      vs26,   vs24,       permute_mask
 | |
|     xxperm      vs30,   vs28,       permute_mask    
 | |
|     xxpermdi    vs25,   vs24,   vs24,2     
 | |
|     xxpermdi    vs29,   vs28,   vs28,2    
 | |
| 
 | |
|     xxpermdi    vs27,   vs26,   vs26,2  
 | |
|     xxpermdi    vs31,   vs30,   vs30,2      
 | |
| 
 | |
| .if \Zero==1 
 | |
|     xxlxor      vs32,   vs32,   vs32
 | |
|     xxlxor      vs33,   vs33,   vs33 
 | |
|     xxlxor      vs36,   vs36,   vs36
 | |
|     xxlxor      vs37,   vs37,   vs37
 | |
|     xxlxor      vs40,   vs40,   vs40
 | |
|     xxlxor      vs41,   vs41,   vs41 
 | |
|     xxlxor      vs44,   vs44,   vs44
 | |
|     xxlxor      vs45,   vs45,   vs45 
 | |
|     xxlxor      vs48,   vs48,   vs48
 | |
|     xxlxor      vs49,   vs49,   vs49 
 | |
|     xxlxor      vs52,   vs52,   vs52
 | |
|     xxlxor      vs53,   vs53,   vs53 
 | |
|     xxlxor      vs56,   vs56,   vs56
 | |
|     xxlxor      vs57,   vs57,   vs57  
 | |
|     xxlxor      vs60,   vs60,   vs60
 | |
|     xxlxor      vs61,   vs61,   vs61  
 | |
| .endif
 | |
| .endm
 | |
| 
 | |
| 
 | |
| .macro END8x8 First, AREG, BREG, OffsetA, OffsetB
 | |
| 
 | |
| .if \OffsetB != 0 
 | |
|     addi        \BREG, \BREG, \OffsetB 
 | |
| .endif
 | |
| .if \OffsetA != 0 
 | |
|     addi        \AREG, \AREG, \OffsetA 
 | |
| .endif  
 | |
| 
 | |
| .if \First==1
 | |
|     xvmulsp     vs32, vs0,vs24
 | |
|     xvmulsp     vs33, vs1,vs24
 | |
| 
 | |
|     xvmulsp     vs36, vs0,vs25
 | |
|     xvmulsp     vs37, vs1,vs25
 | |
| 
 | |
|     xvmulsp     vs40, vs0,vs26
 | |
|     xvmulsp     vs41, vs1,vs26
 | |
| 
 | |
|     xvmulsp     vs44, vs0,vs27
 | |
|     xvmulsp     vs45, vs1,vs27
 | |
| 
 | |
|     xvmulsp     vs48, vs0,vs28
 | |
|     xvmulsp     vs49, vs1,vs28
 | |
| 
 | |
|     xvmulsp     vs52, vs0,vs29
 | |
|     xvmulsp     vs53, vs1,vs29
 | |
| 
 | |
|     xvmulsp     vs56, vs0,vs30
 | |
|     xvmulsp     vs57, vs1,vs30
 | |
| 
 | |
|     xvmulsp     vs60, vs0,vs31
 | |
|     xvmulsp     vs61, vs1,vs31
 | |
| 
 | |
| .else
 | |
|     xvmaddasp       vs32, vs0,vs24
 | |
|     xvmaddasp       vs33, vs1,vs24
 | |
| 
 | |
|     xvmaddasp       vs36, vs0,vs25
 | |
|     xvmaddasp       vs37, vs1,vs25
 | |
| 
 | |
|     xvmaddasp       vs40, vs0,vs26
 | |
|     xvmaddasp       vs41, vs1,vs26
 | |
| 
 | |
|     xvmaddasp       vs44, vs0,vs27
 | |
|     xvmaddasp       vs45, vs1,vs27
 | |
| 
 | |
|     xvmaddasp       vs48, vs0,vs28
 | |
|     xvmaddasp       vs49, vs1,vs28
 | |
| 
 | |
|     xvmaddasp       vs52, vs0,vs29
 | |
|     xvmaddasp       vs53, vs1,vs29
 | |
| 
 | |
|     xvmaddasp       vs56, vs0,vs30
 | |
|     xvmaddasp       vs57, vs1,vs30
 | |
| 
 | |
|     xvmaddasp       vs60, vs0,vs31
 | |
|     xvmaddasp       vs61, vs1,vs31
 | |
| 
 | |
| .endif
 | |
| .endm  
 | |
| 
 | |
| .macro KERNEL8x8_L1_L4_I  AREG,BREG,   OffsetA,OffsetB, Index,IsLast ,Complete
 | |
| 
 | |
|     lxv vs8,    DISP32(\Index, 0+\OffsetB)(\BREG)
 | |
|     lxv vs12,   DISP32(\Index,16+\OffsetB)(\BREG)
 | |
| 
 | |
|     lxv vs4,    DISP32(\Index, 0+\OffsetA)(\AREG)
 | |
|     lxv vs5,    DISP32(\Index,16+\OffsetA)(\AREG)
 | |
| 
 | |
|     xxperm      vs10,   vs8,        permute_mask
 | |
|     xxperm      vs14,   vs12,       permute_mask    
 | |
| 
 | |
|     xvmaddasp       vs32, vs0,vs24
 | |
|     xvmaddasp       vs33, vs1,vs24
 | |
| 
 | |
|     xxpermdi    vs9,    vs8,    vs8,2    
 | |
|     xxpermdi    vs13,   vs12,   vs12,2   
 | |
| 
 | |
| 
 | |
|     xvmaddasp       vs36, vs0,vs25
 | |
|     xvmaddasp       vs37, vs1,vs25
 | |
| 
 | |
|     xxpermdi    vs11,   vs10,   vs10,2  
 | |
|     xxpermdi    vs15,   vs14,   vs14,2  
 | |
| 
 | |
|     xvmaddasp       vs40, vs0,vs26
 | |
|     xvmaddasp       vs41, vs1,vs26
 | |
| 
 | |
|     xvmaddasp       vs44, vs0,vs27
 | |
|     xvmaddasp       vs45, vs1,vs27
 | |
| 
 | |
|     xvmaddasp       vs48, vs0,vs28
 | |
|     xvmaddasp       vs49, vs1,vs28
 | |
| 
 | |
|     xvmaddasp       vs52, vs0,vs29
 | |
|     xvmaddasp       vs53, vs1,vs29
 | |
|     lxv vs24,   DISP32(\Index,32+\OffsetB)(\BREG)
 | |
|     lxv vs28,   DISP32(\Index,32+16+\OffsetB)(\BREG)
 | |
|     xvmaddasp       vs56, vs0,vs30
 | |
|     xvmaddasp       vs57, vs1,vs30
 | |
| 
 | |
|     xvmaddasp       vs60, vs0,vs31
 | |
|     xvmaddasp       vs61, vs1,vs31
 | |
| 
 | |
|     xxperm      vs26,   vs24,       permute_mask
 | |
|     xxperm      vs30,   vs28,       permute_mask    
 | |
| 
 | |
|     lxv vs0,    DISP32(\Index,32+\OffsetA)(\AREG)
 | |
|     lxv vs1,    DISP32(\Index,32+16+\OffsetA)(\AREG)
 | |
| 
 | |
| 
 | |
|     xxpermdi    vs25,   vs24,   vs24,2     
 | |
|     xxpermdi    vs29,   vs28,   vs28,2    
 | |
| 
 | |
|     xvmaddasp       vs32, vs4,vs8
 | |
|     xvmaddasp       vs33, vs5,vs8
 | |
| 
 | |
|     xvmaddasp       vs36, vs4,vs9
 | |
|     xvmaddasp       vs37, vs5,vs9
 | |
| 
 | |
|     xxpermdi    vs27,   vs26,   vs26,2  
 | |
|     xxpermdi    vs31,   vs30,   vs30,2      
 | |
| 
 | |
|     xvmaddasp       vs40, vs4,vs10
 | |
|     xvmaddasp       vs41, vs5,vs10
 | |
| 
 | |
|     xvmaddasp       vs44, vs4,vs11
 | |
|     xvmaddasp       vs45, vs5,vs11
 | |
| 
 | |
|     xvmaddasp       vs48, vs4,vs12
 | |
|     xvmaddasp       vs49, vs5,vs12
 | |
| 
 | |
|     xvmaddasp       vs52, vs4,vs13
 | |
|     xvmaddasp       vs53, vs5,vs13
 | |
|     lxv vs8,    DISP32(\Index,64+\OffsetB)(\BREG)
 | |
|     lxv vs12,   DISP32(\Index,64+16+\OffsetB)(\BREG)
 | |
|     xvmaddasp       vs56, vs4,vs14
 | |
|     xvmaddasp       vs57, vs5,vs14
 | |
| 
 | |
|     xvmaddasp       vs60, vs4,vs15
 | |
|     xvmaddasp       vs61, vs5,vs15
 | |
| 
 | |
|     xxperm      vs10,   vs8,        permute_mask
 | |
|     xxperm      vs14,   vs12,       permute_mask   
 | |
|  
 | |
| 
 | |
|     lxv vs4,    DISP32(\Index,64+0+\OffsetA)(\AREG)
 | |
|     lxv vs5,    DISP32(\Index,64+16+\OffsetA)(\AREG)
 | |
| 
 | |
|  
 | |
|     xxpermdi    vs9,    vs8,    vs8,2    
 | |
|     xxpermdi    vs13,   vs12,   vs12,2  
 | |
| 
 | |
|     xvmaddasp       vs32, vs0,vs24
 | |
|     xvmaddasp       vs33, vs1,vs24
 | |
| 
 | |
|     xvmaddasp       vs36, vs0,vs25
 | |
|     xvmaddasp       vs37, vs1,vs25
 | |
| 
 | |
|     xxpermdi    vs11,   vs10,   vs10,2  
 | |
|     xxpermdi    vs15,   vs14,   vs14,2  
 | |
| 
 | |
|     xvmaddasp       vs40, vs0,vs26
 | |
|     xvmaddasp       vs41, vs1,vs26
 | |
| 
 | |
|     xvmaddasp       vs44, vs0,vs27
 | |
|     xvmaddasp       vs45, vs1,vs27
 | |
| 
 | |
|     xvmaddasp       vs48, vs0,vs28
 | |
|     xvmaddasp       vs49, vs1,vs28
 | |
| 
 | |
|     xvmaddasp       vs52, vs0,vs29
 | |
|     xvmaddasp       vs53, vs1,vs29
 | |
| .if \Complete==0
 | |
|     lxv vs24,   DISP32(\Index,96+\OffsetB)(\BREG)
 | |
|     lxv vs28,   DISP32(\Index,96+16+\OffsetB)(\BREG)
 | |
| .endif 
 | |
|     xvmaddasp       vs56, vs0,vs30
 | |
|     xvmaddasp       vs57, vs1,vs30
 | |
| .if \Complete==0
 | |
|     xxperm      vs26,   vs24,   permute_mask
 | |
|     xxperm      vs30,   vs28,   permute_mask   
 | |
| .endif 
 | |
|     xvmaddasp       vs60, vs0,vs31
 | |
|     xvmaddasp       vs61, vs1,vs31
 | |
| 
 | |
| 
 | |
| .if \Complete==0
 | |
|     lxv vs0,    DISP32(\Index,96+\OffsetA)(\AREG)
 | |
|     lxv vs1,    DISP32(\Index,96+16+\OffsetA)(\AREG) 
 | |
| .endif 
 | |
| 
 | |
| .if \Complete==0     
 | |
|     xxpermdi    vs25,   vs24,   vs24,2 
 | |
|     xxpermdi    vs29,   vs28,   vs28,2      
 | |
| 
 | |
| .endif 
 | |
| .if \IsLast==1  
 | |
| .if \Complete==1
 | |
|   
 | |
|     addi        \BREG, \BREG,  DISP32(\Index,32*3+\OffsetB)
 | |
|     addi        \AREG, \AREG, DISP32(\Index,32*3+\OffsetA)
 | |
| .else
 | |
|   
 | |
|     addi        \BREG, \BREG,  DISP32(\Index,128)
 | |
|     addi        \AREG, \AREG, DISP32(\Index,128)
 | |
| .endif
 | |
| .endif   
 | |
|  
 | |
|     xvmaddasp       vs32, vs4,vs8
 | |
|     xvmaddasp       vs33, vs5,vs8
 | |
| 
 | |
|     xvmaddasp       vs36, vs4,vs9
 | |
|     xvmaddasp       vs37, vs5,vs9
 | |
| 
 | |
| .if \Complete==0        
 | |
|     xxpermdi    vs27,   vs26,   vs26,2  
 | |
|     xxpermdi    vs31,   vs30,   vs30,2  
 | |
|     
 | |
| .endif
 | |
|  
 | |
|     xvmaddasp       vs40, vs4,vs10
 | |
|     xvmaddasp       vs41, vs5,vs10
 | |
| 
 | |
|     xvmaddasp       vs44, vs4,vs11
 | |
|     xvmaddasp       vs45, vs5,vs11
 | |
| 
 | |
|     xvmaddasp       vs48, vs4,vs12
 | |
|     xvmaddasp       vs49, vs5,vs12
 | |
| 
 | |
|     xvmaddasp       vs52, vs4,vs13
 | |
|     xvmaddasp       vs53, vs5,vs13
 | |
| 
 | |
|     xvmaddasp       vs56, vs4,vs14
 | |
|     xvmaddasp       vs57, vs5,vs14
 | |
| 
 | |
|     xvmaddasp       vs60, vs4,vs15
 | |
|     xvmaddasp       vs61, vs5,vs15
 | |
| 
 | |
| .endm
 | |
| 
 | |
| .macro KERNEL8x8 First
 | |
| 
 | |
|   LOAD8x8 0
 | |
|   END8x8 \First, AO, BO, 32,32  
 | |
| .endm
 | |
| 
 | |
| .macro KERNEL8x8_L1_L2_I  AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete
 | |
|     
 | |
|     lxv vs8,    DISP16(\Index, 0+\OffsetB)(\BREG)
 | |
|     lxv vs12,   DISP16(\Index,16+\OffsetB)(\BREG)
 | |
| 
 | |
|     lxv vs4,    DISP16(\Index, 0+\OffsetA)(\AREG)
 | |
|     lxv vs5,    DISP16(\Index,16+\OffsetA)(\AREG)
 | |
| 
 | |
|     xxperm      vs10,   vs8,        permute_mask
 | |
|     xxperm      vs14,   vs12,       permute_mask    
 | |
|     xxpermdi    vs9,    vs8,    vs8,2    
 | |
|     xxpermdi    vs13,   vs12,   vs12,2   
 | |
| .if \First==1
 | |
|     xvmulsp     vs32, vs0,vs24
 | |
|     xvmulsp     vs33, vs1,vs24
 | |
| 
 | |
|     xvmulsp     vs36, vs0,vs25
 | |
|     xvmulsp     vs37, vs1,vs25
 | |
| 
 | |
| .else
 | |
|     xvmaddasp       vs32, vs0,vs24
 | |
|     xvmaddasp       vs33, vs1,vs24
 | |
| 
 | |
|     xvmaddasp       vs36, vs0,vs25
 | |
|     xvmaddasp       vs37, vs1,vs25
 | |
| 
 | |
| .endif
 | |
| 
 | |
|     xxpermdi    vs11,   vs10,   vs10,2  
 | |
|     xxpermdi    vs15,   vs14,   vs14,2  
 | |
|  
 | |
| .if \First==1  
 | |
|     xvmulsp     vs40, vs0,vs26
 | |
|     xvmulsp     vs41, vs1,vs26
 | |
| 
 | |
|     xvmulsp     vs44, vs0,vs27
 | |
|     xvmulsp     vs45, vs1,vs27
 | |
| 
 | |
|     xvmulsp     vs48, vs0,vs28
 | |
|     xvmulsp     vs49, vs1,vs28
 | |
| 
 | |
|     xvmulsp     vs52, vs0,vs29
 | |
|     xvmulsp     vs53, vs1,vs29
 | |
| 
 | |
|     xvmulsp     vs56, vs0,vs30
 | |
|     xvmulsp     vs57, vs1,vs30
 | |
| 
 | |
|     xvmulsp     vs60, vs0,vs31
 | |
|     xvmulsp     vs61, vs1,vs31
 | |
| 
 | |
| .else 
 | |
|     xvmaddasp       vs40, vs0,vs26
 | |
|     xvmaddasp       vs41, vs1,vs26
 | |
| 
 | |
|     xvmaddasp       vs44, vs0,vs27
 | |
|     xvmaddasp       vs45, vs1,vs27
 | |
| 
 | |
|     xvmaddasp       vs48, vs0,vs28
 | |
|     xvmaddasp       vs49, vs1,vs28
 | |
| 
 | |
|     xvmaddasp       vs52, vs0,vs29
 | |
|     xvmaddasp       vs53, vs1,vs29
 | |
| 
 | |
|     xvmaddasp       vs56, vs0,vs30
 | |
|     xvmaddasp       vs57, vs1,vs30
 | |
| 
 | |
|     xvmaddasp       vs60, vs0,vs31
 | |
|     xvmaddasp       vs61, vs1,vs31
 | |
| 
 | |
| .endif
 | |
| .if \Complete==0
 | |
|     lxv vs24,   DISP16(\Index,32+\OffsetB)(\BREG)
 | |
|     lxv vs28,   DISP16(\Index,32+16+\OffsetB)(\BREG)
 | |
| 
 | |
|     lxv vs0,    DISP16(\Index,32+\OffsetA)(\AREG)
 | |
|     lxv vs1,    DISP16(\Index,32+16+\OffsetA)(\AREG)
 | |
| 
 | |
|     xxperm      vs26,   vs24,   permute_mask
 | |
|     xxperm      vs30,   vs28,   permute_mask    
 | |
|     xxpermdi    vs25,   vs24,   vs24,2   
 | |
|     xxpermdi    vs29,   vs28,   vs28,2  
 | |
| .endif    
 | |
| .if \IsLast==1  
 | |
| .if \Complete==1
 | |
|     addi        \BREG, \BREG,  DISP16(\Index,32+\OffsetB) 
 | |
|     addi        \AREG, \AREG,  DISP16(\Index,32+\OffsetA)
 | |
| 
 | |
| .else
 | |
|     addi        \BREG, \BREG,  DISP16(\Index,64)
 | |
|     addi        \AREG, \AREG,  DISP16(\Index,64) 
 | |
| .endif
 | |
| .endif
 | |
| 
 | |
| .if \First==1
 | |
|     xvmulsp     vs32, vs4,vs8
 | |
|     xvmulsp     vs33, vs5,vs8
 | |
| 
 | |
|     xvmulsp     vs36, vs4,vs9
 | |
|     xvmulsp     vs37, vs5,vs9
 | |
| 
 | |
| .else
 | |
|     xvmaddasp       vs32, vs4,vs8
 | |
|     xvmaddasp       vs33, vs5,vs8
 | |
| 
 | |
|     xvmaddasp       vs36, vs4,vs9
 | |
|     xvmaddasp       vs37, vs5,vs9
 | |
| 
 | |
| .endif 
 | |
|  
 | |
| .if \Complete==0        
 | |
|     xxpermdi    vs27,   vs26,   vs26,2  
 | |
|     xxpermdi    vs31,   vs30,   vs30,2  
 | |
|  
 | |
| .endif
 | |
| .if \First==1  
 | |
|     xvmulsp     vs40, vs4,vs10
 | |
|     xvmulsp     vs41, vs5,vs10
 | |
| 
 | |
|     xvmulsp     vs44, vs4,vs11
 | |
|     xvmulsp     vs45, vs5,vs11
 | |
| 
 | |
|     xvmulsp     vs48, vs4,vs12
 | |
|     xvmulsp     vs49, vs5,vs12
 | |
| 
 | |
|     xvmulsp     vs52, vs4,vs13
 | |
|     xvmulsp     vs53, vs5,vs13
 | |
| 
 | |
|     xvmulsp     vs56, vs4,vs14
 | |
|     xvmulsp     vs57, vs5,vs14
 | |
| 
 | |
|     xvmulsp     vs60, vs4,vs15
 | |
|     xvmulsp     vs61, vs5,vs15
 | |
| 
 | |
| .else 
 | |
|     xvmaddasp       vs40, vs4,vs10
 | |
|     xvmaddasp       vs41, vs5,vs10
 | |
| 
 | |
|     xvmaddasp       vs44, vs4,vs11
 | |
|     xvmaddasp       vs45, vs5,vs11
 | |
| 
 | |
|     xvmaddasp       vs48, vs4,vs12
 | |
|     xvmaddasp       vs49, vs5,vs12
 | |
| 
 | |
|     xvmaddasp       vs52, vs4,vs13
 | |
|     xvmaddasp       vs53, vs5,vs13
 | |
| 
 | |
|     xvmaddasp       vs56, vs4,vs14
 | |
|     xvmaddasp       vs57, vs5,vs14
 | |
| 
 | |
|     xvmaddasp       vs60, vs4,vs15
 | |
|     xvmaddasp       vs61, vs5,vs15
 | |
| 
 | |
| .endif
 | |
| 
 | |
| .endm
 | |
| 
 | |
| 
 | |
| .macro SAVE8x8 
 | |
|  
 | |
|   slwi    T10, LDC ,   1 
 | |
|   add     T1, CO, LDC 
 | |
| 
 | |
|   add     T2, CO, T10  
 | |
|   add     T3, T1, T10  
 | |
| 
 | |
|   add     T4, T2, T10  
 | |
|   add     T5, T3, T10 
 | |
| 
 | |
|   add     T6, T4, T10 
 | |
|   add     T7, T5, T10 
 | |
| 
 | |
| #ifndef TRMMKERNEL    
 | |
|     lxv        vs34, 0(CO)
 | |
|     lxv        vs35, 16(CO)      
 | |
|     lxv        vs38, 0(T1)
 | |
|     lxv        vs39, 16(T1)  
 | |
|     lxv        vs42, 0(T2)
 | |
|     lxv        vs43, 16(T2)     
 | |
|     lxv        vs46, 0(T3)
 | |
|     lxv        vs47, 16(T3)  
 | |
| 
 | |
|     lxv        vs50, 0(T4)
 | |
|     lxv        vs51, 16(T4)      
 | |
|     lxv        vs54, 0(T5)
 | |
|     lxv        vs55, 16(T5)  
 | |
|     lxv        vs58, 0(T6)
 | |
|     lxv        vs59, 16(T6)     
 | |
|     lxv        vs62, 0(T7)
 | |
|     lxv        vs63, 16(T7) 
 | |
| #endif  
 | |
| 
 | |
|     xxmrglw     vs8,    vs32,   vs44
 | |
|     xxmrglw     vs10,   vs36,   vs40  
 | |
| 
 | |
|     xxmrghw     vs1,    vs32,   vs44
 | |
|     xxmrghw     vs0,    vs36,   vs40
 | |
| 
 | |
|     xxmrglw     vs12,   vs33,   vs45
 | |
|     xxmrglw     vs14,   vs37,   vs41  
 | |
| 
 | |
|     xxmrghw     vs2,    vs37,   vs41
 | |
|     xxmrghw     vs3,    vs33,   vs45
 | |
| 
 | |
|     xxlor      vs9, vs8,    vs8
 | |
|     xxlor      vs11,    vs10,   vs10 
 | |
|  
 | |
|     xxlor      vs13,    vs12,   vs12
 | |
|     xxlor      vs15,    vs14,   vs14
 | |
| 
 | |
|     xxperm      vs8,    vs0,    save_permute_1
 | |
|     xxperm      vs10,   vs1,    save_permute_1
 | |
|     xxperm      vs9,    vs0,    save_permute_2  
 | |
|     xxperm      vs11,   vs1,    save_permute_2      
 | |
| 
 | |
|     xxperm     vs12,    vs2,    save_permute_1
 | |
|     xxperm     vs14,    vs3,    save_permute_1
 | |
|       
 | |
|     xxperm     vs13,    vs2,    save_permute_2   
 | |
|     xxperm     vs15,    vs3,    save_permute_2      
 | |
| 
 | |
| 
 | |
|     /* multiply add normal way */
 | |
|  
 | |
| #ifdef TRMMKERNEL
 | |
|     xvmulsp     vs34,   vs8,    alpha_r 
 | |
|     xvmulsp     vs35,   vs12,   alpha_r 
 | |
|     xvmulsp     vs38,   vs9,    alpha_r 
 | |
|     xvmulsp     vs39,   vs13,   alpha_r 
 | |
|     xvmulsp     vs42,   vs10,   alpha_r 
 | |
|     xvmulsp     vs43,   vs14,   alpha_r 
 | |
|     xvmulsp     vs46,   vs11,   alpha_r 
 | |
|     xvmulsp     vs47,   vs15,   alpha_r                    
 | |
| #else 
 | |
|     xvmaddasp   vs34,   vs8,    alpha_r 
 | |
|     xvmaddasp   vs35,   vs12,   alpha_r 
 | |
|     xvmaddasp   vs38,   vs9,    alpha_r 
 | |
|     xvmaddasp   vs39,   vs13,   alpha_r  
 | |
|     xvmaddasp   vs42,   vs10,   alpha_r 
 | |
|     xvmaddasp   vs43,   vs14,   alpha_r   
 | |
|     xvmaddasp   vs46,   vs11,   alpha_r 
 | |
|     xvmaddasp   vs47,   vs15,   alpha_r                     
 | |
| #endif     
 | |
|  
 | |
|    
 | |
|     xxmrglw     vs8,    vs48,   vs60
 | |
|     xxmrglw     vs10,   vs52,   vs56  
 | |
| 
 | |
|     xxmrghw     vs1,    vs48,   vs60
 | |
|     xxmrghw     vs0,    vs52,   vs56
 | |
|     stxv        vs34, 0(CO)
 | |
|     stxv        vs35, 16(CO) 
 | |
|     xxmrglw     vs12,   vs49,   vs61
 | |
|     xxmrglw     vs14,   vs53,   vs57  
 | |
|     stxv        vs38, 0(T1)
 | |
|     stxv        vs39, 16(T1) 
 | |
|     xxmrghw     vs2,    vs53,   vs57
 | |
|     xxmrghw     vs3,    vs49,   vs61
 | |
|     stxv        vs42, 0(T2)
 | |
|     stxv        vs43, 16(T2)   
 | |
|     xxlor      vs9, vs8,    vs8
 | |
|     xxlor      vs11,    vs10,   vs10  
 | |
|     stxv        vs46, 0(T3)
 | |
|     stxv        vs47, 16(T3)  
 | |
|     xxlor      vs13,    vs12,   vs12
 | |
|     xxlor      vs15,    vs14,   vs14
 | |
|    
 | |
|     xxperm      vs8,    vs0,    save_permute_1
 | |
|     xxperm      vs10,   vs1,    save_permute_1
 | |
|     
 | |
|  
 | |
|     xxperm      vs9,    vs0,    save_permute_2  
 | |
|     xxperm      vs11,   vs1,    save_permute_2      
 | |
|  
 | |
|     xxperm     vs12,    vs2,    save_permute_1
 | |
|     xxperm     vs14,    vs3,    save_permute_1
 | |
|     xxperm     vs13,    vs2,    save_permute_2   
 | |
|     xxperm     vs15,    vs3,    save_permute_2      
 | |
|     
 | |
|  #ifdef TRMMKERNEL
 | |
|     xvmulsp     vs50,   vs8,    alpha_r 
 | |
|     xvmulsp     vs51,   vs12,   alpha_r 
 | |
|     xvmulsp     vs54,   vs9,    alpha_r 
 | |
|     xvmulsp     vs55,   vs13,   alpha_r 
 | |
|     xvmulsp     vs58,   vs10,   alpha_r 
 | |
|     xvmulsp     vs59,   vs14,   alpha_r 
 | |
|     xvmulsp     vs62,   vs11,   alpha_r 
 | |
|     xvmulsp     vs63,   vs15,   alpha_r                    
 | |
| #else 
 | |
|     xvmaddasp     vs50,   vs8,    alpha_r 
 | |
|     xvmaddasp     vs51,   vs12,   alpha_r 
 | |
|     xvmaddasp     vs54,   vs9,    alpha_r 
 | |
|     xvmaddasp     vs55,   vs13,   alpha_r 
 | |
|     xvmaddasp     vs58,   vs10,   alpha_r 
 | |
|     xvmaddasp     vs59,   vs14,   alpha_r 
 | |
|     xvmaddasp     vs62,   vs11,   alpha_r 
 | |
|     xvmaddasp     vs63,   vs15,   alpha_r                     
 | |
| #endif  
 | |
| 
 | |
|     stxv        vs50, 0(T4)
 | |
|     stxv        vs51, 16(T4)      
 | |
|     stxv        vs54, 0(T5)
 | |
|     stxv        vs55, 16(T5)  
 | |
|     stxv        vs58, 0(T6)
 | |
|     stxv        vs59, 16(T6)     
 | |
|     stxv        vs62, 0(T7)
 | |
|     stxv        vs63, 16(T7)   
 | |
| 
 | |
|     addi CO,CO,32
 | |
| 
 | |
| .endm
 | |
| 
 | |
| 
 | |
| /**********************************************************************************************
 | |
| * Macros for N=8 and M=4
 | |
| **********************************************************************************************/
 | |
| 
 | |
| .macro LOAD8x4_1
 | |
|    LOAD8x4 1
 | |
| .endm
 | |
| 
 | |
| .macro LOAD8x4_0
 | |
|    LOAD8x4 0
 | |
| .endm
 | |
| 
 | |
| .macro KERNEL8x4_L1_L4  Index,IsLast
 | |
|   KERNEL8x4_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0
 | |
| .endm
 | |
| 
 | |
| .macro KERNEL8x4_I1_L4  OffsetA,OffsetB, Index,IsLast
 | |
|   KERNEL8x4_L1_L4_I  AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0
 | |
| .endm
 | |
| 
 | |
| .macro KERNEL8x4_I1_L4_2  OffsetA,OffsetB, Index,IsLast
 | |
|   KERNEL8x4_L1_L4_I  AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast,0
 | |
| .endm
 | |
| 
 | |
| .macro KERNEL8x4_I1_L4_3  OffsetA,OffsetB, Index,IsLast
 | |
|   KERNEL8x4_L1_L4_I  AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast,1
 | |
| .endm
 | |
| .macro KERNEL8x4_I1_L2_3  OffsetA,OffsetB, Index,IsLast
 | |
|    KERNEL8x4_L1_L2_I AO,BO,0,  \OffsetA,\OffsetB,\Index,\IsLast,1
 | |
| .endm
 | |
| 
 | |
| .macro KERNEL8x4_I2_L4_2  AREG,BREG,OffsetA,OffsetB, Index,IsLast
 | |
|   KERNEL8x4_L1_L4_I  \AREG,\BREG,  \OffsetA,\OffsetB,\Index,\IsLast,0
 | |
| .endm
 | |
| 
 | |
| .macro KERNEL8x4_I2_L4_3  AREG,BREG,OffsetA,OffsetB, Index,IsLast
 | |
|   KERNEL8x4_L1_L4_I \AREG,\BREG,  \OffsetA,\OffsetB,\Index,\IsLast,1
 | |
| .endm
 | |
| 
 | |
| .macro Zero8X4
 | |
|     xxlxor      vs32,   vs32,   vs32
 | |
|     xxlxor      vs33,   vs33,   vs33 
 | |
|     xxlxor      vs34,   vs34,   vs34
 | |
|     xxlxor      vs35,   vs35,   vs35
 | |
|     
 | |
|     xxlxor      vs48,   vs48,   vs48
 | |
|     xxlxor      vs49,   vs49,   vs49
 | |
|     xxlxor      vs50,   vs50,   vs50
 | |
|     xxlxor      vs51,   vs51,   vs51  
 | |
|     
 | |
| .endm
 | |
| 
 | |
| .macro LOAD8x4  Zero
 | |
| 
 | |
|     lxv vs0,     0(AO)
 | |
|     lxv vs24,   0(BO)
 | |
|     lxv vs25,   16(BO)
 | |
| 
 | |
| 
 | |
| 
 | |
|     xxperm      vs2,   vs0,       permute_mask  
 | |
|     xxpermdi    vs1,   vs0,   vs0,2      
 | |
|     xxpermdi    vs3,   vs2,   vs2,2        
 | |
| 
 | |
| .if \Zero==1 
 | |
|     xxlxor      vs32,   vs32,   vs32
 | |
|     xxlxor      vs33,   vs33,   vs33 
 | |
|     xxlxor      vs34,   vs34,   vs34
 | |
|     xxlxor      vs35,   vs35,   vs35
 | |
| 
 | |
|     xxlxor      vs48,   vs48,   vs48
 | |
|     xxlxor      vs49,   vs49,   vs49
 | |
|     xxlxor      vs50,   vs50,   vs50
 | |
|     xxlxor      vs51,   vs51,   vs51  
 | |
| .endif
 | |
| .endm
 | |
| 
 | |
| .macro END8x4_NORMAL
 | |
|   END8x4 0, AO, BO, 16,32 
 | |
| .endm
 | |
| 
 | |
| .macro END8x4 First, AREG, BREG, OffsetA, OffsetB
 | |
| 
 | |
| .if \OffsetB != 0 
 | |
|     addi        \BREG, \BREG, \OffsetB 
 | |
| .endif
 | |
| .if \OffsetA != 0 
 | |
|     addi        \AREG, \AREG, \OffsetA 
 | |
| .endif  
 | |
| 
 | |
| .if \First==1
 | |
|     xvmulsp      vs32,   vs24,   vs0
 | |
|     xvmulsp      vs33,   vs24,   vs1 
 | |
|     xvmulsp      vs34,   vs24,   vs2
 | |
|     xvmulsp      vs35,   vs24,   vs3
 | |
| 
 | |
|     xvmulsp      vs48,   vs25,   vs0
 | |
|     xvmulsp      vs49,   vs25,   vs1
 | |
|     xvmulsp      vs50,   vs25,   vs2
 | |
|     xvmulsp      vs51,   vs25,   vs3  
 | |
| .else
 | |
|     xvmaddasp      vs32,   vs24,   vs0
 | |
|     xvmaddasp      vs33,   vs24,   vs1 
 | |
|     xvmaddasp      vs34,   vs24,   vs2
 | |
|     xvmaddasp      vs35,   vs24,   vs3
 | |
| 
 | |
|     xvmaddasp      vs48,   vs25,   vs0
 | |
|     xvmaddasp      vs49,   vs25,   vs1
 | |
|     xvmaddasp      vs50,   vs25,   vs2
 | |
|     xvmaddasp      vs51,   vs25,   vs3 
 | |
| 
 | |
| .endif
 | |
| .endm  
 | |
| 
 | |
| .macro KERNEL8x4_L1_L4_I  AREG,BREG,   OffsetA,OffsetB, Index,IsLast ,Complete
 | |
| 
 | |
|     lxv vs4,    DISP16(\Index, 0+\OffsetA)(\AREG)
 | |
|     lxv vs26,   DISP32(\Index, 0+\OffsetB)(\BREG)
 | |
|     lxv vs27,   DISP32(\Index,16+\OffsetB)(\BREG)
 | |
| 
 | |
|     xxperm      vs6,   vs4,       permute_mask  
 | |
|     xxpermdi    vs5,   vs4,   vs4,2      
 | |
|     xxpermdi    vs7,   vs6,   vs6,2 
 | |
|  
 | |
|     xvmaddasp      vs32,   vs24,   vs0
 | |
|     xvmaddasp      vs33,   vs24,   vs1 
 | |
|     xvmaddasp      vs34,   vs24,   vs2
 | |
|     xvmaddasp      vs35,   vs24,   vs3
 | |
| 
 | |
|     xvmaddasp      vs48,   vs25,   vs0
 | |
|     xvmaddasp      vs49,   vs25,   vs1
 | |
|     xvmaddasp      vs50,   vs25,   vs2
 | |
|     xvmaddasp      vs51,   vs25,   vs3 
 | |
| 
 | |
|     lxv vs0,    DISP16(\Index, 16+\OffsetA)(\AREG)
 | |
|     lxv vs24,   DISP32(\Index, 32+\OffsetB)(\BREG)
 | |
|     lxv vs25,   DISP32(\Index, 48+\OffsetB)(\BREG) 
 | |
| 
 | |
|     xxperm      vs2,   vs0,       permute_mask  
 | |
|     xxpermdi    vs1,   vs0,   vs0,2      
 | |
|     xxpermdi    vs3,   vs2,   vs2,2   
 | |
| 
 | |
|     xvmaddasp      vs32,   vs26,   vs4
 | |
|     xvmaddasp      vs33,   vs26,   vs5 
 | |
|     xvmaddasp      vs34,   vs26,   vs6
 | |
|     xvmaddasp      vs35,   vs26,   vs7
 | |
| 
 | |
|     xvmaddasp      vs48,   vs27,   vs4
 | |
|     xvmaddasp      vs49,   vs27,   vs5
 | |
|     xvmaddasp      vs50,   vs27,   vs6
 | |
|     xvmaddasp      vs51,   vs27,   vs7
 | |
|  
 | |
| 
 | |
|     lxv vs4,    DISP16(\Index, 32+\OffsetA)(\AREG)
 | |
|     lxv vs26,   DISP32(\Index, 64+\OffsetB)(\BREG)
 | |
|     lxv vs27,   DISP32(\Index, 80+\OffsetB)(\BREG)
 | |
| 
 | |
|     xxperm      vs6,   vs4,       permute_mask  
 | |
|     xxpermdi    vs5,   vs4,   vs4,2      
 | |
|     xxpermdi    vs7,   vs6,   vs6,2 
 | |
|  
 | |
|     xvmaddasp      vs32,   vs24,   vs0
 | |
|     xvmaddasp      vs33,   vs24,   vs1 
 | |
|     xvmaddasp      vs34,   vs24,   vs2
 | |
|     xvmaddasp      vs35,   vs24,   vs3
 | |
| 
 | |
|     xvmaddasp      vs48,   vs25,   vs0
 | |
|     xvmaddasp      vs49,   vs25,   vs1
 | |
|     xvmaddasp      vs50,   vs25,   vs2
 | |
|     xvmaddasp      vs51,   vs25,   vs3 
 | |
| 
 | |
| .if \Complete==0 
 | |
| 
 | |
|     lxv vs0,    DISP16(\Index, 48+\OffsetA)(\AREG)
 | |
|     lxv vs24,   DISP32(\Index, 96+\OffsetB)(\BREG)
 | |
|     lxv vs25,   DISP32(\Index, 96+16+\OffsetB)(\BREG) 
 | |
| 
 | |
|     xxperm      vs2,   vs0,       permute_mask  
 | |
|     xxpermdi    vs1,   vs0,   vs0,2      
 | |
|     xxpermdi    vs3,   vs2,   vs2,2   
 | |
| .endif
 | |
|     xvmaddasp      vs32,   vs26,   vs4
 | |
|     xvmaddasp      vs33,   vs26,   vs5 
 | |
|     xvmaddasp      vs34,   vs26,   vs6
 | |
|     xvmaddasp      vs35,   vs26,   vs7
 | |
| 
 | |
|     xvmaddasp      vs48,   vs27,   vs4
 | |
|     xvmaddasp      vs49,   vs27,   vs5
 | |
|     xvmaddasp      vs50,   vs27,   vs6
 | |
|     xvmaddasp      vs51,   vs27,   vs7
 | |
| 
 | |
|  
 | |
|  
 | |
| .if \IsLast==1  
 | |
| .if \Complete==1
 | |
|     addi        \AREG, \AREG, DISP16(\Index,16*3+\OffsetA)  
 | |
|     addi        \BREG, \BREG,  DISP32(\Index,32*3+\OffsetB)
 | |
| 
 | |
| .else
 | |
|     addi        \AREG, \AREG, DISP16(\Index,64)  
 | |
|     addi        \BREG, \BREG,  DISP32(\Index,128)
 | |
| 
 | |
| .endif
 | |
| .endif   
 | |
|  
 | |
|  
 | |
| .endm
 | |
| 
 | |
| .macro KERNEL8x4 First
 | |
|     LOAD8x4 0
 | |
|     END8x4 \First, AO, BO, 16,32  
 | |
| .endm
 | |
| 
 | |
| .macro KERNEL8x4_L1_L2_I  AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete
 | |
| 
 | |
|     lxv vs4,    DISP8(\Index, 0+\OffsetA)(\AREG)
 | |
|     lxv vs26,   DISP16(\Index, 0+\OffsetB)(\BREG)
 | |
|     lxv vs27,   DISP16(\Index,16+\OffsetB)(\BREG)
 | |
| 
 | |
|     xxperm      vs6,   vs4,       permute_mask  
 | |
|     xxpermdi    vs5,   vs4,   vs4,2      
 | |
|     xxpermdi    vs7,   vs6,   vs6,2 
 | |
| .if \First==1
 | |
|     xvmulsp      vs32,   vs24,   vs0
 | |
|     xvmulsp      vs33,   vs24,   vs1 
 | |
|     xvmulsp      vs34,   vs24,   vs2
 | |
|     xvmulsp      vs35,   vs24,   vs3
 | |
| 
 | |
|     xvmulsp      vs48,   vs25,   vs0
 | |
|     xvmulsp      vs49,   vs25,   vs1
 | |
|     xvmulsp      vs50,   vs25,   vs2
 | |
|     xvmulsp      vs51,   vs25,   vs3  
 | |
| .else 
 | |
|     xvmaddasp      vs32,   vs24,   vs0
 | |
|     xvmaddasp      vs33,   vs24,   vs1 
 | |
|     xvmaddasp      vs34,   vs24,   vs2
 | |
|     xvmaddasp      vs35,   vs24,   vs3
 | |
| 
 | |
|     xvmaddasp      vs48,   vs25,   vs0
 | |
|     xvmaddasp      vs49,   vs25,   vs1
 | |
|     xvmaddasp      vs50,   vs25,   vs2
 | |
|     xvmaddasp      vs51,   vs25,   vs3 
 | |
| .endif
 | |
| 
 | |
| .if \Complete==0 
 | |
| 
 | |
|     lxv vs0,    DISP8(\Index, 16+\OffsetA)(\AREG)
 | |
|     lxv vs24,   DISP16(\Index, 32+\OffsetB)(\BREG)
 | |
|     lxv vs25,   DISP16(\Index, 48+\OffsetB)(\BREG) 
 | |
| 
 | |
|     xxperm      vs2,   vs0,       permute_mask  
 | |
|     xxpermdi    vs1,   vs0,   vs0,2      
 | |
|     xxpermdi    vs3,   vs2,   vs2,2   
 | |
| .endif
 | |
| 
 | |
| .if \First==1
 | |
|     xvmulsp      vs32,   vs26,   vs4
 | |
|     xvmulsp      vs33,   vs26,   vs5 
 | |
|     xvmulsp      vs34,   vs26,   vs6
 | |
|     xvmulsp      vs35,   vs26,   vs7
 | |
| 
 | |
|     xvmulsp      vs48,   vs27,   vs4
 | |
|     xvmulsp      vs49,   vs27,   vs5
 | |
|     xvmulsp      vs50,   vs27,   vs6
 | |
|     xvmulsp      vs51,   vs27,   vs7
 | |
| 
 | |
| 
 | |
| .else
 | |
|     xvmaddasp      vs32,   vs26,   vs4
 | |
|     xvmaddasp      vs33,   vs26,   vs5 
 | |
|     xvmaddasp      vs34,   vs26,   vs6
 | |
|     xvmaddasp      vs35,   vs26,   vs7
 | |
| 
 | |
|     xvmaddasp      vs48,   vs27,   vs4
 | |
|     xvmaddasp      vs49,   vs27,   vs5
 | |
|     xvmaddasp      vs50,   vs27,   vs6
 | |
|     xvmaddasp      vs51,   vs27,   vs7
 | |
| .endif
 | |
|  
 | |
|  
 | |
| .if \IsLast==1  
 | |
| .if \Complete==1
 | |
|     addi        \AREG, \AREG, DISP8(\Index,16+\OffsetA)  
 | |
|     addi        \BREG, \BREG,  DISP16(\Index,32+\OffsetB)
 | |
| 
 | |
| .else
 | |
|     addi        \AREG, \AREG, DISP8(\Index,32)  
 | |
|     addi        \BREG, \BREG,  DISP16(\Index,64)
 | |
| 
 | |
| .endif
 | |
| .endif   
 | |
|      
 | |
|   
 | |
| .endm
 | |
| 
 | |
| 
 | |
| .macro SAVE8x4
 | |
|   slwi    T10, LDC ,   1 
 | |
|   add     T1, CO, LDC 
 | |
| #if !defined(TRMMKERNEL)  
 | |
|   lxv        vs36, 0(CO)
 | |
|   lxv        vs37, 0(T1)
 | |
| #endif  
 | |
|   add     T2, CO, T10  
 | |
|   add     T3, T1, T10 
 | |
| #if !defined(TRMMKERNEL)    
 | |
|   lxv        vs38, 0(T2)
 | |
|   lxv        vs39, 0(T3)   
 | |
| #endif   
 | |
|   add     T4, T2, T10 
 | |
|   add     T5, T3, T10
 | |
| #if !defined(TRMMKERNEL)    
 | |
|   lxv        vs40, 0(T4)
 | |
|   lxv        vs41, 0(T5)
 | |
| #endif  
 | |
|   add     T6, T4, T10 
 | |
|   add     T7, T5, T10
 | |
| #if !defined(TRMMKERNEL)    
 | |
|   lxv        vs42, 0(T6)
 | |
|   lxv        vs43, 0(T7)
 | |
| #endif
 | |
|   xxmrglw  vs0, vs35,vs32
 | |
|   xxmrglw  vs1, vs34,vs33 
 | |
|   xxmrglw  vs4, vs32,vs35
 | |
|   xxmrglw  vs5, vs33,vs34 
 | |
| 
 | |
| 
 | |
|   xxmrghw  vs2, vs35,vs32
 | |
|   xxmrghw  vs3, vs34,vs33 
 | |
|   xxmrghw  vs6, vs32,vs35
 | |
|   xxmrghw  vs7, vs33,vs34  
 | |
| 
 | |
|   xxmrgld  vs24, vs1, vs0  
 | |
|   xxmrghd  vs25,vs5,vs4 
 | |
| 
 | |
|   xxmrgld  vs26, vs2, vs3  
 | |
|   xxmrghd  vs27,vs6,vs7
 | |
| 
 | |
| 
 | |
|   xxmrglw  vs0, vs51,vs48
 | |
|   xxmrglw  vs1, vs50,vs49  
 | |
|   xxmrglw  vs4, vs48,vs51
 | |
|   xxmrglw  vs5, vs49,vs50 
 | |
| 
 | |
|   xxmrghw  vs2, vs51,vs48
 | |
|   xxmrghw  vs3, vs50,vs49  
 | |
|   xxmrghw  vs6, vs48,vs51
 | |
|   xxmrghw  vs7, vs49,vs50   
 | |
| 
 | |
|   xxmrgld  vs28, vs1, vs0  
 | |
|   xxmrghd  vs29,vs5,vs4
 | |
| 
 | |
|   xxmrgld  vs30, vs2, vs3   
 | |
|   xxmrghd  vs31,vs6,vs7
 | |
| #if defined(TRMMKERNEL)
 | |
| 
 | |
|   xvmulsp        vs36, vs24, alpha_r
 | |
|   xvmulsp        vs37, vs25, alpha_r 
 | |
|   xvmulsp        vs38, vs26, alpha_r
 | |
|   xvmulsp        vs39, vs27, alpha_r   
 | |
|   xvmulsp        vs40, vs28, alpha_r
 | |
|   xvmulsp        vs41, vs29, alpha_r 
 | |
|   xvmulsp        vs42, vs30, alpha_r
 | |
|   xvmulsp        vs43, vs31, alpha_r
 | |
| #else
 | |
|   xvmaddasp        vs36, vs24, alpha_r
 | |
|   xvmaddasp        vs37, vs25, alpha_r 
 | |
|   xvmaddasp        vs38, vs26, alpha_r
 | |
|   xvmaddasp        vs39, vs27, alpha_r   
 | |
|   xvmaddasp        vs40, vs28, alpha_r
 | |
|   xvmaddasp        vs41, vs29, alpha_r 
 | |
|   xvmaddasp        vs42, vs30, alpha_r
 | |
|   xvmaddasp        vs43, vs31, alpha_r
 | |
| #endif
 | |
| 
 | |
|   stxv        vs36, 0(CO)
 | |
|   stxv        vs37, 0(T1) 
 | |
|   stxv        vs38, 0(T2)
 | |
|   stxv        vs39, 0(T3)   
 | |
|   stxv        vs40, 0(T4)
 | |
|   stxv        vs41, 0(T5) 
 | |
|   stxv        vs42, 0(T6)
 | |
|   stxv        vs43, 0(T7)
 | |
| 
 | |
| 
 | |
|   addi CO,CO,16
 | |
| .endm
 | |
| 
 | |
| 
 | |
| /**********************************************************************************************
 | |
| * Macros for N=8 and M=2
 | |
| **********************************************************************************************/
 | |
| 
 | |
|  
 | |
| .macro KERNEL8x2_2   OffsetA,OffsetB, Index,IsLast
 | |
|   KERNEL8x2_I_2 AO,BO, 0, \OffsetA,\OffsetB,\Index,\IsLast
 | |
| .endm
 | |
| 
 | |
|  
 | |
| 
 | |
| .macro Zero8x2
 | |
|     xxlxor      vs0,   vs0,   vs0
 | |
|     xxlxor      vs1,   vs1,   vs1 
 | |
|     xxlxor      vs2,   vs2,   vs2
 | |
|     xxlxor      vs3,   vs3,   vs3
 | |
|        
 | |
| .endm
 | |
|  
 | |
| .macro KERNEL8x2
 | |
|   KERNEL8x2_1 AO,BO, 0, 0,0,0
 | |
| .endm
 | |
| .macro KERNEL8x2_1 AREG,BREG,First,OffsetA,OffsetB,Index
 | |
| 
 | |
| 
 | |
|     lxsd v4,   DISP2(\Index, 0+\OffsetA)(\AREG)
 | |
|     lxv vs26,   DISP8(\Index, 0+\OffsetB)(\BREG)
 | |
|     lxv vs27,   DISP8(\Index,16+\OffsetB)(\BREG)      
 | |
|     xxspltw   vs8,  vs36, 0 
 | |
|     xxspltw   vs9,  vs36, 1  
 | |
|  
 | |
| .if \First==1
 | |
|     xvmulsp      vs0,   vs26,   vs8
 | |
|     xvmulsp      vs1,   vs27,   vs8 
 | |
|     xvmulsp      vs2,   vs26,   vs9
 | |
|     xvmulsp      vs3,   vs27,   vs9 
 | |
|      
 | |
| .else 
 | |
|     xvmaddasp      vs0,   vs26,   vs8
 | |
|     xvmaddasp      vs1,   vs27,   vs8 
 | |
|     xvmaddasp      vs2,   vs26,   vs9
 | |
|     xvmaddasp      vs3,   vs27,   vs9
 | |
|  
 | |
|  .endif
 | |
|    
 | |
|     addi        \AREG, \AREG, DISP2(\Index,8)  
 | |
|     addi        \BREG, \BREG, DISP8(\Index,32)
 | |
|  
 | |
| .endm
 | |
| 
 | |
| .macro KERNEL8x2_I_2  AREG,BREG,First,OffsetA,OffsetB, Index,IsLast  
 | |
| 
 | |
|     lxv vs4,    DISP4(\Index, 0+\OffsetA)(\AREG)
 | |
|     lxv vs26,   DISP16(\Index, 0+\OffsetB)(\BREG)
 | |
|     lxv vs27,   DISP16(\Index,16+\OffsetB)(\BREG)
 | |
|     lxv vs28,   DISP16(\Index,32+\OffsetB)(\BREG)
 | |
|     lxv vs29,   DISP16(\Index,48+\OffsetB)(\BREG)      
 | |
|     xxspltw   vs8,  vs4, 2  
 | |
|     xxspltw   vs9,  vs4, 3 
 | |
|     xxspltw   vs10, vs4, 0 
 | |
|     xxspltw   vs11, vs4, 1
 | |
|  
 | |
| .if \First==1
 | |
|     xvmulsp      vs0,   vs26,   vs8
 | |
|     xvmulsp      vs1,   vs27,   vs8 
 | |
|     xvmulsp      vs2,   vs26,   vs9
 | |
|     xvmulsp      vs3,   vs27,   vs9 
 | |
| 
 | |
|     xvmulsp      vs0,   vs28,   vs10
 | |
|     xvmulsp      vs1,   vs29,   vs10 
 | |
|     xvmulsp      vs2,   vs28,   vs11
 | |
|     xvmulsp      vs3,   vs29,   vs11     
 | |
| .else 
 | |
|     xvmaddasp      vs0,   vs26,   vs8
 | |
|     xvmaddasp      vs1,   vs27,   vs8 
 | |
|     xvmaddasp      vs2,   vs26,   vs9
 | |
|     xvmaddasp      vs3,   vs27,   vs9
 | |
| 
 | |
|     xvmaddasp      vs0,   vs28,   vs10
 | |
|     xvmaddasp      vs1,   vs29,   vs10 
 | |
|     xvmaddasp      vs2,   vs28,   vs11
 | |
|     xvmaddasp      vs3,   vs29,   vs11  
 | |
|  .endif
 | |
| 
 | |
|  
 | |
| .if \IsLast==1   
 | |
|     addi        \AREG, \AREG, DISP4(\Index,16)  
 | |
|     addi        \BREG, \BREG, DISP16(\Index,64)
 | |
| .endif 
 | |
|   
 | |
| .endm
 | |
| 
 | |
| 
 | |
| .macro SAVE8x2
 | |
|   slwi    T10, LDC ,   1 
 | |
|   add     T1, CO, LDC  
 | |
|   add     T2, CO, T10  
 | |
|   add     T3, T1, T10     
 | |
|   add     T4, T2, T10 
 | |
|   add     T5, T3, T10 
 | |
|   add     T6, T4, T10 
 | |
|   add     T7, T5, T10 
 | |
|   /*convert alpha_r for multiply*/
 | |
|   xscvspdp  vs4,alpha_r
 | |
| /* v0 corresponds to vs32, do not forget*/
 | |
| #if !defined(TRMMKERNEL)
 | |
|   lxssp  v0,0(CO) 
 | |
|   lxssp  v1,4(CO) 
 | |
| 
 | |
|   lxssp  v2,0(T1)
 | |
|   lxssp  v3,4(T1)
 | |
| 
 | |
|   lxssp  v4,0(T2)
 | |
|   lxssp  v5,4(T2)
 | |
| 
 | |
|   lxssp  v6,0(T3)
 | |
|   lxssp  v7,4(T3)
 | |
| 
 | |
|   lxssp  v8,0(T4)
 | |
|   lxssp  v9,4(T4)
 | |
| 
 | |
|   lxssp  v10,0(T5)
 | |
|   lxssp  v11,4(T5)
 | |
| 
 | |
|   lxssp  v12,0(T6)
 | |
|   lxssp  v13,4(T6)
 | |
| 
 | |
|   lxssp  v14,0(T7)
 | |
|   lxssp  v15,4(T7)
 | |
| #endif
 | |
|   xscvspdp  vs5, vs2
 | |
|   xxspltw   vs6, vs2, 1 
 | |
|   xxspltw   vs7, vs2, 2 
 | |
|   xxspltw   vs8, vs2, 3  
 | |
|   xscvspdp  vs6,vs6
 | |
|   xscvspdp  vs7,vs7
 | |
|   xscvspdp  vs8,vs8
 | |
| 
 | |
|   xscvspdp  vs24, vs0
 | |
|   xxspltw   vs25, vs0, 1 
 | |
|   xxspltw   vs26, vs0, 2 
 | |
|   xxspltw   vs27, vs0, 3  
 | |
|   xscvspdp  vs25,vs25
 | |
|   xscvspdp  vs26,vs26
 | |
|   xscvspdp  vs27,vs27
 | |
| 
 | |
|   xscvspdp  vs9, vs3
 | |
|   xxspltw   vs10, vs3, 1 
 | |
|   xxspltw   vs11, vs3, 2 
 | |
|   xxspltw   vs12, vs3, 3  
 | |
|   xscvspdp  vs10,vs10
 | |
|   xscvspdp  vs11,vs11
 | |
|   xscvspdp  vs12,vs12
 | |
| 
 | |
|   xscvspdp  vs28, vs1
 | |
|   xxspltw   vs29, vs1, 1 
 | |
|   xxspltw   vs30, vs1, 2 
 | |
|   xxspltw   vs31, vs1, 3  
 | |
|   xscvspdp  vs29,vs29
 | |
|   xscvspdp  vs30,vs30
 | |
|   xscvspdp  vs31,vs31
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| #if defined(TRMMKERNEL)
 | |
|   xsmuldp  vs32,vs8, vs4 
 | |
|   xsmuldp  vs33,vs27, vs4 
 | |
| 
 | |
|   xsmuldp  vs34,vs7, vs4 
 | |
|   xsmuldp  vs35,vs26, vs4 
 | |
| 
 | |
|   xsmuldp  vs36,vs6, vs4 
 | |
|   xsmuldp  vs37,vs25, vs4  
 | |
| 
 | |
|   xsmuldp  vs38,vs5, vs4 
 | |
|   xsmuldp  vs39,vs24, vs4  
 | |
| 
 | |
|   xsmuldp  vs40,vs12, vs4 
 | |
|   xsmuldp  vs41,vs31, vs4
 | |
| 
 | |
|   xsmuldp  vs42,vs11, vs4 
 | |
|   xsmuldp  vs43,vs30, vs4  
 | |
| 
 | |
|   xsmuldp  vs44,vs10, vs4 
 | |
|   xsmuldp  vs45,vs29, vs4 
 | |
| 
 | |
|   xsmuldp  vs46,vs9, vs4 
 | |
|   xsmuldp  vs47,vs28, vs4      
 | |
| #else
 | |
|   xsmaddadp  vs32,vs8, vs4 
 | |
|   xsmaddadp  vs33,vs27, vs4 
 | |
| 
 | |
|   xsmaddadp  vs34,vs7, vs4 
 | |
|   xsmaddadp  vs35,vs26, vs4 
 | |
| 
 | |
|   xsmaddadp  vs36,vs6, vs4 
 | |
|   xsmaddadp  vs37,vs25, vs4  
 | |
| 
 | |
|   xsmaddadp  vs38,vs5, vs4 
 | |
|   xsmaddadp  vs39,vs24, vs4  
 | |
| 
 | |
|   xsmaddadp  vs40,vs12, vs4 
 | |
|   xsmaddadp  vs41,vs31, vs4
 | |
| 
 | |
|   xsmaddadp  vs42,vs11, vs4 
 | |
|   xsmaddadp  vs43,vs30, vs4  
 | |
| 
 | |
|   xsmaddadp  vs44,vs10, vs4 
 | |
|   xsmaddadp  vs45,vs29, vs4 
 | |
| 
 | |
|   xsmaddadp  vs46,vs9, vs4 
 | |
|   xsmaddadp  vs47,vs28, vs4     
 | |
| #endif  
 | |
| 
 | |
|   stxssp  v0,0(CO) 
 | |
|   stxssp  v1,4(CO) 
 | |
| 
 | |
|   stxssp  v2,0(T1)
 | |
|   stxssp  v3,4(T1)
 | |
| 
 | |
|   stxssp  v4,0(T2)
 | |
|   stxssp  v5,4(T2)
 | |
| 
 | |
|   stxssp  v6,0(T3)
 | |
|   stxssp  v7,4(T3)
 | |
| 
 | |
|   stxssp  v8,0(T4)
 | |
|   stxssp  v9,4(T4)
 | |
| 
 | |
|   stxssp  v10,0(T5)
 | |
|   stxssp  v11,4(T5)
 | |
| 
 | |
|   stxssp  v12,0(T6)
 | |
|   stxssp  v13,4(T6)
 | |
| 
 | |
|   stxssp  v14,0(T7)
 | |
|   stxssp  v15,4(T7)
 | |
|  
 | |
| 
 | |
|   addi CO,CO,8
 | |
| .endm
 | |
| 
 | |
| 
 | |
| /**********************************************************************************************
 | |
| * Macros for N=8 and M=1
 | |
| **********************************************************************************************/
 | |
| .macro KERNEL8x1_4   OffsetA,OffsetB, Index,IsLast
 | |
|   KERNEL8x1_I_4 AO,BO, 0, \OffsetA,\OffsetB,\Index,\IsLast
 | |
| .endm
 | |
| 
 | |
| .macro Zero8x1
 | |
|     xxlxor      vs0,   vs0,   vs0
 | |
|     xxlxor      vs1,   vs1,   vs1  
 | |
| .endm
 | |
| 
 | |
| .macro KERNEL8x1
 | |
|   KERNEL8x1_1 AO,BO, 0 
 | |
| .endm
 | |
| 
 | |
| .macro KERNEL8x1_2
 | |
|   KERNEL8x1_2_1 AO,BO, 0 
 | |
| .endm
 | |
| 
 | |
| .macro KERNEL8x1_1 AREG,BREG,First 
 | |
|     lxvwsx vs8,  0, \AREG
 | |
|     lxv vs26,   0(\BREG)
 | |
|     lxv vs27,   16(\BREG)      
 | |
| .if \First==1
 | |
|     xvmulsp      vs0,   vs26,   vs8
 | |
|     xvmulsp      vs1,   vs27,   vs8  
 | |
| .else 
 | |
|     xvmaddasp      vs0,   vs26,   vs8
 | |
|     xvmaddasp      vs1,   vs27,   vs8  
 | |
|  .endif
 | |
|     addi        \AREG, \AREG,  4  
 | |
|     addi        \BREG, \BREG,  32
 | |
| .endm
 | |
| 
 | |
| .macro KERNEL8x1_2_1 AREG,BREG,First 
 | |
|     lxsd v4,    0(\AREG)
 | |
|     lxv vs26,   0(\BREG)
 | |
|     lxv vs27,  16(\BREG)      
 | |
|     lxv vs28,  32(\BREG)
 | |
|     lxv vs29,  48(\BREG) 
 | |
|     xxspltw   vs8,  vs36, 1 
 | |
|     xxspltw   vs9,  vs36, 0  
 | |
| .if \First==1
 | |
|     xvmulsp      vs0,   vs26,   vs8
 | |
|     xvmulsp      vs1,   vs27,   vs8  
 | |
|     xvmulsp      vs0,   vs28,   vs9
 | |
|     xvmulsp      vs1,   vs29,   vs9     
 | |
| .else 
 | |
|     xvmaddasp      vs0,   vs26,   vs8
 | |
|     xvmaddasp      vs1,   vs27,   vs8  
 | |
|     xvmaddasp      vs0,   vs28,   vs9
 | |
|     xvmaddasp      vs1,   vs29,   vs9 
 | |
|  .endif
 | |
|     addi        \AREG, \AREG,  8 
 | |
|     addi        \BREG, \BREG,  64
 | |
| .endm
 | |
| 
 | |
| .macro KERNEL8x1_I_4  AREG,BREG,First,OffsetA,OffsetB, Index,IsLast  
 | |
|     lxv vs4,    DISP4(\Index, 0+\OffsetA)(\AREG)
 | |
|     xxspltw   vs8,  vs4, 3 
 | |
|     xxspltw   vs9,  vs4, 2 
 | |
|     xxspltw   vs10, vs4, 1 
 | |
|     xxspltw   vs11, vs4, 0
 | |
|     lxv vs26,   DISP32(\Index, 0+\OffsetB)(\BREG)
 | |
|     lxv vs27,   DISP32(\Index,16+\OffsetB)(\BREG)
 | |
|     lxv vs28,   DISP32(\Index,32+\OffsetB)(\BREG)
 | |
|     lxv vs29,   DISP32(\Index,48+\OffsetB)(\BREG) 
 | |
|     lxv vs30,   DISP32(\Index,64+ 0+\OffsetB)(\BREG)
 | |
|     lxv vs31,   DISP32(\Index,64+16+\OffsetB)(\BREG)
 | |
|     lxv vs32,   DISP32(\Index,64+32+\OffsetB)(\BREG)
 | |
|     lxv vs33,   DISP32(\Index,64+48+\OffsetB)(\BREG)         
 | |
| .if \First==1
 | |
|     xvmulsp      vs0,   vs26,   vs8
 | |
|     xvmulsp      vs1,   vs27,   vs8  
 | |
|     xvmulsp      vs0,   vs28,   vs9
 | |
|     xvmulsp      vs1,   vs29,   vs9     
 | |
|     xvmulsp      vs0,   vs30,   vs10
 | |
|     xvmulsp      vs1,   vs31,   vs10  
 | |
|     xvmulsp      vs0,   vs32,   vs11
 | |
|     xvmulsp      vs1,   vs33,   vs11     
 | |
| .else 
 | |
|     xvmaddasp      vs0,   vs26,   vs8
 | |
|     xvmaddasp      vs1,   vs27,   vs8  
 | |
|     xvmaddasp      vs0,   vs28,   vs9
 | |
|     xvmaddasp      vs1,   vs29,   vs9     
 | |
|     xvmaddasp      vs0,   vs30,   vs10
 | |
|     xvmaddasp      vs1,   vs31,   vs10  
 | |
|     xvmaddasp      vs0,   vs32,   vs11
 | |
|     xvmaddasp      vs1,   vs33,   vs11  
 | |
|  .endif
 | |
| .if \IsLast==1   
 | |
|     addi        \AREG, \AREG, DISP4(\Index,16)  
 | |
|     addi        \BREG, \BREG, DISP32(\Index,128)
 | |
| .endif 
 | |
| .endm
 | |
| 
 | |
| .macro SAVE8x1
 | |
|   slwi    T10, LDC ,   1 
 | |
|   add     T1, CO, LDC  
 | |
|   add     T2, CO, T10  
 | |
|   add     T3, T1, T10     
 | |
|   add     T4, T2, T10 
 | |
|   add     T5, T3, T10 
 | |
|   add     T6, T4, T10 
 | |
|   add     T7, T5, T10 
 | |
|   /*convert alpha_r for multiply*/
 | |
|   xscvspdp  vs4,alpha_r
 | |
| /* v0 corresponds to vs32, do not forget*/
 | |
| #if !defined(TRMMKERNEL)
 | |
|   lxssp  v0,0(CO)  
 | |
|   lxssp  v2,0(T1) 
 | |
|   lxssp  v4,0(T2) 
 | |
|   lxssp  v6,0(T3) 
 | |
|   lxssp  v8,0(T4) 
 | |
|   lxssp  v10,0(T5) 
 | |
|   lxssp  v12,0(T6) 
 | |
|   lxssp  v14,0(T7)
 | |
| #endif
 | |
|   xscvspdp  vs24, vs0
 | |
|   xxspltw   vs25, vs0, 1 
 | |
|   xxspltw   vs26, vs0, 2 
 | |
|   xxspltw   vs27, vs0, 3  
 | |
|   xscvspdp  vs25,vs25
 | |
|   xscvspdp  vs26,vs26
 | |
|   xscvspdp  vs27,vs27
 | |
|   xscvspdp  vs28, vs1
 | |
|   xxspltw   vs29, vs1, 1 
 | |
|   xxspltw   vs30, vs1, 2 
 | |
|   xxspltw   vs31, vs1, 3  
 | |
|   xscvspdp  vs29,vs29
 | |
|   xscvspdp  vs30,vs30
 | |
|   xscvspdp  vs31,vs31
 | |
| #if defined(TRMMKERNEL)
 | |
|   xsmuldp  vs32,vs27, vs4 
 | |
|   xsmuldp  vs34,vs26, vs4 
 | |
|   xsmuldp  vs36,vs25, vs4 
 | |
|   xsmuldp  vs38,vs24, vs4 
 | |
|   xsmuldp  vs40,vs31, vs4 
 | |
|   xsmuldp  vs42,vs30, vs4 
 | |
|   xsmuldp  vs44,vs29, vs4 
 | |
|   xsmuldp  vs46,vs28, vs4 
 | |
| #else
 | |
|   xsmaddadp  vs32,vs27, vs4 
 | |
|   xsmaddadp  vs34,vs26, vs4 
 | |
|   xsmaddadp  vs36,vs25, vs4 
 | |
|   xsmaddadp  vs38,vs24, vs4 
 | |
|   xsmaddadp  vs40,vs31, vs4 
 | |
|   xsmaddadp  vs42,vs30, vs4 
 | |
|   xsmaddadp  vs44,vs29, vs4 
 | |
|   xsmaddadp  vs46,vs28, vs4  
 | |
| #endif  
 | |
|   stxssp  v0,0(CO)  
 | |
|   stxssp  v2,0(T1) 
 | |
|   stxssp  v4,0(T2) 
 | |
|   stxssp  v6,0(T3) 
 | |
|   stxssp  v8,0(T4) 
 | |
|   stxssp  v10,0(T5) 
 | |
|   stxssp  v12,0(T6) 
 | |
|   stxssp  v14,0(T7) 
 | |
|   addi CO,CO,4
 | |
| .endm
 | |
| 
 | |
| 
 | |
| 
 | |
| /**********************************************************************************************
 | |
| * Macros for N=4 and M=16
 | |
| **********************************************************************************************/
 | |
| 
 | |
| .macro LOAD4x16_1
 | |
|    LOAD4x16 1
 | |
| .endm
 | |
| 
 | |
| .macro LOAD4x16_0
 | |
|    LOAD4x16 0
 | |
| .endm
 | |
| 
 | |
| .macro KERNEL4x16_L1_L4  Index,IsLast
 | |
|   KERNEL4x16_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0
 | |
| .endm
 | |
| 
 | |
| .macro KERNEL4x16_I1_L4  OffsetA,OffsetB, Index,IsLast
 | |
|   KERNEL4x16_L1_L4_I  AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0
 | |
| .endm
 | |
| 
 | |
| .macro KERNEL4x16_I1_L4_2  OffsetA,OffsetB, Index,IsLast
 | |
|   KERNEL4x16_L1_L4_I  AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast,0
 | |
| .endm
 | |
| 
 | |
| .macro KERNEL4x16_I1_L4_3  OffsetA,OffsetB, Index,IsLast
 | |
|   KERNEL4x16_L1_L4_I  AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast,1
 | |
| .endm
 | |
| .macro KERNEL4x16_I1_L2_3  OffsetA,OffsetB, Index,IsLast
 | |
|    KERNEL4x16_L1_L2_I AO,BO,0,  \OffsetA,\OffsetB,\Index,\IsLast,1
 | |
| .endm
 | |
| 
 | |
| .macro KERNEL4x16_I2_L4_2  AREG,BREG,OffsetA,OffsetB, Index,IsLast
 | |
|   KERNEL4x16_L1_L4_I  \AREG,\BREG,  \OffsetA,\OffsetB,\Index,\IsLast,0
 | |
| .endm
 | |
| 
 | |
| .macro KERNEL4x16_I2_L4_3  AREG,BREG,OffsetA,OffsetB, Index,IsLast
 | |
|   KERNEL4x16_L1_L4_I \AREG,\BREG,  \OffsetA,\OffsetB,\Index,\IsLast,1
 | |
| .endm
 | |
| 
 | |
| .macro Zero4X16
 | |
|     xxlxor		vs32,	vs32,	vs32
 | |
|     xxlxor		vs33,	vs33,	vs33
 | |
| 	xxlxor		vs34,	vs34,	vs34
 | |
| 	xxlxor		vs35,	vs35,	vs35
 | |
| 	xxlxor		vs36,	vs36,	vs36
 | |
| 	xxlxor		vs37,	vs37,	vs37
 | |
| 	xxlxor		vs38,	vs38,	vs38
 | |
| 	xxlxor		vs39,	vs39,	vs39
 | |
| 	xxlxor		vs40,	vs40,	vs40
 | |
| 	xxlxor		vs41,	vs41,	vs41
 | |
| 	xxlxor		vs42,	vs42,	vs42
 | |
| 	xxlxor		vs43,	vs43,	vs43
 | |
| 	xxlxor		vs44,	vs44,	vs44
 | |
| 	xxlxor		vs45,	vs45,	vs45
 | |
| 	xxlxor		vs46,	vs46,	vs46
 | |
| 	xxlxor		vs47,	vs47,	vs47	
 | |
| .endm
 | |
| 
 | |
| .macro LOAD4x16  Zero
 | |
| 
 | |
| 	lxv	vs24,	0(BO) 
 | |
| 	lxv	vs0,	 0(AO)
 | |
| 	lxv	vs1,	16(AO)
 | |
| 	lxv	vs2,	32(AO)
 | |
| 	lxv	vs3,	48(AO)
 | |
| 	xxperm  	vs26,	vs24,		permute_mask 	
 | |
| 	xxpermdi	vs25,	vs24,	vs24,2 
 | |
| 	xxpermdi	vs27,	vs26,	vs26,2	 	
 | |
| 
 | |
| .if \Zero==1 
 | |
|     xxlxor		vs32,	vs32,	vs32
 | |
|     xxlxor		vs33,	vs33,	vs33
 | |
| 	xxlxor		vs34,	vs34,	vs34
 | |
| 	xxlxor		vs35,	vs35,	vs35
 | |
| 	xxlxor		vs36,	vs36,	vs36
 | |
| 	xxlxor		vs37,	vs37,	vs37
 | |
| 	xxlxor		vs38,	vs38,	vs38
 | |
| 	xxlxor		vs39,	vs39,	vs39
 | |
| 	xxlxor		vs40,	vs40,	vs40
 | |
| 	xxlxor		vs41,	vs41,	vs41
 | |
| 	xxlxor		vs42,	vs42,	vs42
 | |
| 	xxlxor		vs43,	vs43,	vs43
 | |
| 	xxlxor		vs44,	vs44,	vs44
 | |
| 	xxlxor		vs45,	vs45,	vs45
 | |
| 	xxlxor		vs46,	vs46,	vs46
 | |
| 	xxlxor		vs47,	vs47,	vs47
 | |
|  
 | |
| .endif
 | |
| .endm
 | |
| 
 | |
| .macro END4x16_NORMAL
 | |
|   END4x16 0, AO, BO, 64,16 
 | |
| .endm
 | |
| 
 | |
| .macro END4x16 First, AREG, BREG, OffsetA, OffsetB
 | |
| 
 | |
| .if \OffsetB != 0 
 | |
|     addi        \BREG, \BREG, \OffsetB 
 | |
| .endif
 | |
| .if \OffsetA != 0 
 | |
|     addi        \AREG, \AREG, \OffsetA 
 | |
| .endif  
 | |
| 
 | |
| .if \First==1
 | |
|     xvmulsp     vs32, vs0,vs24
 | |
|     xvmulsp     vs33, vs1,vs24
 | |
|     xvmulsp     vs34, vs2,vs24  
 | |
|     xvmulsp     vs35, vs3,vs24  
 | |
| 
 | |
|     xvmulsp     vs36, vs0,vs25
 | |
|     xvmulsp     vs37, vs1,vs25
 | |
|     xvmulsp     vs38, vs2,vs25  
 | |
|     xvmulsp     vs39, vs3,vs25
 | |
| 
 | |
|     xvmulsp     vs40, vs0,vs26
 | |
|     xvmulsp     vs41, vs1,vs26
 | |
|     xvmulsp     vs42, vs2,vs26  
 | |
|     xvmulsp     vs43, vs3,vs26
 | |
| 
 | |
|     xvmulsp     vs44, vs0,vs27
 | |
|     xvmulsp     vs45, vs1,vs27
 | |
|     xvmulsp     vs46, vs2,vs27  
 | |
|     xvmulsp     vs47, vs3,vs27
 | |
| 
 | |
| .else
 | |
|     xvmaddasp       vs32, vs0,vs24
 | |
|     xvmaddasp       vs33, vs1,vs24
 | |
|     xvmaddasp       vs34, vs2,vs24  
 | |
|     xvmaddasp       vs35, vs3,vs24  
 | |
| 
 | |
|     xvmaddasp       vs36, vs0,vs25
 | |
|     xvmaddasp       vs37, vs1,vs25
 | |
|     xvmaddasp       vs38, vs2,vs25  
 | |
|     xvmaddasp       vs39, vs3,vs25 
 | |
|     xvmaddasp       vs40, vs0,vs26
 | |
|     xvmaddasp       vs41, vs1,vs26
 | |
|     xvmaddasp       vs42, vs2,vs26  
 | |
|     xvmaddasp       vs43, vs3,vs26
 | |
| 
 | |
|     xvmaddasp       vs44, vs0,vs27
 | |
|     xvmaddasp       vs45, vs1,vs27
 | |
|     xvmaddasp       vs46, vs2,vs27  
 | |
|     xvmaddasp       vs47, vs3,vs27
 | |
| 
 | |
| .endif
 | |
| .endm  
 | |
| 
 | |
| .macro KERNEL4x16_L1_L4_I  AREG,BREG,   OffsetA,OffsetB, Index,IsLast ,Complete
 | |
| 
 | |
| 	lxv	vs8,	DISP16(\Index, 0+\OffsetB)(\BREG) 
 | |
| 
 | |
|  	lxv	vs4,	DISP64(\Index, 0+\OffsetA)(\AREG)
 | |
| 	lxv	vs5,	DISP64(\Index,16+\OffsetA)(\AREG)
 | |
| 	lxv	vs6,	DISP64(\Index,32+\OffsetA)(\AREG)
 | |
| 	lxv	vs7,	DISP64(\Index,48+\OffsetA)(\AREG) 
 | |
| 
 | |
| 	xxperm  	vs10,	vs8,		permute_mask 
 | |
| 	xxpermdi	vs9,	vs8,	vs8,2	  
 | |
| 
 | |
|     xvmaddasp		vs32, vs0,vs24
 | |
| 	xvmaddasp		vs33, vs1,vs24
 | |
| 	xvmaddasp		vs34, vs2,vs24	
 | |
| 	xvmaddasp		vs35, vs3,vs24	 
 | |
| 
 | |
|     xvmaddasp		vs36, vs0,vs25
 | |
| 	xvmaddasp		vs37, vs1,vs25
 | |
| 	xvmaddasp		vs38, vs2,vs25	
 | |
| 	xvmaddasp		vs39, vs3,vs25 
 | |
| 
 | |
|  	xxpermdi	vs11,	vs10,	vs10,2	 
 | |
| 
 | |
|     xvmaddasp		vs40, vs0,vs26
 | |
| 	xvmaddasp		vs41, vs1,vs26
 | |
| 	xvmaddasp		vs42, vs2,vs26	
 | |
| 	xvmaddasp		vs43, vs3,vs26
 | |
| 
 | |
|     xvmaddasp		vs44, vs0,vs27
 | |
| 	xvmaddasp		vs45, vs1,vs27
 | |
| 	xvmaddasp		vs46, vs2,vs27	
 | |
| 	xvmaddasp		vs47, vs3,vs27
 | |
| 
 | |
| 
 | |
| 
 | |
| 	lxv	vs24,	DISP16(\Index,16+\OffsetB)(\BREG) 
 | |
| 
 | |
| 	lxv	vs0,	DISP64(\Index,64+\OffsetA)(\AREG)
 | |
| 	lxv	vs1,	DISP64(\Index,64+16+\OffsetA)(\AREG)
 | |
| 	lxv	vs2,	DISP64(\Index,64+32+\OffsetA)(\AREG)
 | |
| 	lxv	vs3,	DISP64(\Index,64+48+\OffsetA)(\AREG)
 | |
| 
 | |
| 	xxperm  	vs26,	vs24,		permute_mask 
 | |
| 	xxpermdi	vs25,	vs24,	vs24,2	    
 | |
|  
 | |
| 
 | |
|     xvmaddasp		vs32, vs4,vs8
 | |
| 	xvmaddasp		vs33, vs5,vs8
 | |
| 	xvmaddasp		vs34, vs6,vs8	
 | |
| 	xvmaddasp		vs35, vs7,vs8	
 | |
|  
 | |
|     xvmaddasp		vs36, vs4,vs9
 | |
| 	xvmaddasp		vs37, vs5,vs9
 | |
| 	xvmaddasp		vs38, vs6,vs9	
 | |
| 	xvmaddasp		vs39, vs7,vs9
 | |
|          
 | |
| 	xxpermdi	vs27,	vs26,	vs26,2	 	
 | |
| 
 | |
|     xvmaddasp		vs40, vs4,vs10
 | |
| 	xvmaddasp		vs41, vs5,vs10
 | |
| 	xvmaddasp		vs42, vs6,vs10	
 | |
| 	xvmaddasp		vs43, vs7,vs10
 | |
| 
 | |
|     xvmaddasp		vs44, vs4,vs11
 | |
| 	xvmaddasp		vs45, vs5,vs11
 | |
| 	xvmaddasp		vs46, vs6,vs11	
 | |
| 	xvmaddasp		vs47, vs7,vs11
 | |
|  
 | |
| 
 | |
| 	lxv	vs8,	DISP16(\Index,32+\OffsetB)(\BREG) 
 | |
| 
 | |
|  	lxv	vs4,	DISP64(\Index,128+0+\OffsetA)(\AREG)
 | |
| 	lxv	vs5,	DISP64(\Index,128+16+\OffsetA)(\AREG)
 | |
| 	lxv	vs6,	DISP64(\Index,128+32+\OffsetA)(\AREG)
 | |
| 	lxv	vs7,	DISP64(\Index,128+48+\OffsetA)(\AREG) 
 | |
| 
 | |
| 	xxperm  	vs10,	vs8,		permute_mask 
 | |
| 	xxpermdi	vs9,	vs8,	vs8,2	  
 | |
| 
 | |
|     xvmaddasp		vs32, vs0,vs24
 | |
| 	xvmaddasp		vs33, vs1,vs24
 | |
| 	xvmaddasp		vs34, vs2,vs24	
 | |
| 	xvmaddasp		vs35, vs3,vs24	 
 | |
| 
 | |
|     xvmaddasp		vs36, vs0,vs25
 | |
| 	xvmaddasp		vs37, vs1,vs25
 | |
| 	xvmaddasp		vs38, vs2,vs25	
 | |
| 	xvmaddasp		vs39, vs3,vs25
 | |
| 
 | |
|  	xxpermdi	vs11,	vs10,	vs10,2	 
 | |
| 
 | |
|     xvmaddasp		vs40, vs0,vs26
 | |
| 	xvmaddasp		vs41, vs1,vs26
 | |
| 	xvmaddasp		vs42, vs2,vs26	
 | |
| 	xvmaddasp		vs43, vs3,vs26
 | |
| 
 | |
|     xvmaddasp		vs44, vs0,vs27
 | |
| 	xvmaddasp		vs45, vs1,vs27
 | |
| 	xvmaddasp		vs46, vs2,vs27	
 | |
| 	xvmaddasp		vs47, vs3,vs27
 | |
| 
 | |
|  
 | |
|  
 | |
| .if \Complete==0
 | |
| 	lxv	vs24,	DISP16(\Index,48+\OffsetB)(\BREG) 
 | |
| 
 | |
| 	lxv	vs0,	DISP64(\Index,192+\OffsetA)(\AREG)
 | |
| 	lxv	vs1,	DISP64(\Index,192+16+\OffsetA)(\AREG) 
 | |
| 	lxv	vs2,	DISP64(\Index,192+32+\OffsetA)(\AREG)
 | |
| 	lxv	vs3,	DISP64(\Index,192+48+\OffsetA)(\AREG)
 | |
| 
 | |
| 	xxperm  	vs26,	vs24,	permute_mask 	
 | |
| 	xxpermdi	vs25,	vs24,	vs24,2  	
 | |
| 
 | |
| .endif 
 | |
| .if \IsLast==1	
 | |
| .if \Complete==1
 | |
|   
 | |
| 	addi		\BREG, \BREG,  DISP16(\Index,16*3+\OffsetB)
 | |
| 	addi		\AREG, \AREG, DISP64(\Index,64*3+\OffsetA)
 | |
| .else
 | |
|   
 | |
| 	addi		\BREG, \BREG,  DISP16(\Index,64)
 | |
| 	addi		\AREG, \AREG, DISP64(\Index,256)
 | |
| .endif
 | |
| .endif   
 | |
|  
 | |
|     xvmaddasp		vs32, vs4,vs8
 | |
| 	xvmaddasp		vs33, vs5,vs8
 | |
| 	xvmaddasp		vs34, vs6,vs8	
 | |
| 	xvmaddasp		vs35, vs7,vs8	 
 | |
|  
 | |
|     xvmaddasp		vs36, vs4,vs9
 | |
| 	xvmaddasp		vs37, vs5,vs9
 | |
| 	xvmaddasp		vs38, vs6,vs9	
 | |
| 	xvmaddasp		vs39, vs7,vs9
 | |
|   
 | |
| .if \Complete==0        
 | |
| 	xxpermdi	vs27,	vs26,	vs26,2	 
 | |
|  	
 | |
| .endif
 | |
|  
 | |
|     xvmaddasp		vs40, vs4,vs10
 | |
| 	xvmaddasp		vs41, vs5,vs10
 | |
| 	xvmaddasp		vs42, vs6,vs10	
 | |
| 	xvmaddasp		vs43, vs7,vs10
 | |
| 
 | |
|     xvmaddasp		vs44, vs4,vs11
 | |
| 	xvmaddasp		vs45, vs5,vs11
 | |
| 	xvmaddasp		vs46, vs6,vs11	
 | |
| 	xvmaddasp		vs47, vs7,vs11
 | |
| 
 | |
|  
 | |
| 
 | |
| .endm
 | |
| 
 | |
| .macro KERNEL4x16 First
 | |
| 
 | |
|   LOAD4x16 0
 | |
|   END4x16 \First, AO, BO, 64,16 
 | |
| .endm
 | |
| 
 | |
| .macro KERNEL4x16_L1_L2_I  AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete
 | |
| 	
 | |
| 	lxv	vs8,	DISP8(\Index, 0+\OffsetB)(\BREG) 
 | |
|  	lxv	vs4,	DISP32(\Index, 0+\OffsetA)(\AREG)
 | |
| 	lxv	vs5,	DISP32(\Index,16+\OffsetA)(\AREG)
 | |
| 	lxv	vs6,	DISP32(\Index,32+\OffsetA)(\AREG)
 | |
| 	lxv	vs7,	DISP32(\Index,48+\OffsetA)(\AREG) 
 | |
| 
 | |
| 	xxperm  	vs10,	vs8,		permute_mask 
 | |
| 	xxpermdi	vs9,	vs8,	vs8,2	  
 | |
| .if \First==1
 | |
|     xvmulsp		vs32, vs0,vs24
 | |
| 	xvmulsp		vs33, vs1,vs24
 | |
| 	xvmulsp		vs34, vs2,vs24	
 | |
| 	xvmulsp		vs35, vs3,vs24	
 | |
| 
 | |
|     xvmulsp		vs36, vs0,vs25
 | |
| 	xvmulsp		vs37, vs1,vs25
 | |
| 	xvmulsp		vs38, vs2,vs25	
 | |
| 	xvmulsp		vs39, vs3,vs25	
 | |
| .else
 | |
|     xvmaddasp		vs32, vs0,vs24
 | |
| 	xvmaddasp		vs33, vs1,vs24
 | |
| 	xvmaddasp		vs34, vs2,vs24	
 | |
| 	xvmaddasp		vs35, vs3,vs24
 | |
| 
 | |
|     xvmaddasp		vs36, vs0,vs25
 | |
| 	xvmaddasp		vs37, vs1,vs25
 | |
| 	xvmaddasp		vs38, vs2,vs25	
 | |
| 	xvmaddasp		vs39, vs3,vs25		
 | |
| .endif
 | |
| 
 | |
|  	xxpermdi	vs11,	vs10,	vs10,2	 	
 | |
|  
 | |
| .if \First==1  
 | |
|     xvmulsp		vs40, vs0,vs26
 | |
| 	xvmulsp		vs41, vs1,vs26
 | |
| 	xvmulsp		vs42, vs2,vs26	
 | |
| 	xvmulsp		vs43, vs3,vs26
 | |
| 
 | |
|     xvmulsp		vs44, vs0,vs27
 | |
| 	xvmulsp		vs45, vs1,vs27
 | |
| 	xvmulsp		vs46, vs2,vs27	
 | |
| 	xvmulsp		vs47, vs3,vs27
 | |
| 
 | |
|   
 | |
| .else 
 | |
|     xvmaddasp		vs40, vs0,vs26
 | |
| 	xvmaddasp		vs41, vs1,vs26
 | |
| 	xvmaddasp		vs42, vs2,vs26	
 | |
| 	xvmaddasp		vs43, vs3,vs26
 | |
| 
 | |
|     xvmaddasp		vs44, vs0,vs27
 | |
| 	xvmaddasp		vs45, vs1,vs27
 | |
| 	xvmaddasp		vs46, vs2,vs27	
 | |
| 	xvmaddasp		vs47, vs3,vs27
 | |
|  
 | |
| 
 | |
| .endif
 | |
| .if \Complete==0
 | |
| 	lxv	vs24,	DISP8(\Index,16+\OffsetB)(\BREG) 
 | |
| 	lxv	vs0,	DISP32(\Index,64+\OffsetA)(\AREG)
 | |
| 	lxv	vs1,	DISP32(\Index,64+16+\OffsetA)(\AREG)
 | |
| 	lxv	vs2,	DISP32(\Index,64+32+\OffsetA)(\AREG)
 | |
| 	lxv	vs3,	DISP32(\Index,64+48+\OffsetA)(\AREG)
 | |
| 
 | |
| 	xxperm  	vs26,	vs24,	permute_mask 
 | |
| 	xxpermdi	vs25,	vs24,	vs24,2	  
 | |
| .endif    
 | |
| .if \IsLast==1	
 | |
| .if \Complete==1
 | |
|  	addi		\BREG, \BREG,  DISP8(\Index,16+\OffsetB) 
 | |
| 	addi		\AREG, \AREG, DISP32(\Index,64+\OffsetA)
 | |
| 
 | |
| .else
 | |
|   	addi		\BREG, \BREG,  DISP8(\Index,32)
 | |
| 	addi		\AREG, \AREG, DISP32(\Index,128) 
 | |
| .endif
 | |
| .endif
 | |
| 
 | |
| .if \First==1
 | |
|     xvmulsp		vs32, vs4,vs8
 | |
| 	xvmulsp		vs33, vs5,vs8
 | |
| 	xvmulsp		vs34, vs6,vs8	
 | |
| 	xvmulsp		vs35, vs7,vs8
 | |
| 
 | |
|     xvmulsp		vs36, vs4,vs9
 | |
| 	xvmulsp		vs37, vs5,vs9
 | |
| 	xvmulsp		vs38, vs6,vs9	
 | |
| 	xvmulsp		vs39, vs7,vs9
 | |
| .else
 | |
|     xvmaddasp		vs32, vs4,vs8
 | |
| 	xvmaddasp		vs33, vs5,vs8
 | |
| 	xvmaddasp		vs34, vs6,vs8	
 | |
| 	xvmaddasp		vs35, vs7,vs8	
 | |
| 
 | |
|     xvmaddasp		vs36, vs4,vs9
 | |
| 	xvmaddasp		vs37, vs5,vs9
 | |
| 	xvmaddasp		vs38, vs6,vs9	
 | |
| 	xvmaddasp		vs39, vs7,vs9
 | |
| .endif 
 | |
|  
 | |
| .if \Complete==0        
 | |
| 	xxpermdi	vs27,	vs26,	vs26,2	 
 | |
|  
 | |
| .endif
 | |
| .if \First==1  
 | |
|     xvmulsp		vs40, vs4,vs10
 | |
| 	xvmulsp		vs41, vs5,vs10
 | |
| 	xvmulsp		vs42, vs6,vs10	
 | |
| 	xvmulsp		vs43, vs7,vs10
 | |
| 
 | |
|     xvmulsp		vs44, vs4,vs11
 | |
| 	xvmulsp		vs45, vs5,vs11
 | |
| 	xvmulsp		vs46, vs6,vs11	
 | |
| 	xvmulsp		vs47, vs7,vs11
 | |
| 
 | |
|  
 | |
| 
 | |
| .else 
 | |
|     xvmaddasp		vs40, vs4,vs10
 | |
| 	xvmaddasp		vs41, vs5,vs10
 | |
| 	xvmaddasp		vs42, vs6,vs10	
 | |
| 	xvmaddasp		vs43, vs7,vs10
 | |
| 
 | |
|     xvmaddasp		vs44, vs4,vs11
 | |
| 	xvmaddasp		vs45, vs5,vs11
 | |
| 	xvmaddasp		vs46, vs6,vs11	
 | |
| 	xvmaddasp		vs47, vs7,vs11
 | |
| 
 | |
|  
 | |
| 
 | |
| .endif
 | |
| 
 | |
| .endm
 | |
| 
 | |
|  
 | |
| .macro SAVE4x16
 | |
| 
 | |
|   slwi    T10, LDC ,   1 
 | |
|   add     T1, CO, LDC 
 | |
| 
 | |
|   add     T2, CO, T10  
 | |
|   add     T3, T1, T10  
 | |
| 
 | |
|   
 | |
|  
 | |
|     xxmrglw     vs8,    vs32,   vs44
 | |
|     xxmrglw     vs10,   vs36,   vs40  
 | |
| 
 | |
|     xxmrghw     vs1,    vs32,   vs44
 | |
|     xxmrghw     vs0,    vs36,   vs40
 | |
| 
 | |
|     xxmrglw     vs12,   vs33,   vs45
 | |
|     xxmrglw     vs14,   vs37,   vs41  
 | |
| 
 | |
|     xxmrghw     vs2,    vs37,   vs41
 | |
|     xxmrghw     vs3,    vs33,   vs45
 | |
| 
 | |
|     xxmrglw     vs16,   vs34,   vs46
 | |
|     xxmrglw     vs18,   vs38,   vs42   
 | |
| 
 | |
|     xxlor      vs9, vs8,    vs8
 | |
|     xxlor      vs11,    vs10,   vs10 
 | |
| 
 | |
|     xxmrghw     vs4,    vs38,   vs42
 | |
|     xxmrghw     vs5,    vs34,   vs46
 | |
| 
 | |
|     xxlor      vs13,    vs12,   vs12
 | |
|     xxlor      vs15,    vs14,   vs14
 | |
| 
 | |
|     xxmrglw     vs24,   vs35,   vs47
 | |
|     xxmrglw     vs26,   vs39,   vs43  
 | |
| 
 | |
|     xxlor      vs17,    vs16,   vs16
 | |
|     xxlor      vs19,    vs18,   vs18
 | |
| 
 | |
|     xxmrghw     vs30,   vs39,   vs43 
 | |
|     xxmrghw     vs31,   vs35,   vs47
 | |
| 
 | |
|     xxperm      vs8,    vs0,    save_permute_1
 | |
|     xxperm      vs10,   vs1,    save_permute_1
 | |
|     xxperm      vs9,    vs0,    save_permute_2  
 | |
|     xxperm      vs11,   vs1,    save_permute_2      
 | |
| 
 | |
| #ifndef TRMMKERNEL    
 | |
|     lxv        vs32, 0(CO)
 | |
|     lxv        vs33, 16(CO) 
 | |
|     lxv        vs34, 32(CO)  
 | |
|     lxv        vs35, 48(CO)      
 | |
| #endif
 | |
|     xxlor      vs25,    vs24,   vs24
 | |
|     xxlor      vs27,    vs26,   vs26 
 | |
| 
 | |
| #ifndef TRMMKERNEL    
 | |
|     lxv        vs36, 0(T1)
 | |
|     lxv        vs37, 16(T1) 
 | |
|     lxv        vs38, 32(T1)  
 | |
|     lxv        vs39, 48(T1)     
 | |
| #endif
 | |
| #ifndef TRMMKERNEL       
 | |
|     lxv        vs40, 0(T2)
 | |
|     lxv        vs41, 16(T2) 
 | |
|     lxv        vs42, 32(T2)  
 | |
|     lxv        vs43, 48(T2)     
 | |
| #endif  
 | |
| #ifndef TRMMKERNEL    
 | |
|     lxv        vs44, 0(T3)
 | |
|     lxv        vs45, 16(T3) 
 | |
|     lxv        vs46, 32(T3)  
 | |
|     lxv        vs47, 48(T3)                 
 | |
| #endif  
 | |
| 
 | |
|     xxperm     vs12,    vs2,    save_permute_1
 | |
|     xxperm     vs14,    vs3,    save_permute_1
 | |
|        
 | |
|     xxperm     vs13,    vs2,    save_permute_2   
 | |
|     xxperm     vs15,    vs3,    save_permute_2      
 | |
| 
 | |
|     xxperm     vs16,    vs4,    save_permute_1
 | |
|     xxperm     vs18,    vs5,    save_permute_1
 | |
|       
 | |
|     xxperm     vs17,    vs4,    save_permute_2   
 | |
|     xxperm     vs19,    vs5,    save_permute_2      
 | |
| 
 | |
|     xxperm     vs24,    vs30,   save_permute_1
 | |
|     xxperm     vs26,    vs31,   save_permute_1 
 | |
|          
 | |
|     xxperm     vs25,    vs30,   save_permute_2   
 | |
|     xxperm     vs27,    vs31,   save_permute_2  
 | |
| 
 | |
| 
 | |
|     /* multiply add normal way */
 | |
|  
 | |
| #ifdef TRMMKERNEL
 | |
|     xvmulsp     vs32,   vs8,    alpha_r 
 | |
|     xvmulsp     vs33,   vs12,   alpha_r   
 | |
|     xvmulsp     vs34,   vs16,   alpha_r 
 | |
|     xvmulsp     vs35,   vs24,   alpha_r  
 | |
|     xvmulsp     vs36,   vs9,    alpha_r 
 | |
|     xvmulsp     vs37,   vs13,   alpha_r  
 | |
|     xvmulsp     vs38,   vs17,   alpha_r 
 | |
|     xvmulsp     vs39,   vs25,   alpha_r               
 | |
| #else 
 | |
|     xvmaddasp   vs32,   vs8,    alpha_r 
 | |
|     xvmaddasp   vs33,   vs12,   alpha_r   
 | |
|     xvmaddasp   vs34,   vs16,   alpha_r 
 | |
|     xvmaddasp   vs35,   vs24,   alpha_r  
 | |
|     xvmaddasp   vs36,   vs9,    alpha_r 
 | |
|     xvmaddasp   vs37,   vs13,   alpha_r   
 | |
|     xvmaddasp   vs38,   vs17,   alpha_r 
 | |
|     xvmaddasp   vs39,   vs25,   alpha_r         
 | |
| #endif 
 | |
| 
 | |
| 
 | |
| 
 | |
| #ifdef TRMMKERNEL
 | |
|     xvmulsp     vs40,   vs10,   alpha_r 
 | |
|     xvmulsp     vs41,   vs14,   alpha_r 
 | |
|     xvmulsp     vs42,   vs18,   alpha_r 
 | |
|     xvmulsp     vs43,   vs26,   alpha_r  
 | |
|     xvmulsp     vs44,   vs11,   alpha_r 
 | |
|     xvmulsp     vs45,   vs15,   alpha_r  
 | |
|     xvmulsp     vs46,   vs19,   alpha_r 
 | |
|     xvmulsp     vs47,   vs27,   alpha_r                   
 | |
| #else
 | |
| 
 | |
|     xvmaddasp   vs40,   vs10,   alpha_r 
 | |
|     xvmaddasp   vs41,   vs14,   alpha_r   
 | |
|     xvmaddasp   vs42,   vs18,   alpha_r 
 | |
|     xvmaddasp   vs43,   vs26,   alpha_r  
 | |
|     xvmaddasp   vs44,   vs11,   alpha_r 
 | |
|     xvmaddasp   vs45,   vs15,   alpha_r 
 | |
|     xvmaddasp   vs46,   vs19,   alpha_r 
 | |
|     xvmaddasp   vs47,   vs27,   alpha_r  
 | |
|         
 | |
| #endif  
 | |
| 
 | |
|     stxv        vs32, 0(CO)
 | |
|     stxv        vs33, 16(CO) 
 | |
|     stxv        vs34, 32(CO)  
 | |
|     stxv        vs35, 48(CO)  
 | |
| 
 | |
|     stxv        vs36, 0(T1)
 | |
|     stxv        vs37, 16(T1)  
 | |
|     stxv        vs38, 32(T1)  
 | |
|     stxv        vs39, 48(T1)
 | |
| 
 | |
|     stxv        vs40, 0(T2)
 | |
|     stxv        vs41, 16(T2)  
 | |
|     stxv        vs42, 32(T2)  
 | |
|     stxv        vs43, 48(T2)  
 | |
|     stxv        vs44, 0(T3)
 | |
|     stxv        vs45, 16(T3) 
 | |
|     stxv        vs46, 32(T3)  
 | |
|     stxv        vs47, 48(T3)
 | |
|    
 | |
|     addi CO,CO,64
 | |
| 
 | |
| 
 | |
| .endm
 | |
| 
 | |
| 
 | |
| 
 | |
| /**********************************************************************************************
 | |
| * Macros for N=4 and M=8
 | |
| **********************************************************************************************/
 | |
| 
 | |
| .macro LOAD4x8_1
 | |
|    LOAD4x8 1
 | |
| .endm
 | |
| 
 | |
| .macro LOAD4x8_0
 | |
|    LOAD4x8 0
 | |
| .endm
 | |
| 
 | |
| .macro KERNEL4x8_L1_L4  Index,IsLast
 | |
|   KERNEL4x8_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0
 | |
| .endm
 | |
| 
 | |
| .macro KERNEL4x8_I1_L4  OffsetA,OffsetB, Index,IsLast
 | |
|   KERNEL4x8_L1_L4_I  AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0
 | |
| .endm
 | |
| 
 | |
| .macro KERNEL4x8_I1_L4_2  OffsetA,OffsetB, Index,IsLast
 | |
|   KERNEL4x8_L1_L4_I  AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast,0
 | |
| .endm
 | |
| 
 | |
| .macro KERNEL4x8_I1_L4_3  OffsetA,OffsetB, Index,IsLast
 | |
|   KERNEL4x8_L1_L4_I  AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast,1
 | |
| .endm
 | |
| .macro KERNEL4x8_I1_L2_3  OffsetA,OffsetB, Index,IsLast
 | |
|    KERNEL4x8_L1_L2_I AO,BO,0,  \OffsetA,\OffsetB,\Index,\IsLast,1
 | |
| .endm
 | |
| 
 | |
| .macro KERNEL4x8_I2_L4_2  AREG,BREG,OffsetA,OffsetB, Index,IsLast
 | |
|   KERNEL4x8_L1_L4_I  \AREG,\BREG,  \OffsetA,\OffsetB,\Index,\IsLast,0
 | |
| .endm
 | |
| 
 | |
| .macro KERNEL4x8_I2_L4_3  AREG,BREG,OffsetA,OffsetB, Index,IsLast
 | |
|   KERNEL4x8_L1_L4_I \AREG,\BREG,  \OffsetA,\OffsetB,\Index,\IsLast,1
 | |
| .endm
 | |
| 
 | |
| .macro END4x8_NORMAL
 | |
|   END4x8 0, AO, BO, 32,16 
 | |
| .endm
 | |
| 
 | |
| .macro Zero4X8
 | |
|     xxlxor      vs32,   vs32,   vs32
 | |
|     xxlxor      vs33,   vs33,   vs33
 | |
|  
 | |
|     xxlxor      vs36,   vs36,   vs36
 | |
|     xxlxor      vs37,   vs37,   vs37
 | |
|  
 | |
|     xxlxor      vs40,   vs40,   vs40
 | |
|     xxlxor      vs41,   vs41,   vs41
 | |
|  
 | |
|     xxlxor      vs44,   vs44,   vs44
 | |
|     xxlxor      vs45,   vs45,   vs45
 | |
|     
 | |
| .endm
 | |
| 
 | |
| .macro LOAD4x8  Zero
 | |
| 
 | |
|     lxv vs24,   0(BO) 
 | |
|     lxv vs0,     0(AO)
 | |
|     lxv vs1,    16(AO)
 | |
| 
 | |
|     xxperm      vs26,   vs24,       permute_mask    
 | |
|     xxpermdi    vs25,   vs24,   vs24,2      
 | |
| 
 | |
|     xxpermdi    vs27,   vs26,   vs26,2      
 | |
| 
 | |
| .if \Zero==1 
 | |
|     xxlxor      vs32,   vs32,   vs32
 | |
|     xxlxor      vs33,   vs33,   vs33 
 | |
|     xxlxor      vs36,   vs36,   vs36
 | |
|     xxlxor      vs37,   vs37,   vs37
 | |
|     xxlxor      vs40,   vs40,   vs40
 | |
|     xxlxor      vs41,   vs41,   vs41 
 | |
|     xxlxor      vs44,   vs44,   vs44
 | |
|     xxlxor      vs45,   vs45,   vs45 
 | |
|  
 | |
| .endif
 | |
| .endm
 | |
| 
 | |
| 
 | |
| .macro END4x8 First, AREG, BREG, OffsetA, OffsetB
 | |
| 
 | |
| .if \OffsetB != 0 
 | |
|     addi        \BREG, \BREG, \OffsetB 
 | |
| .endif
 | |
| .if \OffsetA != 0 
 | |
|     addi        \AREG, \AREG, \OffsetA 
 | |
| .endif  
 | |
| 
 | |
| .if \First==1
 | |
|     xvmulsp     vs32, vs0,vs24
 | |
|     xvmulsp     vs33, vs1,vs24
 | |
| 
 | |
|     xvmulsp     vs36, vs0,vs25
 | |
|     xvmulsp     vs37, vs1,vs25
 | |
| 
 | |
|     xvmulsp     vs40, vs0,vs26
 | |
|     xvmulsp     vs41, vs1,vs26
 | |
| 
 | |
|     xvmulsp     vs44, vs0,vs27
 | |
|     xvmulsp     vs45, vs1,vs27
 | |
|  
 | |
| 
 | |
| .else
 | |
|     xvmaddasp       vs32, vs0,vs24
 | |
|     xvmaddasp       vs33, vs1,vs24
 | |
| 
 | |
|     xvmaddasp       vs36, vs0,vs25
 | |
|     xvmaddasp       vs37, vs1,vs25
 | |
| 
 | |
|     xvmaddasp       vs40, vs0,vs26
 | |
|     xvmaddasp       vs41, vs1,vs26
 | |
| 
 | |
|     xvmaddasp       vs44, vs0,vs27
 | |
|     xvmaddasp       vs45, vs1,vs27
 | |
|  
 | |
| 
 | |
| .endif
 | |
| .endm  
 | |
| 
 | |
| .macro KERNEL4x8_L1_L4_I  AREG,BREG,   OffsetA,OffsetB, Index,IsLast ,Complete
 | |
| 
 | |
|     lxv vs8,    DISP16(\Index, 0+\OffsetB)(\BREG) 
 | |
| 
 | |
|     lxv vs4,    DISP32(\Index, 0+\OffsetA)(\AREG)
 | |
|     lxv vs5,    DISP32(\Index,16+\OffsetA)(\AREG)
 | |
| 
 | |
|     xxperm      vs10,   vs8,        permute_mask    
 | |
|     xxpermdi    vs9,    vs8,    vs8,2     
 | |
| 
 | |
|     xvmaddasp       vs32, vs0,vs24
 | |
|     xvmaddasp       vs33, vs1,vs24
 | |
| 
 | |
|     xvmaddasp       vs36, vs0,vs25
 | |
|     xvmaddasp       vs37, vs1,vs25
 | |
| 
 | |
|     xxpermdi    vs11,   vs10,   vs10,2   
 | |
| 
 | |
|     xvmaddasp       vs40, vs0,vs26
 | |
|     xvmaddasp       vs41, vs1,vs26
 | |
| 
 | |
|     xvmaddasp       vs44, vs0,vs27
 | |
|     xvmaddasp       vs45, vs1,vs27
 | |
| 
 | |
|  
 | |
| 
 | |
|     lxv vs24,   DISP16(\Index,16+\OffsetB)(\BREG) 
 | |
| 
 | |
|     lxv vs0,    DISP32(\Index,32+\OffsetA)(\AREG)
 | |
|     lxv vs1,    DISP32(\Index,32+16+\OffsetA)(\AREG)
 | |
| 
 | |
|     xxperm      vs26,   vs24,       permute_mask   
 | |
|     xxpermdi    vs25,   vs24,   vs24,2      
 | |
| 
 | |
|     xvmaddasp       vs32, vs4,vs8
 | |
|     xvmaddasp       vs33, vs5,vs8
 | |
| 
 | |
|     xvmaddasp       vs36, vs4,vs9
 | |
|     xvmaddasp       vs37, vs5,vs9
 | |
| 
 | |
|     xxpermdi    vs27,   vs26,   vs26,2       
 | |
| 
 | |
|     xvmaddasp       vs40, vs4,vs10
 | |
|     xvmaddasp       vs41, vs5,vs10
 | |
| 
 | |
|     xvmaddasp       vs44, vs4,vs11
 | |
|     xvmaddasp       vs45, vs5,vs11
 | |
| 
 | |
|  
 | |
| 
 | |
|     lxv vs8,    DISP16(\Index,32+\OffsetB)(\BREG) 
 | |
| 
 | |
|     lxv vs4,    DISP32(\Index,64+0+\OffsetA)(\AREG)
 | |
|     lxv vs5,    DISP32(\Index,64+16+\OffsetA)(\AREG)
 | |
| 
 | |
|     xxperm      vs10,   vs8,        permute_mask     
 | |
|     xxpermdi    vs9,    vs8,    vs8,2     
 | |
| 
 | |
|     xvmaddasp       vs32, vs0,vs24
 | |
|     xvmaddasp       vs33, vs1,vs24
 | |
| 
 | |
|     xvmaddasp       vs36, vs0,vs25
 | |
|     xvmaddasp       vs37, vs1,vs25
 | |
| 
 | |
|     xxpermdi    vs11,   vs10,   vs10,2   
 | |
| 
 | |
|     xvmaddasp       vs40, vs0,vs26
 | |
|     xvmaddasp       vs41, vs1,vs26
 | |
| 
 | |
|     xvmaddasp       vs44, vs0,vs27
 | |
|     xvmaddasp       vs45, vs1,vs27
 | |
| 
 | |
|  
 | |
| 
 | |
| .if \Complete==0
 | |
|     lxv vs24,   DISP16(\Index,48+\OffsetB)(\BREG) 
 | |
| 
 | |
|     lxv vs0,    DISP32(\Index,96+\OffsetA)(\AREG)
 | |
|     lxv vs1,    DISP32(\Index,96+16+\OffsetA)(\AREG) 
 | |
| 
 | |
|     xxperm      vs26,   vs24,   permute_mask     
 | |
|     xxpermdi    vs25,   vs24,   vs24,2      
 | |
| 
 | |
| .endif 
 | |
| .if \IsLast==1  
 | |
| .if \Complete==1
 | |
|   
 | |
|     addi        \BREG, \BREG,  DISP16(\Index,16*3+\OffsetB)
 | |
|     addi        \AREG, \AREG, DISP32(\Index,32*3+\OffsetA)
 | |
| .else
 | |
|   
 | |
|     addi        \BREG, \BREG,  DISP16(\Index,64)
 | |
|     addi        \AREG, \AREG, DISP32(\Index,128)
 | |
| .endif
 | |
| .endif   
 | |
|  
 | |
|     xvmaddasp       vs32, vs4,vs8
 | |
|     xvmaddasp       vs33, vs5,vs8
 | |
| 
 | |
|     xvmaddasp       vs36, vs4,vs9
 | |
|     xvmaddasp       vs37, vs5,vs9
 | |
| 
 | |
| .if \Complete==0        
 | |
|     xxpermdi    vs27,   vs26,   vs26,2    
 | |
|     
 | |
| .endif
 | |
|  
 | |
|     xvmaddasp       vs40, vs4,vs10
 | |
|     xvmaddasp       vs41, vs5,vs10
 | |
| 
 | |
|     xvmaddasp       vs44, vs4,vs11
 | |
|     xvmaddasp       vs45, vs5,vs11
 | |
| 
 | |
|  
 | |
| 
 | |
| .endm
 | |
| 
 | |
| .macro KERNEL4x8 First
 | |
| 
 | |
|   LOAD4x8 0
 | |
|   END4x8 \First, AO, BO, 32,16  
 | |
| .endm
 | |
| 
 | |
| .macro KERNEL4x8_L1_L2_I  AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete
 | |
|     
 | |
|     lxv vs8,     DISP8(\Index, 0+\OffsetB)(\BREG) 
 | |
|     lxv vs4,    DISP16(\Index, 0+\OffsetA)(\AREG)
 | |
|     lxv vs5,    DISP16(\Index,16+\OffsetA)(\AREG)
 | |
| 
 | |
|     xxperm      vs10,   vs8,        permute_mask  
 | |
|     xxpermdi    vs9,    vs8,    vs8,2     
 | |
| .if \First==1
 | |
|     xvmulsp     vs32, vs0,vs24
 | |
|     xvmulsp     vs33, vs1,vs24
 | |
| 
 | |
|     xvmulsp     vs36, vs0,vs25
 | |
|     xvmulsp     vs37, vs1,vs25
 | |
| 
 | |
| .else
 | |
|     xvmaddasp       vs32, vs0,vs24
 | |
|     xvmaddasp       vs33, vs1,vs24
 | |
| 
 | |
|     xvmaddasp       vs36, vs0,vs25
 | |
|     xvmaddasp       vs37, vs1,vs25
 | |
| 
 | |
| .endif
 | |
| 
 | |
|     xxpermdi    vs11,   vs10,   vs10,2    
 | |
|  
 | |
| .if \First==1  
 | |
|     xvmulsp     vs40, vs0,vs26
 | |
|     xvmulsp     vs41, vs1,vs26
 | |
| 
 | |
|     xvmulsp     vs44, vs0,vs27
 | |
|     xvmulsp     vs45, vs1,vs27
 | |
|  
 | |
| 
 | |
| .else 
 | |
|     xvmaddasp       vs40, vs0,vs26
 | |
|     xvmaddasp       vs41, vs1,vs26
 | |
| 
 | |
|     xvmaddasp       vs44, vs0,vs27
 | |
|     xvmaddasp       vs45, vs1,vs27
 | |
|  
 | |
| 
 | |
| .endif
 | |
| .if \Complete==0
 | |
|     lxv vs24,    DISP8(\Index,16+\OffsetB)(\BREG) 
 | |
| 
 | |
|     lxv vs0,    DISP16(\Index,32+\OffsetA)(\AREG)
 | |
|     lxv vs1,    DISP16(\Index,32+16+\OffsetA)(\AREG)
 | |
| 
 | |
|     xxperm      vs26,   vs24,   permute_mask   
 | |
|     xxpermdi    vs25,   vs24,   vs24,2    
 | |
| .endif    
 | |
| .if \IsLast==1  
 | |
| .if \Complete==1
 | |
|     addi        \BREG, \BREG,   DISP8(\Index,16+\OffsetB) 
 | |
|     addi        \AREG, \AREG,  DISP16(\Index,32+\OffsetA)
 | |
| 
 | |
| .else
 | |
|     addi        \BREG, \BREG,   DISP8(\Index,32)
 | |
|     addi        \AREG, \AREG,  DISP16(\Index,64) 
 | |
| .endif
 | |
| .endif
 | |
| 
 | |
| .if \First==1
 | |
|     xvmulsp     vs32, vs4,vs8
 | |
|     xvmulsp     vs33, vs5,vs8
 | |
| 
 | |
|     xvmulsp     vs36, vs4,vs9
 | |
|     xvmulsp     vs37, vs5,vs9
 | |
| 
 | |
| .else
 | |
|     xvmaddasp       vs32, vs4,vs8
 | |
|     xvmaddasp       vs33, vs5,vs8
 | |
| 
 | |
|     xvmaddasp       vs36, vs4,vs9
 | |
|     xvmaddasp       vs37, vs5,vs9
 | |
| 
 | |
| .endif 
 | |
|  
 | |
| .if \Complete==0        
 | |
|     xxpermdi    vs27,   vs26,   vs26,2   
 | |
|  
 | |
| .endif
 | |
| .if \First==1  
 | |
|     xvmulsp     vs40, vs4,vs10
 | |
|     xvmulsp     vs41, vs5,vs10
 | |
| 
 | |
|     xvmulsp     vs44, vs4,vs11
 | |
|     xvmulsp     vs45, vs5,vs11
 | |
|  
 | |
| .else 
 | |
|     xvmaddasp       vs40, vs4,vs10
 | |
|     xvmaddasp       vs41, vs5,vs10
 | |
| 
 | |
|     xvmaddasp       vs44, vs4,vs11
 | |
|     xvmaddasp       vs45, vs5,vs11 
 | |
| 
 | |
| .endif
 | |
| 
 | |
| .endm
 | |
| 
 | |
| 
 | |
| .macro SAVE4x8 
 | |
|  
 | |
|   slwi    T10, LDC ,   1 
 | |
|   add     T1, CO, LDC 
 | |
| 
 | |
|   add     T2, CO, T10  
 | |
|   add     T3, T1, T10  
 | |
| 
 | |
|  
 | |
| 
 | |
| #ifndef TRMMKERNEL    
 | |
|     lxv        vs34, 0(CO)
 | |
|     lxv        vs35, 16(CO)      
 | |
|     lxv        vs38, 0(T1)
 | |
|     lxv        vs39, 16(T1)  
 | |
|     lxv        vs42, 0(T2)
 | |
|     lxv        vs43, 16(T2)     
 | |
|     lxv        vs46, 0(T3)
 | |
|     lxv        vs47, 16(T3)  
 | |
| 
 | |
|  
 | |
| #endif  
 | |
| 
 | |
|     xxmrglw     vs8,    vs32,   vs44
 | |
|     xxmrglw     vs10,   vs36,   vs40  
 | |
| 
 | |
|     xxmrghw     vs1,    vs32,   vs44
 | |
|     xxmrghw     vs0,    vs36,   vs40
 | |
| 
 | |
|     xxmrglw     vs12,   vs33,   vs45
 | |
|     xxmrglw     vs14,   vs37,   vs41  
 | |
| 
 | |
|     xxmrghw     vs2,    vs37,   vs41
 | |
|     xxmrghw     vs3,    vs33,   vs45
 | |
| 
 | |
|     xxlor      vs9, vs8,    vs8
 | |
|     xxlor      vs11,    vs10,   vs10 
 | |
|  
 | |
|     xxlor      vs13,    vs12,   vs12
 | |
|     xxlor      vs15,    vs14,   vs14
 | |
| 
 | |
|     xxperm      vs8,    vs0,    save_permute_1
 | |
|     xxperm      vs10,   vs1,    save_permute_1
 | |
|     xxperm      vs9,    vs0,    save_permute_2  
 | |
|     xxperm      vs11,   vs1,    save_permute_2      
 | |
| 
 | |
|     xxperm     vs12,    vs2,    save_permute_1
 | |
|     xxperm     vs14,    vs3,    save_permute_1
 | |
|       
 | |
|     xxperm     vs13,    vs2,    save_permute_2   
 | |
|     xxperm     vs15,    vs3,    save_permute_2      
 | |
| 
 | |
| 
 | |
|     /* multiply add normal way */
 | |
|  
 | |
| #ifdef TRMMKERNEL
 | |
|     xvmulsp     vs34,   vs8,    alpha_r 
 | |
|     xvmulsp     vs35,   vs12,   alpha_r 
 | |
|     xvmulsp     vs38,   vs9,    alpha_r 
 | |
|     xvmulsp     vs39,   vs13,   alpha_r 
 | |
|     xvmulsp     vs42,   vs10,   alpha_r 
 | |
|     xvmulsp     vs43,   vs14,   alpha_r 
 | |
|     xvmulsp     vs46,   vs11,   alpha_r 
 | |
|     xvmulsp     vs47,   vs15,   alpha_r                    
 | |
| #else 
 | |
|     xvmaddasp   vs34,   vs8,    alpha_r 
 | |
|     xvmaddasp   vs35,   vs12,   alpha_r 
 | |
|     xvmaddasp   vs38,   vs9,    alpha_r 
 | |
|     xvmaddasp   vs39,   vs13,   alpha_r  
 | |
|     xvmaddasp   vs42,   vs10,   alpha_r 
 | |
|     xvmaddasp   vs43,   vs14,   alpha_r   
 | |
|     xvmaddasp   vs46,   vs11,   alpha_r 
 | |
|     xvmaddasp   vs47,   vs15,   alpha_r                     
 | |
| #endif     
 | |
|  
 | |
|     
 | |
|     stxv        vs34, 0(CO)
 | |
|     stxv        vs35, 16(CO)  
 | |
|     stxv        vs38, 0(T1)
 | |
|     stxv        vs39, 16(T1)  
 | |
|     stxv        vs42, 0(T2)
 | |
|     stxv        vs43, 16(T2)     
 | |
|     stxv        vs46, 0(T3)
 | |
|     stxv        vs47, 16(T3)  
 | |
|   
 | |
| 
 | |
|     addi CO,CO,32
 | |
| 
 | |
| .endm
 | |
| 
 | |
| 
 | |
| /**********************************************************************************************
 | |
| * Macros for N=4 and M=4
 | |
| **********************************************************************************************/
 | |
| 
 | |
| .macro LOAD4x4_1
 | |
|    LOAD4x4 1
 | |
| .endm
 | |
| 
 | |
| .macro LOAD4x4_0
 | |
|    LOAD4x4 0
 | |
| .endm
 | |
| 
 | |
| .macro KERNEL4x4_L1_L4  Index,IsLast
 | |
|   KERNEL4x4_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0
 | |
| .endm
 | |
| 
 | |
| .macro KERNEL4x4_I1_L4  OffsetA,OffsetB, Index,IsLast
 | |
|   KERNEL4x4_L1_L4_I  AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0
 | |
| .endm
 | |
| 
 | |
| .macro KERNEL4x4_I1_L4_2  OffsetA,OffsetB, Index,IsLast
 | |
|   KERNEL4x4_L1_L4_I  AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast,0
 | |
| .endm
 | |
| 
 | |
| .macro KERNEL4x4_I1_L4_3  OffsetA,OffsetB, Index,IsLast
 | |
|   KERNEL4x4_L1_L4_I  AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast,1
 | |
| .endm
 | |
| .macro KERNEL4x4_I1_L2_3  OffsetA,OffsetB, Index,IsLast
 | |
|    KERNEL4x4_L1_L2_I AO,BO,0,  \OffsetA,\OffsetB,\Index,\IsLast,1
 | |
| .endm
 | |
| 
 | |
| .macro KERNEL4x4_I2_L4_2  AREG,BREG,OffsetA,OffsetB, Index,IsLast
 | |
|   KERNEL4x4_L1_L4_I  \AREG,\BREG,  \OffsetA,\OffsetB,\Index,\IsLast,0
 | |
| .endm
 | |
| 
 | |
| .macro KERNEL4x4_I2_L4_3  AREG,BREG,OffsetA,OffsetB, Index,IsLast
 | |
|   KERNEL4x4_L1_L4_I \AREG,\BREG,  \OffsetA,\OffsetB,\Index,\IsLast,1
 | |
| .endm
 | |
| 
 | |
| .macro Zero4X4
 | |
|     xxlxor      vs32,   vs32,   vs32
 | |
|     xxlxor      vs33,   vs33,   vs33 
 | |
|     xxlxor      vs34,   vs34,   vs34
 | |
|     xxlxor      vs35,   vs35,   vs35
 | |
|  
 | |
| .endm
 | |
| 
 | |
| .macro LOAD4x4  Zero
 | |
| 
 | |
|     lxv vs0,     0(AO)
 | |
|     lxv vs24,   0(BO) 
 | |
| 
 | |
| 
 | |
| 
 | |
|     xxperm      vs2,   vs0,       permute_mask  
 | |
|     xxpermdi    vs1,   vs0,   vs0,2      
 | |
|     xxpermdi    vs3,   vs2,   vs2,2        
 | |
| 
 | |
| .if \Zero==1 
 | |
|     xxlxor      vs32,   vs32,   vs32
 | |
|     xxlxor      vs33,   vs33,   vs33 
 | |
|     xxlxor      vs34,   vs34,   vs34
 | |
|     xxlxor      vs35,   vs35,   vs35
 | |
|  
 | |
| .endif
 | |
| .endm
 | |
| 
 | |
| .macro END4x4_NORMAL
 | |
|   END4x4 0, AO, BO, 16,16 
 | |
| .endm
 | |
| 
 | |
| .macro END4x4 First, AREG, BREG, OffsetA, OffsetB
 | |
| 
 | |
| .if \OffsetB != 0 
 | |
|     addi        \BREG, \BREG, \OffsetB 
 | |
| .endif
 | |
| .if \OffsetA != 0 
 | |
|     addi        \AREG, \AREG, \OffsetA 
 | |
| .endif  
 | |
| 
 | |
| .if \First==1
 | |
|     xvmulsp      vs32,   vs24,   vs0
 | |
|     xvmulsp      vs33,   vs24,   vs1 
 | |
|     xvmulsp      vs34,   vs24,   vs2
 | |
|     xvmulsp      vs35,   vs24,   vs3  
 | |
| .else
 | |
|     xvmaddasp      vs32,   vs24,   vs0
 | |
|     xvmaddasp      vs33,   vs24,   vs1 
 | |
|     xvmaddasp      vs34,   vs24,   vs2
 | |
|     xvmaddasp      vs35,   vs24,   vs3
 | |
|  
 | |
| 
 | |
| .endif
 | |
| .endm  
 | |
| 
 | |
| .macro KERNEL4x4_L1_L4_I  AREG,BREG,   OffsetA,OffsetB, Index,IsLast ,Complete
 | |
| 
 | |
|     lxv vs4,    DISP16(\Index, 0+\OffsetA)(\AREG)
 | |
|     lxv vs26,   DISP16(\Index, 0+\OffsetB)(\BREG) 
 | |
| 
 | |
|     xxperm      vs6,   vs4,       permute_mask  
 | |
|     xxpermdi    vs5,   vs4,   vs4,2      
 | |
|     xxpermdi    vs7,   vs6,   vs6,2 
 | |
|  
 | |
|     xvmaddasp      vs32,   vs24,   vs0
 | |
|     xvmaddasp      vs33,   vs24,   vs1 
 | |
|     xvmaddasp      vs34,   vs24,   vs2
 | |
|     xvmaddasp      vs35,   vs24,   vs3
 | |
|  
 | |
| 
 | |
|     lxv vs0,    DISP16(\Index, 16+\OffsetA)(\AREG)
 | |
|     lxv vs24,   DISP16(\Index, 16+\OffsetB)(\BREG)  
 | |
| 
 | |
|     xxperm      vs2,   vs0,       permute_mask  
 | |
|     xxpermdi    vs1,   vs0,   vs0,2      
 | |
|     xxpermdi    vs3,   vs2,   vs2,2   
 | |
| 
 | |
|     xvmaddasp      vs32,   vs26,   vs4
 | |
|     xvmaddasp      vs33,   vs26,   vs5 
 | |
|     xvmaddasp      vs34,   vs26,   vs6
 | |
|     xvmaddasp      vs35,   vs26,   vs7
 | |
|  
 | |
|  
 | |
| 
 | |
|     lxv vs4,    DISP16(\Index, 32+\OffsetA)(\AREG)
 | |
|     lxv vs26,   DISP16(\Index, 32+\OffsetB)(\BREG) 
 | |
| 
 | |
|     xxperm      vs6,   vs4,       permute_mask  
 | |
|     xxpermdi    vs5,   vs4,   vs4,2      
 | |
|     xxpermdi    vs7,   vs6,   vs6,2 
 | |
|  
 | |
|     xvmaddasp      vs32,   vs24,   vs0
 | |
|     xvmaddasp      vs33,   vs24,   vs1 
 | |
|     xvmaddasp      vs34,   vs24,   vs2
 | |
|     xvmaddasp      vs35,   vs24,   vs3
 | |
|  
 | |
| 
 | |
| .if \Complete==0 
 | |
| 
 | |
|     lxv vs0,    DISP16(\Index, 48+\OffsetA)(\AREG)
 | |
|     lxv vs24,   DISP16(\Index, 48+\OffsetB)(\BREG) 
 | |
| 
 | |
|     xxperm      vs2,   vs0,       permute_mask  
 | |
|     xxpermdi    vs1,   vs0,   vs0,2      
 | |
|     xxpermdi    vs3,   vs2,   vs2,2   
 | |
| .endif
 | |
|     xvmaddasp      vs32,   vs26,   vs4
 | |
|     xvmaddasp      vs33,   vs26,   vs5 
 | |
|     xvmaddasp      vs34,   vs26,   vs6
 | |
|     xvmaddasp      vs35,   vs26,   vs7
 | |
|  
 | |
| 
 | |
|  
 | |
|  
 | |
| .if \IsLast==1  
 | |
| .if \Complete==1
 | |
|     addi        \AREG, \AREG, DISP16(\Index,16*3+\OffsetA)  
 | |
|     addi        \BREG, \BREG,  DISP16(\Index,16*3+\OffsetB)
 | |
| 
 | |
| .else
 | |
|     addi        \AREG, \AREG, DISP16(\Index,64)  
 | |
|     addi        \BREG, \BREG,  DISP16(\Index,64)
 | |
| 
 | |
| .endif
 | |
| .endif   
 | |
|  
 | |
|  
 | |
| .endm
 | |
| 
 | |
| .macro KERNEL4x4 First
 | |
|     LOAD4x4 0
 | |
|     END4x4 \First, AO, BO, 16,16  
 | |
| .endm
 | |
| 
 | |
| .macro KERNEL4x4_L1_L2_I  AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete
 | |
| 
 | |
|     lxv vs4,    DISP8(\Index, 0+\OffsetA)(\AREG)
 | |
|     lxv vs26,   DISP8(\Index, 0+\OffsetB)(\BREG) 
 | |
| 
 | |
|     xxperm      vs6,   vs4,       permute_mask  
 | |
|     xxpermdi    vs5,   vs4,   vs4,2      
 | |
|     xxpermdi    vs7,   vs6,   vs6,2 
 | |
| .if \First==1
 | |
|     xvmulsp      vs32,   vs24,   vs0
 | |
|     xvmulsp      vs33,   vs24,   vs1 
 | |
|     xvmulsp      vs34,   vs24,   vs2
 | |
|     xvmulsp      vs35,   vs24,   vs3
 | |
|  
 | |
| .else 
 | |
|     xvmaddasp      vs32,   vs24,   vs0
 | |
|     xvmaddasp      vs33,   vs24,   vs1 
 | |
|     xvmaddasp      vs34,   vs24,   vs2
 | |
|     xvmaddasp      vs35,   vs24,   vs3
 | |
|  
 | |
| .endif
 | |
| 
 | |
| .if \Complete==0 
 | |
| 
 | |
|     lxv vs0,    DISP8(\Index, 16+\OffsetA)(\AREG)
 | |
|     lxv vs24,   DISP8(\Index, 16+\OffsetB)(\BREG) 
 | |
| 
 | |
|     xxperm      vs2,   vs0,       permute_mask  
 | |
|     xxpermdi    vs1,   vs0,   vs0,2      
 | |
|     xxpermdi    vs3,   vs2,   vs2,2   
 | |
| .endif
 | |
| 
 | |
| .if \First==1
 | |
|     xvmulsp      vs32,   vs26,   vs4
 | |
|     xvmulsp      vs33,   vs26,   vs5 
 | |
|     xvmulsp      vs34,   vs26,   vs6
 | |
|     xvmulsp      vs35,   vs26,   vs7 
 | |
| 
 | |
| 
 | |
| .else
 | |
|     xvmaddasp      vs32,   vs26,   vs4
 | |
|     xvmaddasp      vs33,   vs26,   vs5 
 | |
|     xvmaddasp      vs34,   vs26,   vs6
 | |
|     xvmaddasp      vs35,   vs26,   vs7
 | |
|  
 | |
| .endif
 | |
|  
 | |
|  
 | |
| .if \IsLast==1  
 | |
| .if \Complete==1
 | |
|     addi        \AREG, \AREG, DISP8(\Index,16+\OffsetA)  
 | |
|     addi        \BREG, \BREG,  DISP8(\Index,16+\OffsetB)
 | |
| 
 | |
| .else
 | |
|     addi        \AREG, \AREG, DISP8(\Index,32)  
 | |
|     addi        \BREG, \BREG,  DISP8(\Index,32)
 | |
| 
 | |
| .endif
 | |
| .endif   
 | |
|      
 | |
|   
 | |
| .endm
 | |
| 
 | |
| 
 | |
| .macro SAVE4x4
 | |
|   slwi    T10, LDC ,   1 
 | |
|   add     T1, CO, LDC 
 | |
| #if !defined(TRMMKERNEL)  
 | |
|   lxv        vs36, 0(CO)
 | |
|   lxv        vs37, 0(T1)
 | |
| #endif
 | |
|   add     T2, CO, T10  
 | |
|   add     T3, T1, T10 
 | |
| #if !defined(TRMMKERNEL)   
 | |
|   lxv        vs38, 0(T2)
 | |
|   lxv        vs39, 0(T3)    
 | |
| #endif   
 | |
| 
 | |
|   xxmrglw  vs0, vs35,vs32
 | |
|   xxmrglw  vs1, vs34,vs33 
 | |
|   xxmrglw  vs4, vs32,vs35
 | |
|   xxmrglw  vs5, vs33,vs34 
 | |
| 
 | |
| 
 | |
|   xxmrghw  vs2, vs35,vs32
 | |
|   xxmrghw  vs3, vs34,vs33 
 | |
|   xxmrghw  vs6, vs32,vs35
 | |
|   xxmrghw  vs7, vs33,vs34  
 | |
| 
 | |
|   xxmrgld  vs24, vs1, vs0  
 | |
|   xxmrghd  vs25,vs5,vs4 
 | |
| 
 | |
|   xxmrgld  vs26, vs2, vs3  
 | |
|   xxmrghd  vs27,vs6,vs7
 | |
| 
 | |
|  #if defined(TRMMKERNEL)
 | |
|   xvmulsp        vs36, vs24, alpha_r
 | |
|   xvmulsp        vs37, vs25, alpha_r 
 | |
|   xvmulsp        vs38, vs26, alpha_r
 | |
|   xvmulsp        vs39, vs27, alpha_r 
 | |
| #else
 | |
|   xvmaddasp        vs36, vs24, alpha_r
 | |
|   xvmaddasp        vs37, vs25, alpha_r 
 | |
|   xvmaddasp        vs38, vs26, alpha_r
 | |
|   xvmaddasp        vs39, vs27, alpha_r   
 | |
|  #endif
 | |
|   stxv        vs36, 0(CO)
 | |
|   stxv        vs37, 0(T1) 
 | |
|   stxv        vs38, 0(T2)
 | |
|   stxv        vs39, 0(T3)   
 | |
|  
 | |
| 
 | |
| 
 | |
|   addi CO,CO,16
 | |
| .endm
 | |
| 
 | |
| 
 | |
| /**********************************************************************************************
 | |
| * Macros for N=4 and M=2
 | |
| **********************************************************************************************/
 | |
| 
 | |
|  
 | |
| .macro KERNEL4x2_2   OffsetA,OffsetB, Index,IsLast
 | |
|   KERNEL4x2_I_2 AO,BO, 0, \OffsetA,\OffsetB,\Index,\IsLast
 | |
| .endm
 | |
| 
 | |
|  
 | |
| 
 | |
| .macro Zero4x2
 | |
|     xxlxor      vs0,   vs0,   vs0 
 | |
|     xxlxor      vs2,   vs2,   vs2 
 | |
|        
 | |
| .endm
 | |
|  
 | |
| .macro KERNEL4x2
 | |
|   KERNEL4x2_1 AO,BO, 0, 0,0,0
 | |
| .endm
 | |
| .macro KERNEL4x2_1 AREG,BREG,First,OffsetA,OffsetB,Index
 | |
| 
 | |
| 
 | |
|     lxsd v4,   DISP2(\Index, 0+\OffsetA)(\AREG)
 | |
|     lxv vs26,   DISP8(\Index, 0+\OffsetB)(\BREG)      
 | |
|     xxspltw   vs8,  vs36, 0 
 | |
|     xxspltw   vs9,  vs36, 1  
 | |
|  
 | |
| .if \First==1
 | |
|     xvmulsp      vs0,   vs26,   vs8 
 | |
|     xvmulsp      vs2,   vs26,   vs9 
 | |
|      
 | |
| .else 
 | |
|     xvmaddasp      vs0,   vs26,   vs8  
 | |
|     xvmaddasp      vs2,   vs26,   vs9 
 | |
|  
 | |
|  .endif
 | |
|    
 | |
|     addi        \AREG, \AREG, DISP2(\Index,8)  
 | |
|     addi        \BREG, \BREG, DISP4(\Index,16)
 | |
|  
 | |
| .endm
 | |
| 
 | |
| .macro KERNEL4x2_I_2  AREG,BREG,First,OffsetA,OffsetB, Index,IsLast  
 | |
| 
 | |
|     lxv vs4,    DISP4(\Index, 0+\OffsetA)(\AREG)
 | |
|     lxv vs26,   DISP8(\Index, 0+\OffsetB)(\BREG) 
 | |
|     lxv vs28,   DISP8(\Index,16+\OffsetB)(\BREG)       
 | |
|     xxspltw   vs8,  vs4, 2  
 | |
|     xxspltw   vs9,  vs4, 3 
 | |
|     xxspltw   vs10, vs4, 0 
 | |
|     xxspltw   vs11, vs4, 1
 | |
|  
 | |
| .if \First==1
 | |
|     xvmulsp      vs0,   vs26,   vs8 
 | |
|     xvmulsp      vs2,   vs26,   vs9  
 | |
| 
 | |
|     xvmulsp      vs0,   vs28,   vs10 
 | |
|     xvmulsp      vs2,   vs28,   vs11     
 | |
| .else 
 | |
|     xvmaddasp      vs0,   vs26,   vs8 
 | |
|     xvmaddasp      vs2,   vs26,   vs9 
 | |
| 
 | |
|     xvmaddasp      vs0,   vs28,   vs10 
 | |
|     xvmaddasp      vs2,   vs28,   vs11   
 | |
|  .endif
 | |
| 
 | |
|  
 | |
| .if \IsLast==1   
 | |
|     addi        \AREG, \AREG, DISP4(\Index,16)  
 | |
|     addi        \BREG, \BREG, DISP8(\Index,32)
 | |
| .endif 
 | |
|   
 | |
| .endm
 | |
| 
 | |
| 
 | |
| .macro SAVE4x2
 | |
|   slwi    T10, LDC ,   1 
 | |
|   add     T1, CO, LDC  
 | |
|   add     T2, CO, T10  
 | |
|   add     T3, T1, T10     
 | |
|   /*convert alpha_r for multiply*/
 | |
|   xscvspdp  vs4,alpha_r
 | |
| /* v0 corresponds to vs32, do not forget*/
 | |
| #if !defined(TRMMKERNEL)
 | |
|   lxssp  v0,0(CO) 
 | |
|   lxssp  v1,4(CO) 
 | |
| 
 | |
|   lxssp  v2,0(T1)
 | |
|   lxssp  v3,4(T1)
 | |
| 
 | |
|   lxssp  v4,0(T2)
 | |
|   lxssp  v5,4(T2)
 | |
| 
 | |
|   lxssp  v6,0(T3)
 | |
|   lxssp  v7,4(T3)
 | |
| 
 | |
|    
 | |
| #endif
 | |
|   xscvspdp  vs5, vs2
 | |
|   xxspltw   vs6, vs2, 1 
 | |
|   xxspltw   vs7, vs2, 2 
 | |
|   xxspltw   vs8, vs2, 3  
 | |
|   xscvspdp  vs6,vs6
 | |
|   xscvspdp  vs7,vs7
 | |
|   xscvspdp  vs8,vs8
 | |
| 
 | |
|   xscvspdp  vs24, vs0
 | |
|   xxspltw   vs25, vs0, 1 
 | |
|   xxspltw   vs26, vs0, 2 
 | |
|   xxspltw   vs27, vs0, 3  
 | |
|   xscvspdp  vs25,vs25
 | |
|   xscvspdp  vs26,vs26
 | |
|   xscvspdp  vs27,vs27
 | |
|  
 | |
| 
 | |
| #if defined(TRMMKERNEL)
 | |
|   xsmuldp  vs32,vs8, vs4 
 | |
|   xsmuldp  vs33,vs27, vs4 
 | |
| 
 | |
|   xsmuldp  vs34,vs7, vs4 
 | |
|   xsmuldp  vs35,vs26, vs4 
 | |
| 
 | |
|   xsmuldp  vs36,vs6, vs4 
 | |
|   xsmuldp  vs37,vs25, vs4  
 | |
| 
 | |
|   xsmuldp  vs38,vs5, vs4 
 | |
|   xsmuldp  vs39,vs24, vs4  
 | |
| 
 | |
|       
 | |
| #else
 | |
|   xsmaddadp  vs32,vs8, vs4 
 | |
|   xsmaddadp  vs33,vs27, vs4 
 | |
| 
 | |
|   xsmaddadp  vs34,vs7, vs4 
 | |
|   xsmaddadp  vs35,vs26, vs4 
 | |
| 
 | |
|   xsmaddadp  vs36,vs6, vs4 
 | |
|   xsmaddadp  vs37,vs25, vs4  
 | |
| 
 | |
|   xsmaddadp  vs38,vs5, vs4 
 | |
|   xsmaddadp  vs39,vs24, vs4  
 | |
| 
 | |
|     
 | |
| #endif  
 | |
| 
 | |
|   stxssp  v0,0(CO) 
 | |
|   stxssp  v1,4(CO) 
 | |
| 
 | |
|   stxssp  v2,0(T1)
 | |
|   stxssp  v3,4(T1)
 | |
| 
 | |
|   stxssp  v4,0(T2)
 | |
|   stxssp  v5,4(T2)
 | |
| 
 | |
|   stxssp  v6,0(T3)
 | |
|   stxssp  v7,4(T3)
 | |
| 
 | |
|  
 | |
|  
 | |
| 
 | |
|   addi CO,CO,8
 | |
| .endm
 | |
| 
 | |
| 
 | |
| /**********************************************************************************************
 | |
| * Macros for N=4 and M=1
 | |
| **********************************************************************************************/
 | |
| .macro KERNEL4x1_4   OffsetA,OffsetB, Index,IsLast
 | |
|   KERNEL4x1_I_4 AO,BO, 0, \OffsetA,\OffsetB,\Index,\IsLast
 | |
| .endm
 | |
| 
 | |
| .macro Zero4x1
 | |
|     xxlxor      vs0,   vs0,   vs0 
 | |
| .endm
 | |
| 
 | |
| .macro KERNEL4x1
 | |
|   KERNEL4x1_1 AO,BO, 0 
 | |
| .endm
 | |
| 
 | |
| .macro KERNEL4x1_2
 | |
|   KERNEL4x1_2_1 AO,BO, 0 
 | |
| .endm
 | |
| 
 | |
| .macro KERNEL4x1_1 AREG,BREG,First 
 | |
|     lxvwsx vs8,  0, \AREG
 | |
|     lxv vs26,   0(\BREG)       
 | |
| .if \First==1
 | |
|     xvmulsp      vs0,   vs26,   vs8  
 | |
| .else 
 | |
|     xvmaddasp      vs0,   vs26,   vs8  
 | |
|  .endif
 | |
|     addi        \AREG, \AREG,  4  
 | |
|     addi        \BREG, \BREG,  16
 | |
| .endm
 | |
| 
 | |
| .macro KERNEL4x1_2_1 AREG,BREG,First 
 | |
|     lxsd v4,    0(\AREG)
 | |
|     lxv vs26,   0(\BREG)      
 | |
|     lxv vs28,  16(\BREG) 
 | |
|     xxspltw   vs8,  vs36, 1 
 | |
|     xxspltw   vs9,  vs36, 0  
 | |
| .if \First==1
 | |
|     xvmulsp      vs0,   vs26,   vs8 
 | |
|     xvmulsp      vs0,   vs28,   vs9     
 | |
| .else 
 | |
|     xvmaddasp      vs0,   vs26,   vs8  
 | |
|     xvmaddasp      vs0,   vs28,   vs9  
 | |
|  .endif
 | |
|     addi        \AREG, \AREG,  8 
 | |
|     addi        \BREG, \BREG,  32
 | |
| .endm
 | |
| 
 | |
| .macro KERNEL4x1_I_4  AREG,BREG,First,OffsetA,OffsetB, Index,IsLast  
 | |
|     lxv vs4,    DISP4(\Index, 0+\OffsetA)(\AREG)
 | |
|     xxspltw   vs8,  vs4, 3 
 | |
|     xxspltw   vs9,  vs4, 2 
 | |
|     xxspltw   vs10, vs4, 1 
 | |
|     xxspltw   vs11, vs4, 0
 | |
|     lxv vs26,   DISP16(\Index, 0+\OffsetB)(\BREG) 
 | |
|     lxv vs28,   DISP16(\Index,16+\OffsetB)(\BREG)  
 | |
|     lxv vs30,   DISP16(\Index,32+\OffsetB)(\BREG) 
 | |
|     lxv vs32,   DISP16(\Index,48+\OffsetB)(\BREG)          
 | |
| .if \First==1
 | |
|     xvmulsp      vs0,   vs26,   vs8  
 | |
|     xvmulsp      vs0,   vs28,   vs9      
 | |
|     xvmulsp      vs0,   vs30,   vs10  
 | |
|     xvmulsp      vs0,   vs32,   vs11     
 | |
| .else 
 | |
|     xvmaddasp      vs0,   vs26,   vs8  
 | |
|     xvmaddasp      vs0,   vs28,   vs9     
 | |
|     xvmaddasp      vs0,   vs30,   vs10  
 | |
|     xvmaddasp      vs0,   vs32,   vs11  
 | |
|  .endif
 | |
| .if \IsLast==1   
 | |
|     addi        \AREG, \AREG, DISP4(\Index,16)  
 | |
|     addi        \BREG, \BREG, DISP16(\Index,64)
 | |
| .endif 
 | |
| .endm
 | |
| 
 | |
| .macro SAVE4x1
 | |
|   slwi    T10, LDC ,   1 
 | |
|   add     T1, CO, LDC  
 | |
|   add     T2, CO, T10  
 | |
|   add     T3, T1, T10     
 | |
|   /*convert alpha_r for multiply*/
 | |
|   xscvspdp  vs4,alpha_r
 | |
| /* v0 corresponds to vs32, do not forget*/
 | |
| #if !defined(TRMMKERNEL)
 | |
|   lxssp  v0,0(CO)  
 | |
|   lxssp  v2,0(T1) 
 | |
|   lxssp  v4,0(T2) 
 | |
|   lxssp  v6,0(T3)  
 | |
| #endif
 | |
|   xscvspdp  vs24, vs0
 | |
|   xxspltw   vs25, vs0, 1 
 | |
|   xxspltw   vs26, vs0, 2 
 | |
|   xxspltw   vs27, vs0, 3  
 | |
|   xscvspdp  vs25,vs25
 | |
|   xscvspdp  vs26,vs26
 | |
|   xscvspdp  vs27,vs27
 | |
| 
 | |
| #if defined(TRMMKERNEL)
 | |
|   xsmuldp  vs32,vs27, vs4 
 | |
|   xsmuldp  vs34,vs26, vs4 
 | |
|   xsmuldp  vs36,vs25, vs4 
 | |
|   xsmuldp  vs38,vs24, vs4  
 | |
| #else
 | |
|   xsmaddadp  vs32,vs27, vs4 
 | |
|   xsmaddadp  vs34,vs26, vs4 
 | |
|   xsmaddadp  vs36,vs25, vs4 
 | |
|   xsmaddadp  vs38,vs24, vs4   
 | |
| #endif  
 | |
|   stxssp  v0,0(CO)  
 | |
|   stxssp  v2,0(T1) 
 | |
|   stxssp  v4,0(T2) 
 | |
|   stxssp  v6,0(T3)  
 | |
|   addi CO,CO,4
 | |
| .endm
 | |
| 
 | |
| /****************************N=2 section*****************/
 | |
| 
 | |
| .macro KERNEL2x16_2   OffsetA,OffsetB, Index,IsLast
 | |
|   KERNEL2x16_I_2 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
 | |
| .endm
 | |
| 
 | |
|  
 | |
| .macro Zero2x16
 | |
|     xxlxor      vs0,   vs0,   vs0
 | |
|     xxlxor      vs1,   vs1,   vs1 
 | |
|     xxlxor      vs2,   vs2,   vs2
 | |
|     xxlxor      vs3,   vs3,   vs3
 | |
|     xxlxor      vs4,   vs4,   vs4
 | |
|     xxlxor      vs5,   vs5,   vs5 
 | |
|     xxlxor      vs6,   vs6,   vs6
 | |
|     xxlxor      vs7,   vs7,   vs7      
 | |
| .endm
 | |
|  
 | |
| .macro KERNEL2x16
 | |
|   KERNEL2x16_1 AO,BO, 0, 0,0,0
 | |
| .endm
 | |
| .macro KERNEL2x16_4 OffsetA,OffsetB, Index,IsLast  
 | |
|   KERNEL2x16_I_4 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
 | |
| .endm
 | |
|  
 | |
| .macro KERNEL2x16_1 AREG,BREG,First,OffsetA,OffsetB,Index
 | |
| 
 | |
| 
 | |
|     lxsd v4,   DISP2(\Index, 0+\OffsetB)(\BREG)
 | |
|     xxspltw   vs8,  vs36, 1 
 | |
|     xxspltw   vs9,  vs36, 0 
 | |
|     lxv vs26,   DISP16(\Index, 0+\OffsetA)(\AREG)
 | |
|     lxv vs27,   DISP16(\Index,16+\OffsetA)(\AREG)   
 | |
|     lxv vs28,   DISP16(\Index, 32+\OffsetA)(\AREG)
 | |
|     lxv vs29,   DISP16(\Index,48+\OffsetA)(\AREG)        
 | |
|  
 | |
|  
 | |
| .if \First==1
 | |
|     xvmulsp      vs0,   vs26,   vs8
 | |
|     xvmulsp      vs1,   vs27,   vs8 
 | |
|     xvmulsp      vs2,   vs28,   vs8
 | |
|     xvmulsp      vs3,   vs29,   vs8 
 | |
| 
 | |
|     xvmulsp      vs4,   vs26,   vs9
 | |
|     xvmulsp      vs5,   vs27,   vs9 
 | |
|     xvmulsp      vs6,   vs28,   vs9
 | |
|     xvmulsp      vs7,   vs29,   vs9     
 | |
|      
 | |
| .else 
 | |
|     xvmaddasp      vs0,   vs26,   vs8
 | |
|     xvmaddasp      vs1,   vs27,   vs8 
 | |
|     xvmaddasp      vs2,   vs28,   vs8
 | |
|     xvmaddasp      vs3,   vs29,   vs8 
 | |
| 
 | |
|     xvmaddasp      vs4,   vs26,   vs9
 | |
|     xvmaddasp      vs5,   vs27,   vs9 
 | |
|     xvmaddasp      vs6,   vs28,   vs9
 | |
|     xvmaddasp      vs7,   vs29,   vs9
 | |
|  
 | |
|  .endif
 | |
|    
 | |
|     addi        \BREG, \BREG, DISP2(\Index,8)
 | |
|     addi        \AREG, \AREG, DISP16(\Index,64)  
 | |
|  
 | |
| .endm
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| .macro KERNEL2x16_I_4  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
 | |
| 
 | |
|     lxv vs38,    DISP8(\Index, 0+\OffsetB)(\BREG)
 | |
|     lxv vs39,    DISP8(\Index, 16+\OffsetB)(\BREG) 
 | |
| 
 | |
|     lxv vs26,   DISP64(\Index, 0+\OffsetA)(\AREG)
 | |
|     lxv vs27,   DISP64(\Index,16+\OffsetA)(\AREG)
 | |
|     lxv vs28,   DISP64(\Index,32+\OffsetA)(\AREG)
 | |
|     lxv vs29,   DISP64(\Index,48+\OffsetA)(\AREG)  
 | |
| 
 | |
|     lxv vs16,   DISP64(\Index,64+ 0+\OffsetA)(\AREG)
 | |
|     lxv vs17,   DISP64(\Index,64+ 16+\OffsetA)(\AREG)
 | |
|     lxv vs18,   DISP64(\Index,64+ 32+\OffsetA)(\AREG)
 | |
|     lxv vs19,   DISP64(\Index,64+ 48+\OffsetA)(\AREG) 
 | |
| 
 | |
|     lxv vs30,   DISP64(\Index,128+ 0+\OffsetA)(\AREG)
 | |
|     lxv vs31,   DISP64(\Index,128+ 16+\OffsetA)(\AREG)
 | |
|     lxv vs32,   DISP64(\Index,128+ 32+\OffsetA)(\AREG)
 | |
|     lxv vs33,   DISP64(\Index,128+ 48+\OffsetA)(\AREG)  
 | |
|         
 | |
|     lxv vs34,   DISP64(\Index,128+ 64+ 0+\OffsetA)(\AREG)
 | |
|     lxv vs35,   DISP64(\Index,128+ 64+ 16+\OffsetA)(\AREG)
 | |
|     lxv vs36,   DISP64(\Index,128+ 64+ 32+\OffsetA)(\AREG)
 | |
|     lxv vs37,   DISP64(\Index,128+ 64+ 48+\OffsetA)(\AREG) 
 | |
| 
 | |
|     xxspltw   vs8,  vs38, 3  
 | |
|     xxspltw   vs9,  vs38, 2 
 | |
|     xxspltw   vs10, vs38, 1 
 | |
|     xxspltw   vs11, vs38, 0  
 | |
| 
 | |
|     xxspltw   vs12,  vs39, 3  
 | |
|     xxspltw   vs13,  vs39, 2 
 | |
|     xxspltw   vs14, vs39, 1 
 | |
|     xxspltw   vs15, vs39, 0  
 | |
| 
 | |
|  
 | |
|     xvmaddasp      vs0,   vs26,   vs8
 | |
|     xvmaddasp      vs1,   vs27,   vs8 
 | |
|     xvmaddasp      vs2,   vs28,   vs8
 | |
|     xvmaddasp      vs3,   vs29,   vs8 
 | |
| 
 | |
|     xvmaddasp      vs4,   vs26,   vs9
 | |
|     xvmaddasp      vs5,   vs27,   vs9 
 | |
|     xvmaddasp      vs6,   vs28,   vs9
 | |
|     xvmaddasp      vs7,   vs29,   vs9 
 | |
| 
 | |
|     xvmaddasp      vs0,   vs16,   vs10
 | |
|     xvmaddasp      vs1,   vs17,   vs10 
 | |
|     xvmaddasp      vs2,   vs18,   vs10
 | |
|     xvmaddasp      vs3,   vs19,   vs10 
 | |
| 
 | |
|     xvmaddasp      vs4,   vs16,   vs11
 | |
|     xvmaddasp      vs5,   vs17,   vs11 
 | |
|     xvmaddasp      vs6,   vs18,   vs11
 | |
|     xvmaddasp      vs7,   vs19,   vs11  
 | |
| 
 | |
|     xvmaddasp      vs0,   vs30,   vs12
 | |
|     xvmaddasp      vs1,   vs31,   vs12 
 | |
|     xvmaddasp      vs2,   vs32,   vs12
 | |
|     xvmaddasp      vs3,   vs33,   vs12 
 | |
| 
 | |
|     xvmaddasp      vs4,   vs30,   vs13
 | |
|     xvmaddasp      vs5,   vs31,   vs13 
 | |
|     xvmaddasp      vs6,   vs32,   vs13
 | |
|     xvmaddasp      vs7,   vs33,   vs13 
 | |
| 
 | |
|     xvmaddasp      vs0,   vs34,   vs14
 | |
|     xvmaddasp      vs1,   vs35,   vs14 
 | |
|     xvmaddasp      vs2,   vs36,   vs14
 | |
|     xvmaddasp      vs3,   vs37,   vs14 
 | |
| 
 | |
|     xvmaddasp      vs4,   vs34,   vs15
 | |
|     xvmaddasp      vs5,   vs35,   vs15 
 | |
|     xvmaddasp      vs6,   vs36,   vs15
 | |
|     xvmaddasp      vs7,   vs37,   vs15    
 | |
|  
 | |
|  
 | |
| .if \IsLast==1   
 | |
|     addi        \BREG, \BREG, DISP8(\Index,32)  
 | |
|     addi        \AREG, \AREG, DISP64(\Index,256)
 | |
| .endif 
 | |
|   
 | |
| .endm
 | |
| 
 | |
| .macro KERNEL2x16_I_2  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
 | |
| 
 | |
|     lxv vs36,    DISP4(\Index, 0+\OffsetB)(\BREG)
 | |
|     xxspltw   vs8,  vs36, 3  
 | |
|     xxspltw   vs9,  vs36, 2 
 | |
|     xxspltw   vs10, vs36, 1 
 | |
|     xxspltw   vs11, vs36, 0    
 | |
|     lxv vs26,   DISP32(\Index, 0+\OffsetA)(\AREG)
 | |
|     lxv vs27,   DISP32(\Index,16+\OffsetA)(\AREG)
 | |
|     lxv vs28,   DISP32(\Index,32+\OffsetA)(\AREG)
 | |
|     lxv vs29,   DISP32(\Index,48+\OffsetA)(\AREG)      
 | |
|     lxv vs16,   DISP32(\Index,64+ 0+\OffsetA)(\AREG)
 | |
|     lxv vs17,   DISP32(\Index,64+ 16+\OffsetA)(\AREG)
 | |
|     lxv vs18,   DISP32(\Index,64+ 32+\OffsetA)(\AREG)
 | |
|     lxv vs19,   DISP32(\Index,64+ 48+\OffsetA)(\AREG) 
 | |
|  
 | |
|  
 | |
|     xvmaddasp      vs0,   vs26,   vs8
 | |
|     xvmaddasp      vs1,   vs27,   vs8 
 | |
|     xvmaddasp      vs2,   vs28,   vs8
 | |
|     xvmaddasp      vs3,   vs29,   vs8 
 | |
| 
 | |
|     xvmaddasp      vs4,   vs26,   vs9
 | |
|     xvmaddasp      vs5,   vs27,   vs9 
 | |
|     xvmaddasp      vs6,   vs28,   vs9
 | |
|     xvmaddasp      vs7,   vs29,   vs9 
 | |
| 
 | |
|     xvmaddasp      vs0,   vs16,   vs10
 | |
|     xvmaddasp      vs1,   vs17,   vs10 
 | |
|     xvmaddasp      vs2,   vs18,   vs10
 | |
|     xvmaddasp      vs3,   vs19,   vs10 
 | |
| 
 | |
|     xvmaddasp      vs4,   vs16,   vs11
 | |
|     xvmaddasp      vs5,   vs17,   vs11 
 | |
|     xvmaddasp      vs6,   vs18,   vs11
 | |
|     xvmaddasp      vs7,   vs19,   vs11   
 | |
|  
 | |
| .if \IsLast==1   
 | |
|     addi        \BREG, \BREG, DISP4(\Index,16)  
 | |
|     addi        \AREG, \AREG, DISP32(\Index,128)
 | |
| .endif 
 | |
|   
 | |
| .endm
 | |
| 
 | |
| 
 | |
| .macro SAVE2x16
 | |
| 
 | |
| #ifndef TRMMKERNEL    
 | |
|     lxv        vs16, 0(CO)
 | |
|     lxv        vs17, 16(CO) 
 | |
|     lxv        vs18, 32(CO)  
 | |
|     lxv        vs19, 48(CO)      
 | |
| #endif
 | |
|   add     T1, CO, LDC 
 | |
| #ifndef TRMMKERNEL    
 | |
|     lxv        vs26, 0(T1)
 | |
|     lxv        vs27, 16(T1) 
 | |
|     lxv        vs28, 32(T1)  
 | |
|     lxv        vs29, 48(T1)      
 | |
| #endif
 | |
| 
 | |
| #if defined(TRMMKERNEL)
 | |
|   xvmulsp        vs16, vs0, alpha_r
 | |
|   xvmulsp        vs17, vs1, alpha_r 
 | |
|   xvmulsp        vs18, vs2, alpha_r
 | |
|   xvmulsp        vs19, vs3, alpha_r   
 | |
|   xvmulsp        vs26, vs4, alpha_r
 | |
|   xvmulsp        vs27, vs5, alpha_r 
 | |
|   xvmulsp        vs28, vs6, alpha_r
 | |
|   xvmulsp        vs29, vs7, alpha_r
 | |
| #else
 | |
|   xvmaddasp        vs16, vs0, alpha_r
 | |
|   xvmaddasp        vs17, vs1, alpha_r 
 | |
|   xvmaddasp        vs18, vs2, alpha_r
 | |
|   xvmaddasp        vs19, vs3, alpha_r   
 | |
|   xvmaddasp        vs26, vs4, alpha_r
 | |
|   xvmaddasp        vs27, vs5, alpha_r 
 | |
|   xvmaddasp        vs28, vs6, alpha_r
 | |
|   xvmaddasp        vs29, vs7, alpha_r
 | |
| #endif
 | |
|     stxv        vs16, 0(CO)
 | |
|     stxv        vs17, 16(CO) 
 | |
|     stxv        vs18, 32(CO)  
 | |
|     stxv        vs19, 48(CO)      
 | |
|     
 | |
|     stxv        vs26, 0(T1)
 | |
|     stxv        vs27, 16(T1) 
 | |
|     stxv        vs28, 32(T1)  
 | |
|     stxv        vs29, 48(T1) 
 | |
|  
 | |
|   addi CO,CO,64
 | |
| 
 | |
| .endm
 | |
| 
 | |
| /*       M=8 N=2 */
 | |
| 
 | |
| .macro KERNEL2x8_2   OffsetA,OffsetB, Index,IsLast
 | |
|   KERNEL2x8_I_2 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
 | |
| .endm
 | |
| 
 | |
|  
 | |
| .macro Zero2x8
 | |
|     xxlxor      vs0,   vs0,   vs0
 | |
|     xxlxor      vs1,   vs1,   vs1 
 | |
|  
 | |
|     xxlxor      vs4,   vs4,   vs4
 | |
|     xxlxor      vs5,   vs5,   vs5 
 | |
|      
 | |
| .endm
 | |
|  
 | |
| .macro KERNEL2x8
 | |
|   KERNEL2x8_1 AO,BO, 0, 0,0,0
 | |
| .endm
 | |
| .macro KERNEL2x8_4 OffsetA,OffsetB, Index,IsLast  
 | |
|   KERNEL2x8_I_4 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
 | |
| .endm
 | |
|  
 | |
| .macro KERNEL2x8_1 AREG,BREG,First,OffsetA,OffsetB,Index
 | |
| 
 | |
| 
 | |
|     lxsd v4,   DISP2(\Index, 0+\OffsetB)(\BREG)
 | |
|     xxspltw   vs8,  vs36, 1 
 | |
|     xxspltw   vs9,  vs36, 0 
 | |
|     lxv vs26,   DISP8(\Index, 0+\OffsetA)(\AREG)
 | |
|     lxv vs27,   DISP8(\Index,16+\OffsetA)(\AREG)          
 | |
|  
 | |
|  
 | |
| .if \First==1
 | |
|     xvmulsp      vs0,   vs26,   vs8
 | |
|     xvmulsp      vs1,   vs27,   vs8  
 | |
| 
 | |
|     xvmulsp      vs4,   vs26,   vs9
 | |
|     xvmulsp      vs5,   vs27,   vs9      
 | |
|      
 | |
| .else 
 | |
|     xvmaddasp      vs0,   vs26,   vs8
 | |
|     xvmaddasp      vs1,   vs27,   vs8   
 | |
| 
 | |
|     xvmaddasp      vs4,   vs26,   vs9
 | |
|     xvmaddasp      vs5,   vs27,   vs9  
 | |
|  
 | |
|  .endif
 | |
|    
 | |
|     addi        \BREG, \BREG, DISP2(\Index,8)
 | |
|     addi        \AREG, \AREG, DISP8(\Index,32)  
 | |
|  
 | |
| .endm
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| .macro KERNEL2x8_I_4  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
 | |
| 
 | |
|     lxv vs38,    DISP8(\Index, 0+\OffsetB)(\BREG)
 | |
|     lxv vs39,    DISP8(\Index, 16+\OffsetB)(\BREG) 
 | |
| 
 | |
|     lxv vs26,   DISP32(\Index, 0+\OffsetA)(\AREG)
 | |
|     lxv vs27,   DISP32(\Index,16+\OffsetA)(\AREG) 
 | |
| 
 | |
|     lxv vs16,   DISP32(\Index,32+ 0+\OffsetA)(\AREG)
 | |
|     lxv vs17,   DISP32(\Index,32+ 16+\OffsetA)(\AREG) 
 | |
| 
 | |
|     lxv vs30,   DISP32(\Index,64+ 0+\OffsetA)(\AREG)
 | |
|     lxv vs31,   DISP32(\Index,64+ 16+\OffsetA)(\AREG)  
 | |
|         
 | |
|     lxv vs34,   DISP32(\Index, 96+ 0+\OffsetA)(\AREG)
 | |
|     lxv vs35,   DISP32(\Index, 96+ 16+\OffsetA)(\AREG) 
 | |
| 
 | |
|     xxspltw   vs8,  vs38, 3  
 | |
|     xxspltw   vs9,  vs38, 2 
 | |
|     xxspltw   vs10, vs38, 1 
 | |
|     xxspltw   vs11, vs38, 0  
 | |
| 
 | |
|     xxspltw   vs12,  vs39, 3  
 | |
|     xxspltw   vs13,  vs39, 2 
 | |
|     xxspltw   vs14, vs39, 1 
 | |
|     xxspltw   vs15, vs39, 0  
 | |
| 
 | |
|  
 | |
|  
 | |
|     xvmaddasp      vs0,   vs26,   vs8
 | |
|     xvmaddasp      vs1,   vs27,   vs8 
 | |
|     xvmaddasp      vs4,   vs26,   vs9
 | |
|     xvmaddasp      vs5,   vs27,   vs9 
 | |
|  
 | |
| 
 | |
|     xvmaddasp      vs0,   vs16,   vs10
 | |
|     xvmaddasp      vs1,   vs17,   vs10 
 | |
|     xvmaddasp      vs4,   vs16,   vs11
 | |
|     xvmaddasp      vs5,   vs17,   vs11 
 | |
|  
 | |
| 
 | |
|     xvmaddasp      vs0,   vs30,   vs12
 | |
|     xvmaddasp      vs1,   vs31,   vs12 
 | |
|     xvmaddasp      vs4,   vs30,   vs13
 | |
|     xvmaddasp      vs5,   vs31,   vs13 
 | |
| 
 | |
|     xvmaddasp      vs0,   vs34,   vs14
 | |
|     xvmaddasp      vs1,   vs35,   vs14 
 | |
|     xvmaddasp      vs4,   vs34,   vs15
 | |
|     xvmaddasp      vs5,   vs35,   vs15 
 | |
|    
 | |
|  
 | |
|  
 | |
| .if \IsLast==1   
 | |
|     addi        \BREG, \BREG, DISP8(\Index,32)  
 | |
|     addi        \AREG, \AREG, DISP32(\Index,128)
 | |
| .endif 
 | |
|   
 | |
| .endm
 | |
| 
 | |
| .macro KERNEL2x8_I_2  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
 | |
| 
 | |
|     lxv vs36,    DISP4(\Index, 0+\OffsetB)(\BREG)
 | |
|     xxspltw   vs8,  vs36, 3  
 | |
|     xxspltw   vs9,  vs36, 2 
 | |
|     xxspltw   vs10, vs36, 1 
 | |
|     xxspltw   vs11, vs36, 0    
 | |
|     lxv vs26,   DISP16(\Index, 0+\OffsetA)(\AREG)
 | |
|     lxv vs27,   DISP16(\Index,16+\OffsetA)(\AREG)
 | |
|     lxv vs16,   DISP16(\Index,32+\OffsetA)(\AREG)
 | |
|     lxv vs17,   DISP16(\Index,48+\OffsetA)(\AREG)      
 | |
|  
 | |
|     xvmaddasp      vs0,   vs26,   vs8
 | |
|     xvmaddasp      vs1,   vs27,   vs8  
 | |
| 
 | |
|     xvmaddasp      vs4,   vs26,   vs9
 | |
|     xvmaddasp      vs5,   vs27,   vs9  
 | |
| 
 | |
|     xvmaddasp      vs0,   vs16,   vs10
 | |
|     xvmaddasp      vs1,   vs17,   vs10   
 | |
| 
 | |
|     xvmaddasp      vs4,   vs16,   vs11
 | |
|     xvmaddasp      vs5,   vs17,   vs11     
 | |
|  
 | |
| .if \IsLast==1   
 | |
|     addi        \BREG, \BREG, DISP4(\Index,16)  
 | |
|     addi        \AREG, \AREG, DISP16(\Index,64)
 | |
| .endif 
 | |
|   
 | |
| .endm
 | |
| 
 | |
| 
 | |
| .macro SAVE2x8
 | |
| 
 | |
| #ifndef TRMMKERNEL    
 | |
|     lxv        vs16, 0(CO)
 | |
|     lxv        vs17, 16(CO)     
 | |
| #endif
 | |
|   add     T1, CO, LDC 
 | |
| #ifndef TRMMKERNEL    
 | |
|     lxv        vs26, 0(T1)
 | |
|     lxv        vs27, 16(T1) 
 | |
|     
 | |
| #endif
 | |
| 
 | |
| #if defined(TRMMKERNEL)
 | |
|   xvmulsp        vs16, vs0, alpha_r
 | |
|   xvmulsp        vs17, vs1, alpha_r  
 | |
|   xvmulsp        vs26, vs4, alpha_r
 | |
|   xvmulsp        vs27, vs5, alpha_r 
 | |
| #else
 | |
|   xvmaddasp        vs16, vs0, alpha_r
 | |
|   xvmaddasp        vs17, vs1, alpha_r  
 | |
|   xvmaddasp        vs26, vs4, alpha_r
 | |
|   xvmaddasp        vs27, vs5, alpha_r 
 | |
| #endif
 | |
| 
 | |
|     stxv        vs16, 0(CO)
 | |
|     stxv        vs17, 16(CO) 
 | |
|      
 | |
|     
 | |
|     stxv        vs26, 0(T1)
 | |
|     stxv        vs27, 16(T1) 
 | |
| 
 | |
|   addi CO,CO,32
 | |
| 
 | |
| .endm
 | |
| 
 | |
| 
 | |
| /*M=4*/
 | |
| 
 | |
| 
 | |
| .macro KERNEL2x4_2   OffsetA,OffsetB, Index,IsLast
 | |
|   KERNEL2x4_I_2 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
 | |
| .endm
 | |
| 
 | |
|  /* we will aggregate on save vs0 +vs4 vs11+vs5 */
 | |
| .macro Zero2x4
 | |
|     xxlxor      vs0,   vs0,   vs0
 | |
|     xxlxor      vs1,   vs1,   vs1 
 | |
|  
 | |
|     xxlxor      vs4,   vs4,   vs4
 | |
|     xxlxor      vs5,   vs5,   vs5 
 | |
|     
 | |
| .endm
 | |
|  
 | |
| .macro KERNEL2x4
 | |
|   KERNEL2x4_1 AO,BO, 0, 0,0,0
 | |
| .endm
 | |
| .macro KERNEL2x4_4 OffsetA,OffsetB, Index,IsLast  
 | |
|   KERNEL2x4_I_4 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
 | |
| .endm
 | |
|  
 | |
| .macro KERNEL2x4_1 AREG,BREG,First,OffsetA,OffsetB,Index
 | |
| 
 | |
| 
 | |
|     lxsd v4,   DISP2(\Index, 0+\OffsetB)(\BREG)
 | |
|     xxspltw   vs8,  vs36, 1 
 | |
|     xxspltw   vs9,  vs36, 0 
 | |
|     lxv vs26,   DISP4(\Index, 0+\OffsetA)(\AREG)        
 | |
|  
 | |
|  
 | |
| .if \First==1
 | |
|     xvmulsp      vs0,   vs26,   vs8 
 | |
|     xvmulsp      vs1,   vs26,   vs9     
 | |
|      
 | |
| .else 
 | |
|     xvmaddasp      vs0,   vs26,   vs8 
 | |
|     xvmaddasp      vs1,   vs26,   vs9 
 | |
|  .endif
 | |
|    
 | |
|     addi        \BREG, \BREG, DISP2(\Index,8)
 | |
|     addi        \AREG, \AREG, DISP4(\Index,16)  
 | |
|  
 | |
| .endm
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| .macro KERNEL2x4_I_4  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
 | |
| 
 | |
|     lxv vs38,    DISP8(\Index, 0+\OffsetB)(\BREG)
 | |
|     lxv vs39,    DISP8(\Index, 16+\OffsetB)(\BREG) 
 | |
| 
 | |
|     lxv vs26,   DISP16(\Index, 0+\OffsetA)(\AREG)
 | |
|     lxv vs16,   DISP16(\Index,16+\OffsetA)(\AREG) 
 | |
| 
 | |
|     lxv vs30,   DISP16(\Index,32+ 0+\OffsetA)(\AREG)
 | |
|     lxv vs34,   DISP16(\Index,32+ 16+\OffsetA)(\AREG) 
 | |
|  
 | |
| 
 | |
|     xxspltw   vs8,  vs38, 3  
 | |
|     xxspltw   vs9,  vs38, 2 
 | |
|     xxspltw   vs10, vs38, 1 
 | |
|     xxspltw   vs11, vs38, 0  
 | |
| 
 | |
|     xxspltw   vs12,  vs39, 3  
 | |
|     xxspltw   vs13,  vs39, 2 
 | |
|     xxspltw   vs14, vs39, 1 
 | |
|     xxspltw   vs15, vs39, 0  
 | |
| 
 | |
|  
 | |
|     xvmaddasp      vs0,   vs26,   vs8
 | |
|     xvmaddasp      vs1,   vs26,   vs9 
 | |
|     xvmaddasp      vs4,   vs16,   vs10
 | |
|     xvmaddasp      vs5,   vs16,   vs11 
 | |
|  
 | |
| 
 | |
|     xvmaddasp      vs0,   vs30,   vs12
 | |
|     xvmaddasp      vs1,   vs30,   vs13 
 | |
|     xvmaddasp      vs4,   vs34,   vs14
 | |
|     xvmaddasp      vs5,   vs34,   vs15 
 | |
|  
 | |
|    
 | |
|  
 | |
|  
 | |
| .if \IsLast==1   
 | |
|     addi        \BREG, \BREG, DISP8(\Index,32)  
 | |
|     addi        \AREG, \AREG, DISP16(\Index,64)
 | |
| .endif 
 | |
|   
 | |
| .endm
 | |
| 
 | |
| .macro KERNEL2x4_I_2  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
 | |
| 
 | |
|     lxv vs36,    DISP4(\Index, 0+\OffsetB)(\BREG)
 | |
|     xxspltw   vs8,  vs36, 3  
 | |
|     xxspltw   vs9,  vs36, 2 
 | |
|     xxspltw   vs10, vs36, 1 
 | |
|     xxspltw   vs11, vs36, 0    
 | |
|     lxv vs26,   DISP8(\Index, 0+\OffsetA)(\AREG)
 | |
|     lxv vs16,   DISP8(\Index, 16+\OffsetA)(\AREG)      
 | |
|  
 | |
|     xvmaddasp      vs0,   vs26,   vs8
 | |
|     xvmaddasp      vs1,   vs26,   vs9 
 | |
|     xvmaddasp      vs4,   vs16,   vs10
 | |
|     xvmaddasp      vs5,   vs16,   vs11     
 | |
|  
 | |
| .if \IsLast==1   
 | |
|     addi        \BREG, \BREG, DISP4(\Index,16)  
 | |
|     addi        \AREG, \AREG, DISP8(\Index,32)
 | |
| .endif 
 | |
|   
 | |
| .endm
 | |
| 
 | |
| 
 | |
| .macro SAVE2x4
 | |
| 
 | |
| #ifndef TRMMKERNEL    
 | |
|     lxv        vs16, 0(CO)     
 | |
| #endif
 | |
|   add     T1, CO, LDC 
 | |
| #ifndef TRMMKERNEL    
 | |
|     lxv        vs26, 0(T1) 
 | |
|     
 | |
| #endif
 | |
|     /*aggregate vectors*/
 | |
|   xvaddsp         vs0,vs0,vs4
 | |
|   xvaddsp         vs1,vs1,vs5 
 | |
| #if defined(TRMMKERNEL)
 | |
|   xvmulsp        vs16, vs0, alpha_r 
 | |
|   xvmulsp        vs26, vs1, alpha_r 
 | |
| #else
 | |
|   xvmaddasp        vs16, vs0, alpha_r 
 | |
|   xvmaddasp        vs26, vs1, alpha_r 
 | |
| #endif
 | |
| 
 | |
|   stxv        vs16, 0(CO) 
 | |
|   stxv        vs26, 0(T1)  
 | |
| 
 | |
|   addi CO,CO,16
 | |
| 
 | |
| .endm
 | |
| 
 | |
| 
 | |
| /* M=2 N=2 we will have inner pemrute action before permute was revrsing 3,2,1,0 not iw 2ill inner reverse 1,0,3,2  */
 | |
| .macro SWITCH_PERMUTE_INNER
 | |
|     xxpermdi	permute_mask,	permute_mask,	permute_mask,2
 | |
| .endm
 | |
| 
 | |
| .macro Zero2x2
 | |
|     xxlxor      vs0,   vs0,   vs0
 | |
|     xxlxor      vs1,   vs1,   vs1 
 | |
|     SWITCH_PERMUTE_INNER
 | |
| .endm
 | |
|  
 | |
| .macro KERNEL2x2
 | |
|   KERNEL2x2_1 AO,BO, 0, 0,0,0
 | |
| .endm
 | |
| .macro KERNEL2x2_4 OffsetA,OffsetB, Index,IsLast  
 | |
|   KERNEL2x2_I_4 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
 | |
| .endm
 | |
| 
 | |
| .macro KERNEL2x2_2 OffsetA,OffsetB, Index,IsLast  
 | |
|   KERNEL2x2_I_2 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
 | |
| .endm
 | |
|  
 | |
| .macro KERNEL2x2_1 AREG,BREG,First,OffsetA,OffsetB,Index
 | |
| 
 | |
| 
 | |
|     lxsd v4,   DISP2(\Index, 0+\OffsetB)(\BREG)
 | |
|     xxperm   vs9,  vs36, permute_mask 
 | |
|     lxsd v5,   DISP2(\Index, 0+\OffsetA)(\AREG)        
 | |
|  
 | |
|  
 | |
| .if \First==1
 | |
|     xvmulsp      vs0,   vs37,   vs36 
 | |
|     xvmulsp      vs1,   vs37,   vs9     
 | |
|      
 | |
| .else 
 | |
|     xvmaddasp      vs0,   vs37,   vs36 
 | |
|     xvmaddasp      vs1,   vs37,   vs9 
 | |
|  .endif
 | |
|    
 | |
|     addi        \BREG, \BREG, DISP2(\Index,8)
 | |
|     addi        \AREG, \AREG, DISP2(\Index,8)  
 | |
|  
 | |
| .endm
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| .macro KERNEL2x2_I_4  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
 | |
| 
 | |
|     lxv vs8,    DISP8(\Index, 0+\OffsetB)(\BREG)
 | |
|     lxv vs10,    DISP8(\Index, 16+\OffsetB)(\BREG) 
 | |
| 
 | |
|     lxv vs26,   DISP8(\Index, 0+\OffsetA)(\AREG)
 | |
|     lxv vs16,   DISP8(\Index,16+\OffsetA)(\AREG) 
 | |
| 
 | |
|  
 | |
|     xxperm   vs9,  vs8, permute_mask   
 | |
|     xxperm   vs11, vs10, permute_mask  
 | |
| 
 | |
| 
 | |
|  
 | |
|     xvmaddasp      vs0,   vs26,   vs8
 | |
|     xvmaddasp      vs1,   vs26,   vs9 
 | |
|     xvmaddasp      vs0,   vs16,   vs10
 | |
|     xvmaddasp      vs1,   vs16,   vs11 
 | |
|  
 | |
|  
 | |
|  
 | |
| .if \IsLast==1   
 | |
|     addi        \BREG, \BREG, DISP8(\Index,32)  
 | |
|     addi        \AREG, \AREG, DISP8(\Index,32)
 | |
| .endif 
 | |
|   
 | |
| .endm
 | |
| 
 | |
| .macro KERNEL2x2_I_2  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
 | |
| 
 | |
|     lxv vs8,    DISP4(\Index, 0+\OffsetB)(\BREG)  
 | |
|     lxv vs26,   DISP4(\Index, 0+\OffsetA)(\AREG) 
 | |
| 
 | |
|  
 | |
|     xxperm   vs9,  vs8, permute_mask    
 | |
| 
 | |
|  
 | |
|     xvmaddasp      vs0,   vs26,   vs8
 | |
|     xvmaddasp      vs1,   vs26,   vs9  
 | |
|  
 | |
| .if \IsLast==1   
 | |
|     addi        \BREG, \BREG, DISP4(\Index,16)  
 | |
|     addi        \AREG, \AREG, DISP4(\Index,16)
 | |
| .endif 
 | |
| .endm
 | |
| 
 | |
| 
 | |
| .macro SAVE2x2
 | |
| 
 | |
| #ifndef TRMMKERNEL    
 | |
|     lxsd v4   , 0(CO)     
 | |
| #endif
 | |
|   add     T1, CO, LDC 
 | |
| #ifndef TRMMKERNEL    
 | |
|     lxsd v5   , 0(T1) 
 | |
|     
 | |
| #endif
 | |
|     /*aggregate vectors*/
 | |
|   xxpermdi         vs4,vs0,vs0,2
 | |
|   xxpermdi         vs5,vs1,vs1,2  
 | |
|   xvaddsp          vs0,vs0,vs4
 | |
|   xvaddsp         vs1,vs1,vs5 
 | |
|   /*   */
 | |
|   /* lets correct the order to 00 10 and 10 ,11 from {00,11} {01,10}  */
 | |
|   xxperm    vs1,vs1, permute_mask
 | |
| 
 | |
| 
 | |
|   xxmrghw   vs2 ,vs1,vs0
 | |
|   xxpermdi         vs2,vs2,vs2,2  
 | |
|   xxmrghw   vs3 ,vs0,vs1  
 | |
| #if defined(TRMMKERNEL)
 | |
|   xvmulsp        vs36, vs2, alpha_r 
 | |
|   xvmulsp        vs37, vs3, alpha_r 
 | |
| #else
 | |
|   xvmaddasp        vs36, vs2, alpha_r 
 | |
|   xvmaddasp        vs37, vs3, alpha_r 
 | |
| #endif
 | |
|   /**** store last two words*/
 | |
| 
 | |
| 
 | |
|   stxsd       v4, 0(CO) 
 | |
|   stxsd        v5, 0(T1)  
 | |
| 
 | |
|   addi CO,CO,8
 | |
| 
 | |
| .endm
 | |
| 
 | |
| /*--------------------------- M=1 N=2 */
 | |
| .macro Zero2x1
 | |
|     xxlxor      vs0,   vs0,   vs0
 | |
|     xxlxor      vs1,   vs1,   vs1 
 | |
|     xxlxor    vs2,vs2,vs2 
 | |
|     xxlxor    vs3,vs3,vs3     
 | |
| .endm
 | |
|  
 | |
| .macro KERNEL2x1
 | |
|   KERNEL2x1_1 AO,BO, 0, 0,0,0
 | |
| .endm
 | |
| .macro KERNEL2x1_4 OffsetA,OffsetB, Index,IsLast  
 | |
|   KERNEL2x1_I_4 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
 | |
| .endm
 | |
| 
 | |
| .macro KERNEL2x1_2 OffsetA,OffsetB, Index,IsLast  
 | |
|   KERNEL2x1_I_2 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
 | |
| .endm
 | |
|  /*
 | |
|    we will calculate 1 alone then will add it to batched ones
 | |
|  */
 | |
| .macro KERNEL2x1_1 AREG,BREG,First,OffsetA,OffsetB,Index
 | |
| 
 | |
| 
 | |
|     lxssp v3,   DISP2(\Index, 0+\OffsetB)(\BREG)
 | |
|     lxssp v4,   DISP2(\Index, 4+\OffsetB)(\BREG) 
 | |
|     lxssp v5,   DISP1(\Index, 0+\OffsetA)(\AREG)        
 | |
|  
 | |
|  
 | |
| .if \First==1
 | |
|     xvmulsp      vs2,   vs37,   vs35 
 | |
|     xvmulsp      vs3,   vs37,   vs36     
 | |
|      
 | |
| .else 
 | |
|     xsmaddadp     vs2,   vs37,   vs35
 | |
|     xsmaddadp      vs3,   vs37,   vs36
 | |
|  .endif
 | |
|    
 | |
|     addi        \BREG, \BREG, DISP2(\Index,8)
 | |
|     addi        \AREG, \AREG, DISP1(\Index,4)  
 | |
|  
 | |
| .endm
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| .macro KERNEL2x1_I_4  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
 | |
| 
 | |
|     lxv vs8,    DISP8(\Index, 0+\OffsetB)(\BREG)
 | |
|     lxv vs10,   DISP8(\Index, 16+\OffsetB)(\BREG) 
 | |
| 
 | |
|     lxv vs26,   DISP4(\Index, 0+\OffsetA)(\AREG)
 | |
|    
 | |
|     xxmrglw   vs5, vs26,vs26
 | |
|     xxmrghw   vs6, vs26,vs26 
 | |
|  
 | |
|     xvmaddasp      vs0,   vs8,   vs5
 | |
|     xvmaddasp      vs1,   vs10,   vs6 
 | |
|  
 | |
|  
 | |
| .if \IsLast==1   
 | |
|     addi        \BREG, \BREG, DISP8(\Index,32)  
 | |
|     addi        \AREG, \AREG, DISP4(\Index,16)
 | |
| .endif 
 | |
|   
 | |
| .endm
 | |
| 
 | |
| .macro KERNEL2x1_I_2  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
 | |
| 
 | |
|     lxssp v3,   DISP4(\Index, 0+\OffsetB)(\BREG)
 | |
|     lxssp v4,   DISP4(\Index, 4+\OffsetB)(\BREG) 
 | |
|     lxssp v7,   DISP4(\Index, 8+\OffsetB)(\BREG)
 | |
|     lxssp v8,   DISP4(\Index, 12+\OffsetB)(\BREG)    
 | |
|     lxssp v5,   DISP2(\Index, 0+\OffsetA)(\AREG)        
 | |
|     lxssp v6,   DISP2(\Index, 4+\OffsetA)(\AREG)  
 | |
|  
 | |
|  
 | |
|     xsmaddadp      vs2,   vs37,   vs35
 | |
|     xsmaddadp      vs3,   vs37,   vs36
 | |
| 
 | |
|     xsmaddadp      vs2,   vs38,   vs39 
 | |
|     xsmaddadp      vs3,   vs38,   vs40      
 | |
|  
 | |
|    
 | |
|     addi        \BREG, \BREG, DISP4(\Index,16)
 | |
|     addi        \AREG, \AREG, DISP2(\Index,8) 
 | |
| .endm
 | |
| 
 | |
| 
 | |
| .macro SAVE2x1
 | |
| 
 | |
| #ifndef TRMMKERNEL    
 | |
|     lxssp v4   , 0(CO)     
 | |
| #endif
 | |
|   add     T1, CO, LDC 
 | |
| #ifndef TRMMKERNEL    
 | |
|     lxssp v5   , 0(T1) 
 | |
|     
 | |
| #endif
 | |
| 
 | |
|   /*convert alpha_r for multiply*/
 | |
|   xscvspdp  vs16,alpha_r
 | |
| 
 | |
|  /*aggregate vectors 2x2_4   */ 
 | |
|       xxpermdi         vs4,vs0,vs0,2
 | |
|       xxpermdi         vs5,vs1,vs1,2  
 | |
|       xvaddsp          vs0,vs0,vs4
 | |
|       xvaddsp         vs1,vs1,vs5 
 | |
|       xvaddsp         vs0,vs0,vs1 
 | |
| /*aggregate vectors 2x1_2 and 2x1_1 into 2x2_4*/
 | |
|   xscvspdp  vs5, vs0
 | |
|   xxspltw   vs6, vs0, 1  
 | |
|   xscvspdp  vs6,vs6 
 | |
|   xsadddp  vs2,vs2,vs6
 | |
|   xsadddp  vs3,vs3,vs5  
 | |
| 
 | |
|   /**** store last two words*/
 | |
| #if defined(TRMMKERNEL) 
 | |
|   xsmuldp  vs36,vs2, vs16 
 | |
|   xsmuldp  vs37,vs3, vs16  
 | |
|  
 | |
| #else
 | |
|   xsmaddadp  vs36,vs2, vs16 
 | |
|   xsmaddadp  vs37,vs3, vs16 
 | |
| #endif  
 | |
| 
 | |
|   stxssp       v4, 0(CO) 
 | |
|   stxssp        v5, 0(T1)  
 | |
| 
 | |
|   addi CO,CO,4
 | |
| 
 | |
| .endm
 | |
| 
 | |
| 
 | |
| 
 | |
| /****************************N=1 section*****************/
 | |
| 
 | |
| .macro KERNEL1x16_2   OffsetA,OffsetB, Index,IsLast
 | |
|   KERNEL1x16_I_2 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
 | |
| .endm
 | |
| 
 | |
|  
 | |
| .macro Zero1x16
 | |
|     xxlxor      vs0,   vs0,   vs0
 | |
|     xxlxor      vs1,   vs1,   vs1 
 | |
|     xxlxor      vs2,   vs2,   vs2
 | |
|     xxlxor      vs3,   vs3,   vs3       
 | |
| .endm
 | |
|  
 | |
| .macro KERNEL1x16
 | |
|   KERNEL1x16_1 AO,BO, 0, 0,0,0
 | |
| .endm
 | |
| .macro KERNEL1x16_4 OffsetA,OffsetB, Index,IsLast  
 | |
|   KERNEL1x16_I_4 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
 | |
| .endm
 | |
|  
 | |
| .macro KERNEL1x16_1 AREG,BREG,First,OffsetA,OffsetB,Index
 | |
| 
 | |
| 
 | |
|     lxssp v4,   DISP1(\Index, 0+\OffsetB)(\BREG)
 | |
|     xscvdpspn   vs36,vs36
 | |
|     xxspltw     vs8,  vs36, 0
 | |
|     lxv vs26,   DISP16(\Index, 0+\OffsetA)(\AREG)
 | |
|     lxv vs27,   DISP16(\Index,16+\OffsetA)(\AREG)   
 | |
|     lxv vs28,   DISP16(\Index, 32+\OffsetA)(\AREG)
 | |
|     lxv vs29,   DISP16(\Index,48+\OffsetA)(\AREG)        
 | |
|  
 | |
|  
 | |
| .if \First==1
 | |
|     xvmulsp      vs0,   vs26,   vs8
 | |
|     xvmulsp      vs1,   vs27,   vs8 
 | |
|     xvmulsp      vs2,   vs28,   vs8
 | |
|     xvmulsp      vs3,   vs29,   vs8 
 | |
|   
 | |
|      
 | |
| .else 
 | |
|     xvmaddasp      vs0,   vs26,   vs8
 | |
|     xvmaddasp      vs1,   vs27,   vs8 
 | |
|     xvmaddasp      vs2,   vs28,   vs8
 | |
|     xvmaddasp      vs3,   vs29,   vs8 
 | |
|  
 | |
|  .endif
 | |
|    
 | |
|     addi        \BREG, \BREG, DISP1(\Index,4)
 | |
|     addi        \AREG, \AREG, DISP16(\Index,64)  
 | |
|  
 | |
| .endm
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| .macro KERNEL1x16_I_4  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
 | |
| 
 | |
|     lxv vs38,    DISP4(\Index, 0+\OffsetB)(\BREG) 
 | |
| 
 | |
|     lxv vs26,   DISP64(\Index, 0+\OffsetA)(\AREG)
 | |
|     lxv vs27,   DISP64(\Index,16+\OffsetA)(\AREG)
 | |
|     lxv vs28,   DISP64(\Index,32+\OffsetA)(\AREG)
 | |
|     lxv vs29,   DISP64(\Index,48+\OffsetA)(\AREG)  
 | |
| 
 | |
|     lxv vs16,   DISP64(\Index,64+ 0+\OffsetA)(\AREG)
 | |
|     lxv vs17,   DISP64(\Index,64+ 16+\OffsetA)(\AREG)
 | |
|     lxv vs18,   DISP64(\Index,64+ 32+\OffsetA)(\AREG)
 | |
|     lxv vs19,   DISP64(\Index,64+ 48+\OffsetA)(\AREG) 
 | |
| 
 | |
|     xxspltw   vs8,  vs38, 3  
 | |
|     xxspltw   vs9,  vs38, 2 
 | |
| 
 | |
|     lxv vs30,   DISP64(\Index,128+ 0+\OffsetA)(\AREG)
 | |
|     lxv vs31,   DISP64(\Index,128+ 16+\OffsetA)(\AREG)
 | |
|     lxv vs32,   DISP64(\Index,128+ 32+\OffsetA)(\AREG)
 | |
|     lxv vs33,   DISP64(\Index,128+ 48+\OffsetA)(\AREG)  
 | |
|         
 | |
|     lxv vs34,   DISP64(\Index,128+ 64+ 0+\OffsetA)(\AREG)
 | |
|     lxv vs35,   DISP64(\Index,128+ 64+ 16+\OffsetA)(\AREG)
 | |
|     lxv vs36,   DISP64(\Index,128+ 64+ 32+\OffsetA)(\AREG)
 | |
|     lxv vs37,   DISP64(\Index,128+ 64+ 48+\OffsetA)(\AREG) 
 | |
| 
 | |
|     xxspltw   vs10, vs38, 1 
 | |
|     xxspltw   vs11, vs38, 0    
 | |
| 
 | |
|  
 | |
|     xvmaddasp      vs0,   vs26,   vs8
 | |
|     xvmaddasp      vs1,   vs27,   vs8 
 | |
|     xvmaddasp      vs2,   vs28,   vs8
 | |
|     xvmaddasp      vs3,   vs29,   vs8 
 | |
|  
 | |
| 
 | |
|     xvmaddasp      vs0,   vs16,   vs9
 | |
|     xvmaddasp      vs1,   vs17,   vs9 
 | |
|     xvmaddasp      vs2,   vs18,   vs9
 | |
|     xvmaddasp      vs3,   vs19,   vs9 
 | |
|  
 | |
| 
 | |
|     xvmaddasp      vs0,   vs30,   vs10
 | |
|     xvmaddasp      vs1,   vs31,   vs10 
 | |
|     xvmaddasp      vs2,   vs32,   vs10
 | |
|     xvmaddasp      vs3,   vs33,   vs10 
 | |
|  
 | |
| 
 | |
|     xvmaddasp      vs0,   vs34,   vs11
 | |
|     xvmaddasp      vs1,   vs35,   vs11 
 | |
|     xvmaddasp      vs2,   vs36,   vs11
 | |
|     xvmaddasp      vs3,   vs37,   vs11 
 | |
| 
 | |
|  
 | |
|  
 | |
|  
 | |
| .if \IsLast==1   
 | |
|     addi        \BREG, \BREG, DISP4(\Index,16)  
 | |
|     addi        \AREG, \AREG, DISP64(\Index,256)
 | |
| .endif 
 | |
|   
 | |
| .endm
 | |
| 
 | |
| .macro KERNEL1x16_I_2  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
 | |
| 
 | |
|     lxsd v4,    DISP2(\Index, 0+\OffsetB)(\BREG)
 | |
|     xxspltw   vs8,  vs36, 1  
 | |
|     xxspltw   vs9,  vs36, 0      
 | |
|     lxv vs26,   DISP32(\Index, 0+\OffsetA)(\AREG)
 | |
|     lxv vs27,   DISP32(\Index,16+\OffsetA)(\AREG)
 | |
|     lxv vs28,   DISP32(\Index,32+\OffsetA)(\AREG)
 | |
|     lxv vs29,   DISP32(\Index,48+\OffsetA)(\AREG)      
 | |
|     lxv vs16,   DISP32(\Index,64+ 0+\OffsetA)(\AREG)
 | |
|     lxv vs17,   DISP32(\Index,64+ 16+\OffsetA)(\AREG)
 | |
|     lxv vs18,   DISP32(\Index,64+ 32+\OffsetA)(\AREG)
 | |
|     lxv vs19,   DISP32(\Index,64+ 48+\OffsetA)(\AREG) 
 | |
|  
 | |
|  
 | |
|     xvmaddasp      vs0,   vs26,   vs8
 | |
|     xvmaddasp      vs1,   vs27,   vs8 
 | |
|     xvmaddasp      vs2,   vs28,   vs8
 | |
|     xvmaddasp      vs3,   vs29,   vs8 
 | |
|  
 | |
| 
 | |
|     xvmaddasp      vs0,   vs16,   vs9
 | |
|     xvmaddasp      vs1,   vs17,   vs9 
 | |
|     xvmaddasp      vs2,   vs18,   vs9
 | |
|     xvmaddasp      vs3,   vs19,   vs9 
 | |
|   
 | |
|  
 | |
| .if \IsLast==1   
 | |
|     addi        \BREG, \BREG, DISP2(\Index,8)  
 | |
|     addi        \AREG, \AREG, DISP32(\Index,128)
 | |
| .endif 
 | |
|   
 | |
| .endm
 | |
| 
 | |
| 
 | |
| .macro SAVE1x16
 | |
| 
 | |
| #ifndef TRMMKERNEL    
 | |
|     lxv        vs16, 0(CO)
 | |
|     lxv        vs17, 16(CO) 
 | |
|     lxv        vs18, 32(CO)  
 | |
|     lxv        vs19, 48(CO)      
 | |
| #endif
 | |
|  
 | |
| 
 | |
| #if defined(TRMMKERNEL)
 | |
|   xvmulsp        vs16, vs0, alpha_r
 | |
|   xvmulsp        vs17, vs1, alpha_r 
 | |
|   xvmulsp        vs18, vs2, alpha_r
 | |
|   xvmulsp        vs19, vs3, alpha_r   
 | |
| #else
 | |
|   xvmaddasp        vs16, vs0, alpha_r
 | |
|   xvmaddasp        vs17, vs1, alpha_r 
 | |
|   xvmaddasp        vs18, vs2, alpha_r
 | |
|   xvmaddasp        vs19, vs3, alpha_r   
 | |
| #endif
 | |
|     stxv        vs16, 0(CO)
 | |
|     stxv        vs17, 16(CO) 
 | |
|     stxv        vs18, 32(CO)  
 | |
|     stxv        vs19, 48(CO)      
 | |
|     
 | |
|   addi CO,CO,64
 | |
| 
 | |
| .endm
 | |
| 
 | |
| /*       M=8 N=1 */
 | |
| 
 | |
| .macro KERNEL1x8_2   OffsetA,OffsetB, Index,IsLast
 | |
|   KERNEL1x8_I_2 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
 | |
| .endm
 | |
| 
 | |
|  
 | |
| .macro Zero1x8
 | |
|     xxlxor      vs0,   vs0,   vs0
 | |
|     xxlxor      vs1,   vs1,   vs1  
 | |
|     xxlxor      vs2,   vs2,   vs2
 | |
|     xxlxor      vs3,   vs3,   vs3          
 | |
| .endm
 | |
|  
 | |
| .macro KERNEL1x8
 | |
|   KERNEL1x8_1 AO,BO, 0, 0,0,0
 | |
| .endm
 | |
| .macro KERNEL1x8_4 OffsetA,OffsetB, Index,IsLast  
 | |
|   KERNEL1x8_I_4 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
 | |
| .endm
 | |
|  
 | |
| .macro KERNEL1x8_1 AREG,BREG,First,OffsetA,OffsetB,Index
 | |
| 
 | |
| 
 | |
|     lxssp v4,   DISP1(\Index, 0+\OffsetB)(\BREG)
 | |
|     xscvdpspn   vs36,vs36
 | |
|     xxspltw     vs8,  vs36, 0
 | |
|     lxv vs26,   DISP8(\Index, 0+\OffsetA)(\AREG)
 | |
|     lxv vs27,   DISP8(\Index,16+\OffsetA)(\AREG)         
 | |
|  
 | |
|  
 | |
| .if \First==1
 | |
|     xvmulsp      vs0,   vs26,   vs8
 | |
|     xvmulsp      vs1,   vs27,   vs8  
 | |
|   
 | |
|      
 | |
| .else 
 | |
|     xvmaddasp      vs0,   vs26,   vs8
 | |
|     xvmaddasp      vs1,   vs27,   vs8  
 | |
|  
 | |
|  .endif
 | |
|    
 | |
|     addi        \BREG, \BREG, DISP1(\Index,4)
 | |
|     addi        \AREG, \AREG, DISP8(\Index,32)  
 | |
|  
 | |
| .endm
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| .macro KERNEL1x8_I_4  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
 | |
| 
 | |
|     lxv vs38,    DISP4(\Index, 0+\OffsetB)(\BREG) 
 | |
| 
 | |
|     lxv vs26,   DISP32(\Index, 0+\OffsetA)(\AREG)
 | |
|     lxv vs27,   DISP32(\Index,16+\OffsetA)(\AREG) 
 | |
| 
 | |
|     lxv vs16,   DISP32(\Index,32+ 0+\OffsetA)(\AREG)
 | |
|     lxv vs17,   DISP32(\Index,32+ 16+\OffsetA)(\AREG) 
 | |
| 
 | |
|     xxspltw   vs8,  vs38, 3  
 | |
|     xxspltw   vs9,  vs38, 2 
 | |
| 
 | |
|     lxv vs30,   DISP32(\Index,64+ 0+\OffsetA)(\AREG)
 | |
|     lxv vs31,   DISP32(\Index,64+ 16+\OffsetA)(\AREG)   
 | |
|         
 | |
|     lxv vs34,   DISP32(\Index,64+ 32+ 0+\OffsetA)(\AREG)
 | |
|     lxv vs35,   DISP32(\Index,64+ 32+ 16+\OffsetA)(\AREG)  
 | |
| 
 | |
|     xxspltw   vs10, vs38, 1 
 | |
|     xxspltw   vs11, vs38, 0    
 | |
| 
 | |
|  
 | |
|     xvmaddasp      vs0,   vs26,   vs8
 | |
|     xvmaddasp      vs1,   vs27,   vs8  
 | |
|  
 | |
| 
 | |
|     xvmaddasp      vs2,   vs16,   vs9
 | |
|     xvmaddasp      vs3,   vs17,   vs9  
 | |
|  
 | |
| 
 | |
|     xvmaddasp      vs0,   vs30,   vs10
 | |
|     xvmaddasp      vs1,   vs31,   vs10  
 | |
|  
 | |
| 
 | |
|     xvmaddasp      vs2,   vs34,   vs11
 | |
|     xvmaddasp      vs3,   vs35,   vs11  
 | |
| 
 | |
|  
 | |
|  
 | |
|  
 | |
| .if \IsLast==1   
 | |
|     addi        \BREG, \BREG, DISP4(\Index,16)  
 | |
|     addi        \AREG, \AREG, DISP32(\Index,128)
 | |
| .endif 
 | |
|   
 | |
| .endm
 | |
| 
 | |
| .macro KERNEL1x8_I_2  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
 | |
| 
 | |
|     lxsd v4,    DISP2(\Index, 0+\OffsetB)(\BREG)
 | |
|     xxspltw   vs8,  vs36, 1  
 | |
|     xxspltw   vs9,  vs36, 0      
 | |
|     lxv vs26,   DISP16(\Index, 0+\OffsetA)(\AREG)
 | |
|     lxv vs27,   DISP16(\Index,16+\OffsetA)(\AREG)     
 | |
|     lxv vs16,   DISP16(\Index,32+ 0+\OffsetA)(\AREG)
 | |
|     lxv vs17,   DISP16(\Index,32+ 16+\OffsetA)(\AREG) 
 | |
|  
 | |
|  
 | |
|     xvmaddasp      vs0,   vs26,   vs8
 | |
|     xvmaddasp      vs1,   vs27,   vs8  
 | |
|  
 | |
| 
 | |
|     xvmaddasp      vs2,   vs16,   vs9
 | |
|     xvmaddasp      vs3,   vs17,   vs9   
 | |
|   
 | |
|  
 | |
| .if \IsLast==1   
 | |
|     addi        \BREG, \BREG, DISP2(\Index,8)  
 | |
|     addi        \AREG, \AREG, DISP16(\Index,64)
 | |
| .endif 
 | |
|   
 | |
| .endm
 | |
| 
 | |
| 
 | |
| .macro SAVE1x8
 | |
| 
 | |
| #ifndef TRMMKERNEL    
 | |
|     lxv        vs16, 0(CO)
 | |
|     lxv        vs17, 16(CO)       
 | |
| #endif
 | |
|    /* aggregate vs0 vs2 and vs1 vs3*/
 | |
|   xvaddsp vs0,vs0,vs2
 | |
|   xvaddsp  vs1,vs1,vs3
 | |
| #if defined(TRMMKERNEL)
 | |
|   xvmulsp        vs16, vs0, alpha_r
 | |
|   xvmulsp        vs17, vs1, alpha_r     
 | |
| #else
 | |
|   xvmaddasp        vs16, vs0, alpha_r
 | |
|   xvmaddasp        vs17, vs1, alpha_r  
 | |
| #endif
 | |
|     stxv        vs16, 0(CO)
 | |
|     stxv        vs17, 16(CO)      
 | |
|     
 | |
|   addi CO,CO,32
 | |
| 
 | |
| .endm
 | |
| /*M=4*/
 | |
| 
 | |
| .macro KERNEL1x4_2   OffsetA,OffsetB, Index,IsLast
 | |
|   KERNEL1x4_I_2 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
 | |
| .endm
 | |
| 
 | |
|  
 | |
| .macro Zero1x4
 | |
|     xxlxor      vs0,   vs0,   vs0
 | |
|     xxlxor      vs1,   vs1,   vs1  
 | |
|     xxlxor      vs2,   vs2,   vs2
 | |
|     xxlxor      vs3,   vs3,   vs3          
 | |
| .endm
 | |
|  
 | |
| .macro KERNEL1x4
 | |
|   KERNEL1x4_1 AO,BO, 0, 0,0,0
 | |
| .endm
 | |
| .macro KERNEL1x4_4 OffsetA,OffsetB, Index,IsLast  
 | |
|   KERNEL1x4_I_4 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
 | |
| .endm
 | |
|  
 | |
| .macro KERNEL1x4_1 AREG,BREG,First,OffsetA,OffsetB,Index
 | |
| 
 | |
| 
 | |
|     lxssp v4,   DISP1(\Index, 0+\OffsetB)(\BREG)
 | |
|     xscvdpspn   vs36,vs36
 | |
|     xxspltw     vs8,  vs36, 0
 | |
|     lxv vs26,   DISP4(\Index, 0+\OffsetA)(\AREG)         
 | |
|  
 | |
|  
 | |
| .if \First==1
 | |
|     xvmulsp      vs0,   vs26,   vs8 
 | |
| .else 
 | |
|     xvmaddasp      vs0,   vs26,   vs8 
 | |
|  
 | |
|  .endif
 | |
|    
 | |
|     addi        \BREG, \BREG, DISP1(\Index,4)
 | |
|     addi        \AREG, \AREG, DISP4(\Index,16)  
 | |
|  
 | |
| .endm
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| .macro KERNEL1x4_I_4  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
 | |
| 
 | |
|     lxv vs38,    DISP4(\Index, 0+\OffsetB)(\BREG) 
 | |
| 
 | |
|     lxv vs26,   DISP16(\Index, 0+\OffsetA)(\AREG)
 | |
|     lxv vs27,   DISP16(\Index,16+\OffsetA)(\AREG) 
 | |
|  
 | |
| 
 | |
|     xxspltw   vs8,  vs38, 3  
 | |
|     xxspltw   vs9,  vs38, 2 
 | |
| 
 | |
|     lxv vs30,   DISP16(\Index,32+ 0+\OffsetA)(\AREG)
 | |
|     lxv vs31,   DISP16(\Index,32+ 16+\OffsetA)(\AREG)   
 | |
|           
 | |
| 
 | |
|     xxspltw   vs10, vs38, 1 
 | |
|     xxspltw   vs11, vs38, 0    
 | |
| 
 | |
|  
 | |
|     xvmaddasp      vs0,   vs26,   vs8 
 | |
| 
 | |
|     xvmaddasp      vs1,   vs27,   vs9 
 | |
| 
 | |
|     xvmaddasp      vs2,   vs30,   vs10   
 | |
|  
 | |
| 
 | |
|     xvmaddasp      vs3,   vs31,   vs11   
 | |
| 
 | |
|  
 | |
|  
 | |
|  
 | |
| .if \IsLast==1   
 | |
|     addi        \BREG, \BREG, DISP4(\Index,16)  
 | |
|     addi        \AREG, \AREG, DISP16(\Index,64)
 | |
| .endif 
 | |
|   
 | |
| .endm
 | |
| 
 | |
| .macro KERNEL1x4_I_2  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
 | |
| 
 | |
|     lxsd v4,    DISP2(\Index, 0+\OffsetB)(\BREG)
 | |
|     xxspltw   vs8,  vs36, 1  
 | |
|     xxspltw   vs9,  vs36, 0      
 | |
|     lxv vs26,   DISP8(\Index, 0+\OffsetA)(\AREG)
 | |
|     lxv vs27,   DISP8(\Index,16+\OffsetA)(\AREG)      
 | |
|  
 | |
|  
 | |
|     xvmaddasp      vs0,   vs26,   vs8
 | |
|     xvmaddasp      vs1,   vs27,   vs9
 | |
|   
 | |
|  
 | |
| .if \IsLast==1   
 | |
|     addi        \BREG, \BREG, DISP2(\Index,8)  
 | |
|     addi        \AREG, \AREG, DISP8(\Index,32)
 | |
| .endif 
 | |
|   
 | |
| .endm
 | |
| 
 | |
| 
 | |
| .macro SAVE1x4
 | |
| 
 | |
| #ifndef TRMMKERNEL    
 | |
|     lxv        vs16, 0(CO)       
 | |
| #endif
 | |
|    /* aggregate */
 | |
|   xvaddsp vs0,vs0,vs2
 | |
|   xvaddsp  vs1,vs1,vs3
 | |
|   xvaddsp  vs0,vs1,vs0
 | |
| #if defined(TRMMKERNEL)
 | |
|   xvmulsp        vs16, vs0, alpha_r     
 | |
| #else
 | |
|   xvmaddasp        vs16, vs0, alpha_r  
 | |
| #endif
 | |
|     stxv        vs16, 0(CO)      
 | |
|     
 | |
|   addi CO,CO,16
 | |
| 
 | |
| .endm
 | |
| 
 | |
| /* M=2 N=1*/ 
 | |
| .macro Zero1x2
 | |
|     xxlxor      vs0,   vs0,   vs0
 | |
|     xxlxor      vs1,   vs1,   vs1 
 | |
|     xxlxor    vs2,vs2,vs2 
 | |
|     xxlxor    vs3,vs3,vs3     
 | |
| .endm
 | |
|  
 | |
| .macro KERNEL1x2
 | |
|   KERNEL1x2_1 AO,BO, 0, 0,0,0
 | |
| .endm
 | |
| .macro KERNEL1x2_4 OffsetA,OffsetB, Index,IsLast  
 | |
|   KERNEL1x2_I_4 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
 | |
| .endm
 | |
| 
 | |
| .macro KERNEL1x2_2 OffsetA,OffsetB, Index,IsLast  
 | |
|   KERNEL1x2_I_2 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
 | |
| .endm
 | |
|  /*
 | |
|    we will calculate 1 alone then will add it to batched ones
 | |
|  */
 | |
| .macro KERNEL1x2_1 AREG,BREG,First,OffsetA,OffsetB,Index
 | |
| 
 | |
| 
 | |
|     lxssp v3,   DISP2(\Index, 0+\OffsetB)(\AREG)
 | |
|     lxssp v4,   DISP2(\Index, 4+\OffsetB)(\AREG) 
 | |
|     lxssp v5,   DISP1(\Index, 0+\OffsetA)(\BREG)        
 | |
|  
 | |
|  
 | |
| .if \First==1
 | |
|     xvmuldp      vs2,   vs37,   vs35 
 | |
|     xvmuldp      vs3,   vs37,   vs36     
 | |
|      
 | |
| .else 
 | |
|     xsmaddadp     vs2,   vs37,   vs35
 | |
|     xsmaddadp      vs3,   vs37,   vs36
 | |
|  .endif
 | |
|    
 | |
|     addi        \AREG, \AREG,  DISP2(\Index,8) 
 | |
|     addi        \BREG, \BREG, DISP1(\Index,4) 
 | |
|  
 | |
| .endm
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| .macro KERNEL1x2_I_4  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
 | |
| 
 | |
|     lxv vs8,    DISP8(\Index, 0+\OffsetB)(\AREG)
 | |
|     lxv vs10,   DISP8(\Index, 16+\OffsetB)(\AREG) 
 | |
| 
 | |
|     lxv vs26,   DISP4(\Index, 0+\OffsetA)(\BREG)
 | |
|    
 | |
|     xxmrglw   vs5, vs26,vs26
 | |
|     xxmrghw   vs6, vs26,vs26 
 | |
|  
 | |
|     xvmaddasp      vs0,   vs8,   vs5
 | |
|     xvmaddasp      vs1,   vs10,   vs6 
 | |
|  
 | |
|  
 | |
| .if \IsLast==1   
 | |
|     addi        \AREG, \AREG, DISP8(\Index,32)
 | |
|     addi        \BREG, \BREG,  DISP4(\Index,16)  
 | |
| .endif 
 | |
|   
 | |
| .endm
 | |
| 
 | |
| .macro KERNEL1x2_I_2  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
 | |
| 
 | |
|     lxssp v3,   DISP4(\Index, 0+\OffsetB)(\AREG)
 | |
|     lxssp v4,   DISP4(\Index, 4+\OffsetB)(\AREG) 
 | |
|     lxssp v7,   DISP4(\Index, 8+\OffsetB)(\AREG)
 | |
|     lxssp v8,   DISP4(\Index, 12+\OffsetB)(\AREG)    
 | |
|     lxssp v5,   DISP2(\Index, 0+\OffsetA)(\BREG)        
 | |
|     lxssp v6,   DISP2(\Index, 4+\OffsetA)(\BREG)  
 | |
|  
 | |
|  
 | |
|     xsmaddadp      vs2,   vs37,   vs35
 | |
|     xsmaddadp      vs3,   vs37,   vs36
 | |
| 
 | |
|     xsmaddadp      vs2,   vs38,   vs39 
 | |
|     xsmaddadp      vs3,   vs38,   vs40      
 | |
|  
 | |
|    
 | |
|     addi        \AREG, \AREG, DISP4(\Index,16)
 | |
|     addi        \BREG, \BREG, DISP2(\Index,8) 
 | |
| .endm
 | |
| 
 | |
| 
 | |
| .macro SAVE1x2
 | |
| 
 | |
| #ifndef TRMMKERNEL    
 | |
|     lxssp v4   , 0(CO)      
 | |
|     lxssp v5   , 4(CO) 
 | |
|     
 | |
| #endif
 | |
| 
 | |
|   /*convert alpha_r for multiply*/
 | |
|   xscvspdp  vs16,alpha_r
 | |
| 
 | |
|  /*aggregate vectors 1x2_4   */ 
 | |
|       xxpermdi         vs4,vs0,vs0,2
 | |
|       xxpermdi         vs5,vs1,vs1,2  
 | |
|       xvaddsp          vs0,vs0,vs4
 | |
|       xvaddsp         vs1,vs1,vs5 
 | |
|       xvaddsp         vs0,vs0,vs1 
 | |
| /*aggregate vectors 1x1_2 and 1x1_1 into 1x2_4*/
 | |
|   xscvspdp  vs5, vs0
 | |
|   xxspltw   vs6, vs0, 1  
 | |
|   xscvspdp  vs6,vs6 
 | |
|   xsadddp  vs2,vs2,vs6
 | |
|   xsadddp  vs3,vs3,vs5  
 | |
| 
 | |
|   /**** store last two words*/
 | |
| #if defined(TRMMKERNEL) 
 | |
|   xsmuldp  vs36,vs2, vs16 
 | |
|   xsmuldp  vs37,vs3, vs16  
 | |
|  
 | |
| #else
 | |
|   xsmaddadp  vs36,vs2, vs16 
 | |
|   xsmaddadp  vs37,vs3, vs16 
 | |
| #endif  
 | |
| 
 | |
|   stxssp       v4, 0(CO) 
 | |
|   stxssp        v5, 4(CO)  
 | |
| 
 | |
|   addi CO,CO,8
 | |
| 
 | |
| .endm
 | |
| /*///////////////// N=1 M=1 //////////////////*/
 | |
| .macro Zero1x1
 | |
|     xxlxor      vs0,   vs0,   vs0
 | |
|     xxlxor      vs1,   vs1,   vs1 
 | |
|     xxlxor      vs2, vs2,vs2 
 | |
|     xxlxor      vs3,vs3,vs3 
 | |
|     xxlxor      vs4,vs4,vs4       
 | |
| .endm
 | |
|  
 | |
| .macro KERNEL1x1
 | |
|   KERNEL1x1_1 AO,BO, 1, 0,0,0
 | |
| .endm
 | |
| 
 | |
| .macro KERNEL1x1_16 OffsetA,OffsetB, Index,IsLast  
 | |
|   KERNEL1x1_I_16 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
 | |
| .endm
 | |
| 
 | |
| .macro KERNEL1x1_8 OffsetA,OffsetB, Index,IsLast  
 | |
|   KERNEL1x1_I_8 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
 | |
| .endm
 | |
| 
 | |
| .macro KERNEL1x1_4 OffsetA,OffsetB, Index,IsLast  
 | |
|   KERNEL1x1_I_4 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
 | |
| .endm
 | |
| 
 | |
| .macro KERNEL1x1_2 OffsetA,OffsetB, Index,IsLast  
 | |
|   KERNEL1x1_I_2 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
 | |
| .endm
 | |
|  /*
 | |
|    we will calculate 1 alone ( FIRST==1 to zero vs4) 
 | |
|  */
 | |
| .macro KERNEL1x1_1 AREG,BREG,First,OffsetA,OffsetB,Index
 | |
| 
 | |
| 
 | |
|     lxssp v3,   DISP1(\Index, 0+\OffsetB)(\AREG) 
 | |
|     lxssp v5,   DISP1(\Index, 0+\OffsetA)(\BREG)        
 | |
|  
 | |
|  
 | |
| .if \First==1
 | |
|     xvmuldp      vs4,   vs37,   vs35       
 | |
|      
 | |
| .else 
 | |
|     xsmaddadp     vs4,   vs37,   vs35 
 | |
|  .endif
 | |
|    
 | |
|     addi        \AREG, \AREG,  DISP1(\Index,4) 
 | |
|     addi        \BREG, \BREG, DISP1(\Index,4) 
 | |
|  
 | |
| .endm
 | |
| 
 | |
| 
 | |
| .macro KERNEL1x1_I_16 AREG,BREG, OffsetA,OffsetB, Index,IsLast  
 | |
| 
 | |
|     lxv vs8,    DISP16(\Index, 0+\OffsetB)(\AREG) 
 | |
|     lxv vs9,    DISP16(\Index, 16+\OffsetB)(\AREG) 
 | |
|     lxv vs10,   DISP16(\Index, 32+0+\OffsetB)(\AREG) 
 | |
|     lxv vs11,   DISP16(\Index, 32+ 16+\OffsetB)(\AREG)        
 | |
|     lxv vs26,   DISP16(\Index, 0+\OffsetA)(\BREG) 
 | |
|     lxv vs16,   DISP16(\Index, 16+\OffsetA)(\BREG) 
 | |
|     lxv vs17,   DISP16(\Index, 32+0+\OffsetA)(\BREG) 
 | |
|     lxv vs18,   DISP16(\Index, 32+16+\OffsetA)(\BREG)     
 | |
|     xvmaddasp      vs0,   vs8,   vs26 
 | |
|     xvmaddasp      vs1,   vs9,   vs16  
 | |
|     xvmaddasp      vs2,   vs10,  vs17 
 | |
|     xvmaddasp      vs3,   vs11,  vs18
 | |
| .if \IsLast==1   
 | |
|     addi        \AREG, \AREG, DISP16(\Index,64)
 | |
|     addi        \BREG, \BREG,  DISP16(\Index,64)  
 | |
| .endif 
 | |
|   
 | |
| .endm
 | |
| 
 | |
| .macro KERNEL1x1_I_8  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
 | |
| 
 | |
|     lxv vs8,    DISP8(\Index, 0+\OffsetB)(\AREG) 
 | |
|     lxv vs9,    DISP8(\Index, 16+\OffsetB)(\AREG)     
 | |
|     lxv vs26,   DISP8(\Index, 0+\OffsetA)(\BREG) 
 | |
|     lxv vs16,   DISP8(\Index, 16+\OffsetA)(\BREG) 
 | |
|     xvmaddasp      vs0,   vs8,   vs26 
 | |
|     xvmaddasp      vs1,   vs9,   vs16 
 | |
|  
 | |
| .if \IsLast==1   
 | |
|     addi        \AREG, \AREG, DISP8(\Index,32)
 | |
|     addi        \BREG, \BREG,  DISP8(\Index,32)  
 | |
| .endif 
 | |
|   
 | |
| .endm
 | |
| 
 | |
| 
 | |
| .macro KERNEL1x1_I_4  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
 | |
| 
 | |
|     lxv vs8,    DISP4(\Index, 0+\OffsetB)(\AREG) 
 | |
|     lxv vs26,   DISP4(\Index, 0+\OffsetA)(\BREG) 
 | |
|  
 | |
|     xvmaddasp      vs0,   vs8,   vs26 
 | |
|  
 | |
|  
 | |
| .if \IsLast==1   
 | |
|     addi        \AREG, \AREG, DISP4(\Index,16)
 | |
|     addi        \BREG, \BREG,  DISP4(\Index,16)  
 | |
| .endif 
 | |
|   
 | |
| .endm
 | |
| 
 | |
| .macro KERNEL1x1_I_2  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
 | |
| 
 | |
|     lxsd v4,    DISP2(\Index, 0+\OffsetB)(\AREG) 
 | |
|     lxsd v5,   DISP2(\Index, 0+\OffsetA)(\BREG) 
 | |
|  
 | |
|     xvmaddasp      vs0,   vs36,   vs37 
 | |
|  
 | |
|     addi        \AREG, \AREG, DISP2(\Index,8)
 | |
|     addi        \BREG, \BREG, DISP2(\Index,8) 
 | |
| .endm
 | |
| 
 | |
| 
 | |
| .macro SAVE1x1
 | |
| 
 | |
| #ifndef TRMMKERNEL    
 | |
|     lxssp v4   , 0(CO)    
 | |
|     
 | |
| #endif
 | |
| 
 | |
|   /*convert alpha_r for multiply*/
 | |
|   xscvspdp  vs16,alpha_r
 | |
| 
 | |
|  /*aggregate vectors   */ 
 | |
|       xvaddsp          vs0,vs0,vs1
 | |
|       xvaddsp          vs2,vs2,vs3
 | |
|       xvaddsp          vs0,vs0,vs2
 | |
| 
 | |
|       xxpermdi         vs7,vs0,vs0,2   
 | |
|       xvaddsp          vs0,vs0,vs7 
 | |
| /*aggregate vectors 1x1_2 and 1x1_1 into 1x1_4*/
 | |
|   xscvspdp  vs5, vs0
 | |
|   xxspltw   vs6, vs0, 1  
 | |
|   xscvspdp  vs6,vs6 
 | |
|   xsadddp  vs7,vs5,vs6
 | |
|   xsadddp  vs4,vs4,vs7  
 | |
| 
 | |
|   /**** store last two words*/
 | |
| #if defined(TRMMKERNEL) 
 | |
|   xsmuldp  vs36,vs4, vs16   
 | |
|  
 | |
| #else
 | |
|   xsmaddadp  vs36,vs4, vs16   
 | |
| #endif  
 | |
| 
 | |
|   stxssp       v4, 0(CO)    
 | |
| 
 | |
|   addi CO,CO,4
 | |
| 
 | |
| .endm
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| /****************************TRMM POINTER REFRESH MACROSES*************************/
 | |
| 
 | |
| .macro SHIFT_REG  REG1,REG2,SHIFT_VAL
 | |
| 		.if \SHIFT_VAL==16 
 | |
| 			slwi		\REG1,	\REG2,	6			
 | |
| 		.elseif \SHIFT_VAL==8  
 | |
| 			slwi		\REG1,	\REG2,	5			 
 | |
| 		.elseif \SHIFT_VAL==4
 | |
| 			slwi		\REG1,	\REG2,	4			  
 | |
| 		.elseif \SHIFT_VAL==2
 | |
| 			slwi		\REG1,	\REG2,	3			 
 | |
| 		.elseif \SHIFT_VAL==1
 | |
| 			slwi		\REG1,	\REG2,	2			 
 | |
| 		.endif
 | |
| .endm
 | |
| 
 | |
| /*
 | |
| //#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
 | |
| // 		ptrbb = bb;
 | |
| // #else
 | |
| // 		ptrba += off*16;
 | |
| // 		ptrbb = bb + off*2;
 | |
| // #endif
 | |
| */
 | |
| .macro REFRESH_POINTERS  PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B
 | |
|     #if (defined(LEFT) &&  defined(TRANSA)) ||  (!defined(LEFT) && !defined(TRANSA))
 | |
|         /* ptrbb = bb;*/
 | |
|         mr \PTR_B,\B_VAL     /* refresh BPOINT */
 | |
| 
 | |
|     #else
 | |
| 		    /*
 | |
|         // ptrba  =ptrba+ off*C_A;
 | |
|         // ptrbb = bb + off*C_B; 
 | |
| 				*/
 | |
| 		SHIFT_REG T4,\OFF_VAL,\C_B		/* Number of values in B shifted  */
 | |
| 		SHIFT_REG T2,\OFF_VAL,\C_A		/* Number of values in A shifted  */
 | |
| 		add		\PTR_B,	\B_VAL ,	T4				/* Add values to BO */
 | |
| 		add		\PTR_A,	\PTR_A,	T2				/* Add values to AO  */
 | |
|     #endif 
 | |
| .endm
 | |
| 
 | |
| 
 | |
| /*
 | |
| // #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
 | |
| // 		temp = bk-off;
 | |
| // #elif defined(LEFT)
 | |
| // 		temp = off+16;	// number of values in A
 | |
| // #else
 | |
| // 		temp = off+2;	// number of values in B
 | |
| // #endif
 | |
| */
 | |
| .macro REFRESH_TEMP_BK TEMP_BK,BK_VAL,OFF_VAL,INCR_A,INCR_B
 | |
|     #if (defined(LEFT) && !defined(TRANSA)) ||  (!defined(LEFT) && defined(TRANSA))
 | |
|                             /* temp = bk-off;*/
 | |
|            sub \TEMP_BK,\BK_VAL,\OFF_VAL
 | |
| 
 | |
|     #elif defined(LEFT)
 | |
|                             /* temp = off+INCR_A;	// number of values in A */
 | |
|            addi \TEMP_BK, \OFF_VAL, \INCR_A
 | |
|     #else
 | |
|                             /* temp = off+INCR_B	// number of values in B*/
 | |
|            addi \TEMP_BK,\OFF_VAL, \INCR_B
 | |
|     #endif
 | |
| 
 | |
| .endm
 | |
| /*
 | |
| // #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
 | |
| // 		temp = bk - off;
 | |
| // #ifdef LEFT
 | |
| // 		temp -= 16; // number of values in A
 | |
| // #else
 | |
| // 		temp -= 2; // number of values in B
 | |
| // #endif
 | |
| // 		ptrba += temp*16;
 | |
| // 		ptrbb += temp*2;
 | |
| // #endif
 | |
| 
 | |
| // #ifdef LEFT
 | |
| // 		off += 16; // number of values in A
 | |
| // #endif
 | |
| */
 | |
|  
 | |
| 
 | |
| .macro REFRESH_AFTER_SAVE TEMP_BK,BK_VAL,OFF_VAL,PTR_B,PTR_A,C_A,C_B
 | |
| 
 | |
|     #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
 | |
|                     /*temp = bk - off;*/
 | |
|                 sub \TEMP_BK,\BK_VAL,\OFF_VAL
 | |
|     #ifdef LEFT
 | |
|                     /*temp -= 8; // number of values in A*/
 | |
|                 addi \TEMP_BK,\TEMP_BK,-\C_A
 | |
|     #else
 | |
|                     /*temp -= 4; // number of values in B*/
 | |
|                 addi \TEMP_BK,\TEMP_BK,-\C_B 
 | |
|     #endif
 | |
|                     /*ptrba += temp*C_A;
 | |
|                     ptrbb += temp*C_B;*/ 
 | |
|                 SHIFT_REG T4,\TEMP_BK,\C_A
 | |
| 								SHIFT_REG T2,\TEMP_BK,\C_B
 | |
|                 add \PTR_A, \PTR_A,T4/*ptrba+temp*C_A*/ 
 | |
| 								add \PTR_B, \PTR_B,T2 
 | |
| 
 | |
|     #endif
 | |
| 
 | |
|     #ifdef LEFT
 | |
|                     /*off += 8; // number of values in A*/
 | |
|                  addi \OFF_VAL,\OFF_VAL,\C_A
 | |
|     #endif
 | |
| .endm |