Replace two vector loads with one vector pair load and fix endianess of stores.
This commit is contained in:
		
							parent
							
								
									46440a0486
								
							
						
					
					
						commit
						4e738e561a
					
				| 
						 | 
				
			
			@ -108,6 +108,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 | 
			
		|||
 | 
			
		||||
#define o0	0
 | 
			
		||||
 | 
			
		||||
#ifdef POWER10
 | 
			
		||||
#include "sgemm_tcopy_macros_16_power10.S"
 | 
			
		||||
#endif
 | 
			
		||||
#include "sgemm_tcopy_macros_16_power8.S"
 | 
			
		||||
 | 
			
		||||
#define STACKSIZE 144
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -0,0 +1,323 @@
 | 
			
		|||
/***************************************************************************
 | 
			
		||||
Copyright (c) 2013-2016, The OpenBLAS Project
 | 
			
		||||
All rights reserved.
 | 
			
		||||
Redistribution and use in source and binary forms, with or without
 | 
			
		||||
modification, are permitted provided that the following conditions are
 | 
			
		||||
met:
 | 
			
		||||
1. Redistributions of source code must retain the above copyright
 | 
			
		||||
notice, this list of conditions and the following disclaimer.
 | 
			
		||||
2. Redistributions in binary form must reproduce the above copyright
 | 
			
		||||
notice, this list of conditions and the following disclaimer in
 | 
			
		||||
the documentation and/or other materials provided with the
 | 
			
		||||
distribution.
 | 
			
		||||
3. Neither the name of the OpenBLAS project nor the names of
 | 
			
		||||
its contributors may be used to endorse or promote products
 | 
			
		||||
derived from this software without specific prior written permission.
 | 
			
		||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 | 
			
		||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 | 
			
		||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 | 
			
		||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 | 
			
		||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 | 
			
		||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 | 
			
		||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 | 
			
		||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 | 
			
		||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 | 
			
		||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 | 
			
		||||
*****************************************************************************/
 | 
			
		||||
 | 
			
		||||
/**************************************************************************************
 | 
			
		||||
* 2016/04/21 Werner Saar (wernsaar@googlemail.com)
 | 
			
		||||
* 	 BLASTEST 		: OK
 | 
			
		||||
* 	 CTEST			: OK
 | 
			
		||||
* 	 TEST			: OK
 | 
			
		||||
*	 LAPACK-TEST		: OK
 | 
			
		||||
**************************************************************************************/
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
/**********************************************************************************************
 | 
			
		||||
* Macros for N=4 and M=16
 | 
			
		||||
**********************************************************************************************/
 | 
			
		||||
 | 
			
		||||
#if defined(_AIX)
 | 
			
		||||
define(`COPY_4x16', `
 | 
			
		||||
#else
 | 
			
		||||
.macro COPY_4x16
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
    lxvpx       vs32,   o0,     A0
 | 
			
		||||
    lxvpx       vs34,   o32,    A0
 | 
			
		||||
 | 
			
		||||
    lxvpx       vs36,   o0,     A1
 | 
			
		||||
    lxvpx       vs38,   o32,    A1
 | 
			
		||||
 | 
			
		||||
    lxvpx       vs40,   o0,     A2
 | 
			
		||||
    lxvpx       vs42,   o32,    A2
 | 
			
		||||
 | 
			
		||||
    lxvpx       vs44,   o0,     A3
 | 
			
		||||
    lxvpx       vs46,   o32,    A3
 | 
			
		||||
 | 
			
		||||
	mr		T1,	BO
 | 
			
		||||
 | 
			
		||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
 | 
			
		||||
    stxvx       vs32,   o0,     T1
 | 
			
		||||
    stxvx       vs33,   o16,    T1
 | 
			
		||||
    stxvx       vs34,   o32,    T1
 | 
			
		||||
    stxvx       vs35,   o48,    T1
 | 
			
		||||
#else
 | 
			
		||||
    stxvx       vs33,   o0,     T1
 | 
			
		||||
    stxvx       vs32,   o16,    T1
 | 
			
		||||
    stxvx       vs35,   o32,    T1
 | 
			
		||||
    stxvx       vs34,   o48,    T1
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
	addi		T1,	T1,	64
 | 
			
		||||
 | 
			
		||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
 | 
			
		||||
    stxvx       vs36,   o0,     T1
 | 
			
		||||
    stxvx       vs37,   o16,    T1
 | 
			
		||||
    stxvx       vs38,   o32,    T1
 | 
			
		||||
    stxvx       vs39,   o48,    T1
 | 
			
		||||
#else
 | 
			
		||||
    stxvx       vs37,   o0,     T1
 | 
			
		||||
    stxvx       vs36,   o16,    T1
 | 
			
		||||
    stxvx       vs39,   o32,    T1
 | 
			
		||||
    stxvx       vs38,   o48,    T1
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
	addi		T1,	T1,	64
 | 
			
		||||
 | 
			
		||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
 | 
			
		||||
    stxvx       vs40,   o0,     T1
 | 
			
		||||
    stxvx       vs41,   o16,    T1
 | 
			
		||||
    stxvx       vs42,   o32,    T1
 | 
			
		||||
    stxvx       vs43,   o48,    T1
 | 
			
		||||
#else
 | 
			
		||||
    stxvx       vs41,   o0,     T1
 | 
			
		||||
    stxvx       vs40,   o16,    T1
 | 
			
		||||
    stxvx       vs43,   o32,    T1
 | 
			
		||||
    stxvx       vs42,   o48,    T1
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
	addi		T1,	T1,	64
 | 
			
		||||
 | 
			
		||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
 | 
			
		||||
    stxvx       vs44,   o0,     T1
 | 
			
		||||
    stxvx       vs45,   o16,    T1
 | 
			
		||||
    stxvx       vs46,   o32,    T1
 | 
			
		||||
    stxvx       vs47,   o48,    T1
 | 
			
		||||
#else
 | 
			
		||||
    stxvx       vs45,   o0,     T1
 | 
			
		||||
    stxvx       vs44,   o16,    T1
 | 
			
		||||
    stxvx       vs47,   o32,    T1
 | 
			
		||||
    stxvx       vs46,   o48,    T1
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#if defined(_AIX)
 | 
			
		||||
')
 | 
			
		||||
#else
 | 
			
		||||
.endm
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
/**********************************************************************************************
 | 
			
		||||
* Macros for N=4 and M=8
 | 
			
		||||
**********************************************************************************************/
 | 
			
		||||
 | 
			
		||||
#if defined(_AIX)
 | 
			
		||||
define(`COPY_4x8', `
 | 
			
		||||
#else
 | 
			
		||||
.macro COPY_4x8
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
    lxvpx       vs32,   o0,     A0
 | 
			
		||||
 | 
			
		||||
    lxvpx       vs34,   o0,     A1
 | 
			
		||||
 | 
			
		||||
    lxvpx       vs36,   o0,     A2
 | 
			
		||||
 | 
			
		||||
    lxvpx       vs38,   o0,     A3
 | 
			
		||||
 | 
			
		||||
    mr      T1, BO
 | 
			
		||||
 | 
			
		||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
 | 
			
		||||
    stxvx       vs32,   o0,     T1
 | 
			
		||||
    stxvx       vs33,   o16,    T1
 | 
			
		||||
 | 
			
		||||
    stxvx       vs34,   o32,    T1
 | 
			
		||||
    stxvx       vs35,   o48,    T1
 | 
			
		||||
#else
 | 
			
		||||
    stxvx       vs33,   o0,     T1
 | 
			
		||||
    stxvx       vs32,   o16,    T1
 | 
			
		||||
 | 
			
		||||
    stxvx       vs35,   o32,    T1
 | 
			
		||||
    stxvx       vs34,   o48,    T1
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
    addi        T1, T1, 64
 | 
			
		||||
 | 
			
		||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
 | 
			
		||||
    stxvx       vs36,   o0,     T1
 | 
			
		||||
    stxvx       vs37,   o16,    T1
 | 
			
		||||
 | 
			
		||||
    stxvx       vs38,   o32,    T1
 | 
			
		||||
    stxvx       vs39,   o48,    T1
 | 
			
		||||
#else
 | 
			
		||||
    stxvx       vs37,   o0,     T1
 | 
			
		||||
    stxvx       vs36,   o16,    T1
 | 
			
		||||
 | 
			
		||||
    stxvx       vs39,   o32,    T1
 | 
			
		||||
    stxvx       vs38,   o48,    T1
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#if defined(_AIX)
 | 
			
		||||
')
 | 
			
		||||
#else
 | 
			
		||||
.endm
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
/**********************************************************************************************
 | 
			
		||||
* Macros for N=2 and M=16
 | 
			
		||||
**********************************************************************************************/
 | 
			
		||||
 | 
			
		||||
#if defined(_AIX)
 | 
			
		||||
define(`COPY_2x16', `
 | 
			
		||||
#else
 | 
			
		||||
.macro COPY_2x16
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
    lxvpx       vs32,   o0,     A0
 | 
			
		||||
    lxvpx       vs34,   o32,    A0
 | 
			
		||||
 | 
			
		||||
    lxvpx       vs36,   o0,     A1
 | 
			
		||||
    lxvpx       vs38,   o32,    A1
 | 
			
		||||
 | 
			
		||||
	mr		T1,	BO
 | 
			
		||||
 | 
			
		||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
 | 
			
		||||
    stxvx       vs32,   o0,     T1
 | 
			
		||||
    stxvx       vs33,   o16,    T1
 | 
			
		||||
    stxvx       vs34,   o32,    T1
 | 
			
		||||
    stxvx       vs35,   o48,    T1
 | 
			
		||||
#else
 | 
			
		||||
    stxvx       vs33,   o0,     T1
 | 
			
		||||
    stxvx       vs32,   o16,    T1
 | 
			
		||||
    stxvx       vs35,   o32,    T1
 | 
			
		||||
    stxvx       vs34,   o48,    T1
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
	addi		T1,	T1,	64
 | 
			
		||||
 | 
			
		||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
 | 
			
		||||
    stxvx       vs36,   o0, T1
 | 
			
		||||
    stxvx       vs37,   o16,    T1
 | 
			
		||||
    stxvx       vs38,   o32,    T1
 | 
			
		||||
    stxvx       vs39,   o48,    T1
 | 
			
		||||
#else
 | 
			
		||||
    stxvx       vs37,   o0, T1
 | 
			
		||||
    stxvx       vs36,   o16,    T1
 | 
			
		||||
    stxvx       vs39,   o32,    T1
 | 
			
		||||
    stxvx       vs38,   o48,    T1
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#if defined(_AIX)
 | 
			
		||||
')
 | 
			
		||||
#else
 | 
			
		||||
.endm
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
/**********************************************************************************************
 | 
			
		||||
* Macros for N=2 and M=8
 | 
			
		||||
**********************************************************************************************/
 | 
			
		||||
 | 
			
		||||
#if defined(_AIX)
 | 
			
		||||
define(`COPY_2x8', `
 | 
			
		||||
#else
 | 
			
		||||
.macro COPY_2x8
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
    lxvpx       vs32,   o0,     A0
 | 
			
		||||
 | 
			
		||||
    lxvpx       vs34,   o0,     A1
 | 
			
		||||
 | 
			
		||||
    mr      T1, BO
 | 
			
		||||
 | 
			
		||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
 | 
			
		||||
    stxvx       vs32,   o0,     T1
 | 
			
		||||
    stxvx       vs33,   o16,    T1
 | 
			
		||||
 | 
			
		||||
    stxvx       vs34,   o32,    T1
 | 
			
		||||
    stxvx       vs35,   o48,    T1
 | 
			
		||||
#else
 | 
			
		||||
    stxvx       vs33,   o0,     T1
 | 
			
		||||
    stxvx       vs32,   o16,    T1
 | 
			
		||||
 | 
			
		||||
    stxvx       vs35,   o32,    T1
 | 
			
		||||
    stxvx       vs34,   o48,    T1
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#if defined(_AIX)
 | 
			
		||||
')
 | 
			
		||||
#else
 | 
			
		||||
.endm
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
/**********************************************************************************************
 | 
			
		||||
* Macros for N=1 and M=16
 | 
			
		||||
**********************************************************************************************/
 | 
			
		||||
 | 
			
		||||
#if defined(_AIX)
 | 
			
		||||
define(`COPY_1x16', `
 | 
			
		||||
#else
 | 
			
		||||
.macro COPY_1x16
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
    lxvpx       vs32,   o0,     A0
 | 
			
		||||
    lxvpx       vs34,   o32,    A0
 | 
			
		||||
 | 
			
		||||
	mr		T1,	BO
 | 
			
		||||
 | 
			
		||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
 | 
			
		||||
    stxvx       vs32,   o0, T1
 | 
			
		||||
    stxvx       vs33,   o16,    T1
 | 
			
		||||
    stxvx       vs34,   o32,    T1
 | 
			
		||||
    stxvx       vs35,   o48,    T1
 | 
			
		||||
#else
 | 
			
		||||
    stxvx       vs33,   o0, T1
 | 
			
		||||
    stxvx       vs32,   o16,    T1
 | 
			
		||||
    stxvx       vs35,   o32,    T1
 | 
			
		||||
    stxvx       vs34,   o48,    T1
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#if defined(_AIX)
 | 
			
		||||
')
 | 
			
		||||
#else
 | 
			
		||||
.endm
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
/**********************************************************************************************
 | 
			
		||||
* Macros for N=1 and M=8
 | 
			
		||||
**********************************************************************************************/
 | 
			
		||||
 | 
			
		||||
#if defined(_AIX)
 | 
			
		||||
define(`COPY_1x8', `
 | 
			
		||||
#else
 | 
			
		||||
.macro COPY_1x8
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
    lxvpx       vs32,   o0,     A0
 | 
			
		||||
 | 
			
		||||
    mr      T1, BO
 | 
			
		||||
 | 
			
		||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
 | 
			
		||||
    stxvx       vs32,   o0, T1
 | 
			
		||||
    stxvx       vs33,   o16,    T1
 | 
			
		||||
#else
 | 
			
		||||
    stxvx       vs33,   o0, T1
 | 
			
		||||
    stxvx       vs32,   o16,    T1
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#if defined(_AIX)
 | 
			
		||||
')
 | 
			
		||||
#else
 | 
			
		||||
.endm
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -38,6 +38,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 | 
			
		|||
* Macros for N=4 and M=16
 | 
			
		||||
**********************************************************************************************/
 | 
			
		||||
 | 
			
		||||
#ifndef POWER10
 | 
			
		||||
#if defined(_AIX)
 | 
			
		||||
define(`COPY_4x16', `
 | 
			
		||||
#else
 | 
			
		||||
| 
						 | 
				
			
			@ -141,6 +142,7 @@ define(`COPY_4x8', `
 | 
			
		|||
#else
 | 
			
		||||
.endm
 | 
			
		||||
#endif
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
/**********************************************************************************************
 | 
			
		||||
* Macros for N=4 and M=4
 | 
			
		||||
| 
						 | 
				
			
			@ -264,6 +266,7 @@ define(`COPY_4x1', `
 | 
			
		|||
* Macros for N=2 and M=16
 | 
			
		||||
**********************************************************************************************/
 | 
			
		||||
 | 
			
		||||
#ifndef POWER10
 | 
			
		||||
#if defined(_AIX)
 | 
			
		||||
define(`COPY_2x16', `
 | 
			
		||||
#else
 | 
			
		||||
| 
						 | 
				
			
			@ -329,6 +332,7 @@ define(`COPY_2x8', `
 | 
			
		|||
#else
 | 
			
		||||
.endm
 | 
			
		||||
#endif
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
/**********************************************************************************************
 | 
			
		||||
* Macros for N=2 and M=4
 | 
			
		||||
| 
						 | 
				
			
			@ -418,6 +422,7 @@ define(`COPY_2x1', `
 | 
			
		|||
* Macros for N=1 and M=16
 | 
			
		||||
**********************************************************************************************/
 | 
			
		||||
 | 
			
		||||
#ifndef POWER10
 | 
			
		||||
#if defined(_AIX)
 | 
			
		||||
define(`COPY_1x16', `
 | 
			
		||||
#else
 | 
			
		||||
| 
						 | 
				
			
			@ -465,6 +470,7 @@ define(`COPY_1x8', `
 | 
			
		|||
#else
 | 
			
		||||
.endm
 | 
			
		||||
#endif
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
/**********************************************************************************************
 | 
			
		||||
* Macros for N=1 and M=4
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in New Issue