313 lines
		
	
	
		
			6.9 KiB
		
	
	
	
		
			ArmAsm
		
	
	
	
			
		
		
	
	
			313 lines
		
	
	
		
			6.9 KiB
		
	
	
	
		
			ArmAsm
		
	
	
	
/***************************************************************************
 | 
						|
Copyright (c) 2013-2020, The OpenBLAS Project
 | 
						|
All rights reserved.
 | 
						|
Redistribution and use in source and binary forms, with or without
 | 
						|
modification, are permitted provided that the following conditions are
 | 
						|
met:
 | 
						|
1. Redistributions of source code must retain the above copyright
 | 
						|
notice, this list of conditions and the following disclaimer.
 | 
						|
2. Redistributions in binary form must reproduce the above copyright
 | 
						|
notice, this list of conditions and the following disclaimer in
 | 
						|
the documentation and/or other materials provided with the
 | 
						|
distribution.
 | 
						|
3. Neither the name of the OpenBLAS project nor the names of
 | 
						|
its contributors may be used to endorse or promote products
 | 
						|
derived from this software without specific prior written permission.
 | 
						|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 | 
						|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 | 
						|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 | 
						|
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 | 
						|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 | 
						|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 | 
						|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 | 
						|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 | 
						|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 | 
						|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 | 
						|
*****************************************************************************/
 | 
						|
 | 
						|
#define ASSEMBLER
 | 
						|
#include "common.h"
 | 
						|
#include "def_vsx.h"
 | 
						|
 | 
						|
 
 | 
						|
#define LOAD	ld
 | 
						|
#define STACKSIZE  (512 )  
 | 
						|
#define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */  
 | 
						|
#define	M	r3
 | 
						|
#define	N	r4
 | 
						|
#define	K	r5
 | 
						|
 | 
						|
 | 
						|
#define A	r8
 | 
						|
#define	B	r9
 | 
						|
#define	C	r10
 | 
						|
#define	LDC	r6
 | 
						|
#define OFFSET	r7
 | 
						|
 | 
						|
 | 
						|
#define alpha_r vs51
 | 
						|
#define alpha_i vs55
 | 
						|
#define save_permute_1 vs59
 | 
						|
#define permute_mask vs63
 | 
						|
#define o0	0
 | 
						|
 
 | 
						|
 | 
						|
#define T1	r11
 | 
						|
#define T2	r12
 | 
						|
#define T3	r14
 | 
						|
#define T4	r15
 | 
						|
#define T5	r16
 | 
						|
#define T6	r17
 | 
						|
#define L	r18
 | 
						|
#define T7	r19
 | 
						|
#define T8	r20
 | 
						|
#define TEMP_REG	r21
 | 
						|
#define	I	r22
 | 
						|
#define J	r23
 | 
						|
#define AO	r24
 | 
						|
#define	BO	r25
 | 
						|
#define	CO 	r26
 | 
						|
#define T9	r27
 | 
						|
#define	T10	r28
 | 
						|
#define	PRE	r29
 | 
						|
 | 
						|
#define T12	r30
 | 
						|
#define T13	r31
 | 
						|
 | 
						|
#include "cgemm_macros_power10.S"
 | 
						|
 | 
						|
#if (_AIX)
 | 
						|
.set	perm_const1, 0x0405060700010203
 | 
						|
.set	perm_const2, 0x0c0d0e0f08090a0b
 | 
						|
.set	save_permute_12, 0x1011121300010203	
 | 
						|
.set	save_permute_11, 0x18191a1b08090a0b
 | 
						|
#else
 | 
						|
.equ    perm_const1, 0x0405060700010203
 | 
						|
.equ    perm_const2, 0x0c0d0e0f08090a0b
 | 
						|
.equ save_permute_12, 0x0c0d0e0f1c1d1e1f
 | 
						|
.equ save_permute_11, 0x0405060714151617
 | 
						|
#endif
 | 
						|
 | 
						|
 | 
						|
#ifndef NEEDPARAM
 | 
						|
 | 
						|
	PROLOGUE
 | 
						|
	PROFCODE
 | 
						|
 | 
						|
 | 
						|
	addi	SP, SP, -STACKSIZE
 | 
						|
	mflr r0
 | 
						|
 | 
						|
 | 
						|
	stfd	f14,    0(SP)
 | 
						|
	stfd	f15,    8(SP)
 | 
						|
	stfd	f16,   16(SP)
 | 
						|
	stfd	f17,   24(SP)
 | 
						|
 | 
						|
	stfd	f18,   32(SP)
 | 
						|
	stfd	f19,   40(SP)
 | 
						|
	stfd	f20,   48(SP)
 | 
						|
	stfd	f21,   56(SP)
 | 
						|
 | 
						|
	stfd	f22,   64(SP)
 | 
						|
	stfd	f23,   72(SP)
 | 
						|
	stfd	f24,   80(SP)
 | 
						|
	stfd	f25,   88(SP)
 | 
						|
 | 
						|
	stfd	f26,   96(SP)
 | 
						|
	stfd	f27,  104(SP)
 | 
						|
	stfd	f28,  112(SP)
 | 
						|
	stfd	f29,  120(SP)
 | 
						|
 | 
						|
	stfd	f30,  128(SP)
 | 
						|
	stfd	f31,  136(SP)
 | 
						|
 | 
						|
 | 
						|
	std	r31,  144(SP)
 | 
						|
	std	r30,  152(SP)
 | 
						|
	std	r29,  160(SP)
 | 
						|
	std	r28,  168(SP)
 | 
						|
	std	r27,  176(SP)
 | 
						|
	std	r26,  184(SP)
 | 
						|
	std	r25,  192(SP)
 | 
						|
	std	r24,  200(SP)
 | 
						|
	std	r23,  208(SP)
 | 
						|
	std	r22,  216(SP)
 | 
						|
	std	r21,  224(SP)
 | 
						|
	std	r20,  232(SP)
 | 
						|
	std	r19,  240(SP)
 | 
						|
	std	r18,  248(SP)
 | 
						|
	std	r17,  256(SP)
 | 
						|
	std	r16,  264(SP)
 | 
						|
	std	r15,  272(SP)
 | 
						|
	std	r14,  280(SP)
 | 
						|
 
 | 
						|
 
 | 
						|
  stxv    vs52,  288(SP)
 | 
						|
  stxv    vs53,  304(SP)
 | 
						|
  stxv    vs54,  320(SP)
 | 
						|
  stxv    vs55,  336(SP)
 | 
						|
  stxv    vs56,  352(SP)
 | 
						|
  stxv    vs57,  368(SP)
 | 
						|
  stxv    vs58,  384(SP)
 | 
						|
  stxv    vs59,  400(SP)
 | 
						|
  stxv    vs60,  416(SP)
 | 
						|
  stxv    vs61,  432(SP)
 | 
						|
  stxv    vs62,  448(SP)
 | 
						|
  stxv    vs63,  464(SP)
 | 
						|
  std     r0,   FLINK_SAVE(SP)
 | 
						|
 
 | 
						|
 | 
						|
 | 
						|
	ld	LDC, FRAMESLOT(0) + STACKSIZE(SP)
 | 
						|
 | 
						|
 | 
						|
 | 
						|
#ifdef TRMMKERNEL
 | 
						|
	ld	OFFSET,  FRAMESLOT(1) + STACKSIZE(SP)
 | 
						|
#endif
 | 
						|
   slwi    LDC, LDC, ZBASE_SHIFT
 | 
						|
 | 
						|
 
 | 
						|
 
 | 
						|
	/*alpha is stored in f1. convert to single and splat*/
 | 
						|
    xscvdpspn alpha_r,vs1 
 | 
						|
    xscvdpspn alpha_i,vs2 
 | 
						|
	xxspltw   alpha_r,alpha_r,0 
 | 
						|
	xxspltw   alpha_i,alpha_i,0 
 | 
						|
/*load reverse permute mask for big endian
 | 
						|
  uint128 = 0xc0d0e0f08090a0b0405060700010203
 | 
						|
*/ 
 | 
						|
#if (_AIX)
 | 
						|
	lis	T2,	(perm_const2>>48 & 0xFFFF)
 | 
						|
	lis	T1,	(perm_const1>>48 & 0xFFFF)
 | 
						|
	lis	T3,	(save_permute_12>>48 & 0xFFFF)
 | 
						|
	lis	T4,	(save_permute_11>>48 & 0xFFFF)
 | 
						|
 | 
						|
	ori	T2,	T2,	(perm_const2>>32 & 0xFFFF)
 | 
						|
	ori	T1,	T1,	(perm_const1>>32 & 0xFFFF)
 | 
						|
	ori	T3,	T3,	(save_permute_12>>32 & 0xFFFF)
 | 
						|
	ori	T4,	T4,	(save_permute_11>>32 & 0xFFFF)
 | 
						|
#else
 | 
						|
	lis T2, perm_const2@highest
 | 
						|
	lis T1, perm_const1@highest
 | 
						|
	lis T3, save_permute_12@highest
 | 
						|
	lis T4, save_permute_11@highest
 | 
						|
	
 | 
						|
	ori T2, T2, perm_const2@higher
 | 
						|
	ori T1, T1, perm_const1@higher
 | 
						|
	ori T3, T3, save_permute_12@higher
 | 
						|
	ori T4, T4, save_permute_11@higher
 | 
						|
#endif
 | 
						|
	
 | 
						|
	rldicr T2, T2, 32, 31
 | 
						|
	rldicr T1, T1, 32, 31
 | 
						|
	rldicr T3, T3, 32, 31
 | 
						|
	rldicr T4, T4, 32, 31 
 | 
						|
 | 
						|
#if (_AIX)
 | 
						|
	oris	T2,	T2,	(perm_const2>>16 & 0xFFFF)
 | 
						|
	oris	T1, T1,	(perm_const1>>16 & 0xFFFF)
 | 
						|
	oris	T3, T3,	(save_permute_12>>16 & 0xFFFF)
 | 
						|
	oris	T4, T4,	(save_permute_11>>16 & 0xFFFF)
 | 
						|
 | 
						|
	ori	T2, T2,	(perm_const2  & 0xFFFF)
 | 
						|
	ori	T1, T1,	(perm_const1 & 0xFFFF)
 | 
						|
	ori	T3, T3,	(save_permute_12 &  0xFFFF)
 | 
						|
	ori	T4, T4,	(save_permute_11 &  0xFFFF)	
 | 
						|
#else
 | 
						|
	oris T2, T2, perm_const2@h
 | 
						|
	oris T1, T1, perm_const1@h
 | 
						|
	oris T3, T3, save_permute_12@h
 | 
						|
	oris T4, T4, save_permute_11@h
 | 
						|
 | 
						|
	
 | 
						|
	ori T2, T2, perm_const2@l  
 | 
						|
	ori T1, T1, perm_const1@l
 | 
						|
	ori T3, T3, save_permute_12@l  
 | 
						|
	ori T4, T4, save_permute_11@l
 | 
						|
#endif
 | 
						|
	
 | 
						|
  li r0,0
 | 
						|
  li PRE,512
 | 
						|
 | 
						|
#if defined(CC) || defined(CR) || defined(RC) || defined(RR) 
 | 
						|
/*negate for this case as we will use addition -1*(a+b) */
 | 
						|
  xvnegsp alpha_r,alpha_r
 | 
						|
  xvnegsp alpha_i,alpha_i
 | 
						|
#endif
 | 
						|
 | 
						|
	mtvsrdd permute_mask,T2,T1
 | 
						|
	mtvsrdd save_permute_1,T3,T4 	
 | 
						|
 | 
						|
     /*mask is reverse permute so we have to make it inner permute */
 | 
						|
 	xxpermdi	permute_mask,	permute_mask,	permute_mask,2 
 | 
						|
 | 
						|
#include "cgemm_logic_power10.S"
 | 
						|
 | 
						|
.L999: 
 | 
						|
	lfd	f14,    0(SP)
 | 
						|
	lfd	f15,    8(SP)
 | 
						|
	lfd	f16,   16(SP)
 | 
						|
	lfd	f17,   24(SP)
 | 
						|
 | 
						|
	lfd	f18,   32(SP)
 | 
						|
	lfd	f19,   40(SP)
 | 
						|
	lfd	f20,   48(SP)
 | 
						|
	lfd	f21,   56(SP)
 | 
						|
 | 
						|
	lfd	f22,   64(SP)
 | 
						|
	lfd	f23,   72(SP)
 | 
						|
	lfd	f24,   80(SP)
 | 
						|
	lfd	f25,   88(SP)
 | 
						|
 | 
						|
	lfd	f26,   96(SP)
 | 
						|
	lfd	f27,  104(SP)
 | 
						|
	lfd	f28,  112(SP)
 | 
						|
	lfd	f29,  120(SP)
 | 
						|
 | 
						|
	lfd	f30,  128(SP)
 | 
						|
	lfd	f31,  136(SP)
 | 
						|
 | 
						|
	ld	r31,  144(SP)
 | 
						|
	ld	r30,  152(SP)
 | 
						|
	ld	r29,  160(SP)
 | 
						|
	ld	r28,  168(SP)
 | 
						|
	ld	r27,  176(SP)
 | 
						|
	ld	r26,  184(SP)
 | 
						|
	ld	r25,  192(SP)
 | 
						|
	ld	r24,  200(SP)
 | 
						|
	ld	r23,  208(SP)
 | 
						|
	ld	r22,  216(SP)
 | 
						|
	ld	r21,  224(SP)
 | 
						|
	ld	r20,  232(SP)
 | 
						|
	ld	r19,  240(SP)
 | 
						|
	ld	r18,  248(SP)
 | 
						|
	ld	r17,  256(SP)
 | 
						|
	ld	r16,  264(SP)
 | 
						|
	ld	r15,  272(SP)
 | 
						|
	ld	r14,  280(SP)
 | 
						|
 | 
						|
	ld    r0, 	 FLINK_SAVE(SP)	
 | 
						|
 
 | 
						|
    lxv    vs52,  288(SP)
 | 
						|
    lxv    vs53,  304(SP)
 | 
						|
    lxv    vs54,  320(SP)
 | 
						|
    lxv    vs55,  336(SP)
 | 
						|
    lxv    vs56,  352(SP)
 | 
						|
    lxv    vs57,  368(SP)
 | 
						|
    lxv    vs58,  384(SP) 
 | 
						|
    lxv    vs59,  400(SP)
 | 
						|
	mtlr r0
 | 
						|
    lxv    vs60,  416(SP)
 | 
						|
    lxv    vs61,  432(SP) 
 | 
						|
    lxv    vs62,  448(SP)
 | 
						|
    lxv    vs63,  464(SP)
 | 
						|
 | 
						|
	addi	SP, SP, STACKSIZE 
 | 
						|
	blr
 | 
						|
 | 
						|
 | 
						|
	EPILOGUE
 | 
						|
#endif
 |