AIX changes for Power8

This commit is contained in:
Kavana Bhat 2019-08-20 06:51:35 -05:00
parent 2a43062de7
commit 3dc6b26eff
48 changed files with 9263 additions and 996 deletions

View File

@ -39,6 +39,35 @@
#ifndef COMMON_POWER
#define COMMON_POWER
#define str(x) #x
#ifdef OS_AIX
#define XXSPLTD(T,A,z) xxpermdi T, A, A, 0b##z##z
#define XXMRGHD(T,A,B) xxpermdi T, A, B, 0b00
#define XXMRGLD(T,A,B) xxpermdi T, A, B, 0b11
#define XXSWAPD(T,A) xxpermdi T, A, A, 0b10
#define XVMOVDP(T,A) xvcpsgndp T, A, A
#define XXSPLTD_S(T,A,z) "xxpermdi " str(T) ", " str(A) ", " str(A) ", 0b" str(z ## z) " \n\t"
#define XXMRGHD_S(T,A,B) "xxpermdi " str(T) ", " str(A) ", " str(B) ", 0b00 \n\t"
#define XXMRGLD_S(T,A,B) "xxpermdi " str(T) ", " str(A) ", " str(B) ", 0b11 \n\t"
#define XXSWAPD_S(T,A) "xxpermdi " str(T) ", " str(A) ", " str(A) ", 0b10 \n\t"
#else
#define XXSPLTD(T,A,z) xxspltd T, A, z
#define XXMRGHD(T,A,B) xxmrghd T, A, B
#define XXMRGLD(T,A,B) xxmrgld T, A, B
#define XXSWAPD(T,A) xxswapd T, A
#define XVMOVDP(T,A) xvmovdp T, A
#define XXSPLTD_S(T,A,z) "xxspltd T, A, z \n\t"
#define XXMRGHD_S(T,A,B) "xxmrghd T, A, B \n\t"
#define XXMRGLD_S(T,A,B) "xxmrgld T, A, B \n\t"
#define XXSWAPD_S(T,A) "xxswapd T, A"
#endif
#if defined(POWER8) || defined(POWER9)
#define MB __asm__ __volatile__ ("eieio":::"memory")
#define WMB __asm__ __volatile__ ("eieio":::"memory")

View File

@ -57,8 +57,6 @@ USE_TRMM = 1
endif
SKERNELOBJS += \
sgemm_kernel$(TSUFFIX).$(SUFFIX) \
$(SGEMMINCOPYOBJ) $(SGEMMITCOPYOBJ) \
@ -436,7 +434,10 @@ $(KDIR)$(SGEMMONCOPYOBJ) : $(KERNELDIR)/$(SGEMMONCOPY)
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
$(KDIR)$(SGEMMOTCOPYOBJ) : $(KERNELDIR)/$(SGEMMOTCOPY)
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
$(CC) $(CFLAGS) -E -UDOUBLE -UCOMPLEX $< -o sgemmotcopy.s
m4 sgemmotcopy.s > sgemmotcopy_nomacros.s
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemmotcopy_nomacros.s -o $@
rm sgemmotcopy.s sgemmotcopy_nomacros.s
ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
@ -444,12 +445,17 @@ $(KDIR)$(SGEMMINCOPYOBJ) : $(KERNELDIR)/$(SGEMMINCOPY)
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
$(KDIR)$(SGEMMITCOPYOBJ) : $(KERNELDIR)/$(SGEMMITCOPY)
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
$(CC) $(CFLAGS) -E -UDOUBLE -UCOMPLEX $< -o sgemmitcopy.s
m4 sgemmitcopy.s > sgemmitcopy_nomacros.s
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemmitcopy_nomacros.s -o $@
rm sgemmitcopy.s sgemmitcopy_nomacros.s
endif
$(KDIR)$(DGEMMONCOPYOBJ) : $(KERNELDIR)/$(DGEMMONCOPY)
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
$(CC) $(CFLAGS) -E -DDOUBLE -UCOMPLEX $< -o dgemm_ncopy.s
m4 dgemm_ncopy.s > dgemm_ncopy_nomacros.s
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX dgemm_ncopy_nomacros.s -o $@
rm dgemm_ncopy.s dgemm_ncopy_nomacros.s
$(KDIR)$(DGEMMOTCOPYOBJ) : $(KERNELDIR)/$(DGEMMOTCOPY)
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
@ -460,7 +466,10 @@ $(KDIR)$(DGEMMINCOPYOBJ) : $(KERNELDIR)/$(DGEMMINCOPY)
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
$(KDIR)$(DGEMMITCOPYOBJ) : $(KERNELDIR)/$(DGEMMITCOPY)
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
$(CC) $(CFLAGS) -E -DDOUBLE -UCOMPLEX $< -o dgemm_itcopy.s
m4 dgemm_itcopy.s > dgemm_itcopy_nomacros.s
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX dgemm_itcopy_nomacros.s -o $@
rm dgemm_itcopy.s dgemm_itcopy_nomacros.s
endif
@ -485,10 +494,16 @@ endif
endif
$(KDIR)$(CGEMMONCOPYOBJ) : $(KERNELDIR)/$(CGEMMONCOPY)
# $(CC) $(CFLAGS) -E -UDOUBLE -UCOMPLEX $< -o cgemm_oncopy.s
# m4 cgemm_oncopy.s > cgemm_oncopy_nomacros.s
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
# rm cgemm_oncopy.s cgemm_oncopy_nomacros.s
$(KDIR)$(CGEMMOTCOPYOBJ) : $(KERNELDIR)/$(CGEMMOTCOPY)
# $(CC) $(CFLAGS) -E -UDOUBLE -UCOMPLEX $< -o cgemm_otcopy.s
# m4 cgemm_otcopy.s > cgemm_otcopy_nomacros.s
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
# rm cgemm_otcopy.s cgemm_otcopy_nomacros.s
ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
@ -496,7 +511,10 @@ $(KDIR)$(CGEMMINCOPYOBJ) : $(KERNELDIR)/$(CGEMMINCOPY)
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
$(KDIR)$(CGEMMITCOPYOBJ) : $(KERNELDIR)/$(CGEMMITCOPY)
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
$(CC) $(CFLAGS) -UDOUBLE -UCOMPLEX -E $< -o cgemm_itcopy.s
m4 cgemm_itcopy.s > cgemm_itcopy_nomacros.s
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX cgemm_itcopy_nomacros.s -o $@
rm cgemm_itcopy.s cgemm_itcopy_nomacros.s
endif
@ -512,7 +530,10 @@ $(KDIR)$(ZGEMMINCOPYOBJ) : $(KERNELDIR)/$(ZGEMMINCOPY)
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
$(KDIR)$(ZGEMMITCOPYOBJ) : $(KERNELDIR)/$(ZGEMMITCOPY)
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
$(CC) $(CFLAGS) -E -DDOUBLE -UCOMPLEX $< -o zgemm_itcopy.s
m4 zgemm_itcopy.s > zgemm_itcopy_nomacros.s
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX zgemm_itcopy_nomacros.s -o $@
rm zgemm_itcopy.s zgemm_itcopy_nomacros.s
endif
@ -537,37 +558,67 @@ endif
endif
$(KDIR)sgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(SGEMMDEPEND)
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
$(CC) $(CFLAGS) -E -UDOUBLE -UCOMPLEX $< -o sgemm_kernel$(TSUFFIX).s
m4 sgemm_kernel$(TSUFFIX).s > sgemm_kernel$(TSUFFIX)_nomacros.s
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemm_kernel$(TSUFFIX)_nomacros.s -o $@
rm sgemm_kernel$(TSUFFIX).s sgemm_kernel$(TSUFFIX)_nomacros.s
$(KDIR)dgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) $(DGEMMDEPEND)
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
$(CC) $(CFLAGS) -E -DDOUBLE -UCOMPLEX $< -o dgemm_kernel$(TSUFFIX).s
m4 dgemm_kernel$(TSUFFIX).s > dgemm_kernel$(TSUFFIX)_nomacros.s
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX dgemm_kernel$(TSUFFIX)_nomacros.s -o $@
rm dgemm_kernel$(TSUFFIX).s dgemm_kernel$(TSUFFIX)_nomacros.s
$(KDIR)qgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) $(QGEMMDEPEND)
$(CC) $(CFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@
$(KDIR)cgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND)
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN $< -o $@
$(CC) $(CFLAGS) -E -UDOUBLE -DCOMPLEX -DNN $< -o cgemm_kernel_n.s
m4 cgemm_kernel_n.s > cgemm_kernel_n_nomacros.s
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN cgemm_kernel_n_nomacros.s -o $@
rm cgemm_kernel_n.s cgemm_kernel_n_nomacros.s
$(KDIR)cgemm_kernel_l$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND)
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $@
$(CC) $(CFLAGS) -E -UDOUBLE -DCOMPLEX -DCN $< -o cgemm_kernel_l.s
m4 cgemm_kernel_l.s > cgemm_kernel_l_nomacros.s
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCN cgemm_kernel_l_nomacros.s -o $@
rm cgemm_kernel_l.s cgemm_kernel_l_nomacros.s
$(KDIR)cgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND)
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNC $< -o $@
$(CC) $(CFLAGS) -E -UDOUBLE -DCOMPLEX -DNC $< -o cgemm_kernel_r.s
m4 cgemm_kernel_r.s > cgemm_kernel_r_nomacros.s
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNC cgemm_kernel_r_nomacros.s -o $@
rm cgemm_kernel_r.s cgemm_kernel_r_nomacros.s
$(KDIR)cgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND)
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $@
$(CC) $(CFLAGS) -E -UDOUBLE -DCOMPLEX -DCC $< -o cgemm_kernel_b.s
m4 cgemm_kernel_b.s > cgemm_kernel_b_nomacros.s
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCC cgemm_kernel_b_nomacros.s -o $@
rm cgemm_kernel_b.s cgemm_kernel_b_nomacros.s
$(KDIR)zgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND)
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN $< -o $@
$(CC) $(CFLAGS) -E -DDOUBLE -DCOMPLEX -DNN $< -o zgemm_kernel_n.s
m4 zgemm_kernel_n.s > zgemm_kernel_n_nomacros.s
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN zgemm_kernel_n_nomacros.s -o $@
rm zgemm_kernel_n.s zgemm_kernel_n_nomacros.s
$(KDIR)zgemm_kernel_l$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND)
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN $< -o $@
$(CC) $(CFLAGS) -E -DDOUBLE -DCOMPLEX -DCN $< -o zgemm_kernel_l.s
m4 zgemm_kernel_l.s > zgemm_kernel_l_nomacros.s
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN zgemm_kernel_l_nomacros.s -o $@
rm zgemm_kernel_l.s zgemm_kernel_l_nomacros.s
$(KDIR)zgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND)
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNC $< -o $@
$(CC) $(CFLAGS) -E -DDOUBLE -DCOMPLEX -DNC $< -o zgemm_kernel_r.s
m4 zgemm_kernel_r.s > zgemm_kernel_r_nomacros.s
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNC zgemm_kernel_r_nomacros.s -o $@
rm zgemm_kernel_r.s zgemm_kernel_r_nomacros.s
$(KDIR)zgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND)
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $@
$(CC) $(CFLAGS) -E -DDOUBLE -DCOMPLEX -DCC $< -o zgemm_kernel_b.s
m4 zgemm_kernel_b.s > zgemm_kernel_b_nomacros.s
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC zgemm_kernel_b_nomacros.s -o $@
rm zgemm_kernel_b.s zgemm_kernel_b_nomacros.s
$(KDIR)xgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMDEPEND)
$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DNN $< -o $@
@ -584,28 +635,56 @@ $(KDIR)xgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMD
ifdef USE_TRMM
$(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o strmmkernel_ln.s
m4 strmmkernel_ln.s > strmmkernel_ln_nomacros.s
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA strmmkernel_ln_nomacros.s -o $@
rm strmmkernel_ln.s strmmkernel_ln_nomacros.s
$(KDIR)strmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o strmmkernel_lt.s
m4 strmmkernel_lt.s > strmmkernel_lt_nomacros.s
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA strmmkernel_lt_nomacros.s -o $@
rm strmmkernel_lt.s strmmkernel_lt_nomacros.s
$(KDIR)strmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o strmmkernel_rn.s
m4 strmmkernel_rn.s > strmmkernel_rn_nomacros.s
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA strmmkernel_rn_nomacros.s -o $@
rm strmmkernel_rn.s strmmkernel_rn_nomacros.s
$(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o strmm_kernel_rt.s
m4 strmm_kernel_rt.s > strmm_kernel_rt_nomacros.s
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA strmm_kernel_rt_nomacros.s -o $@
rm strmm_kernel_rt.s strmm_kernel_rt_nomacros.s
$(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o dtrmm_kernel_ln.s
# $(CC) $(CFLAGS) -E $< -o dtrmm_kernel_ln.s
m4 dtrmm_kernel_ln.s > dtrmm_kernel_ln_nomacros.s
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA dtrmm_kernel_ln_nomacros.s -o $@
rm dtrmm_kernel_ln.s dtrmm_kernel_ln_nomacros.s
$(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@
$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o dtrmm_kernel_lt.s
# $(CC) $(CFLAGS) -E $< -o dtrmm_kernel_lt.s
m4 dtrmm_kernel_lt.s > dtrmm_kernel_lt_nomacros.s
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA dtrmm_kernel_lt_nomacros.s -o $@
rm dtrmm_kernel_lt.s dtrmm_kernel_lt_nomacros.s
$(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@
$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o dtrmm_kernel_rn.s
# $(CC) $(CFLAGS) -E $< -o dtrmm_kernel_rn.s
m4 dtrmm_kernel_rn.s > dtrmm_kernel_rn_nomacros.s
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA dtrmm_kernel_rn_nomacros.s -o $@
rm dtrmm_kernel_rn.s dtrmm_kernel_rn_nomacros.s
$(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o dtrmm_kernel_rt.s
# $(CC) $(CFLAGS) -E $< -o dtrmm_kernel_rt.s
m4 dtrmm_kernel_rt.s > dtrmm_kernel_rt_nomacros.s
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA dtrmm_kernel_rt_nomacros.s -o $@
rm dtrmm_kernel_rt.s dtrmm_kernel_rt_nomacros.s
$(KDIR)qtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
@ -620,52 +699,100 @@ $(KDIR)qtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
$(KDIR)ctrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o ctrmm_kernel_ln.s
m4 ctrmm_kernel_ln.s > ctrmm_kernel_ln_nomacros.s
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN ctrmm_kernel_ln_nomacros.s -o $@
rm ctrmm_kernel_ln.s ctrmm_kernel_ln_nomacros.s
$(KDIR)ctrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o ctrmm_kernel_lt.s
m4 ctrmm_kernel_lt.s > ctrmm_kernel_lt_nomacros.s
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN ctrmm_kernel_lt_nomacros.s -o $@
rm ctrmm_kernel_lt.s ctrmm_kernel_lt_nomacros.s
$(KDIR)ctrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o ctrmm_kernel_lr.s
m4 ctrmm_kernel_lr.s > ctrmm_kernel_lr_nomacros.s
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN ctrmm_kernel_lr_nomacros.s -o $@
rm ctrmm_kernel_lr.s ctrmm_kernel_lr_nomacros.s
$(KDIR)ctrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o ctrmm_kernel_lc.s
m4 ctrmm_kernel_lc.s > ctrmm_kernel_lc_nomacros.s
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN ctrmm_kernel_lc_nomacros.s -o $@
rm ctrmm_kernel_lc_nomacros.s ctrmm_kernel_lc.s
$(KDIR)ctrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o ctrmm_kernel_rn.s
m4 ctrmm_kernel_rn.s > ctrmm_kernel_rn_nomacros.s
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN ctrmm_kernel_rn_nomacros.s -o $@
rm ctrmm_kernel_rn.s ctrmm_kernel_rn_nomacros.s
$(KDIR)ctrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o ctrmm_kernel_rt.s
m4 ctrmm_kernel_rt.s > ctrmm_kernel_rt_nomacros.s
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN ctrmm_kernel_rt_nomacros.s -o $@
rm ctrmm_kernel_rt.s ctrmm_kernel_rt_nomacros.s
$(KDIR)ctrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o ctrmm_kernel_rr.s
m4 ctrmm_kernel_rr.s > ctrmm_kernel_rr_nomacros.s
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC ctrmm_kernel_rr_nomacros.s -o $@
rm ctrmm_kernel_rr.s ctrmm_kernel_rr_nomacros.s
$(KDIR)ctrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o ctrmm_kernel_RC.s
m4 ctrmm_kernel_RC.s > ctrmm_kernel_RC_nomacros.s
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC ctrmm_kernel_RC_nomacros.s -o $@
rm ctrmm_kernel_RC.s ctrmm_kernel_RC_nomacros.s
$(KDIR)ztrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@
$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o ztrmm_kernel_ln.s
m4 ztrmm_kernel_ln.s > ztrmm_kernel_ln_nomacros.s
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN ztrmm_kernel_ln_nomacros.s -o $@
rm ztrmm_kernel_ln.s ztrmm_kernel_ln_nomacros.s
$(KDIR)ztrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@
$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o ztrmm_kernel_lt.s
m4 ztrmm_kernel_lt.s > ztrmm_kernel_lt_nomacros.s
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN ztrmm_kernel_lt_nomacros.s -o $@
rm ztrmm_kernel_lt.s ztrmm_kernel_lt_nomacros.s
$(KDIR)ztrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@
$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o ztrmm_kernel_lr.s
m4 ztrmm_kernel_lr.s > ztrmm_kernel_lr_nomacros.s
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN ztrmm_kernel_lr_nomacros.s -o $@
rm ztrmm_kernel_lr.s ztrmm_kernel_lr_nomacros.s
$(KDIR)ztrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@
$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o ztrmm_kernel_lc.s
m4 ztrmm_kernel_lc.s >ztrmm_kernel_lc_nomacros.s
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN ztrmm_kernel_lc_nomacros.s -o $@
rm ztrmm_kernel_lc.s ztrmm_kernel_lc_nomacros.s
$(KDIR)ztrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@
$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o ztrmm_kernel_rn.s
m4 ztrmm_kernel_rn.s > ztrmm_kernel_rn_nomacros.s
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN ztrmm_kernel_rn_nomacros.s -o $@
rm ztrmm_kernel_rn.s ztrmm_kernel_rn_nomacros.s
$(KDIR)ztrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@
$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o ztrmm_kernel_rt.s
m4 ztrmm_kernel_rt.s > ztrmm_kernel_rt_nomacros.s
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN ztrmm_kernel_rt_nomacros.s -o $@
rm ztrmm_kernel_rt.s ztrmm_kernel_rt_nomacros.s
$(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@
$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o ztrmm_kernel_rr.s
m4 ztrmm_kernel_rr.s > ztrmm_kernel_rr_nomacros.s
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC ztrmm_kernel_rr_nomacros.s -o $@
rm ztrmm_kernel_rr.s ztrmm_kernel_rr_nomacros.s
$(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@
$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o ztrmm_kernel_rc.s
m4 ztrmm_kernel_rc.s > ztrmm_kernel_rc_nomacros.s
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC ztrmm_kernel_rc_nomacros.s -o $@
rm ztrmm_kernel_rc.s ztrmm_kernel_rc_nomacros.s
else
$(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
@ -677,7 +804,10 @@ $(KDIR)strmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@
$(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o strmm_kernel_rt.s
m4 strmm_kernel_rt.s > strmm_kernel_rt_nomacros.s
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA strmm_kernel_rt_nomacros.s -o $@
rm strmm_kernel_rt.s strmm_kernel_rt_nomacros.s
$(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
@ -801,10 +931,16 @@ $(KDIR)strsm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRSMKERNEL_RT) $(ST
$(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -UDOUBLE -UUPPER -DRT -UCONJ $< -o $@
$(KDIR)dtrsm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_LN) $(DTRSMDEPEND)
# $(CC) $(CFLAGS) -E $< -o dtrsm_kernel_ln.s
# m4 dtrsm_kernel_ln.s > dtrsm_kernel_ln_nomacros.s
$(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -DUPPER -DLN -UCONJ $< -o $@
# rm dtrsm_kernel_ln.s dtrsm_kernel_ln_nomacros.s
$(KDIR)dtrsm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_LT) $(DTRSMDEPEND)
$(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ $< -o $@
$(CC) $(CFLAGS) -E -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ $< -o dtrsm_kernel_lt.s
m4 dtrsm_kernel_lt.s > dtrsm_kernel_lt_nomacros.s
$(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ dtrsm_kernel_lt_nomacros.s -o $@
rm dtrsm_kernel_lt.s dtrsm_kernel_lt_nomacros.s
$(KDIR)dtrsm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_RN) $(DTRSMDEPEND)
$(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -DUPPER -DRN -UCONJ $< -o $@
@ -1940,7 +2076,7 @@ $(SGEMMITCOPYOBJ_P) : $(KERNELDIR)/$(SGEMMITCOPY)
endif
$(DGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(DGEMMONCOPY)
$(D<GEMMONCOPYOBJ_P) : $(KERNELDIR)/$(DGEMMONCOPY)
$(CC) $(PFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
$(DGEMMOTCOPYOBJ_P) : $(KERNELDIR)/$(DGEMMOTCOPY)
@ -2044,7 +2180,10 @@ $(KDIR)cgemm_kernel_l$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMM
$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $@
$(KDIR)cgemm_kernel_r$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND)
$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DNC $< -o $@
$(CC) $(PFLAGS) -E -UDOUBLE -DCOMPLEX -DNC $< -o cgemm_kernel_r.s
m4 cgemm_kernel_r.s > cgemm_kernel_r_nomacros.s
$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DNC cgemm_kernel_r_nomacros.s -o $@
rm cgemm_kernel_r.s cgemm_kernel_r_nomacros.s
$(KDIR)cgemm_kernel_b$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND)
$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $@
@ -2083,7 +2222,10 @@ $(KDIR)strmm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL)
$(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@
$(KDIR)strmm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL)
$(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o strmm_kernel_rt.s
m4 strmmkernel_rn.s > strmm_kernel_rt_nomacros.s
$(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA strmm_kernel_rt_nomacros.s -o $@
rm strmm_kernel_rt.s strmm_kernel_rt_nomacros.s
$(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL)
$(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@

View File

@ -68,10 +68,10 @@ static float casum_kernel_16 (long n, float *x)
"addi %2, %2, 128 \n\t"
"addic. %1, %1, -16 \n\t"
"ble 2f \n\t"
"ble two%= \n\t"
".p2align 5 \n"
"1: \n\t"
".align 5 \n"
"one%=: \n\t"
"xvabssp 48, 40 \n\t"
"xvabssp 49, 41 \n\t"
@ -108,9 +108,9 @@ static float casum_kernel_16 (long n, float *x)
"xvaddsp 38, 38, %x5 \n\t"
"xvaddsp 39, 39, %x6 \n\t"
"bgt 1b \n"
"bgt one%= \n"
"2: \n\t"
"two%=: \n\t"
"xvabssp 48, 40 \n\t"
"xvabssp 49, 41 \n\t"

View File

@ -62,10 +62,10 @@ static void ccopy_kernel_32 (long n, float *x, float *y)
"addi %2, %2, 128 \n\t"
"addic. %1, %1, -32 \n\t"
"ble 2f \n\t"
"ble two%= \n\t"
".p2align 5 \n"
"1: \n\t"
".align 5 \n"
"one%=: \n\t"
"stxvd2x 32, 0, %3 \n\t"
"stxvd2x 33, %5, %3 \n\t"
@ -108,9 +108,9 @@ static void ccopy_kernel_32 (long n, float *x, float *y)
"addi %2, %2, 128 \n\t"
"addic. %1, %1, -32 \n\t"
"bgt 1b \n"
"bgt one%= \n"
"2: \n\t"
"two%=: \n\t"
"stxvd2x 32, 0, %3 \n\t"
"stxvd2x 33, %5, %3 \n\t"

File diff suppressed because it is too large Load Diff

View File

@ -38,7 +38,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
* Macros for N=4 and M=8
**********************************************************************************************/
#if defined(_AIX)
define(`COPY_4x8', `
#else
.macro COPY_4x8
#endif
lxvw4x vs32, o0, A0
lxvw4x vs33, o16, A0
@ -93,13 +97,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvw4x vs46, o32, T1
stxvw4x vs47, o48, T1
#if defined(_AIX)
')
#else
.endm
#endif
/**********************************************************************************************
* Macros for N=4 and M=4
**********************************************************************************************/
#if defined(_AIX)
define(`COPY_4x4', `
#else
.macro COPY_4x4
#endif
lxvw4x vs32, o0, A0
lxvw4x vs33, o16, A0
@ -133,13 +145,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvw4x vs38, o32, T1
stxvw4x vs39, o48, T1
#if defined(_AIX)
')
#else
.endm
#endif
/**********************************************************************************************
* Macros for N=4 and M=2
**********************************************************************************************/
#if defined(_AIX)
define(`COPY_4x2', `
#else
.macro COPY_4x2
#endif
lxvw4x vs32, o0, A0
addi A0, A0, 16
@ -163,13 +183,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvw4x vs35, o48, T1
#if defined(_AIX)
')
#else
.endm
#endif
/**********************************************************************************************
* Macros for N=4 and M=1
**********************************************************************************************/
#if defined(_AIX)
define(`COPY_4x1', `
#else
.macro COPY_4x1
#endif
lxsspx vs32, o0, A0
lxsspx vs33, o4, A0
@ -207,13 +235,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxsspx vs38, o0, T1
stxsspx vs39, o4, T1
#if defined(_AIX)
')
#else
.endm
#endif
/**********************************************************************************************
* Macros for N=2 and M=8
**********************************************************************************************/
#if defined(_AIX)
define(`COPY_2x8', `
#else
.macro COPY_2x8
#endif
lxvw4x vs32, o0, A0
lxvw4x vs33, o16, A0
@ -241,13 +277,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvw4x vs38, o32, T1
stxvw4x vs39, o48, T1
#if defined(_AIX)
')
#else
.endm
#endif
/**********************************************************************************************
* Macros for N=2 and M=4
**********************************************************************************************/
#if defined(_AIX)
define(`COPY_2x4', `
#else
.macro COPY_2x4
#endif
lxvw4x vs32, o0, A0
lxvw4x vs33, o16, A0
@ -265,13 +309,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvw4x vs34, o32, T1
stxvw4x vs35, o48, T1
#if defined(_AIX)
')
#else
.endm
#endif
/**********************************************************************************************
* Macros for N=2 and M=2
**********************************************************************************************/
#if defined(_AIX)
define(`COPY_2x2', `
#else
.macro COPY_2x2
#endif
lxvw4x vs32, o0, A0
addi A0, A0, 16
@ -285,13 +337,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvw4x vs33, o16, T1
#if defined(_AIX)
')
#else
.endm
#endif
/**********************************************************************************************
* Macros for N=2 and M=1
**********************************************************************************************/
#if defined(_AIX)
define(`COPY_2x1', `
#else
.macro COPY_2x1
#endif
lxsspx vs32, o0, A0
lxsspx vs33, o4, A0
@ -311,13 +371,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxsspx vs34, o0, T1
stxsspx vs35, o4, T1
#if defined(_AIX)
')
#else
.endm
#endif
/**********************************************************************************************
* Macros for N=1 and M=8
**********************************************************************************************/
#if defined(_AIX)
define(`COPY_1x8', `
#else
.macro COPY_1x8
#endif
lxvw4x vs32, o0, A0
lxvw4x vs33, o16, A0
@ -332,13 +400,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvw4x vs34, o32, T1
stxvw4x vs35, o48, T1
#if defined(_AIX)
')
#else
.endm
#endif
/**********************************************************************************************
* Macros for N=1 and M=4
**********************************************************************************************/
#if defined(_AIX)
define(`COPY_1x4', `
#else
.macro COPY_1x4
#endif
lxvw4x vs32, o0, A0
lxvw4x vs33, o16, A0
@ -349,13 +425,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvw4x vs32, o0, T1
stxvw4x vs33, o16, T1
#if defined(_AIX)
')
#else
.endm
#endif
/**********************************************************************************************
* Macros for N=1 and M=2
**********************************************************************************************/
#if defined(_AIX)
define(`COPY_1x2', `
#else
.macro COPY_1x2
#endif
lxvw4x vs32, o0, A0
addi A0, A0, 16
@ -364,13 +448,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvw4x vs32, o0, T1
#if defined(_AIX)
')
#else
.endm
#endif
/**********************************************************************************************
* Macros for N=1 and M=1
**********************************************************************************************/
#if defined(_AIX)
define(`COPY_1x1', `
#else
.macro COPY_1x1
#endif
lxsspx vs32, o0, A0
lxsspx vs33, o4, A0
@ -381,5 +473,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxsspx vs32, o0, T1
stxsspx vs33, o4, T1
#if defined(_AIX)
')
#else
.endm
#endif

View File

@ -56,9 +56,9 @@ static void crot_kernel_8 (long n, float *x, float *y, float c, float s)
"addi %[x_ptr], %[x_ptr], 64 \n\t"
"addi %[y_ptr], %[y_ptr], 64 \n\t"
"addic. %[temp_n], %[temp_n], -8 \n\t"
"ble 2f \n\t"
".p2align 5 \n\t"
"1: \n\t"
"ble two%= \n\t"
".align 5 \n\t"
"one%=: \n\t"
"xvmulsp 40, 32, 36 \n\t" // c * x
"xvmulsp 41, 33, 36 \n\t"
"xvmulsp 42, 34, 36 \n\t"
@ -104,8 +104,8 @@ static void crot_kernel_8 (long n, float *x, float *y, float c, float s)
"addi %[x_ptr], %[x_ptr], 128 \n\t"
"addi %[y_ptr], %[y_ptr], 128 \n\t"
"addic. %[temp_n], %[temp_n], -8 \n\t"
"bgt 1b \n\t"
"2: \n\t"
"bgt one%= \n\t"
"two%=: \n\t"
"xvmulsp 40, 32, 36 \n\t" // c * x
"xvmulsp 41, 33, 36 \n\t"
"xvmulsp 42, 34, 36 \n\t"

View File

@ -39,8 +39,8 @@ static void cswap_kernel_32 (long n, float *x, float *y)
{
__asm__
(
".p2align 5 \n"
"1: \n\t"
".align 5 \n"
"one%=: \n\t"
"lxvd2x 32, 0, %4 \n\t"
"lxvd2x 33, %5, %4 \n\t"
@ -131,7 +131,7 @@ static void cswap_kernel_32 (long n, float *x, float *y)
"addi %4, %4, 128 \n\t"
"addic. %2, %2, -32 \n\t"
"bgt 1b \n"
"bgt one%= \n"
"#n=%2 x=%0=%3 y=%1=%4 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11"
:

File diff suppressed because it is too large Load Diff

View File

@ -68,10 +68,10 @@ static double dasum_kernel_16 (long n, double *x)
"addi %2, %2, 128 \n\t"
"addic. %1, %1, -16 \n\t"
"ble 2f \n\t"
"ble two%= \n\t"
".p2align 5 \n"
"1: \n\t"
".align 5 \n"
"one%=: \n\t"
"xvabsdp 48, 40 \n\t"
"xvabsdp 49, 41 \n\t"
@ -108,9 +108,9 @@ static double dasum_kernel_16 (long n, double *x)
"xvadddp 38, 38, %x5 \n\t"
"xvadddp 39, 39, %x6 \n\t"
"bgt 1b \n"
"bgt one%= \n"
"2: \n\t"
"two%=: \n\t"
"xvabsdp 48, 40 \n\t"
"xvabsdp 49, 41 \n\t"
@ -140,7 +140,7 @@ static double dasum_kernel_16 (long n, double *x)
"xvadddp 32, 32, 36 \n\t"
"xxswapd 33, 32 \n\t"
XXSWAPD_S(33,32)
"xsadddp %x0, 32, 33 \n"
"#n=%1 x=%3=%2 sum=%0 o16=%8 o32=%9 o48=%10 o64=%11 o80=%12 o96=%13 o112=%14\n"

View File

@ -58,7 +58,7 @@ static void daxpy_kernel_8 (long n, double *x, double *y, double alpha)
__asm__
(
"xxspltd %x4, %x22, 0 \n\t"
XXSPLTD_S(%x4,%x22,0)
"dcbt 0, %2 \n\t"
"dcbt 0, %3 \n\t"
@ -90,10 +90,10 @@ static void daxpy_kernel_8 (long n, double *x, double *y, double alpha)
"addi %3, %3, -64 \n\t"
"addic. %1, %1, -16 \n\t"
"ble 2f \n\t"
"ble two%= \n\t"
".align 5 \n"
"1: \n\t"
"one%=: \n\t"
"xvmaddadp %x13, %x5, %x4 \n\t"
"xvmaddadp %x14, %x6, %x4 \n\t"
@ -152,9 +152,9 @@ static void daxpy_kernel_8 (long n, double *x, double *y, double alpha)
"addi %3, %3, -64 \n\t"
"addic. %1, %1, -16 \n\t"
"bgt 1b \n"
"bgt one%= \n"
"2: \n\t"
"two%=: \n\t"
"xvmaddadp %x13, %x5, %x4 \n\t"
"xvmaddadp %x14, %x6, %x4 \n\t"

View File

@ -62,10 +62,10 @@ static void dcopy_kernel_32 (long n, double *x, double *y)
"addi %2, %2, 128 \n\t"
"addic. %1, %1, -32 \n\t"
"ble 2f \n\t"
"ble two%= \n\t"
".p2align 5 \n"
"1: \n\t"
".align 5 \n"
"one%=: \n\t"
"stxvd2x 32, 0, %3 \n\t"
"stxvd2x 33, %5, %3 \n\t"
@ -108,9 +108,9 @@ static void dcopy_kernel_32 (long n, double *x, double *y)
"addi %2, %2, 128 \n\t"
"addic. %1, %1, -32 \n\t"
"bgt 1b \n"
"bgt one%= \n"
"2: \n\t"
"two%=: \n\t"
"stxvd2x 32, 0, %3 \n\t"
"stxvd2x 33, %5, %3 \n\t"

View File

@ -78,10 +78,10 @@ static double ddot_kernel_8 (long n, double *x, double *y)
"addi %3, %3, 128 \n\t"
"addic. %1, %1, -16 \n\t"
"ble 2f \n\t"
"ble two%= \n\t"
".p2align 5 \n"
"1: \n\t"
".align 5 \n"
"one%=: \n\t"
"xvmaddadp 32, 40, 48 \n\t"
"lxvd2x 40, 0, %2 \n\t"
@ -112,9 +112,9 @@ static double ddot_kernel_8 (long n, double *x, double *y)
"addi %3, %3, 128 \n\t"
"addic. %1, %1, -16 \n\t"
"bgt 1b \n"
"bgt one%= \n"
"2: \n\t"
"two%=: \n\t"
"xvmaddadp 32, 40, 48 \n\t"
"xvmaddadp 33, 41, 49 \n\t"
@ -135,7 +135,7 @@ static double ddot_kernel_8 (long n, double *x, double *y)
"xvadddp 32, 32, 36 \n\t"
"xxswapd 33, 32 \n\t"
XXSWAPD_S(33,32)
"xsadddp %x0, 32, 33 \n"

File diff suppressed because it is too large Load Diff

View File

@ -38,7 +38,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
* Macros for N=4 and M=16
**********************************************************************************************/
#if defined(_AIX)
define(`COPY_4x16', `
#else
.macro COPY_4x16
#endif
lxvd2x vs0, o0, A0
lxvd2x vs1, o0, A1
@ -180,14 +184,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi BO, BO, 128
#if defined(_AIX)
')
#else
.endm
#endif
/**********************************************************************************************
* Macros for N=4 and M=8
**********************************************************************************************/
#if defined(_AIX)
define(`COPY_4x8', `
#else
.macro COPY_4x8
#endif
lxvd2x vs0, o0, A0
lxvd2x vs1, o16, A0
@ -259,14 +271,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi BO, BO, 128
#if defined(_AIX)
')
#else
.endm
#endif
/**********************************************************************************************
* Macros for N=4 and M=4
**********************************************************************************************/
#if defined(_AIX)
define(`COPY_4x4', `
#else
.macro COPY_4x4
#endif
lxvd2x vs0, o0, A0
lxvd2x vs1, o16, A0
@ -310,14 +330,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi BO, BO, 128
#if defined(_AIX)
')
#else
.endm
#endif
/**********************************************************************************************
* Macros for N=4 and M=2
**********************************************************************************************/
#if defined(_AIX)
define(`COPY_4x2', `
#else
.macro COPY_4x2
#endif
lxvd2x vs0, o0, A0
addi A0, A0, 16
@ -348,14 +376,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi BO, BO, 64
#if defined(_AIX)
')
#else
.endm
#endif
/**********************************************************************************************
* Macros for N=4 and M=1
**********************************************************************************************/
#if defined(_AIX)
define(`COPY_4x1', `
#else
.macro COPY_4x1
#endif
lxsdx vs0, o0, A0
addi A0, A0, 8
@ -382,14 +418,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi BO, BO, 32
#if defined(_AIX)
')
#else
.endm
#endif
/**********************************************************************************************
* Macros for N=2 and M=16
**********************************************************************************************/
#if defined(_AIX)
define(`COPY_2x16', `
#else
.macro COPY_2x16
#endif
lxvd2x vs0, o0, A0
lxvd2x vs1, o16, A0
@ -459,14 +503,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi BO, BO, 128
#if defined(_AIX)
')
#else
.endm
#endif
/**********************************************************************************************
* Macros for N=2 and M=8
**********************************************************************************************/
#if defined(_AIX)
define(`COPY_2x8', `
#else
.macro COPY_2x8
#endif
lxvd2x vs0, o0, A0
lxvd2x vs1, o16, A0
@ -506,14 +558,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi BO, BO, 128
#if defined(_AIX)
')
#else
.endm
#endif
/**********************************************************************************************
* Macros for N=2 and M=4
**********************************************************************************************/
#if defined(_AIX)
define(`COPY_2x4', `
#else
.macro COPY_2x4
#endif
lxvd2x vs0, o0, A0
lxvd2x vs1, o16, A0
@ -539,14 +599,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi BO, BO, 64
#if defined(_AIX)
')
#else
.endm
#endif
/**********************************************************************************************
* Macros for N=2 and M=2
**********************************************************************************************/
#if defined(_AIX)
define(`COPY_2x2', `
#else
.macro COPY_2x2
#endif
lxvd2x vs0, o0, A0
addi A0, A0, 16
@ -565,14 +633,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi BO, BO, 32
#if defined(_AIX)
')
#else
.endm
#endif
/**********************************************************************************************
* Macros for N=2 and M=1
**********************************************************************************************/
#if defined(_AIX)
define(`COPY_2x1', `
#else
.macro COPY_2x1
#endif
lxsdx vs0, o0, A0
addi A0, A0, 8
@ -589,14 +665,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi BO, BO, 16
#if defined(_AIX)
')
#else
.endm
#endif
/**********************************************************************************************
* Macros for N=1 and M=16
**********************************************************************************************/
#if defined(_AIX)
define(`COPY_1x16', `
#else
.macro COPY_1x16
#endif
lxvd2x vs0, o0, A0
lxvd2x vs1, o16, A0
@ -622,14 +706,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi BO, BO, 64
#if defined(_AIX)
')
#else
.endm
#endif
/**********************************************************************************************
* Macros for N=1 and M=8
**********************************************************************************************/
#if defined(_AIX)
define(`COPY_1x8', `
#else
.macro COPY_1x8
#endif
lxvd2x vs0, o0, A0
lxvd2x vs1, o16, A0
@ -645,14 +737,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi BO, BO, 64
#if defined(_AIX)
')
#else
.endm
#endif
/**********************************************************************************************
* Macros for N=1 and M=4
**********************************************************************************************/
#if defined(_AIX)
define(`COPY_1x4', `
#else
.macro COPY_1x4
#endif
lxvd2x vs0, o0, A0
lxvd2x vs1, o16, A0
@ -664,14 +764,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi BO, BO, 32
#if defined(_AIX)
')
#else
.endm
#endif
/**********************************************************************************************
* Macros for N=1 and M=2
**********************************************************************************************/
#if defined(_AIX)
define(`COPY_1x2', `
#else
.macro COPY_1x2
#endif
lxvd2x vs0, o0, A0
addi A0, A0, 16
@ -681,14 +789,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi BO, BO, 16
#if defined(_AIX)
')
#else
.endm
#endif
/**********************************************************************************************
* Macros for N=1 and M=1
**********************************************************************************************/
#if defined(_AIX)
define(`COPY_1x1', `
#else
.macro COPY_1x1
#endif
lxsdx vs0, o0, A0
addi A0, A0, 8
@ -698,5 +814,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi BO, BO, 8
#if defined(_AIX)
')
#else
.endm
#endif

View File

@ -38,7 +38,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
* Macros for N=4 and M=16
**********************************************************************************************/
#if defined(_AIX)
define(`COPY_4x16', `
#else
.macro COPY_4x16
#endif
lxvd2x vs32, o0, A0
lxvd2x vs33, o16, A0
@ -140,14 +144,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvd2x vs10, o32, T1
stxvd2x vs11, o48, T1
#if defined(_AIX)
')
#else
.endm
#endif
/**********************************************************************************************
* Macros for N=4 and M=8
**********************************************************************************************/
#if defined(_AIX)
define(`COPY_4x8', `
#else
.macro COPY_4x8
#endif
lxvd2x vs32, o0, A0
lxvd2x vs33, o16, A0
@ -205,14 +217,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvd2x vs46, o32, T1
stxvd2x vs47, o48, T1
#if defined(_AIX)
')
#else
.endm
#endif
/**********************************************************************************************
* Macros for N=4 and M=4
**********************************************************************************************/
#if defined(_AIX)
define(`COPY_4x4', `
#else
.macro COPY_4x4
#endif
lxvd2x vs32, o0, A0
lxvd2x vs33, o16, A0
@ -250,14 +270,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvd2x vs38, o32, T1
stxvd2x vs39, o48, T1
#if defined(_AIX)
')
#else
.endm
#endif
/**********************************************************************************************
* Macros for N=4 and M=2
**********************************************************************************************/
#if defined(_AIX)
define(`COPY_4x2', `
#else
.macro COPY_4x2
#endif
lxvd2x vs32, o0, A0
addi A0, A0, 16
@ -285,14 +313,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvd2x vs35, o48, T1
#if defined(_AIX)
')
#else
.endm
#endif
/**********************************************************************************************
* Macros for N=4 and M=1
**********************************************************************************************/
#if defined(_AIX)
define(`COPY_4x1', `
#else
.macro COPY_4x1
#endif
lxsdx vs32, o0, A0
addi A0, A0, 8
@ -322,14 +358,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxsdx vs35, o8, T1
#if defined(_AIX)
')
#else
.endm
#endif
/**********************************************************************************************
* Macros for N=2 and M=16
**********************************************************************************************/
#if defined(_AIX)
define(`COPY_2x16', `
#else
.macro COPY_2x16
#endif
lxvd2x vs32, o0, A0
lxvd2x vs33, o16, A0
@ -383,14 +427,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvd2x vs46, o32, T1
stxvd2x vs47, o48, T1
#if defined(_AIX)
')
#else
.endm
#endif
/**********************************************************************************************
* Macros for N=2 and M=8
**********************************************************************************************/
#if defined(_AIX)
define(`COPY_2x8', `
#else
.macro COPY_2x8
#endif
lxvd2x vs32, o0, A0
lxvd2x vs33, o16, A0
@ -420,14 +472,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvd2x vs38, o32, T1
stxvd2x vs39, o48, T1
#if defined(_AIX)
')
#else
.endm
#endif
/**********************************************************************************************
* Macros for N=2 and M=4
**********************************************************************************************/
#if defined(_AIX)
define(`COPY_2x4', `
#else
.macro COPY_2x4
#endif
lxvd2x vs32, o0, A0
lxvd2x vs33, o16, A0
@ -447,14 +507,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvd2x vs34, o32, T1
stxvd2x vs35, o48, T1
#if defined(_AIX)
')
#else
.endm
#endif
/**********************************************************************************************
* Macros for N=2 and M=2
**********************************************************************************************/
#if defined(_AIX)
define(`COPY_2x2', `
#else
.macro COPY_2x2
#endif
lxvd2x vs32, o0, A0
addi A0, A0, 16
@ -470,14 +538,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvd2x vs33, o16, T1
#if defined(_AIX)
')
#else
.endm
#endif
/**********************************************************************************************
* Macros for N=2 and M=1
**********************************************************************************************/
#if defined(_AIX)
define(`COPY_2x1', `
#else
.macro COPY_2x1
#endif
lxsdx vs32, o0, A0
addi A0, A0, 8
@ -493,14 +569,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxsdx vs33, o8, T1
#if defined(_AIX)
')
#else
.endm
#endif
/**********************************************************************************************
* Macros for N=1 and M=16
**********************************************************************************************/
#if defined(_AIX)
define(`COPY_1x16', `
#else
.macro COPY_1x16
#endif
lxvd2x vs32, o0, A0
lxvd2x vs33, o16, A0
@ -528,14 +612,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvd2x vs38, o32, T1
stxvd2x vs39, o48, T1
#if defined(_AIX)
')
#else
.endm
#endif
/**********************************************************************************************
* Macros for N=1 and M=8
**********************************************************************************************/
#if defined(_AIX)
define(`COPY_1x8', `
#else
.macro COPY_1x8
#endif
lxvd2x vs32, o0, A0
lxvd2x vs33, o16, A0
@ -551,14 +643,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvd2x vs34, o32, T1
stxvd2x vs35, o48, T1
#if defined(_AIX)
')
#else
.endm
#endif
/**********************************************************************************************
* Macros for N=1 and M=4
**********************************************************************************************/
#if defined(_AIX)
define(`COPY_1x4', `
#else
.macro COPY_1x4
#endif
lxvd2x vs32, o0, A0
lxvd2x vs33, o16, A0
@ -570,14 +670,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvd2x vs32, o0, T1
stxvd2x vs33, o16, T1
#if defined(_AIX)
')
#else
.endm
#endif
/**********************************************************************************************
* Macros for N=1 and M=2
**********************************************************************************************/
#if defined(_AIX)
define(`COPY_1x2', `
#else
.macro COPY_1x2
#endif
lxvd2x vs32, o0, A0
addi A0, A0, 16
@ -587,14 +695,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvd2x vs32, o0, T1
#if defined(_AIX)
')
#else
.endm
#endif
/**********************************************************************************************
* Macros for N=1 and M=1
**********************************************************************************************/
#if defined(_AIX)
define(`COPY_1x1', `
#else
.macro COPY_1x1
#endif
lxsdx vs32, o0, A0
addi A0, A0, 8
@ -604,5 +720,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxsdx vs32, o0, T1
#if defined(_AIX)
')
#else
.endm
#endif

View File

@ -46,7 +46,7 @@ static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y
(
"lxvd2x 34, 0, %10 \n\t" // x0, x1
"lxvd2x 35, %11, %10 \n\t" // x2, x3
"xxspltd 32, %x9, 0 \n\t" // alpha, alpha
XXSPLTD_S(32,%x9,0) // alpha, alpha
"sldi %6, %13, 3 \n\t" // lda * sizeof (double)
@ -56,10 +56,10 @@ static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y
"add %4, %3, %6 \n\t" // a0 = ap, a1 = a0 + lda
"add %6, %6, %6 \n\t" // 2 * lda
"xxspltd 32, 34, 0 \n\t" // x0 * alpha, x0 * alpha
"xxspltd 33, 34, 1 \n\t" // x1 * alpha, x1 * alpha
"xxspltd 34, 35, 0 \n\t" // x2 * alpha, x2 * alpha
"xxspltd 35, 35, 1 \n\t" // x3 * alpha, x3 * alpha
XXSPLTD_S(32,34,0) // x0 * alpha, x0 * alpha
XXSPLTD_S(33,34,1) // x1 * alpha, x1 * alpha
XXSPLTD_S(34,35,0) // x2 * alpha, x2 * alpha
XXSPLTD_S(35,35,1) // x3 * alpha, x3 * alpha
"add %5, %3, %6 \n\t" // a2 = a0 + 2 * lda
"add %6, %4, %6 \n\t" // a3 = a1 + 2 * lda
@ -89,10 +89,10 @@ static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y
"addi %6, %6, 32 \n\t"
"addic. %1, %1, -4 \n\t"
"ble 2f \n\t"
"ble two%= \n\t"
".p2align 5 \n"
"1: \n\t"
".align 5 \n"
"one%=: \n\t"
"lxvd2x 36, 0, %2 \n\t" // y0, y1
"lxvd2x 37, %11, %2 \n\t" // y2, y3
@ -131,7 +131,7 @@ static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y
"addi %2, %2, 32 \n\t"
"addic. %1, %1, -4 \n\t"
"ble 2f \n\t"
"ble two%= \n\t"
"lxvd2x 36, 0, %2 \n\t" // y0, y1
@ -171,7 +171,7 @@ static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y
"addi %2, %2, 32 \n\t"
"addic. %1, %1, -4 \n\t"
"ble 2f \n\t"
"ble two%= \n\t"
"lxvd2x 36, 0, %2 \n\t" // y0, y1
@ -211,7 +211,7 @@ static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y
"addi %2, %2, 32 \n\t"
"addic. %1, %1, -4 \n\t"
"ble 2f \n\t"
"ble two%= \n\t"
"lxvd2x 36, 0, %2 \n\t" // y0, y1
@ -251,9 +251,9 @@ static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y
"addi %2, %2, 32 \n\t"
"addic. %1, %1, -4 \n\t"
"bgt 1b \n"
"bgt one%= \n"
"2: \n\t"
"two%=: \n\t"
"lxvd2x 36, 0, %2 \n\t" // y0, y1
"lxvd2x 37, %11, %2 \n\t" // y2, y3

View File

@ -93,11 +93,11 @@ static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, do
"li %[off],32 \n\t"
"ble- 2f \n\t"
"ble- two%= \n\t"
//--------------------------------------------------
".p2align 5 \n\t"
"1: \n\t"
".align 5 \n\t"
"one%=: \n\t"
"xvmaddadp 34,36,32 \n\t"
"xvmaddadp 35,38,32 \n\t"
"addi %[off2], %[off2],32 \n\t"
@ -137,7 +137,7 @@ static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, do
"lxvd2x 49, %[a6], %[off2] \n\t"
"lxvd2x 51, %[a7], %[off2] \n\t"
"lxvd2x 33, %[x], %[off2] \n\t"
"ble- 2f \n\t"
"ble- two%= \n\t"
"xvmaddadp 34,36,32 \n\t"
"xvmaddadp 35,38,32 \n\t"
"addi %[off2], %[off2],32 \n\t"
@ -177,7 +177,7 @@ static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, do
"lxvd2x 49, %[a6], %[off2] \n\t"
"lxvd2x 51, %[a7], %[off2] \n\t"
"lxvd2x 33, %[x], %[off2] \n\t"
"ble- 2f \n\t"
"ble- two%= \n\t"
"xvmaddadp 34,36,32 \n\t"
"xvmaddadp 35,38,32 \n\t"
#if defined(PREFETCH)
@ -229,7 +229,7 @@ static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, do
"lxvd2x 33, %[x], %[off2] \n\t"
"addic. %[n],%[n],-4 \n\t"
"ble- 2f \n\t"
"ble- two%= \n\t"
"addi %[off2], %[off2],32 \n\t"
#if defined(PREFETCH)
@ -288,9 +288,9 @@ static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, do
#if defined(PREFETCH)
"dcbt %[temp],%[x] \n\t"
#endif
"bgt+ 1b \n\t"
".p2align 5 \n\t"
"2: \n\t"
"bgt+ one%= \n\t"
".align 5 \n\t"
"two%=: \n\t"
//--------------------------------------------
"xvmaddadp 34,36,32 \n\t"
@ -301,7 +301,7 @@ static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, do
"xvmaddadp 7,46,32 \n\t"
"xvmaddadp 8,48,32 \n\t"
"xvmaddadp 9,50,32 \n\t"
"xxspltd 36, %x[alpha], 0 \n\t"
XXSPLTD_S(36,%x[alpha],0)
"xvmaddadp 34,37,33 \n\t"
"xvmaddadp 35,39,33 \n\t"
"xvmaddadp 4,41,33 \n\t"
@ -322,21 +322,21 @@ static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, do
"xxmrgld 42,34,35 \n\t"
"xxmrghd 43,34,35 \n\t"
XXMRGLD_S(42,34,35)
XXMRGHD_S(43,34,35)
"xxmrgld 44,4,5 \n\t"
"xxmrghd 45,4,5 \n\t"
XXMRGLD_S(44,4,5)
XXMRGHD_S(45,4,5)
"xvadddp 42,42,43 \n\t"
"xxmrgld 46,6,7 \n\t"
"xxmrghd 47,6,7 \n\t"
XXMRGLD_S(46,6,7)
XXMRGHD_S(47,6,7)
"xvadddp 44,44,45 \n\t"
"xxmrgld 48,8,9 \n\t"
"xxmrghd 49,8,9 \n\t"
XXMRGLD_S(48,8,9)
XXMRGHD_S(49,8,9)
"xvadddp 46,46,47 \n\t"

View File

@ -51,8 +51,8 @@ static void drot_kernel_16 (long n, double *x, double *y, double c, double s)
__asm__
(
"xxspltd 36, %x13, 0 \n\t" // load c to both dwords
"xxspltd 37, %x14, 0 \n\t" // load s to both dwords
XXSPLTD_S(36,%x13,0) // load c to both dwords
XXSPLTD_S(37,%x14,0) // load s to both dwords
"lxvd2x 32, 0, %3 \n\t" // load x
"lxvd2x 33, %15, %3 \n\t"
@ -68,10 +68,10 @@ static void drot_kernel_16 (long n, double *x, double *y, double c, double s)
"addi %4, %4, 64 \n\t"
"addic. %2, %2, -8 \n\t"
"ble 2f \n\t"
"ble two%= \n\t"
".p2align 5 \n"
"1: \n\t"
".align 5 \n"
"one%=: \n\t"
"xvmuldp 40, 32, 36 \n\t" // c * x
"xvmuldp 41, 33, 36 \n\t"
@ -135,9 +135,9 @@ static void drot_kernel_16 (long n, double *x, double *y, double c, double s)
"addi %4, %4, 128 \n\t"
"addic. %2, %2, -8 \n\t"
"bgt 1b \n"
"bgt one%= \n"
"2: \n\t"
"two%=: \n\t"
"xvmuldp 40, 32, 36 \n\t" // c * x
"xvmuldp 41, 33, 36 \n\t"

View File

@ -41,7 +41,7 @@ static void dscal_kernel_8 (long n, double *x, double alpha)
(
"dcbt 0, %2 \n\t"
"xxspltd %x3, %x3, 0 \n\t"
XXSPLTD_S(%x3,%x3,0)
"lxvd2x 32, 0, %2 \n\t"
"lxvd2x 33, %4, %2 \n\t"
@ -55,10 +55,10 @@ static void dscal_kernel_8 (long n, double *x, double alpha)
"addi %2, %2, 128 \n\t"
"addic. %1, %1, -16 \n\t"
"ble 2f \n\t"
"ble two%= \n\t"
".p2align 5 \n"
"1: \n\t"
".align 5 \n"
"one%=: \n\t"
"xvmuldp 40, 32, %x3 \n\t"
"xvmuldp 41, 33, %x3 \n\t"
@ -91,9 +91,9 @@ static void dscal_kernel_8 (long n, double *x, double alpha)
"addi %2, %2, 256 \n\t"
"addic. %1, %1, -16 \n\t"
"bgt 1b \n"
"bgt one%= \n"
"2: \n\t"
"two%=: \n\t"
"xvmuldp 40, 32, %x3 \n\t"
"xvmuldp 41, 33, %x3 \n\t"
@ -146,8 +146,8 @@ static void dscal_kernel_8_zero (long n, double *x)
(
"xxlxor %x3, %x3, %x3 \n\t"
".p2align 5 \n"
"1: \n\t"
".align 5 \n"
"one%=: \n\t"
"stxvd2x %x3, 0, %2 \n\t"
"stxvd2x %x3, %4, %2 \n\t"
@ -161,7 +161,7 @@ static void dscal_kernel_8_zero (long n, double *x)
"addi %2, %2, 128 \n\t"
"addic. %1, %1, -16 \n\t"
"bgt 1b \n"
"bgt one%= \n"
"#n=%1 x=%0=%2 t0=%x3 o16=%4 o32=%5 o48=%6 o64=%7 o80=%8 o96=%9 o112=%10"
:

View File

@ -39,8 +39,8 @@ static void dswap_kernel_32 (long n, double *x, double *y)
{
__asm__
(
".p2align 5 \n"
"1: \n\t"
".align 5 \n"
"one%=: \n\t"
"lxvd2x 32, 0, %4 \n\t"
"lxvd2x 33, %5, %4 \n\t"
@ -131,7 +131,7 @@ static void dswap_kernel_32 (long n, double *x, double *y)
"addi %4, %4, 128 \n\t"
"addic. %2, %2, -32 \n\t"
"bgt 1b \n"
"bgt one%= \n"
"#n=%2 x=%0=%3 y=%1=%4 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11"
:

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -58,8 +58,8 @@ static BLASLONG diamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) {
"lxvd2x 47, %[i48],%[ptr_tmp] \n\t"
"lxvd2x 48, %[i64],%[ptr_tmp] \n\t"
"lxvd2x 49, %[i80],%[ptr_tmp] \n\t"
"lxvd2x 50, %[i96],%[ptr_tmp] \n\t"
"lxvd2x 51,%[i112],%[ptr_tmp] \n\t"
"lxvd2x 6, %[i96],%[ptr_tmp] \n\t"
"lxvd2x 7,%[i112],%[ptr_tmp] \n\t"
"xxlor 40,%x[start],%x[start] \n\t" //{ 1,0} vs40 | v8
"vaddudm 9,8,%[adder] \n\t" //{3,2} vs41
@ -69,7 +69,7 @@ static BLASLONG diamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) {
"vaddudm 11,10,%[adder] \n\t" //{7,6} vs43
"xxlxor 39,39,39 \n\t" // vs39 vec_max_value
"vaddudm 4,11, %[adder] \n\t" // {9,8} -{8;8} vs36 | v4
"xxspltd 36,36,0 \n\t"
XXSPLTD_S(36,36,0)
"xvabsdp 44, 44 \n\t"
"xvabsdp 45, 45 \n\t"
@ -77,21 +77,21 @@ static BLASLONG diamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) {
"xvabsdp 47, 47 \n\t"
"xvabsdp 48, 48 \n\t"
"xvabsdp 49, 49 \n\t"
"xvabsdp 50, 50 \n\t"
"xvabsdp 51, 51 \n\t"
"xvabsdp 6, 6 \n\t"
"xvabsdp 7, 7 \n\t"
//jump first half forward
"b 2f \n\t"
"b two%= \n\t"
//===================================================================
".p2align 5 \n\t"
".align 5 \n\t"
"1: \n\t"
"one%=: \n\t"
"xvcmpgtdp 2,45,44 \n\t "
"xvcmpgtdp 3,47,46 \n\t "
"xvcmpgtdp 4,49,48 \n\t "
"xvcmpgtdp 5,51,50 \n\t"
"xvcmpgtdp 5,7,6 \n\t"
"xxsel 32,40,41,2 \n\t"
"xxsel 0,44,45,2 \n\t"
@ -100,7 +100,7 @@ static BLASLONG diamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) {
"xxsel 34,40,41,4 \n\t"
"xxsel 45,48,49,4 \n\t"
"xxsel 35,42,43,5 \n\t"
"xxsel 47,50,51,5 \n\t"
"xxsel 47,6,7,5 \n\t"
"xvcmpgtdp 2, 1,0 \n\t"
"xvcmpgtdp 3,47, 45 \n\t"
@ -134,8 +134,8 @@ static BLASLONG diamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) {
"vaddudm 1,1,5 \n\t" // get real index for first bigger
"lxvd2x 50, %[i96],%[ptr_tmp] \n\t"
"lxvd2x 51,%[i112],%[ptr_tmp] \n\t"
"lxvd2x 6, %[i96],%[ptr_tmp] \n\t"
"lxvd2x 7,%[i112],%[ptr_tmp] \n\t"
//compare with previous to get vec_max_index(v6 | vs38 ) and vec_max_value (vs39)
"xvcmpgtdp 2, 3,39 \n\t"
@ -155,16 +155,16 @@ static BLASLONG diamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) {
"xvabsdp 48, 48 \n\t"
"xvabsdp 49, 49 \n\t"
"xvabsdp 50, 50 \n\t"
"xvabsdp 51, 51 \n\t"
"xvabsdp 6, 6 \n\t"
"xvabsdp 7, 7 \n\t"
//<-----------jump here from first load
"2: \n\t"
"two%=: \n\t"
"xvcmpgtdp 2,45,44 \n\t "
"xvcmpgtdp 3,47,46 \n\t "
"xvcmpgtdp 4,49,48 \n\t "
"xvcmpgtdp 5,51,50 \n\t"
"xvcmpgtdp 5,7,6 \n\t"
"xxsel 32,40,41,2 \n\t"
"xxsel 0,44,45,2 \n\t"
@ -173,7 +173,7 @@ static BLASLONG diamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) {
"xxsel 34,40,41,4 \n\t"
"xxsel 45,48,49,4 \n\t"
"xxsel 35,42,43,5 \n\t"
"xxsel 47,50,51,5 \n\t"
"xxsel 47,6,7,5 \n\t"
"xvcmpgtdp 2, 1,0 \n\t"
"xvcmpgtdp 3,47, 45 \n\t"
@ -203,8 +203,8 @@ static BLASLONG diamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) {
"vaddudm 1,1,5 \n\t" // get real index for first bigger
"lxvd2x 50, %[i96],%[ptr_tmp] \n\t"
"lxvd2x 51,%[i112],%[ptr_tmp] \n\t"
"lxvd2x 6, %[i96],%[ptr_tmp] \n\t"
"lxvd2x 7,%[i112],%[ptr_tmp] \n\t"
@ -226,21 +226,21 @@ static BLASLONG diamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) {
"xvabsdp 48, 48 \n\t"
"xvabsdp 49, 49 \n\t"
"xvabsdp 50, 50 \n\t"
"xvabsdp 51, 51 \n\t"
"xvabsdp 6, 6 \n\t"
"xvabsdp 7, 7 \n\t"
//decrement n
"addic. %[n], %[n], -32 \n\t"
//Loop back if >0
"bgt+ 1b \n\t"
"bgt+ one%= \n\t"
//==============================================================================
"xvcmpgtdp 2,45,44 \n\t "
"xvcmpgtdp 3,47,46 \n\t "
"xvcmpgtdp 4,49,48 \n\t "
"xvcmpgtdp 5,51,50 \n\t"
"xvcmpgtdp 5,7,6 \n\t"
"xxsel 32,40,41,2 \n\t"
"xxsel 0,44,45,2 \n\t"
@ -249,7 +249,7 @@ static BLASLONG diamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) {
"xxsel 34,40,41,4 \n\t"
"xxsel 45,48,49,4 \n\t"
"xxsel 35,42,43,5 \n\t"
"xxsel 47,50,51,5 \n\t"
"xxsel 47,6,7,5 \n\t"
"xvcmpgtdp 2, 1,0 \n\t"
"xvcmpgtdp 3,47, 45 \n\t"
@ -276,28 +276,28 @@ static BLASLONG diamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) {
///////extract max value and max index from vector
"xxspltd 32,38,1 \n\t"
"xxspltd 40,39,1 \n\t"
XXSPLTD_S(32,38,1)
XXSPLTD_S(40,39,1)
"xvcmpeqdp. 2, 40,39 \n\t"
//cr6 0 bit set if all true, cr6=4*6+bit_ind=24,0011at CR(BI)==1, at=10 hint that it occurs rarely
//0b001110=14
"bc 14,24, 3f \n\t"
"bc 14,24, three%= \n\t"
"xvcmpgtdp 4, 40,39 \n\t"
"xxsel 0,39,40,4 \n\t"
"xxsel 1,38,32,4 \n\t"
"stxsdx 0,0,%[ptr_maxf] \n\t"
"b 4f \n\t"
"b four%= \n\t"
"3: \n\t"
"three%=: \n\t"
//if elements value are equal then choose minimum index
"xxspltd 0,40,0 \n\t"
XXSPLTD_S(0,40,0)
"vminud 0,0,6 \n\t" //vs32 vs38
"xxlor 1,32,32 \n\t"
"stxsdx 0,0,%[ptr_maxf] \n\t"
"4: \n\t"
"four%=: \n\t"
"mfvsrd %[index],1 \n\t"
: [maxf] "=m"(*maxf),[ptr_tmp] "+&b"(x),[index] "=r"(index), [n] "+&r"(n)
@ -306,7 +306,7 @@ static BLASLONG diamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) {
[i64] "b"(64), [i80] "b"(80), [i96] "b"(96), [i112] "b"(112),
[start] "v"(start), [adder] "v"(temp_add_index)
: "cc", "vs0", "vs1","vs2","vs3", "vs4","vs5","vs32", "vs33", "vs34", "vs35", "vs36",
"vs37", "vs38", "vs39", "vs40", "vs41", "vs42", "vs43", "vs44", "vs45", "vs46", "vs47", "vs48", "vs49", "vs50", "vs51"
"vs37", "vs38", "vs39", "vs40", "vs41", "vs42", "vs43", "vs44", "vs45", "vs46", "vs47", "vs48", "vs49", "vs6", "vs7"
);

View File

@ -58,8 +58,8 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) {
"lxvd2x 47, %[i48],%[ptr_tmp] \n\t"
"lxvd2x 48, %[i64],%[ptr_tmp] \n\t"
"lxvd2x 49, %[i80],%[ptr_tmp] \n\t"
"lxvd2x 50, %[i96],%[ptr_tmp] \n\t"
"lxvd2x 51,%[i112],%[ptr_tmp] \n\t"
"lxvd2x 6, %[i96],%[ptr_tmp] \n\t"
"lxvd2x 7,%[i112],%[ptr_tmp] \n\t"
"xxlor 40,%x[start],%x[start] \n\t" //{ 1,0} vs40 | v8
"vaddudm 9,8, %[adder] \n\t" //{3,2} vs41
@ -69,7 +69,7 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) {
"vaddudm 11,10,%[adder] \n\t" //{7,6} vs43
"lxvdsx 39,0,%[ptr_minf] \n\t" // vs39 vec_min_value
"vaddudm 4,11, %[adder] \n\t" // {9,8} -{8;8} vs36 | v4
"xxspltd 36,36,0 \n\t"
XXSPLTD_S(36,36,0)
"xvabsdp 39, 39 \n\t"
"xvabsdp 44, 44 \n\t"
@ -78,21 +78,21 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) {
"xvabsdp 47, 47 \n\t"
"xvabsdp 48, 48 \n\t"
"xvabsdp 49, 49 \n\t"
"xvabsdp 50, 50 \n\t"
"xvabsdp 51, 51 \n\t"
"xvabsdp 6, 6 \n\t"
"xvabsdp 7, 7 \n\t"
//jump first half forward
"b 2f \n\t"
"b two%= \n\t"
//===================================================================
".p2align 5 \n\t"
".align 5 \n\t"
"1: \n\t"
"one%=: \n\t"
"xvcmpgtdp 2,44,45 \n\t "
"xvcmpgtdp 3,46,47 \n\t "
"xvcmpgtdp 4,48,49 \n\t "
"xvcmpgtdp 5,50,51 \n\t"
"xvcmpgtdp 5,6,7 \n\t"
"xxsel 32,40,41,2 \n\t"
"xxsel 0,44,45,2 \n\t"
@ -101,7 +101,7 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) {
"xxsel 34,40,41,4 \n\t"
"xxsel 45,48,49,4 \n\t"
"xxsel 35,42,43,5 \n\t"
"xxsel 47,50,51,5 \n\t"
"xxsel 47,6,7,5 \n\t"
"xvcmpgtdp 2,0, 1 \n\t"
"xvcmpgtdp 3, 45,47 \n\t"
@ -135,8 +135,8 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) {
"vaddudm 1,1,5 \n\t" // get real index for first smaller
"lxvd2x 50, %[i96],%[ptr_tmp] \n\t"
"lxvd2x 51,%[i112],%[ptr_tmp] \n\t"
"lxvd2x 6, %[i96],%[ptr_tmp] \n\t"
"lxvd2x 7,%[i112],%[ptr_tmp] \n\t"
//compare with previous to get vec_min_index(v6 | vs38 ) and vec_min_value (vs39)
"xvcmpgtdp 2,39, 3 \n\t"
@ -156,16 +156,16 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) {
"xvabsdp 48, 48 \n\t"
"xvabsdp 49, 49 \n\t"
"xvabsdp 50, 50 \n\t"
"xvabsdp 51, 51 \n\t"
"xvabsdp 6, 6 \n\t"
"xvabsdp 7, 7 \n\t"
//<-----------jump here from first load
"2: \n\t"
"two%=: \n\t"
"xvcmpgtdp 2,44,45 \n\t "
"xvcmpgtdp 3,46,47 \n\t "
"xvcmpgtdp 4,48,49 \n\t "
"xvcmpgtdp 5,50,51 \n\t"
"xvcmpgtdp 5,6,7 \n\t"
"xxsel 32,40,41,2 \n\t"
"xxsel 0,44,45,2 \n\t"
@ -174,7 +174,7 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) {
"xxsel 34,40,41,4 \n\t"
"xxsel 45,48,49,4 \n\t"
"xxsel 35,42,43,5 \n\t"
"xxsel 47,50,51,5 \n\t"
"xxsel 47,6,7,5 \n\t"
"xvcmpgtdp 2,0, 1 \n\t"
"xvcmpgtdp 3, 45,47 \n\t"
@ -204,8 +204,8 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) {
"vaddudm 1,1,5 \n\t" // get real index for first smaller
"lxvd2x 50, %[i96],%[ptr_tmp] \n\t"
"lxvd2x 51,%[i112],%[ptr_tmp] \n\t"
"lxvd2x 6, %[i96],%[ptr_tmp] \n\t"
"lxvd2x 7,%[i112],%[ptr_tmp] \n\t"
@ -227,21 +227,21 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) {
"xvabsdp 48, 48 \n\t"
"xvabsdp 49, 49 \n\t"
"xvabsdp 50, 50 \n\t"
"xvabsdp 51, 51 \n\t"
"xvabsdp 6, 6 \n\t"
"xvabsdp 7, 7 \n\t"
//decrement n
"addic. %[n], %[n], -32 \n\t"
//Loop back if >0
"bgt+ 1b \n\t"
"bgt+ one%= \n\t"
//==============================================================================
"xvcmpgtdp 2,44,45 \n\t "
"xvcmpgtdp 3,46,47 \n\t "
"xvcmpgtdp 4,48,49 \n\t "
"xvcmpgtdp 5,50,51 \n\t"
"xvcmpgtdp 5,6,7 \n\t"
"xxsel 32,40,41,2 \n\t"
"xxsel 0,44,45,2 \n\t"
@ -250,7 +250,7 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) {
"xxsel 34,40,41,4 \n\t"
"xxsel 45,48,49,4 \n\t"
"xxsel 35,42,43,5 \n\t"
"xxsel 47,50,51,5 \n\t"
"xxsel 47,6,7,5 \n\t"
"xvcmpgtdp 2,0, 1 \n\t"
"xvcmpgtdp 3, 45,47 \n\t"
@ -277,28 +277,28 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) {
///////extract min value and min index from vector
"xxspltd 32,38,1 \n\t"
"xxspltd 40,39,1 \n\t"
XXSPLTD_S(32,38,1)
XXSPLTD_S(40,39,1)
"xvcmpeqdp. 2, 40,39 \n\t"
//cr6 0 bit set if all true, cr6=4*6+bit_ind=24,0011at CR(BI)==1, at=10 hint that it occurs rarely
//0b001110=14
"bc 14,24, 3f \n\t"
"bc 14,24, three%= \n\t"
"xvcmpgtdp 4,39, 40 \n\t"
"xxsel 0,39,40,4 \n\t"
"xxsel 1,38,32,4 \n\t"
"stxsdx 0,0,%[ptr_minf] \n\t"
"b 4f \n\t"
"b four%= \n\t"
"3: \n\t"
"three%=: \n\t"
//if elements value are equal then choose minimum index
"xxspltd 0,40,0 \n\t"
XXSPLTD_S(0,40,0)
"vminud 0,0,6 \n\t" //vs32 vs38
"xxlor 1,32,32 \n\t"
"stxsdx 0,0,%[ptr_minf] \n\t"
"4: \n\t"
"four%=: \n\t"
"mfvsrd %[index],1 \n\t"
: [minf] "=m"(*minf),[ptr_tmp] "+&b"(x),[index] "=r"(index), [n] "+&r"(n)
@ -307,7 +307,7 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) {
[i64] "b"(64), [i80] "b"(80), [i96] "b"(96), [i112] "b"(112),
[start] "v"(start), [adder] "v"(temp_add_index)
: "cc", "vs0", "vs1","vs2","vs3", "vs4","vs5","vs32", "vs33", "vs34", "vs35", "vs36",
"vs37", "vs38", "vs39", "vs40", "vs41", "vs42", "vs43", "vs44", "vs45", "vs46", "vs47", "vs48", "vs49", "vs50", "vs51"
"vs37", "vs38", "vs39", "vs40", "vs41", "vs42", "vs43", "vs44", "vs45", "vs46", "vs47", "vs48", "vs49", "vs6", "vs7"
);
return index;

View File

@ -56,8 +56,8 @@ static BLASLONG ziamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *maxf) {
"lxvd2x 47, %[i48],%[ptr_tmp] \n\t"
"lxvd2x 48, %[i64],%[ptr_tmp] \n\t"
"lxvd2x 49, %[i80],%[ptr_tmp] \n\t"
"lxvd2x 50, %[i96],%[ptr_tmp] \n\t"
"lxvd2x 51,%[i112],%[ptr_tmp] \n\t"
"lxvd2x 6, %[i96],%[ptr_tmp] \n\t"
"lxvd2x 7,%[i112],%[ptr_tmp] \n\t"
"xxlor 40,%x[start],%x[start] \n\t" //{ 1,0} vs40 | v8
"vaddudm 9,8,%[adder] \n\t" //{3,2} vs41
@ -67,7 +67,7 @@ static BLASLONG ziamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *maxf) {
"vaddudm 11,10,%[adder] \n\t" //{7,6} vs43
"xxlxor 39,39,39 \n\t" // vs39 vec_max_value is zero
"vaddudm 4,11, %[adder] \n\t" // {9,8} -{8;8} vs36 | v4
"xxspltd 36,36,0 \n\t"
XXSPLTD_S(36,36,0)
@ -77,24 +77,24 @@ static BLASLONG ziamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *maxf) {
"xvabsdp 47, 47 \n\t"
"xvabsdp 48, 48 \n\t"
"xvabsdp 49, 49 \n\t"
"xvabsdp 50, 50 \n\t"
"xvabsdp 51, 51 \n\t"
"xvabsdp 6, 6 \n\t"
"xvabsdp 7, 7 \n\t"
//jump first half forward
"b 2f \n\t"
"b two%= \n\t"
".p2align 5 \n\t"
"1: \n\t"
".align 5 \n\t"
"one%=: \n\t"
"xxmrghd 0,44,45 \n\t"
"xxmrgld 1,44,45 \n\t"
"xxmrghd 2,46,47 \n\t"
"xxmrgld 3,46,47 \n\t"
"xxmrghd 4,48,49 \n\t"
"xxmrgld 5,48,49 \n\t"
"xxmrghd 44,50,51 \n\t"
"xxmrgld 45,50,51 \n\t"
XXMRGHD_S(0,44,45)
XXMRGLD_S(1,44,45)
XXMRGHD_S(2,46,47)
XXMRGLD_S(3,46,47)
XXMRGHD_S(4,48,49)
XXMRGLD_S(5,48,49)
XXMRGHD_S(44,6,7)
XXMRGLD_S(45,6,7)
"xvadddp 46, 0,1 \n\t"
"xvadddp 47, 2,3 \n\t"
@ -103,15 +103,15 @@ static BLASLONG ziamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *maxf) {
"xvcmpgtdp 50,47,46 \n\t "
"xvcmpgtdp 51,49,48 \n\t "
"xvcmpgtdp 6,47,46 \n\t "
"xvcmpgtdp 7,49,48 \n\t "
"addi %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t"
"xxsel 32,40,41,50 \n\t"
"xxsel 0,46,47,50 \n\t"
"xxsel 33,42,43,51 \n\t"
"xxsel 1,48,49,51 \n\t"
"xxsel 32,40,41,6 \n\t"
"xxsel 0,46,47,6 \n\t"
"xxsel 33,42,43,7 \n\t"
"xxsel 1,48,49,7 \n\t"
"lxvd2x 44, 0,%[ptr_tmp] \n\t"
"lxvd2x 45, %[i16],%[ptr_tmp] \n\t"
@ -133,8 +133,8 @@ static BLASLONG ziamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *maxf) {
"lxvd2x 48, %[i64],%[ptr_tmp] \n\t"
"lxvd2x 49, %[i80],%[ptr_tmp] \n\t"
"lxvd2x 50, %[i96],%[ptr_tmp] \n\t"
"lxvd2x 51,%[i112],%[ptr_tmp] \n\t"
"lxvd2x 6, %[i96],%[ptr_tmp] \n\t"
"lxvd2x 7,%[i112],%[ptr_tmp] \n\t"
//select with previous
"xxsel 38,38,32,4 \n\t"
"xxsel 39,39,3,4 \n\t"
@ -148,35 +148,35 @@ static BLASLONG ziamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *maxf) {
"xvabsdp 47, 47 \n\t"
"xvabsdp 48, 48 \n\t"
"xvabsdp 49, 49 \n\t"
"xvabsdp 50, 50 \n\t"
"xvabsdp 51, 51 \n\t"
"xvabsdp 6, 6 \n\t"
"xvabsdp 7, 7 \n\t"
//>>/////////////////////////////// half start
"2: \n\t"
"xxmrghd 0,44,45 \n\t"
"xxmrgld 1,44,45 \n\t"
"xxmrghd 2,46,47 \n\t"
"xxmrgld 3,46,47 \n\t"
"xxmrghd 4,48,49 \n\t"
"xxmrgld 5,48,49 \n\t"
"xxmrghd 44,50,51 \n\t"
"xxmrgld 45,50,51 \n\t"
"two%=: \n\t"
XXMRGHD_S(0,44,45)
XXMRGLD_S(1,44,45)
XXMRGHD_S(2,46,47)
XXMRGLD_S(3,46,47)
XXMRGHD_S(4,48,49)
XXMRGLD_S(5,48,49)
XXMRGHD_S(44,6,7)
XXMRGLD_S(45,6,7)
"xvadddp 46, 0,1 \n\t"
"xvadddp 47, 2,3 \n\t"
"xvadddp 48, 4,5 \n\t"
"xvadddp 49, 44,45 \n\t"
"xvcmpgtdp 50,47,46 \n\t "
"xvcmpgtdp 51,49,48 \n\t "
"xvcmpgtdp 6,47,46 \n\t "
"xvcmpgtdp 7,49,48 \n\t "
"addi %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t"
"xxsel 32,40,41,50 \n\t"
"xxsel 0,46,47,50 \n\t"
"xxsel 33,42,43,51 \n\t"
"xxsel 1,48,49,51 \n\t"
"xxsel 32,40,41,6 \n\t"
"xxsel 0,46,47,6 \n\t"
"xxsel 33,42,43,7 \n\t"
"xxsel 1,48,49,7 \n\t"
"lxvd2x 44, 0,%[ptr_tmp] \n\t"
"lxvd2x 45, %[i16],%[ptr_tmp] \n\t"
@ -198,8 +198,8 @@ static BLASLONG ziamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *maxf) {
"lxvd2x 48, %[i64],%[ptr_tmp] \n\t"
"lxvd2x 49, %[i80],%[ptr_tmp] \n\t"
"lxvd2x 50, %[i96],%[ptr_tmp] \n\t"
"lxvd2x 51,%[i112],%[ptr_tmp] \n\t"
"lxvd2x 6, %[i96],%[ptr_tmp] \n\t"
"lxvd2x 7,%[i112],%[ptr_tmp] \n\t"
//select with previous
"xxsel 38,38,32,4 \n\t"
"xxsel 39,39,3,4 \n\t"
@ -211,24 +211,24 @@ static BLASLONG ziamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *maxf) {
"xvabsdp 47, 47 \n\t"
"xvabsdp 48, 48 \n\t"
"xvabsdp 49, 49 \n\t"
"xvabsdp 50, 50 \n\t"
"xvabsdp 51, 51 \n\t"
"xvabsdp 6, 6 \n\t"
"xvabsdp 7, 7 \n\t"
//decrement n
"addic. %[n], %[n], -16 \n\t"
//Loop back if >0
"bgt+ 1b \n\t"
"bgt+ one%= \n\t"
"xxmrghd 0,44,45 \n\t"
"xxmrgld 1,44,45 \n\t"
"xxmrghd 2,46,47 \n\t"
"xxmrgld 3,46,47 \n\t"
"xxmrghd 4,48,49 \n\t"
"xxmrgld 5,48,49 \n\t"
"xxmrghd 44,50,51 \n\t"
"xxmrgld 45,50,51 \n\t"
XXMRGHD_S(0,44,45)
XXMRGLD_S(1,44,45)
XXMRGHD_S(2,46,47)
XXMRGLD_S(3,46,47)
XXMRGHD_S(4,48,49)
XXMRGLD_S(5,48,49)
XXMRGHD_S(44,6,7)
XXMRGLD_S(45,6,7)
"xvadddp 46, 0,1 \n\t"
"xvadddp 47, 2,3 \n\t"
@ -237,13 +237,13 @@ static BLASLONG ziamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *maxf) {
"xvcmpgtdp 50,47,46 \n\t "
"xvcmpgtdp 51,49,48 \n\t "
"xvcmpgtdp 6,47,46 \n\t "
"xvcmpgtdp 7,49,48 \n\t "
"xxsel 32,40,41,50 \n\t"
"xxsel 0,46,47,50 \n\t"
"xxsel 33,42,43,51 \n\t"
"xxsel 1,48,49,51 \n\t"
"xxsel 32,40,41,6 \n\t"
"xxsel 0,46,47,6 \n\t"
"xxsel 33,42,43,7 \n\t"
"xxsel 1,48,49,7 \n\t"
"xvcmpgtdp 2,1,0 \n\t "
"xxsel 32,32,33,2 \n\t"
@ -262,28 +262,28 @@ static BLASLONG ziamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *maxf) {
///////extract max value and max index from vector
"xxspltd 32,38,1 \n\t"
"xxspltd 40,39,1 \n\t"
XXSPLTD_S(32,38,1)
XXSPLTD_S(40,39,1)
"xvcmpeqdp. 2, 40,39 \n\t"
//cr6 0 bit set if all true, cr6=4*6+bit_ind=24,0011at CR(BI)==1, at=10 hint that it occurs rarely
//0b001110=14
"bc 14,24, 3f \n\t"
"bc 14,24, three%= \n\t"
"xvcmpgtdp 4, 40,39 \n\t"
"xxsel 0,39,40,4 \n\t"
"xxsel 1,38,32,4 \n\t"
"stxsdx 0,0,%[ptr_maxf] \n\t"
"b 4f \n\t"
"b four%= \n\t"
"3: \n\t"
"three%=: \n\t"
//if elements value are equal then choose minimum index
"xxspltd 0,40,0 \n\t"
XXSPLTD_S(0,40,0)
"vminud 0,0,6 \n\t" //vs32 vs38
"xxlor 1,32,32 \n\t"
"stxsdx 0,0,%[ptr_maxf] \n\t"
"4: \n\t"
"four%=: \n\t"
"mfvsrd %[index],1 \n\t"
: [maxf] "=m"(*maxf),[ptr_tmp] "+&b"(x),[index] "=r"(index), [n] "+&r"(n)
@ -292,7 +292,7 @@ static BLASLONG ziamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *maxf) {
[i64] "b"(64), [i80] "b"(80), [i96] "b"(96), [i112] "b"(112),
[start] "v"(start), [adder] "v"(temp_add_index)
: "cc", "vs0", "vs1","vs2","vs3", "vs4","vs5","vs32", "vs33", "vs34", "vs35", "vs36",
"vs37", "vs38", "vs39", "vs40", "vs41", "vs42", "vs43", "vs44", "vs45", "vs46", "vs47", "vs48", "vs49", "vs50", "vs51"
"vs37", "vs38", "vs39", "vs40", "vs41", "vs42", "vs43", "vs44", "vs45", "vs46", "vs47", "vs48", "vs49", "vs6", "vs7"
);
return index;

View File

@ -54,8 +54,8 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) {
"lxvd2x 47, %[i48],%[ptr_tmp] \n\t"
"lxvd2x 48, %[i64],%[ptr_tmp] \n\t"
"lxvd2x 49, %[i80],%[ptr_tmp] \n\t"
"lxvd2x 50, %[i96],%[ptr_tmp] \n\t"
"lxvd2x 51,%[i112],%[ptr_tmp] \n\t"
"lxvd2x 6, %[i96],%[ptr_tmp] \n\t"
"lxvd2x 7,%[i112],%[ptr_tmp] \n\t"
"xxlor 40,%x[start],%x[start] \n\t" //{ 1,0} vs40 | v8
"vaddudm 9,8,%[adder] \n\t" //{3,2} vs41
@ -65,7 +65,7 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) {
"vaddudm 11,10,%[adder] \n\t" //{7,6} vs43
"lxvdsx 39,0,%[ptr_minf] \n\t" // vs39 vec_min_value
"vaddudm 4,11, %[adder] \n\t" // {9,8} -{8;8} vs36 | v4
"xxspltd 36,36,0 \n\t"
XXSPLTD_S(36,36,0)
@ -75,24 +75,24 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) {
"xvabsdp 47, 47 \n\t"
"xvabsdp 48, 48 \n\t"
"xvabsdp 49, 49 \n\t"
"xvabsdp 50, 50 \n\t"
"xvabsdp 51, 51 \n\t"
"xvabsdp 6, 6 \n\t"
"xvabsdp 7, 7 \n\t"
//jump first half forward
"b 2f \n\t"
"b two%= \n\t"
".p2align 5 \n\t"
"1: \n\t"
".align 5 \n\t"
"one%=: \n\t"
"xxmrghd 0,44,45 \n\t"
"xxmrgld 1,44,45 \n\t"
"xxmrghd 2,46,47 \n\t"
"xxmrgld 3,46,47 \n\t"
"xxmrghd 4,48,49 \n\t"
"xxmrgld 5,48,49 \n\t"
"xxmrghd 44,50,51 \n\t"
"xxmrgld 45,50,51 \n\t"
XXMRGHD_S(0,44,45)
XXMRGLD_S(1,44,45)
XXMRGHD_S(2,46,47)
XXMRGLD_S(3,46,47)
XXMRGHD_S(4,48,49)
XXMRGLD_S(5,48,49)
XXMRGHD_S(44,6,7)
XXMRGLD_S(45,6,7)
"xvadddp 46, 0,1 \n\t"
"xvadddp 47, 2,3 \n\t"
@ -101,15 +101,15 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) {
"xvcmpgtdp 50,46,47 \n\t "
"xvcmpgtdp 51,48,49 \n\t "
"xvcmpgtdp 6,46,47 \n\t "
"xvcmpgtdp 7,48,49 \n\t "
"addi %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t"
"xxsel 32,40,41,50 \n\t"
"xxsel 0,46,47,50 \n\t"
"xxsel 33,42,43,51 \n\t"
"xxsel 1,48,49,51 \n\t"
"xxsel 32,40,41,6 \n\t"
"xxsel 0,46,47,6 \n\t"
"xxsel 33,42,43,7 \n\t"
"xxsel 1,48,49,7 \n\t"
"lxvd2x 44, 0,%[ptr_tmp] \n\t"
"lxvd2x 45, %[i16],%[ptr_tmp] \n\t"
@ -131,8 +131,8 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) {
"lxvd2x 48, %[i64],%[ptr_tmp] \n\t"
"lxvd2x 49, %[i80],%[ptr_tmp] \n\t"
"lxvd2x 50, %[i96],%[ptr_tmp] \n\t"
"lxvd2x 51,%[i112],%[ptr_tmp] \n\t"
"lxvd2x 6, %[i96],%[ptr_tmp] \n\t"
"lxvd2x 7,%[i112],%[ptr_tmp] \n\t"
//select with previous
"xxsel 38,38,32,4 \n\t"
"xxsel 39,39,3,4 \n\t"
@ -146,35 +146,35 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) {
"xvabsdp 47, 47 \n\t"
"xvabsdp 48, 48 \n\t"
"xvabsdp 49, 49 \n\t"
"xvabsdp 50, 50 \n\t"
"xvabsdp 51, 51 \n\t"
"xvabsdp 6, 6 \n\t"
"xvabsdp 7, 7 \n\t"
//>>/////////////////////////////// half start
"2: \n\t"
"xxmrghd 0,44,45 \n\t"
"xxmrgld 1,44,45 \n\t"
"xxmrghd 2,46,47 \n\t"
"xxmrgld 3,46,47 \n\t"
"xxmrghd 4,48,49 \n\t"
"xxmrgld 5,48,49 \n\t"
"xxmrghd 44,50,51 \n\t"
"xxmrgld 45,50,51 \n\t"
"two%=: \n\t"
XXMRGHD_S(0,44,45)
XXMRGLD_S(1,44,45)
XXMRGHD_S(2,46,47)
XXMRGLD_S(3,46,47)
XXMRGHD_S(4,48,49)
XXMRGLD_S(5,48,49)
XXMRGHD_S(44,6,7)
XXMRGLD_S(45,6,7)
"xvadddp 46, 0,1 \n\t"
"xvadddp 47, 2,3 \n\t"
"xvadddp 48, 4,5 \n\t"
"xvadddp 49, 44,45 \n\t"
"xvcmpgtdp 50,46,47 \n\t "
"xvcmpgtdp 51,48,49 \n\t "
"xvcmpgtdp 6,46,47 \n\t "
"xvcmpgtdp 7,48,49 \n\t "
"addi %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t"
"xxsel 32,40,41,50 \n\t"
"xxsel 0,46,47,50 \n\t"
"xxsel 33,42,43,51 \n\t"
"xxsel 1,48,49,51 \n\t"
"xxsel 32,40,41,6 \n\t"
"xxsel 0,46,47,6 \n\t"
"xxsel 33,42,43,7 \n\t"
"xxsel 1,48,49,7 \n\t"
"lxvd2x 44, 0,%[ptr_tmp] \n\t"
"lxvd2x 45, %[i16],%[ptr_tmp] \n\t"
@ -196,8 +196,8 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) {
"lxvd2x 48, %[i64],%[ptr_tmp] \n\t"
"lxvd2x 49, %[i80],%[ptr_tmp] \n\t"
"lxvd2x 50, %[i96],%[ptr_tmp] \n\t"
"lxvd2x 51,%[i112],%[ptr_tmp] \n\t"
"lxvd2x 6, %[i96],%[ptr_tmp] \n\t"
"lxvd2x 7,%[i112],%[ptr_tmp] \n\t"
//select with previous
"xxsel 38,38,32,4 \n\t"
"xxsel 39,39,3,4 \n\t"
@ -209,24 +209,24 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) {
"xvabsdp 47, 47 \n\t"
"xvabsdp 48, 48 \n\t"
"xvabsdp 49, 49 \n\t"
"xvabsdp 50, 50 \n\t"
"xvabsdp 51, 51 \n\t"
"xvabsdp 6, 6 \n\t"
"xvabsdp 7, 7 \n\t"
//decrement n
"addic. %[n], %[n], -16 \n\t"
//Loop back if >0
"bgt+ 1b \n\t"
"bgt+ one%= \n\t"
"xxmrghd 0,44,45 \n\t"
"xxmrgld 1,44,45 \n\t"
"xxmrghd 2,46,47 \n\t"
"xxmrgld 3,46,47 \n\t"
"xxmrghd 4,48,49 \n\t"
"xxmrgld 5,48,49 \n\t"
"xxmrghd 44,50,51 \n\t"
"xxmrgld 45,50,51 \n\t"
XXMRGHD_S(0,44,45)
XXMRGLD_S(1,44,45)
XXMRGHD_S(2,46,47)
XXMRGLD_S(3,46,47)
XXMRGHD_S(4,48,49)
XXMRGLD_S(5,48,49)
XXMRGHD_S(44,6,7)
XXMRGLD_S(45,6,7)
"xvadddp 46, 0,1 \n\t"
"xvadddp 47, 2,3 \n\t"
@ -235,13 +235,13 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) {
"xvcmpgtdp 50,46,47 \n\t "
"xvcmpgtdp 51,48,49 \n\t "
"xvcmpgtdp 6,46,47 \n\t "
"xvcmpgtdp 7,48,49 \n\t "
"xxsel 32,40,41,50 \n\t"
"xxsel 0,46,47,50 \n\t"
"xxsel 33,42,43,51 \n\t"
"xxsel 1,48,49,51 \n\t"
"xxsel 32,40,41,6 \n\t"
"xxsel 0,46,47,6 \n\t"
"xxsel 33,42,43,7 \n\t"
"xxsel 1,48,49,7 \n\t"
"xvcmpgtdp 2,0,1 \n\t "
"xxsel 32,32,33,2 \n\t"
@ -260,28 +260,28 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) {
///////extract min value and min index from vector
"xxspltd 32,38,1 \n\t"
"xxspltd 40,39,1 \n\t"
XXSPLTD_S(32,38,1)
XXSPLTD_S(40,39,1)
"xvcmpeqdp. 2, 40,39 \n\t"
//cr6 0 bit set if all true, cr6=4*6+bit_ind=24,0011at CR(BI)==1, at=10 hint that it occurs rarely
//0b001110=14
"bc 14,24, 3f \n\t"
"bc 14,24, three%= \n\t"
"xvcmpgtdp 4,39, 40 \n\t"
"xxsel 0,39,40,4 \n\t"
"xxsel 1,38,32,4 \n\t"
"stxsdx 0,0,%[ptr_minf] \n\t"
"b 4f \n\t"
"b four%= \n\t"
"3: \n\t"
"three%=: \n\t"
//if elements value are equal then choose minimum index
"xxspltd 0,40,0 \n\t"
XXSPLTD_S(0,40,0)
"vminud 0,0,6 \n\t" //vs32 vs38
"xxlor 1,32,32 \n\t"
"stxsdx 0,0,%[ptr_minf] \n\t"
"4: \n\t"
"four%=: \n\t"
"mfvsrd %[index],1 \n\t"
: [minf] "=m"(*minf),[ptr_tmp] "+&b"(x),[index] "=r"(index), [n] "+&r"(n)
@ -290,7 +290,7 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) {
[i64] "b"(64), [i80] "b"(80), [i96] "b"(96), [i112] "b"(112),
[start] "v"(start), [adder] "v"(temp_add_index)
: "cc", "vs0", "vs1","vs2","vs3", "vs4","vs5","vs32", "vs33", "vs34", "vs35", "vs36",
"vs37", "vs38", "vs39", "vs40", "vs41", "vs42", "vs43", "vs44", "vs45", "vs46", "vs47", "vs48", "vs49", "vs50", "vs51"
"vs37", "vs38", "vs39", "vs40", "vs41", "vs42", "vs43", "vs44", "vs45", "vs46", "vs47", "vs48", "vs49", "vs6", "vs7"
);
return index;

View File

@ -46,10 +46,10 @@ static void __inline blas_lock(volatile BLASULONG *address){
" .machine \"any\" ;"
"0: lwarx %0,0, %1 ;"
" cmpwi 0,%0,0;"
" bne 1f;"
" bne one%=;"
" stwcx. %2,0, %1 ;"
" bne- 0b;"
"1: "
"one%=: "
: "=&r"(ret)
: "r"(address), "r" (val)
: "cr0", "memory");

View File

@ -68,10 +68,10 @@ static float sasum_kernel_32 (long n, float *x)
"addi %2, %2, 128 \n\t"
"addic. %1, %1, -32 \n\t"
"ble 2f \n\t"
"ble two%= \n\t"
".p2align 5 \n"
"1: \n\t"
".align 5 \n"
"one%=: \n\t"
"xvabssp 48, 40 \n\t"
"xvabssp 49, 41 \n\t"
@ -108,9 +108,9 @@ static float sasum_kernel_32 (long n, float *x)
"xvaddsp 38, 38, %x5 \n\t"
"xvaddsp 39, 39, %x6 \n\t"
"bgt 1b \n"
"bgt one%= \n"
"2: \n\t"
"two%=: \n\t"
"xvabssp 48, 40 \n\t"
"xvabssp 49, 41 \n\t"

View File

@ -51,10 +51,10 @@ static void scopy_kernel_32 (long n, float *x, float *y)
"addi %2, %2, 128 \n\t"
"addic. %1, %1, -32 \n\t"
"ble 2f \n\t"
"ble two%= \n\t"
".p2align 5 \n"
"1: \n\t"
".align 5 \n"
"one%=: \n\t"
"stxvd2x 40, 0, %3 \n\t"
"stxvd2x 41, %5, %3 \n\t"
@ -77,9 +77,9 @@ static void scopy_kernel_32 (long n, float *x, float *y)
"addi %2, %2, 128 \n\t"
"addic. %1, %1, -32 \n\t"
"bgt 1b \n"
"bgt one%= \n"
"2: \n\t"
"two%=: \n\t"
"stxvd2x 40, 0, %3 \n\t"
"stxvd2x 41, %5, %3 \n\t"

View File

@ -78,10 +78,10 @@ static float sdot_kernel_16 (long n, float *x, float *y)
"addi %3, %3, 128 \n\t"
"addic. %1, %1, -32 \n\t"
"ble 2f \n\t"
"ble two%= \n\t"
".p2align 5 \n"
"1: \n\t"
".align 5 \n"
"one%=: \n\t"
"xvmaddasp 32, 40, 48 \n\t"
"lxvd2x 40, 0, %2 \n\t"
@ -112,9 +112,9 @@ static float sdot_kernel_16 (long n, float *x, float *y)
"addi %3, %3, 128 \n\t"
"addic. %1, %1, -32 \n\t"
"bgt 1b \n"
"bgt one%= \n"
"2: \n\t"
"two%=: \n\t"
"xvmaddasp 32, 40, 48 \n\t"
"xvmaddasp 33, 41, 49 \n\t"

File diff suppressed because it is too large Load Diff

View File

@ -38,7 +38,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
* Macros for N=4 and M=16
**********************************************************************************************/
#if defined(_AIX)
define(`COPY_4x16', `
#else
.macro COPY_4x16
#endif
lxvw4x vs32, o0, A0
lxvw4x vs33, o16, A0
@ -88,13 +92,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvw4x vs46, o32, T1
stxvw4x vs47, o48, T1
#if defined(_AIX)
')
#else
.endm
#endif
/**********************************************************************************************
* Macros for N=4 and M=8
**********************************************************************************************/
#if defined(_AIX)
define(`COPY_4x8', `
#else
.macro COPY_4x8
#endif
lxvw4x vs32, o0, A0
lxvw4x vs33, o16, A0
@ -124,13 +136,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvw4x vs38, o32, T1
stxvw4x vs39, o48, T1
#if defined(_AIX)
')
#else
.endm
#endif
/**********************************************************************************************
* Macros for N=4 and M=4
**********************************************************************************************/
#if defined(_AIX)
define(`COPY_4x4', `
#else
.macro COPY_4x4
#endif
lxvw4x vs32, o0, A0
@ -150,13 +170,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvw4x vs35, o48, T1
#if defined(_AIX)
')
#else
.endm
#endif
/**********************************************************************************************
* Macros for N=4 and M=2
**********************************************************************************************/
#if defined(_AIX)
define(`COPY_4x2', `
#else
.macro COPY_4x2
#endif
lxsspx vs32, o0, A0
lxsspx vs33, o4, A0
@ -190,13 +218,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxsspx vs38, o0, T1
stxsspx vs39, o4, T1
#if defined(_AIX)
')
#else
.endm
#endif
/**********************************************************************************************
* Macros for N=4 and M=1
**********************************************************************************************/
#if defined(_AIX)
define(`COPY_4x1', `
#else
.macro COPY_4x1
#endif
lxsspx vs32, o0, A0
@ -218,13 +254,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxsspx vs35, o4, T1
#if defined(_AIX)
')
#else
.endm
#endif
/**********************************************************************************************
* Macros for N=2 and M=16
**********************************************************************************************/
#if defined(_AIX)
define(`COPY_2x16', `
#else
.macro COPY_2x16
#endif
lxvw4x vs32, o0, A0
lxvw4x vs33, o16, A0
@ -250,13 +294,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvw4x vs38, o32, T1
stxvw4x vs39, o48, T1
#if defined(_AIX)
')
#else
.endm
#endif
/**********************************************************************************************
* Macros for N=2 and M=8
**********************************************************************************************/
#if defined(_AIX)
define(`COPY_2x8', `
#else
.macro COPY_2x8
#endif
lxvw4x vs32, o0, A0
lxvw4x vs33, o16, A0
@ -272,13 +324,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvw4x vs34, o32, T1
stxvw4x vs35, o48, T1
#if defined(_AIX)
')
#else
.endm
#endif
/**********************************************************************************************
* Macros for N=2 and M=4
**********************************************************************************************/
#if defined(_AIX)
define(`COPY_2x4', `
#else
.macro COPY_2x4
#endif
lxvw4x vs32, o0, A0
@ -290,13 +350,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvw4x vs33, o16, T1
#if defined(_AIX)
')
#else
.endm
#endif
/**********************************************************************************************
* Macros for N=2 and M=2
**********************************************************************************************/
#if defined(_AIX)
define(`COPY_2x2', `
#else
.macro COPY_2x2
#endif
lxsspx vs32, o0, A0
lxsspx vs33, o4, A0
@ -314,13 +382,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxsspx vs34, o0, T1
stxsspx vs35, o4, T1
#if defined(_AIX)
')
#else
.endm
#endif
/**********************************************************************************************
* Macros for N=2 and M=1
**********************************************************************************************/
#if defined(_AIX)
define(`COPY_2x1', `
#else
.macro COPY_2x1
#endif
lxsspx vs32, o0, A0
@ -332,13 +408,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxsspx vs33, o4, T1
#if defined(_AIX)
')
#else
.endm
#endif
/**********************************************************************************************
* Macros for N=1 and M=16
**********************************************************************************************/
#if defined(_AIX)
define(`COPY_1x16', `
#else
.macro COPY_1x16
#endif
lxvw4x vs32, o0, A0
lxvw4x vs33, o16, A0
@ -352,13 +436,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvw4x vs34, o32, T1
stxvw4x vs35, o48, T1
#if defined(_AIX)
')
#else
.endm
#endif
/**********************************************************************************************
* Macros for N=1 and M=8
**********************************************************************************************/
#if defined(_AIX)
define(`COPY_1x8', `
#else
.macro COPY_1x8
#endif
lxvw4x vs32, o0, A0
lxvw4x vs33, o16, A0
@ -368,13 +460,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvw4x vs32, o0, T1
stxvw4x vs33, o16, T1
#if defined(_AIX)
')
#else
.endm
#endif
/**********************************************************************************************
* Macros for N=1 and M=4
**********************************************************************************************/
#if defined(_AIX)
define(`COPY_1x4', `
#else
.macro COPY_1x4
#endif
lxvw4x vs32, o0, A0
@ -382,13 +482,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvw4x vs32, o0, T1
#if defined(_AIX)
')
#else
.endm
#endif
/**********************************************************************************************
* Macros for N=1 and M=2
**********************************************************************************************/
#if defined(_AIX)
define(`COPY_1x2', `
#else
.macro COPY_1x2
#endif
lxsspx vs32, o0, A0
lxsspx vs33, o4, A0
@ -398,13 +506,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxsspx vs32, o0, T1
stxsspx vs33, o4, T1
#if defined(_AIX)
')
#else
.endm
#endif
/**********************************************************************************************
* Macros for N=1 and M=1
**********************************************************************************************/
#if defined(_AIX)
define(`COPY_1x1', `
#else
.macro COPY_1x1
#endif
lxsspx vs32, o0, A0
@ -412,5 +528,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxsspx vs32, o0, T1
#if defined(_AIX)
')
#else
.endm
#endif

View File

@ -38,7 +38,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
* Macros for N=4 and M=8
**********************************************************************************************/
#if defined(_AIX)
define(`COPY_4x8', `
#else
.macro COPY_4x8
#endif
lxvw4x vs32, o0, A0
lxvw4x vs33, o16, A0
@ -68,13 +72,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvw4x vs38, o32, T1
stxvw4x vs39, o48, T1
#if defined(_AIX)
')
#else
.endm
#endif
/**********************************************************************************************
* Macros for N=4 and M=4
**********************************************************************************************/
#if defined(_AIX)
define(`COPY_4x4', `
#else
.macro COPY_4x4
#endif
lxvw4x vs32, o0, A0
@ -94,13 +106,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvw4x vs35, o48, T1
#if defined(_AIX)
')
#else
.endm
#endif
/**********************************************************************************************
* Macros for N=4 and M=2
**********************************************************************************************/
#if defined(_AIX)
define(`COPY_4x2', `
#else
.macro COPY_4x2
#endif
lxsspx vs32, o0, A0
lxsspx vs33, o4, A0
@ -134,13 +154,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxsspx vs38, o0, T1
stxsspx vs39, o4, T1
#if defined(_AIX)
')
#else
.endm
#endif
/**********************************************************************************************
* Macros for N=4 and M=1
**********************************************************************************************/
#if defined(_AIX)
define(`COPY_4x1', `
#else
.macro COPY_4x1
#endif
lxsspx vs32, o0, A0
@ -162,13 +190,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxsspx vs35, o4, T1
#if defined(_AIX)
')
#else
.endm
#endif
/**********************************************************************************************
* Macros for N=2 and M=8
**********************************************************************************************/
#if defined(_AIX)
define(`COPY_2x8', `
#else
.macro COPY_2x8
#endif
lxvw4x vs32, o0, A0
lxvw4x vs33, o16, A0
@ -184,13 +220,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvw4x vs34, o32, T1
stxvw4x vs35, o48, T1
#if defined(_AIX)
')
#else
.endm
#endif
/**********************************************************************************************
* Macros for N=2 and M=4
**********************************************************************************************/
#if defined(_AIX)
define(`COPY_2x4', `
#else
.macro COPY_2x4
#endif
lxvw4x vs32, o0, A0
@ -202,13 +246,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvw4x vs33, o16, T1
#if defined(_AIX)
')
#else
.endm
#endif
/**********************************************************************************************
* Macros for N=2 and M=2
**********************************************************************************************/
#if defined(_AIX)
define(`COPY_2x2', `
#else
.macro COPY_2x2
#endif
lxsspx vs32, o0, A0
lxsspx vs33, o4, A0
@ -226,13 +278,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxsspx vs34, o0, T1
stxsspx vs35, o4, T1
#if defined(_AIX)
')
#else
.endm
#endif
/**********************************************************************************************
* Macros for N=2 and M=1
**********************************************************************************************/
#if defined(_AIX)
define(`COPY_2x1', `
#else
.macro COPY_2x1
#endif
lxsspx vs32, o0, A0
@ -244,13 +304,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxsspx vs33, o4, T1
#if defined(_AIX)
')
#else
.endm
#endif
/**********************************************************************************************
* Macros for N=1 and M=8
**********************************************************************************************/
#if defined(_AIX)
define(`COPY_1x8', `
#else
.macro COPY_1x8
#endif
lxvw4x vs32, o0, A0
lxvw4x vs33, o16, A0
@ -260,13 +328,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvw4x vs32, o0, T1
stxvw4x vs33, o16, T1
#if defined(_AIX)
')
#else
.endm
#endif
/**********************************************************************************************
* Macros for N=1 and M=4
**********************************************************************************************/
#if defined(_AIX)
define(`COPY_1x4', `
#else
.macro COPY_1x4
#endif
lxvw4x vs32, o0, A0
@ -274,13 +350,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvw4x vs32, o0, T1
#if defined(_AIX)
')
#else
.endm
#endif
/**********************************************************************************************
* Macros for N=1 and M=2
**********************************************************************************************/
#if defined(_AIX)
define(`COPY_1x2', `
#else
.macro COPY_1x2
#endif
lxsspx vs32, o0, A0
lxsspx vs33, o4, A0
@ -290,13 +374,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxsspx vs32, o0, T1
stxsspx vs33, o4, T1
#if defined(_AIX)
')
#else
.endm
#endif
/**********************************************************************************************
* Macros for N=1 and M=1
**********************************************************************************************/
#if defined(_AIX)
define(`COPY_1x1', `
#else
.macro COPY_1x1
#endif
lxsspx vs32, o0, A0
@ -304,5 +396,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxsspx vs32, o0, T1
#if defined(_AIX)
')
#else
.endm
#endif

View File

@ -71,10 +71,10 @@ static void srot_kernel_16 (long n, float *x, float *y, float c, float s)
"addi %4, %4, 64 \n\t"
"addic. %2, %2, -16 \n\t"
"ble 2f \n\t"
"ble two%= \n\t"
".p2align 5 \n"
"1: \n\t"
".align 5 \n"
"one%=: \n\t"
"xvmulsp 40, 32, 36 \n\t" // c * x
"xvmulsp 41, 33, 36 \n\t"
@ -138,9 +138,9 @@ static void srot_kernel_16 (long n, float *x, float *y, float c, float s)
"addi %4, %4, 128 \n\t"
"addic. %2, %2, -16 \n\t"
"bgt 1b \n"
"bgt one%= \n"
"2: \n\t"
"two%=: \n\t"
"xvmulsp 40, 32, 36 \n\t" // c * x
"xvmulsp 41, 33, 36 \n\t"

View File

@ -56,10 +56,10 @@ static void sscal_kernel_16 (long n, float *x, float alpha)
"addi %2, %2, 128 \n\t"
"addic. %1, %1, -32 \n\t"
"ble 2f \n\t"
"ble two%= \n\t"
".p2align 5 \n"
"1: \n\t"
".align 5 \n"
"one%=: \n\t"
"xvmulsp 40, 32, %x3 \n\t"
"xvmulsp 41, 33, %x3 \n\t"
@ -92,9 +92,9 @@ static void sscal_kernel_16 (long n, float *x, float alpha)
"addi %2, %2, 256 \n\t"
"addic. %1, %1, -32 \n\t"
"bgt 1b \n"
"bgt one%= \n"
"2: \n\t"
"two%=: \n\t"
"xvmulsp 40, 32, %x3 \n\t"
"xvmulsp 41, 33, %x3 \n\t"
@ -147,8 +147,8 @@ static void sscal_kernel_16_zero (long n, float *x)
(
"xxlxor %x3, %x3, %x3 \n\t"
".p2align 5 \n"
"1: \n\t"
".align 5 \n"
"one%=: \n\t"
"stxvd2x %x3, 0, %2 \n\t"
"stxvd2x %x3, %4, %2 \n\t"
@ -162,7 +162,7 @@ static void sscal_kernel_16_zero (long n, float *x)
"addi %2, %2, 128 \n\t"
"addic. %1, %1, -32 \n\t"
"bgt 1b \n"
"bgt one%= \n"
"#n=%1 x=%0=%2 t0=%x3 o16=%4 o32=%5 o48=%6 o64=%7 o80=%8 o96=%9 o112=%10"
:

View File

@ -39,8 +39,8 @@ static void sswap_kernel_32 (long n, float *x, float *y)
{
__asm__
(
".p2align 5 \n"
"1: \n\t"
".align 5 \n"
"one%=: \n\t"
"lxvd2x 32, 0, %4 \n\t"
"lxvd2x 33, %5, %4 \n\t"
@ -83,7 +83,7 @@ static void sswap_kernel_32 (long n, float *x, float *y)
"addi %4, %4, 128 \n\t"
"addic. %2, %2, -32 \n\t"
"bgt 1b \n"
"bgt one%= \n"
"#n=%2 x=%0=%3 y=%1=%4 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11"
:

File diff suppressed because it is too large Load Diff

View File

@ -68,10 +68,10 @@ static double zasum_kernel_8 (long n, double *x)
"addi %2, %2, 128 \n\t"
"addic. %1, %1, -8 \n\t"
"ble 2f \n\t"
"ble two%= \n\t"
".p2align 5 \n"
"1: \n\t"
".align 5 \n"
"one%=: \n\t"
"xvabsdp 48, 40 \n\t"
"xvabsdp 49, 41 \n\t"
@ -108,9 +108,9 @@ static double zasum_kernel_8 (long n, double *x)
"xvadddp 38, 38, %x5 \n\t"
"xvadddp 39, 39, %x6 \n\t"
"bgt 1b \n"
"bgt one%= \n"
"2: \n\t"
"two%=: \n\t"
"xvabsdp 48, 40 \n\t"
"xvabsdp 49, 41 \n\t"
@ -140,7 +140,7 @@ static double zasum_kernel_8 (long n, double *x)
"xvadddp 32, 32, 36 \n\t"
"xxswapd 33, 32 \n\t"
XXSWAPD_S(33,32)
"xsadddp %x0, 32, 33 \n"
"#n=%1 x=%3=%2 sum=%0 o16=%8 o32=%9 o48=%10 o64=%11 o80=%12 o96=%13 o112=%14\n"

View File

@ -61,8 +61,8 @@ static void zaxpy_kernel_4 (long n, double *x, double *y,
__asm__
(
"xxspltd 32, %x19, 0 \n\t" // alpha_r
"xxspltd 33, %x20, 0 \n\t" // alpha_i
XXSPLTD_S(32,%x19,0) // alpha_r
XXSPLTD_S(33,%x20,0) // alpha_i
"lxvd2x 36, 0, %21 \n\t" // mvec
@ -87,10 +87,10 @@ static void zaxpy_kernel_4 (long n, double *x, double *y,
"lxvd2x 50, %23, %3 \n\t" // y2
"lxvd2x 51, %24, %3 \n\t" // y3
"xxswapd %x8, 40 \n\t" // exchange real and imag part
"xxswapd %x9, 41 \n\t" // exchange real and imag part
"xxswapd %x10, 42 \n\t" // exchange real and imag part
"xxswapd %x11, 43 \n\t" // exchange real and imag part
XXSWAPD_S(%x8,40) // exchange real and imag part
XXSWAPD_S(%x9,41) // exchange real and imag part
XXSWAPD_S(%x10,42) // exchange real and imag part
XXSWAPD_S(%x11,43) // exchange real and imag part
"addi %2, %2, 64 \n\t"
"addi %3, %3, 64 \n\t"
@ -105,19 +105,19 @@ static void zaxpy_kernel_4 (long n, double *x, double *y,
"lxvd2x %x6, %23, %3 \n\t" // y6
"lxvd2x %x7, %24, %3 \n\t" // y7
"xxswapd %x12, 44 \n\t" // exchange real and imag part
"xxswapd %x13, 45 \n\t" // exchange real and imag part
"xxswapd %x14, 46 \n\t" // exchange real and imag part
"xxswapd %x15, 47 \n\t" // exchange real and imag part
XXSWAPD_S(%x12,44) // exchange real and imag part
XXSWAPD_S(%x13,45) // exchange real and imag part
XXSWAPD_S(%x14,46) // exchange real and imag part
XXSWAPD_S(%x15,47) // exchange real and imag part
"addi %2, %2, 64 \n\t"
"addi %3, %3, 64 \n\t"
"addic. %1, %1, -8 \n\t"
"ble 2f \n\t"
"ble two%= \n\t"
".p2align 5 \n"
"1: \n\t"
".align 5 \n"
"one%=: \n\t"
"xvmaddadp 48, 40, 32 \n\t" // alpha_r * x0_r , alpha_r * x0_i
"xvmaddadp 49, 41, 32 \n\t"
@ -163,31 +163,31 @@ static void zaxpy_kernel_4 (long n, double *x, double *y,
"addi %16, %16, 64 \n\t"
"xxswapd %x8, 40 \n\t" // exchange real and imag part
"xxswapd %x9, 41 \n\t" // exchange real and imag part
XXSWAPD_S(%x8,40) // exchange real and imag part
XXSWAPD_S(%x9,41) // exchange real and imag part
"lxvd2x 48, 0, %3 \n\t" // y0
"lxvd2x 49, %22, %3 \n\t" // y1
"xxswapd %x10, 42 \n\t" // exchange real and imag part
"xxswapd %x11, 43 \n\t" // exchange real and imag part
XXSWAPD_S(%x10,42) // exchange real and imag part
XXSWAPD_S(%x11,43) // exchange real and imag part
"lxvd2x 50, %23, %3 \n\t" // y2
"lxvd2x 51, %24, %3 \n\t" // y3
"xxswapd %x12, 44 \n\t" // exchange real and imag part
XXSWAPD_S(%x12,44) // exchange real and imag part
"addi %3, %3, 64 \n\t"
"xxswapd %x13, 45 \n\t" // exchange real and imag part
XXSWAPD_S(%x13,45) // exchange real and imag part
"lxvd2x %x4, 0, %3 \n\t" // y4
"lxvd2x %x5, %22, %3 \n\t" // y5
"xxswapd %x14, 46 \n\t" // exchange real and imag part
"xxswapd %x15, 47 \n\t" // exchange real and imag part
XXSWAPD_S(%x14,46) // exchange real and imag part
XXSWAPD_S(%x15,47) // exchange real and imag part
"lxvd2x %x6, %23, %3 \n\t" // y6
"lxvd2x %x7, %24, %3 \n\t" // y7
"addi %3, %3, 64 \n\t"
"addic. %1, %1, -8 \n\t"
"bgt 1b \n"
"bgt one%= \n"
"2: \n\t"
"two%=: \n\t"
"xvmaddadp 48, 40, 32 \n\t" // alpha_r * x0_r , alpha_r * x0_i
"xvmaddadp 49, 41, 32 \n\t"

View File

@ -62,10 +62,10 @@ static void zcopy_kernel_16 (long n, FLOAT *x, FLOAT *y)
"addi %2, %2, 128 \n\t"
"addic. %1, %1, -16 \n\t"
"ble 2f \n\t"
"ble two%= \n\t"
".p2align 5 \n"
"1: \n\t"
".align 5 \n"
"one%=: \n\t"
"stxvd2x 32, 0, %3 \n\t"
"stxvd2x 33, %5, %3 \n\t"
@ -108,9 +108,9 @@ static void zcopy_kernel_16 (long n, FLOAT *x, FLOAT *y)
"addi %2, %2, 128 \n\t"
"addic. %1, %1, -16 \n\t"
"bgt 1b \n"
"bgt one%= \n"
"2: \n\t"
"two%=: \n\t"
"stxvd2x 32, 0, %3 \n\t"
"stxvd2x 33, %5, %3 \n\t"

View File

@ -60,10 +60,10 @@ static void zdot_kernel_8 (long n, double *x, double *y, double *dot)
"lxvd2x 43, %9, %2 \n\t" // x3_r, x3_i
"lxvd2x 51, %9, %3 \n\t" // y3_r, y3_i
"xxswapd 0, 48 \n\t" // y0_i, y0_r
"xxswapd 1, 49 \n\t" // y1_i, y1_r
"xxswapd 2, 50 \n\t" // y2_i, y2_r
"xxswapd 3, 51 \n\t" // y3_i, y3_r
XXSWAPD_S(0,48) // y0_i, y0_r
XXSWAPD_S(1,49) // y1_i, y1_r
XXSWAPD_S(2,50) // y2_i, y2_r
XXSWAPD_S(3,51) // y3_i, y3_r
"addi %2, %2, 64 \n\t"
"addi %3, %3, 64 \n\t"
@ -77,19 +77,19 @@ static void zdot_kernel_8 (long n, double *x, double *y, double *dot)
"lxvd2x 47, %9, %2 \n\t" // x3_r, x3_i
"lxvd2x 7, %9, %3 \n\t" // y3_r, y3_i
"xxswapd 8, 4 \n\t" // y0_i, y0_r
"xxswapd 9, 5 \n\t" // y1_i, y1_r
"xxswapd 10, 6 \n\t" // y2_i, y2_r
"xxswapd 11, 7 \n\t" // y3_i, y3_r
XXSWAPD_S(8,4) // y0_i, y0_r
XXSWAPD_S(9,5) // y1_i, y1_r
XXSWAPD_S(10,6) // y2_i, y2_r
XXSWAPD_S(11,7) // y3_i, y3_r
"addi %2, %2, 64 \n\t"
"addi %3, %3, 64 \n\t"
"addic. %1, %1, -8 \n\t"
"ble 2f \n\t"
"ble two%= \n\t"
".p2align 5 \n"
"1: \n\t"
".align 5 \n"
"one%=: \n\t"
"xvmaddadp 32, 40, 48 \n\t" // x0_r * y0_r , x0_i * y0_i
"lxvd2x 48, 0, %3 \n\t" // y0_r, y0_i
@ -111,14 +111,14 @@ static void zdot_kernel_8 (long n, double *x, double *y, double *dot)
"xvmaddadp 39, 43, 3 \n\t" // x3_r * y3_i , x3_i * y3_r
"lxvd2x 43, %9, %2 \n\t" // x3_r, x3_i
"xxswapd 0,48 \n\t" // y0_i, y0_r
"xxswapd 1,49 \n\t" // y1_i, y1_r
XXSWAPD_S(0,48) // y0_i, y0_r
XXSWAPD_S(1,49) // y1_i, y1_r
"addi %2, %2, 64 \n\t"
"addi %3, %3, 64 \n\t"
"xxswapd 2,50 \n\t" // y2_i, y2_r
"xxswapd 3,51 \n\t" // y3_i, y3_r
XXSWAPD_S(2,50) // y2_i, y2_r
XXSWAPD_S(3,51) // y3_i, y3_r
"xvmaddadp 32, 44, 4 \n\t" // x0_r * y0_r , x0_i * y0_i
"lxvd2x 4, 0, %3 \n\t" // y0_r, y0_i
@ -138,19 +138,19 @@ static void zdot_kernel_8 (long n, double *x, double *y, double *dot)
"xvmaddadp 39, 47, 11 \n\t" // x3_r * y3_i , x3_i * y3_r
"lxvd2x 47, %9, %2 \n\t" // x3_r, x3_i
"xxswapd 8,4 \n\t" // y0_i, y0_r
"xxswapd 9,5 \n\t" // y1_i, y1_r
XXSWAPD_S(8,4) // y0_i, y0_r
XXSWAPD_S(9,5) // y1_i, y1_r
"addi %2, %2, 64 \n\t"
"addi %3, %3, 64 \n\t"
"xxswapd 10,6 \n\t" // y2_i, y2_r
"xxswapd 11,7 \n\t" // y3_i, y3_r
XXSWAPD_S(10,6) // y2_i, y2_r
XXSWAPD_S(11,7) // y3_i, y3_r
"addic. %1, %1, -8 \n\t"
"bgt 1b \n"
"bgt one%= \n"
"2: \n\t"
"two%=: \n\t"
"xvmaddadp 32, 40, 48 \n\t" // x0_r * y0_r , x0_i * y0_i
"xvmaddadp 34, 41, 49 \n\t" // x1_r * y1_r , x1_i * y1_i

File diff suppressed because it is too large Load Diff

View File

@ -38,7 +38,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
* Macros for N=4 and M=8
**********************************************************************************************/
#if defined(_AIX)
define(`COPY_4x8', `
#else
.macro COPY_4x8
#endif
lxvd2x vs32, o0, A0
lxvd2x vs33, o16, A0
@ -144,14 +148,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvd2x vs12, o32, T1
stxvd2x vs13, o48, T1
#if defined(_AIX)
')
#else
.endm
#endif
/**********************************************************************************************
* Macros for N=4 and M=4
**********************************************************************************************/
#if defined(_AIX)
define(`COPY_4x4', `
#else
.macro COPY_4x4
#endif
lxvd2x vs32, o0, A0
lxvd2x vs33, o16, A0
@ -209,14 +221,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvd2x vs46, o32, T1
stxvd2x vs47, o48, T1
#if defined(_AIX)
')
#else
.endm
#endif
/**********************************************************************************************
* Macros for N=4 and M=2
**********************************************************************************************/
#if defined(_AIX)
define(`COPY_4x2', `
#else
.macro COPY_4x2
#endif
lxvd2x vs32, o0, A0
lxvd2x vs33, o16, A0
@ -254,14 +274,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvd2x vs38, o32, T1
stxvd2x vs39, o48, T1
#if defined(_AIX)
')
#else
.endm
#endif
/**********************************************************************************************
* Macros for N=4 and M=1
**********************************************************************************************/
#if defined(_AIX)
define(`COPY_4x1', `
#else
.macro COPY_4x1
#endif
lxvd2x vs32, o0, A0
addi A0, A0, 16
@ -289,14 +317,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvd2x vs35, o48, T1
#if defined(_AIX)
')
#else
.endm
#endif
/**********************************************************************************************
* Macros for N=2 and M=8
**********************************************************************************************/
#if defined(_AIX)
define(`COPY_2x8', `
#else
.macro COPY_2x8
#endif
lxvd2x vs32, o0, A0
lxvd2x vs33, o16, A0
@ -350,14 +386,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvd2x vs46, o32, T1
stxvd2x vs47, o48, T1
#if defined(_AIX)
')
#else
.endm
#endif
/**********************************************************************************************
* Macros for N=2 and M=4
**********************************************************************************************/
#if defined(_AIX)
define(`COPY_2x4', `
#else
.macro COPY_2x4
#endif
lxvd2x vs32, o0, A0
lxvd2x vs33, o16, A0
@ -387,14 +431,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvd2x vs38, o32, T1
stxvd2x vs39, o48, T1
#if defined(_AIX)
')
#else
.endm
#endif
/**********************************************************************************************
* Macros for N=2 and M=2
**********************************************************************************************/
#if defined(_AIX)
define(`COPY_2x2', `
#else
.macro COPY_2x2
#endif
lxvd2x vs32, o0, A0
lxvd2x vs33, o16, A0
@ -414,14 +466,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvd2x vs34, o32, T1
stxvd2x vs35, o48, T1
#if defined(_AIX)
')
#else
.endm
#endif
/**********************************************************************************************
* Macros for N=2 and M=1
**********************************************************************************************/
#if defined(_AIX)
define(`COPY_2x1', `
#else
.macro COPY_2x1
#endif
lxvd2x vs32, o0, A0
addi A0, A0, 16
@ -437,14 +497,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvd2x vs33, o16, T1
#if defined(_AIX)
')
#else
.endm
#endif
/**********************************************************************************************
* Macros for N=1 and M=8
**********************************************************************************************/
#if defined(_AIX)
define(`COPY_1x8', `
#else
.macro COPY_1x8
#endif
lxvd2x vs32, o0, A0
lxvd2x vs33, o16, A0
@ -472,14 +540,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvd2x vs38, o32, T1
stxvd2x vs39, o48, T1
#if defined(_AIX)
')
#else
.endm
#endif
/**********************************************************************************************
* Macros for N=1 and M=4
**********************************************************************************************/
#if defined(_AIX)
define(`COPY_1x4', `
#else
.macro COPY_1x4
#endif
lxvd2x vs32, o0, A0
lxvd2x vs33, o16, A0
@ -495,14 +571,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvd2x vs34, o32, T1
stxvd2x vs35, o48, T1
#if defined(_AIX)
')
#else
.endm
#endif
/**********************************************************************************************
* Macros for N=1 and M=2
**********************************************************************************************/
#if defined(_AIX)
define(`COPY_1x2', `
#else
.macro COPY_1x2
#endif
lxvd2x vs32, o0, A0
lxvd2x vs33, o16, A0
@ -514,14 +598,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvd2x vs32, o0, T1
stxvd2x vs33, o16, T1
#if defined(_AIX)
')
#else
.endm
#endif
/**********************************************************************************************
* Macros for N=1 and M=1
**********************************************************************************************/
#if defined(_AIX)
define(`COPY_1x1', `
#else
.macro COPY_1x1
#endif
lxvd2x vs32, o0, A0
addi A0, A0, 16
@ -531,5 +623,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvd2x vs32, o0, T1
#if defined(_AIX)
')
#else
.endm
#endif

View File

@ -40,8 +40,8 @@ static void zrot_kernel_4(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT cosA, FLOAT si
__asm__
(
"xxspltd 36, %x[cos], 0 \n\t" // load c to both dwords
"xxspltd 37, %x[sin], 0 \n\t" // load s to both dwords
XXSPLTD_S(36,%x[cos],0) // load c to both dwords
XXSPLTD_S(37,%x[sin],0) // load s to both dwords
"lxvd2x 32, 0, %[x_ptr] \n\t" // load x
"lxvd2x 33, %[i16], %[x_ptr] \n\t"
@ -57,10 +57,10 @@ static void zrot_kernel_4(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT cosA, FLOAT si
"addi %[y_ptr], %[y_ptr], 64 \n\t"
"addic. %[temp_n], %[temp_n], -4 \n\t"
"ble 2f \n\t"
"ble two%= \n\t"
".p2align 5 \n"
"1: \n\t"
".align 5 \n"
"one%=: \n\t"
"xvmuldp 40, 32, 36 \n\t" // c * x
"xvmuldp 41, 33, 36 \n\t"
@ -124,9 +124,9 @@ static void zrot_kernel_4(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT cosA, FLOAT si
"addi %[y_ptr], %[y_ptr], 128 \n\t"
"addic. %[temp_n], %[temp_n], -4 \n\t"
"bgt+ 1b \n"
"bgt+ one%= \n"
"2: \n\t"
"two%=: \n\t"
"xvmuldp 40, 32, 36 \n\t" // c * x
"xvmuldp 41, 33, 36 \n\t"

View File

@ -58,8 +58,8 @@ static void zscal_kernel_8 (long n, double *x, double alpha_r, double alpha_i)
"dcbt 0, %2 \n\t"
"xsnegdp 33, %x16 \n\t" // -alpha_i
"xxspltd 32, %x15, 0 \n\t" // alpha_r , alpha_r
"xxmrghd 33, 33, %x16 \n\t" // -alpha_i , alpha_i
XXSPLTD_S(32,%x15,0) // alpha_r , alpha_r
XXMRGHD_S(33,33,%x16) // -alpha_i , alpha_i
"lxvd2x 40, 0, %2 \n\t" // x0_r, x0_i
"lxvd2x 41, %17, %2 \n\t"
@ -73,10 +73,10 @@ static void zscal_kernel_8 (long n, double *x, double alpha_r, double alpha_i)
"addi %2, %2, 128 \n\t"
"addic. %1, %1, -8 \n\t"
"ble 2f \n\t"
"ble two%= \n\t"
".p2align 5 \n"
"1: \n\t"
".align 5 \n"
"one%=: \n\t"
"xvmuldp 48, 40, 32 \n\t" // x0_r * alpha_r, x0_i * alpha_r
"xvmuldp 49, 41, 32 \n\t"
@ -87,14 +87,14 @@ static void zscal_kernel_8 (long n, double *x, double alpha_r, double alpha_i)
"xvmuldp %x5, 46, 32 \n\t"
"xvmuldp %x6, 47, 32 \n\t"
"xxswapd %x7, 40 \n\t"
"xxswapd %x8, 41 \n\t"
"xxswapd %x9, 42 \n\t"
"xxswapd %x10, 43 \n\t"
"xxswapd %x11, 44 \n\t"
"xxswapd %x12, 45 \n\t"
"xxswapd %x13, 46 \n\t"
"xxswapd %x14, 47 \n\t"
XXSWAPD_S(%x7,40)
XXSWAPD_S(%x8,41)
XXSWAPD_S(%x9,42)
XXSWAPD_S(%x10,43)
XXSWAPD_S(%x11,44)
XXSWAPD_S(%x12,45)
XXSWAPD_S(%x13,46)
XXSWAPD_S(%x14,47)
"xvmuldp %x7, %x7, 33 \n\t" // x0_i * -alpha_i, x0_r * alpha_i
"xvmuldp %x8, %x8, 33 \n\t"
@ -147,9 +147,9 @@ static void zscal_kernel_8 (long n, double *x, double alpha_r, double alpha_i)
"addi %2, %2, 256 \n\t"
"addic. %1, %1, -8 \n\t"
"bgt 1b \n"
"bgt one%= \n"
"2: \n\t"
"two%=: \n\t"
"xvmuldp 48, 40, 32 \n\t" // x0_r * alpha_r, x0_i * alpha_r
"xvmuldp 49, 41, 32 \n\t"
@ -160,14 +160,14 @@ static void zscal_kernel_8 (long n, double *x, double alpha_r, double alpha_i)
"xvmuldp %x5, 46, 32 \n\t"
"xvmuldp %x6, 47, 32 \n\t"
"xxswapd %x7, 40 \n\t"
"xxswapd %x8, 41 \n\t"
"xxswapd %x9, 42 \n\t"
"xxswapd %x10, 43 \n\t"
"xxswapd %x11, 44 \n\t"
"xxswapd %x12, 45 \n\t"
"xxswapd %x13, 46 \n\t"
"xxswapd %x14, 47 \n\t"
XXSWAPD_S(%x7,40)
XXSWAPD_S(%x8,41)
XXSWAPD_S(%x9,42)
XXSWAPD_S(%x10,43)
XXSWAPD_S(%x11,44)
XXSWAPD_S(%x12,45)
XXSWAPD_S(%x13,46)
XXSWAPD_S(%x14,47)
"addi %2, %2, -128 \n\t"

View File

@ -40,8 +40,8 @@ zswap_kernel_16 (long n, double *x, double *y)
{
__asm__
(
".p2align 5 \n"
"1: \n\t"
".align 5 \n"
"one%=: \n\t"
"lxvd2x 32, 0, %4 \n\t"
"lxvd2x 33, %5, %4 \n\t"
"lxvd2x 34, %6, %4 \n\t"
@ -130,7 +130,7 @@ zswap_kernel_16 (long n, double *x, double *y)
"addi %4, %4, 128 \n\t"
"addic. %2, %2, -16 \n\t"
"bgt 1b \n"
"bgt one%= \n"
"#n=%2 x=%0=%3 y=%1=%4 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11"
:

File diff suppressed because it is too large Load Diff