AIX changes for Power8

This commit is contained in:
Kavana Bhat 2019-08-20 06:51:35 -05:00
parent 2a43062de7
commit 3dc6b26eff
48 changed files with 9263 additions and 996 deletions

View File

@ -39,6 +39,35 @@
#ifndef COMMON_POWER #ifndef COMMON_POWER
#define COMMON_POWER #define COMMON_POWER
#define str(x) #x
#ifdef OS_AIX
#define XXSPLTD(T,A,z) xxpermdi T, A, A, 0b##z##z
#define XXMRGHD(T,A,B) xxpermdi T, A, B, 0b00
#define XXMRGLD(T,A,B) xxpermdi T, A, B, 0b11
#define XXSWAPD(T,A) xxpermdi T, A, A, 0b10
#define XVMOVDP(T,A) xvcpsgndp T, A, A
#define XXSPLTD_S(T,A,z) "xxpermdi " str(T) ", " str(A) ", " str(A) ", 0b" str(z ## z) " \n\t"
#define XXMRGHD_S(T,A,B) "xxpermdi " str(T) ", " str(A) ", " str(B) ", 0b00 \n\t"
#define XXMRGLD_S(T,A,B) "xxpermdi " str(T) ", " str(A) ", " str(B) ", 0b11 \n\t"
#define XXSWAPD_S(T,A) "xxpermdi " str(T) ", " str(A) ", " str(A) ", 0b10 \n\t"
#else
#define XXSPLTD(T,A,z) xxspltd T, A, z
#define XXMRGHD(T,A,B) xxmrghd T, A, B
#define XXMRGLD(T,A,B) xxmrgld T, A, B
#define XXSWAPD(T,A) xxswapd T, A
#define XVMOVDP(T,A) xvmovdp T, A
#define XXSPLTD_S(T,A,z) "xxspltd T, A, z \n\t"
#define XXMRGHD_S(T,A,B) "xxmrghd T, A, B \n\t"
#define XXMRGLD_S(T,A,B) "xxmrgld T, A, B \n\t"
#define XXSWAPD_S(T,A) "xxswapd T, A"
#endif
#if defined(POWER8) || defined(POWER9) #if defined(POWER8) || defined(POWER9)
#define MB __asm__ __volatile__ ("eieio":::"memory") #define MB __asm__ __volatile__ ("eieio":::"memory")
#define WMB __asm__ __volatile__ ("eieio":::"memory") #define WMB __asm__ __volatile__ ("eieio":::"memory")

View File

@ -57,8 +57,6 @@ USE_TRMM = 1
endif endif
SKERNELOBJS += \ SKERNELOBJS += \
sgemm_kernel$(TSUFFIX).$(SUFFIX) \ sgemm_kernel$(TSUFFIX).$(SUFFIX) \
$(SGEMMINCOPYOBJ) $(SGEMMITCOPYOBJ) \ $(SGEMMINCOPYOBJ) $(SGEMMITCOPYOBJ) \
@ -436,7 +434,10 @@ $(KDIR)$(SGEMMONCOPYOBJ) : $(KERNELDIR)/$(SGEMMONCOPY)
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
$(KDIR)$(SGEMMOTCOPYOBJ) : $(KERNELDIR)/$(SGEMMOTCOPY) $(KDIR)$(SGEMMOTCOPYOBJ) : $(KERNELDIR)/$(SGEMMOTCOPY)
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ $(CC) $(CFLAGS) -E -UDOUBLE -UCOMPLEX $< -o sgemmotcopy.s
m4 sgemmotcopy.s > sgemmotcopy_nomacros.s
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemmotcopy_nomacros.s -o $@
rm sgemmotcopy.s sgemmotcopy_nomacros.s
ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
@ -444,12 +445,17 @@ $(KDIR)$(SGEMMINCOPYOBJ) : $(KERNELDIR)/$(SGEMMINCOPY)
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
$(KDIR)$(SGEMMITCOPYOBJ) : $(KERNELDIR)/$(SGEMMITCOPY) $(KDIR)$(SGEMMITCOPYOBJ) : $(KERNELDIR)/$(SGEMMITCOPY)
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ $(CC) $(CFLAGS) -E -UDOUBLE -UCOMPLEX $< -o sgemmitcopy.s
m4 sgemmitcopy.s > sgemmitcopy_nomacros.s
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemmitcopy_nomacros.s -o $@
rm sgemmitcopy.s sgemmitcopy_nomacros.s
endif endif
$(KDIR)$(DGEMMONCOPYOBJ) : $(KERNELDIR)/$(DGEMMONCOPY) $(KDIR)$(DGEMMONCOPYOBJ) : $(KERNELDIR)/$(DGEMMONCOPY)
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ $(CC) $(CFLAGS) -E -DDOUBLE -UCOMPLEX $< -o dgemm_ncopy.s
m4 dgemm_ncopy.s > dgemm_ncopy_nomacros.s
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX dgemm_ncopy_nomacros.s -o $@
rm dgemm_ncopy.s dgemm_ncopy_nomacros.s
$(KDIR)$(DGEMMOTCOPYOBJ) : $(KERNELDIR)/$(DGEMMOTCOPY) $(KDIR)$(DGEMMOTCOPYOBJ) : $(KERNELDIR)/$(DGEMMOTCOPY)
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
@ -460,7 +466,10 @@ $(KDIR)$(DGEMMINCOPYOBJ) : $(KERNELDIR)/$(DGEMMINCOPY)
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
$(KDIR)$(DGEMMITCOPYOBJ) : $(KERNELDIR)/$(DGEMMITCOPY) $(KDIR)$(DGEMMITCOPYOBJ) : $(KERNELDIR)/$(DGEMMITCOPY)
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ $(CC) $(CFLAGS) -E -DDOUBLE -UCOMPLEX $< -o dgemm_itcopy.s
m4 dgemm_itcopy.s > dgemm_itcopy_nomacros.s
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX dgemm_itcopy_nomacros.s -o $@
rm dgemm_itcopy.s dgemm_itcopy_nomacros.s
endif endif
@ -485,10 +494,16 @@ endif
endif endif
$(KDIR)$(CGEMMONCOPYOBJ) : $(KERNELDIR)/$(CGEMMONCOPY) $(KDIR)$(CGEMMONCOPYOBJ) : $(KERNELDIR)/$(CGEMMONCOPY)
# $(CC) $(CFLAGS) -E -UDOUBLE -UCOMPLEX $< -o cgemm_oncopy.s
# m4 cgemm_oncopy.s > cgemm_oncopy_nomacros.s
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
# rm cgemm_oncopy.s cgemm_oncopy_nomacros.s
$(KDIR)$(CGEMMOTCOPYOBJ) : $(KERNELDIR)/$(CGEMMOTCOPY) $(KDIR)$(CGEMMOTCOPYOBJ) : $(KERNELDIR)/$(CGEMMOTCOPY)
# $(CC) $(CFLAGS) -E -UDOUBLE -UCOMPLEX $< -o cgemm_otcopy.s
# m4 cgemm_otcopy.s > cgemm_otcopy_nomacros.s
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
# rm cgemm_otcopy.s cgemm_otcopy_nomacros.s
ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
@ -496,7 +511,10 @@ $(KDIR)$(CGEMMINCOPYOBJ) : $(KERNELDIR)/$(CGEMMINCOPY)
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
$(KDIR)$(CGEMMITCOPYOBJ) : $(KERNELDIR)/$(CGEMMITCOPY) $(KDIR)$(CGEMMITCOPYOBJ) : $(KERNELDIR)/$(CGEMMITCOPY)
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ $(CC) $(CFLAGS) -UDOUBLE -UCOMPLEX -E $< -o cgemm_itcopy.s
m4 cgemm_itcopy.s > cgemm_itcopy_nomacros.s
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX cgemm_itcopy_nomacros.s -o $@
rm cgemm_itcopy.s cgemm_itcopy_nomacros.s
endif endif
@ -512,7 +530,10 @@ $(KDIR)$(ZGEMMINCOPYOBJ) : $(KERNELDIR)/$(ZGEMMINCOPY)
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
$(KDIR)$(ZGEMMITCOPYOBJ) : $(KERNELDIR)/$(ZGEMMITCOPY) $(KDIR)$(ZGEMMITCOPYOBJ) : $(KERNELDIR)/$(ZGEMMITCOPY)
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ $(CC) $(CFLAGS) -E -DDOUBLE -UCOMPLEX $< -o zgemm_itcopy.s
m4 zgemm_itcopy.s > zgemm_itcopy_nomacros.s
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX zgemm_itcopy_nomacros.s -o $@
rm zgemm_itcopy.s zgemm_itcopy_nomacros.s
endif endif
@ -537,37 +558,67 @@ endif
endif endif
$(KDIR)sgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(SGEMMDEPEND) $(KDIR)sgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(SGEMMDEPEND)
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ $(CC) $(CFLAGS) -E -UDOUBLE -UCOMPLEX $< -o sgemm_kernel$(TSUFFIX).s
m4 sgemm_kernel$(TSUFFIX).s > sgemm_kernel$(TSUFFIX)_nomacros.s
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemm_kernel$(TSUFFIX)_nomacros.s -o $@
rm sgemm_kernel$(TSUFFIX).s sgemm_kernel$(TSUFFIX)_nomacros.s
$(KDIR)dgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) $(DGEMMDEPEND) $(KDIR)dgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) $(DGEMMDEPEND)
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ $(CC) $(CFLAGS) -E -DDOUBLE -UCOMPLEX $< -o dgemm_kernel$(TSUFFIX).s
m4 dgemm_kernel$(TSUFFIX).s > dgemm_kernel$(TSUFFIX)_nomacros.s
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX dgemm_kernel$(TSUFFIX)_nomacros.s -o $@
rm dgemm_kernel$(TSUFFIX).s dgemm_kernel$(TSUFFIX)_nomacros.s
$(KDIR)qgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) $(QGEMMDEPEND) $(KDIR)qgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) $(QGEMMDEPEND)
$(CC) $(CFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ $(CC) $(CFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@
$(KDIR)cgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) $(KDIR)cgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND)
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN $< -o $@ $(CC) $(CFLAGS) -E -UDOUBLE -DCOMPLEX -DNN $< -o cgemm_kernel_n.s
m4 cgemm_kernel_n.s > cgemm_kernel_n_nomacros.s
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN cgemm_kernel_n_nomacros.s -o $@
rm cgemm_kernel_n.s cgemm_kernel_n_nomacros.s
$(KDIR)cgemm_kernel_l$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) $(KDIR)cgemm_kernel_l$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND)
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $@ $(CC) $(CFLAGS) -E -UDOUBLE -DCOMPLEX -DCN $< -o cgemm_kernel_l.s
m4 cgemm_kernel_l.s > cgemm_kernel_l_nomacros.s
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCN cgemm_kernel_l_nomacros.s -o $@
rm cgemm_kernel_l.s cgemm_kernel_l_nomacros.s
$(KDIR)cgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) $(KDIR)cgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND)
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNC $< -o $@ $(CC) $(CFLAGS) -E -UDOUBLE -DCOMPLEX -DNC $< -o cgemm_kernel_r.s
m4 cgemm_kernel_r.s > cgemm_kernel_r_nomacros.s
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNC cgemm_kernel_r_nomacros.s -o $@
rm cgemm_kernel_r.s cgemm_kernel_r_nomacros.s
$(KDIR)cgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) $(KDIR)cgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND)
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $@ $(CC) $(CFLAGS) -E -UDOUBLE -DCOMPLEX -DCC $< -o cgemm_kernel_b.s
m4 cgemm_kernel_b.s > cgemm_kernel_b_nomacros.s
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCC cgemm_kernel_b_nomacros.s -o $@
rm cgemm_kernel_b.s cgemm_kernel_b_nomacros.s
$(KDIR)zgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) $(KDIR)zgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND)
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN $< -o $@ $(CC) $(CFLAGS) -E -DDOUBLE -DCOMPLEX -DNN $< -o zgemm_kernel_n.s
m4 zgemm_kernel_n.s > zgemm_kernel_n_nomacros.s
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN zgemm_kernel_n_nomacros.s -o $@
rm zgemm_kernel_n.s zgemm_kernel_n_nomacros.s
$(KDIR)zgemm_kernel_l$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) $(KDIR)zgemm_kernel_l$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND)
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN $< -o $@ $(CC) $(CFLAGS) -E -DDOUBLE -DCOMPLEX -DCN $< -o zgemm_kernel_l.s
m4 zgemm_kernel_l.s > zgemm_kernel_l_nomacros.s
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN zgemm_kernel_l_nomacros.s -o $@
rm zgemm_kernel_l.s zgemm_kernel_l_nomacros.s
$(KDIR)zgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) $(KDIR)zgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND)
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNC $< -o $@ $(CC) $(CFLAGS) -E -DDOUBLE -DCOMPLEX -DNC $< -o zgemm_kernel_r.s
m4 zgemm_kernel_r.s > zgemm_kernel_r_nomacros.s
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNC zgemm_kernel_r_nomacros.s -o $@
rm zgemm_kernel_r.s zgemm_kernel_r_nomacros.s
$(KDIR)zgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) $(KDIR)zgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND)
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $@ $(CC) $(CFLAGS) -E -DDOUBLE -DCOMPLEX -DCC $< -o zgemm_kernel_b.s
m4 zgemm_kernel_b.s > zgemm_kernel_b_nomacros.s
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC zgemm_kernel_b_nomacros.s -o $@
rm zgemm_kernel_b.s zgemm_kernel_b_nomacros.s
$(KDIR)xgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMDEPEND) $(KDIR)xgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMDEPEND)
$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DNN $< -o $@ $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DNN $< -o $@
@ -584,28 +635,56 @@ $(KDIR)xgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMD
ifdef USE_TRMM ifdef USE_TRMM
$(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) $(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o strmmkernel_ln.s
m4 strmmkernel_ln.s > strmmkernel_ln_nomacros.s
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA strmmkernel_ln_nomacros.s -o $@
rm strmmkernel_ln.s strmmkernel_ln_nomacros.s
$(KDIR)strmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) $(KDIR)strmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o strmmkernel_lt.s
m4 strmmkernel_lt.s > strmmkernel_lt_nomacros.s
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA strmmkernel_lt_nomacros.s -o $@
rm strmmkernel_lt.s strmmkernel_lt_nomacros.s
$(KDIR)strmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) $(KDIR)strmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o strmmkernel_rn.s
m4 strmmkernel_rn.s > strmmkernel_rn_nomacros.s
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA strmmkernel_rn_nomacros.s -o $@
rm strmmkernel_rn.s strmmkernel_rn_nomacros.s
$(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) $(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o strmm_kernel_rt.s
m4 strmm_kernel_rt.s > strmm_kernel_rt_nomacros.s
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA strmm_kernel_rt_nomacros.s -o $@
rm strmm_kernel_rt.s strmm_kernel_rt_nomacros.s
$(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) $(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o dtrmm_kernel_ln.s
# $(CC) $(CFLAGS) -E $< -o dtrmm_kernel_ln.s
m4 dtrmm_kernel_ln.s > dtrmm_kernel_ln_nomacros.s
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA dtrmm_kernel_ln_nomacros.s -o $@
rm dtrmm_kernel_ln.s dtrmm_kernel_ln_nomacros.s
$(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) $(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o dtrmm_kernel_lt.s
# $(CC) $(CFLAGS) -E $< -o dtrmm_kernel_lt.s
m4 dtrmm_kernel_lt.s > dtrmm_kernel_lt_nomacros.s
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA dtrmm_kernel_lt_nomacros.s -o $@
rm dtrmm_kernel_lt.s dtrmm_kernel_lt_nomacros.s
$(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) $(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o dtrmm_kernel_rn.s
# $(CC) $(CFLAGS) -E $< -o dtrmm_kernel_rn.s
m4 dtrmm_kernel_rn.s > dtrmm_kernel_rn_nomacros.s
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA dtrmm_kernel_rn_nomacros.s -o $@
rm dtrmm_kernel_rn.s dtrmm_kernel_rn_nomacros.s
$(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) $(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o dtrmm_kernel_rt.s
# $(CC) $(CFLAGS) -E $< -o dtrmm_kernel_rt.s
m4 dtrmm_kernel_rt.s > dtrmm_kernel_rt_nomacros.s
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA dtrmm_kernel_rt_nomacros.s -o $@
rm dtrmm_kernel_rt.s dtrmm_kernel_rt_nomacros.s
$(KDIR)qtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) $(KDIR)qtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
@ -620,52 +699,100 @@ $(KDIR)qtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
$(KDIR)ctrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) $(KDIR)ctrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o ctrmm_kernel_ln.s
m4 ctrmm_kernel_ln.s > ctrmm_kernel_ln_nomacros.s
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN ctrmm_kernel_ln_nomacros.s -o $@
rm ctrmm_kernel_ln.s ctrmm_kernel_ln_nomacros.s
$(KDIR)ctrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) $(KDIR)ctrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o ctrmm_kernel_lt.s
m4 ctrmm_kernel_lt.s > ctrmm_kernel_lt_nomacros.s
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN ctrmm_kernel_lt_nomacros.s -o $@
rm ctrmm_kernel_lt.s ctrmm_kernel_lt_nomacros.s
$(KDIR)ctrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) $(KDIR)ctrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o ctrmm_kernel_lr.s
m4 ctrmm_kernel_lr.s > ctrmm_kernel_lr_nomacros.s
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN ctrmm_kernel_lr_nomacros.s -o $@
rm ctrmm_kernel_lr.s ctrmm_kernel_lr_nomacros.s
$(KDIR)ctrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) $(KDIR)ctrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o ctrmm_kernel_lc.s
m4 ctrmm_kernel_lc.s > ctrmm_kernel_lc_nomacros.s
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN ctrmm_kernel_lc_nomacros.s -o $@
rm ctrmm_kernel_lc_nomacros.s ctrmm_kernel_lc.s
$(KDIR)ctrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) $(KDIR)ctrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o ctrmm_kernel_rn.s
m4 ctrmm_kernel_rn.s > ctrmm_kernel_rn_nomacros.s
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN ctrmm_kernel_rn_nomacros.s -o $@
rm ctrmm_kernel_rn.s ctrmm_kernel_rn_nomacros.s
$(KDIR)ctrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) $(KDIR)ctrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o ctrmm_kernel_rt.s
m4 ctrmm_kernel_rt.s > ctrmm_kernel_rt_nomacros.s
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN ctrmm_kernel_rt_nomacros.s -o $@
rm ctrmm_kernel_rt.s ctrmm_kernel_rt_nomacros.s
$(KDIR)ctrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) $(KDIR)ctrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o ctrmm_kernel_rr.s
m4 ctrmm_kernel_rr.s > ctrmm_kernel_rr_nomacros.s
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC ctrmm_kernel_rr_nomacros.s -o $@
rm ctrmm_kernel_rr.s ctrmm_kernel_rr_nomacros.s
$(KDIR)ctrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) $(KDIR)ctrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o ctrmm_kernel_RC.s
m4 ctrmm_kernel_RC.s > ctrmm_kernel_RC_nomacros.s
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC ctrmm_kernel_RC_nomacros.s -o $@
rm ctrmm_kernel_RC.s ctrmm_kernel_RC_nomacros.s
$(KDIR)ztrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) $(KDIR)ztrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o ztrmm_kernel_ln.s
m4 ztrmm_kernel_ln.s > ztrmm_kernel_ln_nomacros.s
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN ztrmm_kernel_ln_nomacros.s -o $@
rm ztrmm_kernel_ln.s ztrmm_kernel_ln_nomacros.s
$(KDIR)ztrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) $(KDIR)ztrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o ztrmm_kernel_lt.s
m4 ztrmm_kernel_lt.s > ztrmm_kernel_lt_nomacros.s
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN ztrmm_kernel_lt_nomacros.s -o $@
rm ztrmm_kernel_lt.s ztrmm_kernel_lt_nomacros.s
$(KDIR)ztrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) $(KDIR)ztrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o ztrmm_kernel_lr.s
m4 ztrmm_kernel_lr.s > ztrmm_kernel_lr_nomacros.s
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN ztrmm_kernel_lr_nomacros.s -o $@
rm ztrmm_kernel_lr.s ztrmm_kernel_lr_nomacros.s
$(KDIR)ztrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) $(KDIR)ztrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o ztrmm_kernel_lc.s
m4 ztrmm_kernel_lc.s >ztrmm_kernel_lc_nomacros.s
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN ztrmm_kernel_lc_nomacros.s -o $@
rm ztrmm_kernel_lc.s ztrmm_kernel_lc_nomacros.s
$(KDIR)ztrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) $(KDIR)ztrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o ztrmm_kernel_rn.s
m4 ztrmm_kernel_rn.s > ztrmm_kernel_rn_nomacros.s
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN ztrmm_kernel_rn_nomacros.s -o $@
rm ztrmm_kernel_rn.s ztrmm_kernel_rn_nomacros.s
$(KDIR)ztrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) $(KDIR)ztrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o ztrmm_kernel_rt.s
m4 ztrmm_kernel_rt.s > ztrmm_kernel_rt_nomacros.s
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN ztrmm_kernel_rt_nomacros.s -o $@
rm ztrmm_kernel_rt.s ztrmm_kernel_rt_nomacros.s
$(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) $(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o ztrmm_kernel_rr.s
m4 ztrmm_kernel_rr.s > ztrmm_kernel_rr_nomacros.s
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC ztrmm_kernel_rr_nomacros.s -o $@
rm ztrmm_kernel_rr.s ztrmm_kernel_rr_nomacros.s
$(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) $(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o ztrmm_kernel_rc.s
m4 ztrmm_kernel_rc.s > ztrmm_kernel_rc_nomacros.s
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC ztrmm_kernel_rc_nomacros.s -o $@
rm ztrmm_kernel_rc.s ztrmm_kernel_rc_nomacros.s
else else
$(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
@ -677,7 +804,10 @@ $(KDIR)strmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@
$(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o strmm_kernel_rt.s
m4 strmm_kernel_rt.s > strmm_kernel_rt_nomacros.s
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA strmm_kernel_rt_nomacros.s -o $@
rm strmm_kernel_rt.s strmm_kernel_rt_nomacros.s
$(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) $(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
@ -801,10 +931,16 @@ $(KDIR)strsm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRSMKERNEL_RT) $(ST
$(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -UDOUBLE -UUPPER -DRT -UCONJ $< -o $@ $(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -UDOUBLE -UUPPER -DRT -UCONJ $< -o $@
$(KDIR)dtrsm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_LN) $(DTRSMDEPEND) $(KDIR)dtrsm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_LN) $(DTRSMDEPEND)
# $(CC) $(CFLAGS) -E $< -o dtrsm_kernel_ln.s
# m4 dtrsm_kernel_ln.s > dtrsm_kernel_ln_nomacros.s
$(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -DUPPER -DLN -UCONJ $< -o $@ $(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -DUPPER -DLN -UCONJ $< -o $@
# rm dtrsm_kernel_ln.s dtrsm_kernel_ln_nomacros.s
$(KDIR)dtrsm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_LT) $(DTRSMDEPEND) $(KDIR)dtrsm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_LT) $(DTRSMDEPEND)
$(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ $< -o $@ $(CC) $(CFLAGS) -E -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ $< -o dtrsm_kernel_lt.s
m4 dtrsm_kernel_lt.s > dtrsm_kernel_lt_nomacros.s
$(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ dtrsm_kernel_lt_nomacros.s -o $@
rm dtrsm_kernel_lt.s dtrsm_kernel_lt_nomacros.s
$(KDIR)dtrsm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_RN) $(DTRSMDEPEND) $(KDIR)dtrsm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_RN) $(DTRSMDEPEND)
$(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -DUPPER -DRN -UCONJ $< -o $@ $(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -DUPPER -DRN -UCONJ $< -o $@
@ -1940,7 +2076,7 @@ $(SGEMMITCOPYOBJ_P) : $(KERNELDIR)/$(SGEMMITCOPY)
endif endif
$(DGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(DGEMMONCOPY) $(D<GEMMONCOPYOBJ_P) : $(KERNELDIR)/$(DGEMMONCOPY)
$(CC) $(PFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ $(CC) $(PFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
$(DGEMMOTCOPYOBJ_P) : $(KERNELDIR)/$(DGEMMOTCOPY) $(DGEMMOTCOPYOBJ_P) : $(KERNELDIR)/$(DGEMMOTCOPY)
@ -2044,7 +2180,10 @@ $(KDIR)cgemm_kernel_l$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMM
$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $@ $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $@
$(KDIR)cgemm_kernel_r$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) $(KDIR)cgemm_kernel_r$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND)
$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DNC $< -o $@ $(CC) $(PFLAGS) -E -UDOUBLE -DCOMPLEX -DNC $< -o cgemm_kernel_r.s
m4 cgemm_kernel_r.s > cgemm_kernel_r_nomacros.s
$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DNC cgemm_kernel_r_nomacros.s -o $@
rm cgemm_kernel_r.s cgemm_kernel_r_nomacros.s
$(KDIR)cgemm_kernel_b$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) $(KDIR)cgemm_kernel_b$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND)
$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $@ $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $@
@ -2083,7 +2222,10 @@ $(KDIR)strmm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL)
$(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ $(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@
$(KDIR)strmm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(KDIR)strmm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL)
$(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o strmm_kernel_rt.s
m4 strmmkernel_rn.s > strmm_kernel_rt_nomacros.s
$(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA strmm_kernel_rt_nomacros.s -o $@
rm strmm_kernel_rt.s strmm_kernel_rt_nomacros.s
$(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) $(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL)
$(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ $(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@

View File

@ -68,10 +68,10 @@ static float casum_kernel_16 (long n, float *x)
"addi %2, %2, 128 \n\t" "addi %2, %2, 128 \n\t"
"addic. %1, %1, -16 \n\t" "addic. %1, %1, -16 \n\t"
"ble 2f \n\t" "ble two%= \n\t"
".p2align 5 \n" ".align 5 \n"
"1: \n\t" "one%=: \n\t"
"xvabssp 48, 40 \n\t" "xvabssp 48, 40 \n\t"
"xvabssp 49, 41 \n\t" "xvabssp 49, 41 \n\t"
@ -108,9 +108,9 @@ static float casum_kernel_16 (long n, float *x)
"xvaddsp 38, 38, %x5 \n\t" "xvaddsp 38, 38, %x5 \n\t"
"xvaddsp 39, 39, %x6 \n\t" "xvaddsp 39, 39, %x6 \n\t"
"bgt 1b \n" "bgt one%= \n"
"2: \n\t" "two%=: \n\t"
"xvabssp 48, 40 \n\t" "xvabssp 48, 40 \n\t"
"xvabssp 49, 41 \n\t" "xvabssp 49, 41 \n\t"

View File

@ -62,10 +62,10 @@ static void ccopy_kernel_32 (long n, float *x, float *y)
"addi %2, %2, 128 \n\t" "addi %2, %2, 128 \n\t"
"addic. %1, %1, -32 \n\t" "addic. %1, %1, -32 \n\t"
"ble 2f \n\t" "ble two%= \n\t"
".p2align 5 \n" ".align 5 \n"
"1: \n\t" "one%=: \n\t"
"stxvd2x 32, 0, %3 \n\t" "stxvd2x 32, 0, %3 \n\t"
"stxvd2x 33, %5, %3 \n\t" "stxvd2x 33, %5, %3 \n\t"
@ -108,9 +108,9 @@ static void ccopy_kernel_32 (long n, float *x, float *y)
"addi %2, %2, 128 \n\t" "addi %2, %2, 128 \n\t"
"addic. %1, %1, -32 \n\t" "addic. %1, %1, -32 \n\t"
"bgt 1b \n" "bgt one%= \n"
"2: \n\t" "two%=: \n\t"
"stxvd2x 32, 0, %3 \n\t" "stxvd2x 32, 0, %3 \n\t"
"stxvd2x 33, %5, %3 \n\t" "stxvd2x 33, %5, %3 \n\t"

File diff suppressed because it is too large Load Diff

View File

@ -38,7 +38,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
* Macros for N=4 and M=8 * Macros for N=4 and M=8
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_4x8', `
#else
.macro COPY_4x8 .macro COPY_4x8
#endif
lxvw4x vs32, o0, A0 lxvw4x vs32, o0, A0
lxvw4x vs33, o16, A0 lxvw4x vs33, o16, A0
@ -93,13 +97,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvw4x vs46, o32, T1 stxvw4x vs46, o32, T1
stxvw4x vs47, o48, T1 stxvw4x vs47, o48, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=4 and M=4 * Macros for N=4 and M=4
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_4x4', `
#else
.macro COPY_4x4 .macro COPY_4x4
#endif
lxvw4x vs32, o0, A0 lxvw4x vs32, o0, A0
lxvw4x vs33, o16, A0 lxvw4x vs33, o16, A0
@ -133,13 +145,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvw4x vs38, o32, T1 stxvw4x vs38, o32, T1
stxvw4x vs39, o48, T1 stxvw4x vs39, o48, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=4 and M=2 * Macros for N=4 and M=2
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_4x2', `
#else
.macro COPY_4x2 .macro COPY_4x2
#endif
lxvw4x vs32, o0, A0 lxvw4x vs32, o0, A0
addi A0, A0, 16 addi A0, A0, 16
@ -163,13 +183,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvw4x vs35, o48, T1 stxvw4x vs35, o48, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=4 and M=1 * Macros for N=4 and M=1
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_4x1', `
#else
.macro COPY_4x1 .macro COPY_4x1
#endif
lxsspx vs32, o0, A0 lxsspx vs32, o0, A0
lxsspx vs33, o4, A0 lxsspx vs33, o4, A0
@ -207,13 +235,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxsspx vs38, o0, T1 stxsspx vs38, o0, T1
stxsspx vs39, o4, T1 stxsspx vs39, o4, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=2 and M=8 * Macros for N=2 and M=8
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_2x8', `
#else
.macro COPY_2x8 .macro COPY_2x8
#endif
lxvw4x vs32, o0, A0 lxvw4x vs32, o0, A0
lxvw4x vs33, o16, A0 lxvw4x vs33, o16, A0
@ -241,13 +277,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvw4x vs38, o32, T1 stxvw4x vs38, o32, T1
stxvw4x vs39, o48, T1 stxvw4x vs39, o48, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=2 and M=4 * Macros for N=2 and M=4
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_2x4', `
#else
.macro COPY_2x4 .macro COPY_2x4
#endif
lxvw4x vs32, o0, A0 lxvw4x vs32, o0, A0
lxvw4x vs33, o16, A0 lxvw4x vs33, o16, A0
@ -265,13 +309,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvw4x vs34, o32, T1 stxvw4x vs34, o32, T1
stxvw4x vs35, o48, T1 stxvw4x vs35, o48, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=2 and M=2 * Macros for N=2 and M=2
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_2x2', `
#else
.macro COPY_2x2 .macro COPY_2x2
#endif
lxvw4x vs32, o0, A0 lxvw4x vs32, o0, A0
addi A0, A0, 16 addi A0, A0, 16
@ -285,13 +337,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvw4x vs33, o16, T1 stxvw4x vs33, o16, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=2 and M=1 * Macros for N=2 and M=1
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_2x1', `
#else
.macro COPY_2x1 .macro COPY_2x1
#endif
lxsspx vs32, o0, A0 lxsspx vs32, o0, A0
lxsspx vs33, o4, A0 lxsspx vs33, o4, A0
@ -311,13 +371,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxsspx vs34, o0, T1 stxsspx vs34, o0, T1
stxsspx vs35, o4, T1 stxsspx vs35, o4, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=1 and M=8 * Macros for N=1 and M=8
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_1x8', `
#else
.macro COPY_1x8 .macro COPY_1x8
#endif
lxvw4x vs32, o0, A0 lxvw4x vs32, o0, A0
lxvw4x vs33, o16, A0 lxvw4x vs33, o16, A0
@ -332,13 +400,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvw4x vs34, o32, T1 stxvw4x vs34, o32, T1
stxvw4x vs35, o48, T1 stxvw4x vs35, o48, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=1 and M=4 * Macros for N=1 and M=4
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_1x4', `
#else
.macro COPY_1x4 .macro COPY_1x4
#endif
lxvw4x vs32, o0, A0 lxvw4x vs32, o0, A0
lxvw4x vs33, o16, A0 lxvw4x vs33, o16, A0
@ -349,13 +425,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvw4x vs32, o0, T1 stxvw4x vs32, o0, T1
stxvw4x vs33, o16, T1 stxvw4x vs33, o16, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=1 and M=2 * Macros for N=1 and M=2
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_1x2', `
#else
.macro COPY_1x2 .macro COPY_1x2
#endif
lxvw4x vs32, o0, A0 lxvw4x vs32, o0, A0
addi A0, A0, 16 addi A0, A0, 16
@ -364,13 +448,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvw4x vs32, o0, T1 stxvw4x vs32, o0, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=1 and M=1 * Macros for N=1 and M=1
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_1x1', `
#else
.macro COPY_1x1 .macro COPY_1x1
#endif
lxsspx vs32, o0, A0 lxsspx vs32, o0, A0
lxsspx vs33, o4, A0 lxsspx vs33, o4, A0
@ -381,5 +473,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxsspx vs32, o0, T1 stxsspx vs32, o0, T1
stxsspx vs33, o4, T1 stxsspx vs33, o4, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif

View File

@ -56,9 +56,9 @@ static void crot_kernel_8 (long n, float *x, float *y, float c, float s)
"addi %[x_ptr], %[x_ptr], 64 \n\t" "addi %[x_ptr], %[x_ptr], 64 \n\t"
"addi %[y_ptr], %[y_ptr], 64 \n\t" "addi %[y_ptr], %[y_ptr], 64 \n\t"
"addic. %[temp_n], %[temp_n], -8 \n\t" "addic. %[temp_n], %[temp_n], -8 \n\t"
"ble 2f \n\t" "ble two%= \n\t"
".p2align 5 \n\t" ".align 5 \n\t"
"1: \n\t" "one%=: \n\t"
"xvmulsp 40, 32, 36 \n\t" // c * x "xvmulsp 40, 32, 36 \n\t" // c * x
"xvmulsp 41, 33, 36 \n\t" "xvmulsp 41, 33, 36 \n\t"
"xvmulsp 42, 34, 36 \n\t" "xvmulsp 42, 34, 36 \n\t"
@ -104,8 +104,8 @@ static void crot_kernel_8 (long n, float *x, float *y, float c, float s)
"addi %[x_ptr], %[x_ptr], 128 \n\t" "addi %[x_ptr], %[x_ptr], 128 \n\t"
"addi %[y_ptr], %[y_ptr], 128 \n\t" "addi %[y_ptr], %[y_ptr], 128 \n\t"
"addic. %[temp_n], %[temp_n], -8 \n\t" "addic. %[temp_n], %[temp_n], -8 \n\t"
"bgt 1b \n\t" "bgt one%= \n\t"
"2: \n\t" "two%=: \n\t"
"xvmulsp 40, 32, 36 \n\t" // c * x "xvmulsp 40, 32, 36 \n\t" // c * x
"xvmulsp 41, 33, 36 \n\t" "xvmulsp 41, 33, 36 \n\t"
"xvmulsp 42, 34, 36 \n\t" "xvmulsp 42, 34, 36 \n\t"

View File

@ -39,8 +39,8 @@ static void cswap_kernel_32 (long n, float *x, float *y)
{ {
__asm__ __asm__
( (
".p2align 5 \n" ".align 5 \n"
"1: \n\t" "one%=: \n\t"
"lxvd2x 32, 0, %4 \n\t" "lxvd2x 32, 0, %4 \n\t"
"lxvd2x 33, %5, %4 \n\t" "lxvd2x 33, %5, %4 \n\t"
@ -131,7 +131,7 @@ static void cswap_kernel_32 (long n, float *x, float *y)
"addi %4, %4, 128 \n\t" "addi %4, %4, 128 \n\t"
"addic. %2, %2, -32 \n\t" "addic. %2, %2, -32 \n\t"
"bgt 1b \n" "bgt one%= \n"
"#n=%2 x=%0=%3 y=%1=%4 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11" "#n=%2 x=%0=%3 y=%1=%4 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11"
: :

File diff suppressed because it is too large Load Diff

View File

@ -68,10 +68,10 @@ static double dasum_kernel_16 (long n, double *x)
"addi %2, %2, 128 \n\t" "addi %2, %2, 128 \n\t"
"addic. %1, %1, -16 \n\t" "addic. %1, %1, -16 \n\t"
"ble 2f \n\t" "ble two%= \n\t"
".p2align 5 \n" ".align 5 \n"
"1: \n\t" "one%=: \n\t"
"xvabsdp 48, 40 \n\t" "xvabsdp 48, 40 \n\t"
"xvabsdp 49, 41 \n\t" "xvabsdp 49, 41 \n\t"
@ -108,9 +108,9 @@ static double dasum_kernel_16 (long n, double *x)
"xvadddp 38, 38, %x5 \n\t" "xvadddp 38, 38, %x5 \n\t"
"xvadddp 39, 39, %x6 \n\t" "xvadddp 39, 39, %x6 \n\t"
"bgt 1b \n" "bgt one%= \n"
"2: \n\t" "two%=: \n\t"
"xvabsdp 48, 40 \n\t" "xvabsdp 48, 40 \n\t"
"xvabsdp 49, 41 \n\t" "xvabsdp 49, 41 \n\t"
@ -140,7 +140,7 @@ static double dasum_kernel_16 (long n, double *x)
"xvadddp 32, 32, 36 \n\t" "xvadddp 32, 32, 36 \n\t"
"xxswapd 33, 32 \n\t" XXSWAPD_S(33,32)
"xsadddp %x0, 32, 33 \n" "xsadddp %x0, 32, 33 \n"
"#n=%1 x=%3=%2 sum=%0 o16=%8 o32=%9 o48=%10 o64=%11 o80=%12 o96=%13 o112=%14\n" "#n=%1 x=%3=%2 sum=%0 o16=%8 o32=%9 o48=%10 o64=%11 o80=%12 o96=%13 o112=%14\n"

View File

@ -58,7 +58,7 @@ static void daxpy_kernel_8 (long n, double *x, double *y, double alpha)
__asm__ __asm__
( (
"xxspltd %x4, %x22, 0 \n\t" XXSPLTD_S(%x4,%x22,0)
"dcbt 0, %2 \n\t" "dcbt 0, %2 \n\t"
"dcbt 0, %3 \n\t" "dcbt 0, %3 \n\t"
@ -90,10 +90,10 @@ static void daxpy_kernel_8 (long n, double *x, double *y, double alpha)
"addi %3, %3, -64 \n\t" "addi %3, %3, -64 \n\t"
"addic. %1, %1, -16 \n\t" "addic. %1, %1, -16 \n\t"
"ble 2f \n\t" "ble two%= \n\t"
".align 5 \n" ".align 5 \n"
"1: \n\t" "one%=: \n\t"
"xvmaddadp %x13, %x5, %x4 \n\t" "xvmaddadp %x13, %x5, %x4 \n\t"
"xvmaddadp %x14, %x6, %x4 \n\t" "xvmaddadp %x14, %x6, %x4 \n\t"
@ -152,9 +152,9 @@ static void daxpy_kernel_8 (long n, double *x, double *y, double alpha)
"addi %3, %3, -64 \n\t" "addi %3, %3, -64 \n\t"
"addic. %1, %1, -16 \n\t" "addic. %1, %1, -16 \n\t"
"bgt 1b \n" "bgt one%= \n"
"2: \n\t" "two%=: \n\t"
"xvmaddadp %x13, %x5, %x4 \n\t" "xvmaddadp %x13, %x5, %x4 \n\t"
"xvmaddadp %x14, %x6, %x4 \n\t" "xvmaddadp %x14, %x6, %x4 \n\t"

View File

@ -62,10 +62,10 @@ static void dcopy_kernel_32 (long n, double *x, double *y)
"addi %2, %2, 128 \n\t" "addi %2, %2, 128 \n\t"
"addic. %1, %1, -32 \n\t" "addic. %1, %1, -32 \n\t"
"ble 2f \n\t" "ble two%= \n\t"
".p2align 5 \n" ".align 5 \n"
"1: \n\t" "one%=: \n\t"
"stxvd2x 32, 0, %3 \n\t" "stxvd2x 32, 0, %3 \n\t"
"stxvd2x 33, %5, %3 \n\t" "stxvd2x 33, %5, %3 \n\t"
@ -108,9 +108,9 @@ static void dcopy_kernel_32 (long n, double *x, double *y)
"addi %2, %2, 128 \n\t" "addi %2, %2, 128 \n\t"
"addic. %1, %1, -32 \n\t" "addic. %1, %1, -32 \n\t"
"bgt 1b \n" "bgt one%= \n"
"2: \n\t" "two%=: \n\t"
"stxvd2x 32, 0, %3 \n\t" "stxvd2x 32, 0, %3 \n\t"
"stxvd2x 33, %5, %3 \n\t" "stxvd2x 33, %5, %3 \n\t"

View File

@ -78,10 +78,10 @@ static double ddot_kernel_8 (long n, double *x, double *y)
"addi %3, %3, 128 \n\t" "addi %3, %3, 128 \n\t"
"addic. %1, %1, -16 \n\t" "addic. %1, %1, -16 \n\t"
"ble 2f \n\t" "ble two%= \n\t"
".p2align 5 \n" ".align 5 \n"
"1: \n\t" "one%=: \n\t"
"xvmaddadp 32, 40, 48 \n\t" "xvmaddadp 32, 40, 48 \n\t"
"lxvd2x 40, 0, %2 \n\t" "lxvd2x 40, 0, %2 \n\t"
@ -112,9 +112,9 @@ static double ddot_kernel_8 (long n, double *x, double *y)
"addi %3, %3, 128 \n\t" "addi %3, %3, 128 \n\t"
"addic. %1, %1, -16 \n\t" "addic. %1, %1, -16 \n\t"
"bgt 1b \n" "bgt one%= \n"
"2: \n\t" "two%=: \n\t"
"xvmaddadp 32, 40, 48 \n\t" "xvmaddadp 32, 40, 48 \n\t"
"xvmaddadp 33, 41, 49 \n\t" "xvmaddadp 33, 41, 49 \n\t"
@ -135,7 +135,7 @@ static double ddot_kernel_8 (long n, double *x, double *y)
"xvadddp 32, 32, 36 \n\t" "xvadddp 32, 32, 36 \n\t"
"xxswapd 33, 32 \n\t" XXSWAPD_S(33,32)
"xsadddp %x0, 32, 33 \n" "xsadddp %x0, 32, 33 \n"

File diff suppressed because it is too large Load Diff

View File

@ -38,7 +38,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
* Macros for N=4 and M=16 * Macros for N=4 and M=16
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_4x16', `
#else
.macro COPY_4x16 .macro COPY_4x16
#endif
lxvd2x vs0, o0, A0 lxvd2x vs0, o0, A0
lxvd2x vs1, o0, A1 lxvd2x vs1, o0, A1
@ -180,14 +184,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi BO, BO, 128 addi BO, BO, 128
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=4 and M=8 * Macros for N=4 and M=8
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_4x8', `
#else
.macro COPY_4x8 .macro COPY_4x8
#endif
lxvd2x vs0, o0, A0 lxvd2x vs0, o0, A0
lxvd2x vs1, o16, A0 lxvd2x vs1, o16, A0
@ -259,14 +271,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi BO, BO, 128 addi BO, BO, 128
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=4 and M=4 * Macros for N=4 and M=4
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_4x4', `
#else
.macro COPY_4x4 .macro COPY_4x4
#endif
lxvd2x vs0, o0, A0 lxvd2x vs0, o0, A0
lxvd2x vs1, o16, A0 lxvd2x vs1, o16, A0
@ -310,14 +330,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi BO, BO, 128 addi BO, BO, 128
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=4 and M=2 * Macros for N=4 and M=2
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_4x2', `
#else
.macro COPY_4x2 .macro COPY_4x2
#endif
lxvd2x vs0, o0, A0 lxvd2x vs0, o0, A0
addi A0, A0, 16 addi A0, A0, 16
@ -348,14 +376,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi BO, BO, 64 addi BO, BO, 64
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=4 and M=1 * Macros for N=4 and M=1
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_4x1', `
#else
.macro COPY_4x1 .macro COPY_4x1
#endif
lxsdx vs0, o0, A0 lxsdx vs0, o0, A0
addi A0, A0, 8 addi A0, A0, 8
@ -382,14 +418,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi BO, BO, 32 addi BO, BO, 32
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=2 and M=16 * Macros for N=2 and M=16
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_2x16', `
#else
.macro COPY_2x16 .macro COPY_2x16
#endif
lxvd2x vs0, o0, A0 lxvd2x vs0, o0, A0
lxvd2x vs1, o16, A0 lxvd2x vs1, o16, A0
@ -459,14 +503,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi BO, BO, 128 addi BO, BO, 128
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=2 and M=8 * Macros for N=2 and M=8
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_2x8', `
#else
.macro COPY_2x8 .macro COPY_2x8
#endif
lxvd2x vs0, o0, A0 lxvd2x vs0, o0, A0
lxvd2x vs1, o16, A0 lxvd2x vs1, o16, A0
@ -506,14 +558,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi BO, BO, 128 addi BO, BO, 128
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=2 and M=4 * Macros for N=2 and M=4
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_2x4', `
#else
.macro COPY_2x4 .macro COPY_2x4
#endif
lxvd2x vs0, o0, A0 lxvd2x vs0, o0, A0
lxvd2x vs1, o16, A0 lxvd2x vs1, o16, A0
@ -539,14 +599,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi BO, BO, 64 addi BO, BO, 64
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=2 and M=2 * Macros for N=2 and M=2
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_2x2', `
#else
.macro COPY_2x2 .macro COPY_2x2
#endif
lxvd2x vs0, o0, A0 lxvd2x vs0, o0, A0
addi A0, A0, 16 addi A0, A0, 16
@ -565,14 +633,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi BO, BO, 32 addi BO, BO, 32
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=2 and M=1 * Macros for N=2 and M=1
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_2x1', `
#else
.macro COPY_2x1 .macro COPY_2x1
#endif
lxsdx vs0, o0, A0 lxsdx vs0, o0, A0
addi A0, A0, 8 addi A0, A0, 8
@ -589,14 +665,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi BO, BO, 16 addi BO, BO, 16
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=1 and M=16 * Macros for N=1 and M=16
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_1x16', `
#else
.macro COPY_1x16 .macro COPY_1x16
#endif
lxvd2x vs0, o0, A0 lxvd2x vs0, o0, A0
lxvd2x vs1, o16, A0 lxvd2x vs1, o16, A0
@ -622,14 +706,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi BO, BO, 64 addi BO, BO, 64
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=1 and M=8 * Macros for N=1 and M=8
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_1x8', `
#else
.macro COPY_1x8 .macro COPY_1x8
#endif
lxvd2x vs0, o0, A0 lxvd2x vs0, o0, A0
lxvd2x vs1, o16, A0 lxvd2x vs1, o16, A0
@ -645,14 +737,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi BO, BO, 64 addi BO, BO, 64
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=1 and M=4 * Macros for N=1 and M=4
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_1x4', `
#else
.macro COPY_1x4 .macro COPY_1x4
#endif
lxvd2x vs0, o0, A0 lxvd2x vs0, o0, A0
lxvd2x vs1, o16, A0 lxvd2x vs1, o16, A0
@ -664,14 +764,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi BO, BO, 32 addi BO, BO, 32
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=1 and M=2 * Macros for N=1 and M=2
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_1x2', `
#else
.macro COPY_1x2 .macro COPY_1x2
#endif
lxvd2x vs0, o0, A0 lxvd2x vs0, o0, A0
addi A0, A0, 16 addi A0, A0, 16
@ -681,14 +789,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi BO, BO, 16 addi BO, BO, 16
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=1 and M=1 * Macros for N=1 and M=1
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_1x1', `
#else
.macro COPY_1x1 .macro COPY_1x1
#endif
lxsdx vs0, o0, A0 lxsdx vs0, o0, A0
addi A0, A0, 8 addi A0, A0, 8
@ -698,5 +814,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi BO, BO, 8 addi BO, BO, 8
#if defined(_AIX)
')
#else
.endm .endm
#endif

View File

@ -38,7 +38,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
* Macros for N=4 and M=16 * Macros for N=4 and M=16
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_4x16', `
#else
.macro COPY_4x16 .macro COPY_4x16
#endif
lxvd2x vs32, o0, A0 lxvd2x vs32, o0, A0
lxvd2x vs33, o16, A0 lxvd2x vs33, o16, A0
@ -140,14 +144,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvd2x vs10, o32, T1 stxvd2x vs10, o32, T1
stxvd2x vs11, o48, T1 stxvd2x vs11, o48, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=4 and M=8 * Macros for N=4 and M=8
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_4x8', `
#else
.macro COPY_4x8 .macro COPY_4x8
#endif
lxvd2x vs32, o0, A0 lxvd2x vs32, o0, A0
lxvd2x vs33, o16, A0 lxvd2x vs33, o16, A0
@ -205,14 +217,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvd2x vs46, o32, T1 stxvd2x vs46, o32, T1
stxvd2x vs47, o48, T1 stxvd2x vs47, o48, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=4 and M=4 * Macros for N=4 and M=4
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_4x4', `
#else
.macro COPY_4x4 .macro COPY_4x4
#endif
lxvd2x vs32, o0, A0 lxvd2x vs32, o0, A0
lxvd2x vs33, o16, A0 lxvd2x vs33, o16, A0
@ -250,14 +270,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvd2x vs38, o32, T1 stxvd2x vs38, o32, T1
stxvd2x vs39, o48, T1 stxvd2x vs39, o48, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=4 and M=2 * Macros for N=4 and M=2
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_4x2', `
#else
.macro COPY_4x2 .macro COPY_4x2
#endif
lxvd2x vs32, o0, A0 lxvd2x vs32, o0, A0
addi A0, A0, 16 addi A0, A0, 16
@ -285,14 +313,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvd2x vs35, o48, T1 stxvd2x vs35, o48, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=4 and M=1 * Macros for N=4 and M=1
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_4x1', `
#else
.macro COPY_4x1 .macro COPY_4x1
#endif
lxsdx vs32, o0, A0 lxsdx vs32, o0, A0
addi A0, A0, 8 addi A0, A0, 8
@ -322,14 +358,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxsdx vs35, o8, T1 stxsdx vs35, o8, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=2 and M=16 * Macros for N=2 and M=16
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_2x16', `
#else
.macro COPY_2x16 .macro COPY_2x16
#endif
lxvd2x vs32, o0, A0 lxvd2x vs32, o0, A0
lxvd2x vs33, o16, A0 lxvd2x vs33, o16, A0
@ -383,14 +427,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvd2x vs46, o32, T1 stxvd2x vs46, o32, T1
stxvd2x vs47, o48, T1 stxvd2x vs47, o48, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=2 and M=8 * Macros for N=2 and M=8
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_2x8', `
#else
.macro COPY_2x8 .macro COPY_2x8
#endif
lxvd2x vs32, o0, A0 lxvd2x vs32, o0, A0
lxvd2x vs33, o16, A0 lxvd2x vs33, o16, A0
@ -420,14 +472,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvd2x vs38, o32, T1 stxvd2x vs38, o32, T1
stxvd2x vs39, o48, T1 stxvd2x vs39, o48, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=2 and M=4 * Macros for N=2 and M=4
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_2x4', `
#else
.macro COPY_2x4 .macro COPY_2x4
#endif
lxvd2x vs32, o0, A0 lxvd2x vs32, o0, A0
lxvd2x vs33, o16, A0 lxvd2x vs33, o16, A0
@ -447,14 +507,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvd2x vs34, o32, T1 stxvd2x vs34, o32, T1
stxvd2x vs35, o48, T1 stxvd2x vs35, o48, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=2 and M=2 * Macros for N=2 and M=2
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_2x2', `
#else
.macro COPY_2x2 .macro COPY_2x2
#endif
lxvd2x vs32, o0, A0 lxvd2x vs32, o0, A0
addi A0, A0, 16 addi A0, A0, 16
@ -470,14 +538,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvd2x vs33, o16, T1 stxvd2x vs33, o16, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=2 and M=1 * Macros for N=2 and M=1
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_2x1', `
#else
.macro COPY_2x1 .macro COPY_2x1
#endif
lxsdx vs32, o0, A0 lxsdx vs32, o0, A0
addi A0, A0, 8 addi A0, A0, 8
@ -493,14 +569,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxsdx vs33, o8, T1 stxsdx vs33, o8, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=1 and M=16 * Macros for N=1 and M=16
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_1x16', `
#else
.macro COPY_1x16 .macro COPY_1x16
#endif
lxvd2x vs32, o0, A0 lxvd2x vs32, o0, A0
lxvd2x vs33, o16, A0 lxvd2x vs33, o16, A0
@ -528,14 +612,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvd2x vs38, o32, T1 stxvd2x vs38, o32, T1
stxvd2x vs39, o48, T1 stxvd2x vs39, o48, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=1 and M=8 * Macros for N=1 and M=8
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_1x8', `
#else
.macro COPY_1x8 .macro COPY_1x8
#endif
lxvd2x vs32, o0, A0 lxvd2x vs32, o0, A0
lxvd2x vs33, o16, A0 lxvd2x vs33, o16, A0
@ -551,14 +643,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvd2x vs34, o32, T1 stxvd2x vs34, o32, T1
stxvd2x vs35, o48, T1 stxvd2x vs35, o48, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=1 and M=4 * Macros for N=1 and M=4
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_1x4', `
#else
.macro COPY_1x4 .macro COPY_1x4
#endif
lxvd2x vs32, o0, A0 lxvd2x vs32, o0, A0
lxvd2x vs33, o16, A0 lxvd2x vs33, o16, A0
@ -570,14 +670,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvd2x vs32, o0, T1 stxvd2x vs32, o0, T1
stxvd2x vs33, o16, T1 stxvd2x vs33, o16, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=1 and M=2 * Macros for N=1 and M=2
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_1x2', `
#else
.macro COPY_1x2 .macro COPY_1x2
#endif
lxvd2x vs32, o0, A0 lxvd2x vs32, o0, A0
addi A0, A0, 16 addi A0, A0, 16
@ -587,14 +695,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvd2x vs32, o0, T1 stxvd2x vs32, o0, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=1 and M=1 * Macros for N=1 and M=1
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_1x1', `
#else
.macro COPY_1x1 .macro COPY_1x1
#endif
lxsdx vs32, o0, A0 lxsdx vs32, o0, A0
addi A0, A0, 8 addi A0, A0, 8
@ -604,5 +720,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxsdx vs32, o0, T1 stxsdx vs32, o0, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif

View File

@ -46,7 +46,7 @@ static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y
( (
"lxvd2x 34, 0, %10 \n\t" // x0, x1 "lxvd2x 34, 0, %10 \n\t" // x0, x1
"lxvd2x 35, %11, %10 \n\t" // x2, x3 "lxvd2x 35, %11, %10 \n\t" // x2, x3
"xxspltd 32, %x9, 0 \n\t" // alpha, alpha XXSPLTD_S(32,%x9,0) // alpha, alpha
"sldi %6, %13, 3 \n\t" // lda * sizeof (double) "sldi %6, %13, 3 \n\t" // lda * sizeof (double)
@ -56,10 +56,10 @@ static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y
"add %4, %3, %6 \n\t" // a0 = ap, a1 = a0 + lda "add %4, %3, %6 \n\t" // a0 = ap, a1 = a0 + lda
"add %6, %6, %6 \n\t" // 2 * lda "add %6, %6, %6 \n\t" // 2 * lda
"xxspltd 32, 34, 0 \n\t" // x0 * alpha, x0 * alpha XXSPLTD_S(32,34,0) // x0 * alpha, x0 * alpha
"xxspltd 33, 34, 1 \n\t" // x1 * alpha, x1 * alpha XXSPLTD_S(33,34,1) // x1 * alpha, x1 * alpha
"xxspltd 34, 35, 0 \n\t" // x2 * alpha, x2 * alpha XXSPLTD_S(34,35,0) // x2 * alpha, x2 * alpha
"xxspltd 35, 35, 1 \n\t" // x3 * alpha, x3 * alpha XXSPLTD_S(35,35,1) // x3 * alpha, x3 * alpha
"add %5, %3, %6 \n\t" // a2 = a0 + 2 * lda "add %5, %3, %6 \n\t" // a2 = a0 + 2 * lda
"add %6, %4, %6 \n\t" // a3 = a1 + 2 * lda "add %6, %4, %6 \n\t" // a3 = a1 + 2 * lda
@ -89,10 +89,10 @@ static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y
"addi %6, %6, 32 \n\t" "addi %6, %6, 32 \n\t"
"addic. %1, %1, -4 \n\t" "addic. %1, %1, -4 \n\t"
"ble 2f \n\t" "ble two%= \n\t"
".p2align 5 \n" ".align 5 \n"
"1: \n\t" "one%=: \n\t"
"lxvd2x 36, 0, %2 \n\t" // y0, y1 "lxvd2x 36, 0, %2 \n\t" // y0, y1
"lxvd2x 37, %11, %2 \n\t" // y2, y3 "lxvd2x 37, %11, %2 \n\t" // y2, y3
@ -131,7 +131,7 @@ static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y
"addi %2, %2, 32 \n\t" "addi %2, %2, 32 \n\t"
"addic. %1, %1, -4 \n\t" "addic. %1, %1, -4 \n\t"
"ble 2f \n\t" "ble two%= \n\t"
"lxvd2x 36, 0, %2 \n\t" // y0, y1 "lxvd2x 36, 0, %2 \n\t" // y0, y1
@ -171,7 +171,7 @@ static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y
"addi %2, %2, 32 \n\t" "addi %2, %2, 32 \n\t"
"addic. %1, %1, -4 \n\t" "addic. %1, %1, -4 \n\t"
"ble 2f \n\t" "ble two%= \n\t"
"lxvd2x 36, 0, %2 \n\t" // y0, y1 "lxvd2x 36, 0, %2 \n\t" // y0, y1
@ -211,7 +211,7 @@ static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y
"addi %2, %2, 32 \n\t" "addi %2, %2, 32 \n\t"
"addic. %1, %1, -4 \n\t" "addic. %1, %1, -4 \n\t"
"ble 2f \n\t" "ble two%= \n\t"
"lxvd2x 36, 0, %2 \n\t" // y0, y1 "lxvd2x 36, 0, %2 \n\t" // y0, y1
@ -251,9 +251,9 @@ static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y
"addi %2, %2, 32 \n\t" "addi %2, %2, 32 \n\t"
"addic. %1, %1, -4 \n\t" "addic. %1, %1, -4 \n\t"
"bgt 1b \n" "bgt one%= \n"
"2: \n\t" "two%=: \n\t"
"lxvd2x 36, 0, %2 \n\t" // y0, y1 "lxvd2x 36, 0, %2 \n\t" // y0, y1
"lxvd2x 37, %11, %2 \n\t" // y2, y3 "lxvd2x 37, %11, %2 \n\t" // y2, y3

View File

@ -93,11 +93,11 @@ static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, do
"li %[off],32 \n\t" "li %[off],32 \n\t"
"ble- 2f \n\t" "ble- two%= \n\t"
//-------------------------------------------------- //--------------------------------------------------
".p2align 5 \n\t" ".align 5 \n\t"
"1: \n\t" "one%=: \n\t"
"xvmaddadp 34,36,32 \n\t" "xvmaddadp 34,36,32 \n\t"
"xvmaddadp 35,38,32 \n\t" "xvmaddadp 35,38,32 \n\t"
"addi %[off2], %[off2],32 \n\t" "addi %[off2], %[off2],32 \n\t"
@ -137,7 +137,7 @@ static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, do
"lxvd2x 49, %[a6], %[off2] \n\t" "lxvd2x 49, %[a6], %[off2] \n\t"
"lxvd2x 51, %[a7], %[off2] \n\t" "lxvd2x 51, %[a7], %[off2] \n\t"
"lxvd2x 33, %[x], %[off2] \n\t" "lxvd2x 33, %[x], %[off2] \n\t"
"ble- 2f \n\t" "ble- two%= \n\t"
"xvmaddadp 34,36,32 \n\t" "xvmaddadp 34,36,32 \n\t"
"xvmaddadp 35,38,32 \n\t" "xvmaddadp 35,38,32 \n\t"
"addi %[off2], %[off2],32 \n\t" "addi %[off2], %[off2],32 \n\t"
@ -177,7 +177,7 @@ static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, do
"lxvd2x 49, %[a6], %[off2] \n\t" "lxvd2x 49, %[a6], %[off2] \n\t"
"lxvd2x 51, %[a7], %[off2] \n\t" "lxvd2x 51, %[a7], %[off2] \n\t"
"lxvd2x 33, %[x], %[off2] \n\t" "lxvd2x 33, %[x], %[off2] \n\t"
"ble- 2f \n\t" "ble- two%= \n\t"
"xvmaddadp 34,36,32 \n\t" "xvmaddadp 34,36,32 \n\t"
"xvmaddadp 35,38,32 \n\t" "xvmaddadp 35,38,32 \n\t"
#if defined(PREFETCH) #if defined(PREFETCH)
@ -229,7 +229,7 @@ static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, do
"lxvd2x 33, %[x], %[off2] \n\t" "lxvd2x 33, %[x], %[off2] \n\t"
"addic. %[n],%[n],-4 \n\t" "addic. %[n],%[n],-4 \n\t"
"ble- 2f \n\t" "ble- two%= \n\t"
"addi %[off2], %[off2],32 \n\t" "addi %[off2], %[off2],32 \n\t"
#if defined(PREFETCH) #if defined(PREFETCH)
@ -288,9 +288,9 @@ static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, do
#if defined(PREFETCH) #if defined(PREFETCH)
"dcbt %[temp],%[x] \n\t" "dcbt %[temp],%[x] \n\t"
#endif #endif
"bgt+ 1b \n\t" "bgt+ one%= \n\t"
".p2align 5 \n\t" ".align 5 \n\t"
"2: \n\t" "two%=: \n\t"
//-------------------------------------------- //--------------------------------------------
"xvmaddadp 34,36,32 \n\t" "xvmaddadp 34,36,32 \n\t"
@ -301,7 +301,7 @@ static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, do
"xvmaddadp 7,46,32 \n\t" "xvmaddadp 7,46,32 \n\t"
"xvmaddadp 8,48,32 \n\t" "xvmaddadp 8,48,32 \n\t"
"xvmaddadp 9,50,32 \n\t" "xvmaddadp 9,50,32 \n\t"
"xxspltd 36, %x[alpha], 0 \n\t" XXSPLTD_S(36,%x[alpha],0)
"xvmaddadp 34,37,33 \n\t" "xvmaddadp 34,37,33 \n\t"
"xvmaddadp 35,39,33 \n\t" "xvmaddadp 35,39,33 \n\t"
"xvmaddadp 4,41,33 \n\t" "xvmaddadp 4,41,33 \n\t"
@ -322,21 +322,21 @@ static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, do
"xxmrgld 42,34,35 \n\t" XXMRGLD_S(42,34,35)
"xxmrghd 43,34,35 \n\t" XXMRGHD_S(43,34,35)
"xxmrgld 44,4,5 \n\t" XXMRGLD_S(44,4,5)
"xxmrghd 45,4,5 \n\t" XXMRGHD_S(45,4,5)
"xvadddp 42,42,43 \n\t" "xvadddp 42,42,43 \n\t"
"xxmrgld 46,6,7 \n\t" XXMRGLD_S(46,6,7)
"xxmrghd 47,6,7 \n\t" XXMRGHD_S(47,6,7)
"xvadddp 44,44,45 \n\t" "xvadddp 44,44,45 \n\t"
"xxmrgld 48,8,9 \n\t" XXMRGLD_S(48,8,9)
"xxmrghd 49,8,9 \n\t" XXMRGHD_S(49,8,9)
"xvadddp 46,46,47 \n\t" "xvadddp 46,46,47 \n\t"

View File

@ -51,8 +51,8 @@ static void drot_kernel_16 (long n, double *x, double *y, double c, double s)
__asm__ __asm__
( (
"xxspltd 36, %x13, 0 \n\t" // load c to both dwords XXSPLTD_S(36,%x13,0) // load c to both dwords
"xxspltd 37, %x14, 0 \n\t" // load s to both dwords XXSPLTD_S(37,%x14,0) // load s to both dwords
"lxvd2x 32, 0, %3 \n\t" // load x "lxvd2x 32, 0, %3 \n\t" // load x
"lxvd2x 33, %15, %3 \n\t" "lxvd2x 33, %15, %3 \n\t"
@ -68,10 +68,10 @@ static void drot_kernel_16 (long n, double *x, double *y, double c, double s)
"addi %4, %4, 64 \n\t" "addi %4, %4, 64 \n\t"
"addic. %2, %2, -8 \n\t" "addic. %2, %2, -8 \n\t"
"ble 2f \n\t" "ble two%= \n\t"
".p2align 5 \n" ".align 5 \n"
"1: \n\t" "one%=: \n\t"
"xvmuldp 40, 32, 36 \n\t" // c * x "xvmuldp 40, 32, 36 \n\t" // c * x
"xvmuldp 41, 33, 36 \n\t" "xvmuldp 41, 33, 36 \n\t"
@ -135,9 +135,9 @@ static void drot_kernel_16 (long n, double *x, double *y, double c, double s)
"addi %4, %4, 128 \n\t" "addi %4, %4, 128 \n\t"
"addic. %2, %2, -8 \n\t" "addic. %2, %2, -8 \n\t"
"bgt 1b \n" "bgt one%= \n"
"2: \n\t" "two%=: \n\t"
"xvmuldp 40, 32, 36 \n\t" // c * x "xvmuldp 40, 32, 36 \n\t" // c * x
"xvmuldp 41, 33, 36 \n\t" "xvmuldp 41, 33, 36 \n\t"

View File

@ -41,7 +41,7 @@ static void dscal_kernel_8 (long n, double *x, double alpha)
( (
"dcbt 0, %2 \n\t" "dcbt 0, %2 \n\t"
"xxspltd %x3, %x3, 0 \n\t" XXSPLTD_S(%x3,%x3,0)
"lxvd2x 32, 0, %2 \n\t" "lxvd2x 32, 0, %2 \n\t"
"lxvd2x 33, %4, %2 \n\t" "lxvd2x 33, %4, %2 \n\t"
@ -55,10 +55,10 @@ static void dscal_kernel_8 (long n, double *x, double alpha)
"addi %2, %2, 128 \n\t" "addi %2, %2, 128 \n\t"
"addic. %1, %1, -16 \n\t" "addic. %1, %1, -16 \n\t"
"ble 2f \n\t" "ble two%= \n\t"
".p2align 5 \n" ".align 5 \n"
"1: \n\t" "one%=: \n\t"
"xvmuldp 40, 32, %x3 \n\t" "xvmuldp 40, 32, %x3 \n\t"
"xvmuldp 41, 33, %x3 \n\t" "xvmuldp 41, 33, %x3 \n\t"
@ -91,9 +91,9 @@ static void dscal_kernel_8 (long n, double *x, double alpha)
"addi %2, %2, 256 \n\t" "addi %2, %2, 256 \n\t"
"addic. %1, %1, -16 \n\t" "addic. %1, %1, -16 \n\t"
"bgt 1b \n" "bgt one%= \n"
"2: \n\t" "two%=: \n\t"
"xvmuldp 40, 32, %x3 \n\t" "xvmuldp 40, 32, %x3 \n\t"
"xvmuldp 41, 33, %x3 \n\t" "xvmuldp 41, 33, %x3 \n\t"
@ -146,8 +146,8 @@ static void dscal_kernel_8_zero (long n, double *x)
( (
"xxlxor %x3, %x3, %x3 \n\t" "xxlxor %x3, %x3, %x3 \n\t"
".p2align 5 \n" ".align 5 \n"
"1: \n\t" "one%=: \n\t"
"stxvd2x %x3, 0, %2 \n\t" "stxvd2x %x3, 0, %2 \n\t"
"stxvd2x %x3, %4, %2 \n\t" "stxvd2x %x3, %4, %2 \n\t"
@ -161,7 +161,7 @@ static void dscal_kernel_8_zero (long n, double *x)
"addi %2, %2, 128 \n\t" "addi %2, %2, 128 \n\t"
"addic. %1, %1, -16 \n\t" "addic. %1, %1, -16 \n\t"
"bgt 1b \n" "bgt one%= \n"
"#n=%1 x=%0=%2 t0=%x3 o16=%4 o32=%5 o48=%6 o64=%7 o80=%8 o96=%9 o112=%10" "#n=%1 x=%0=%2 t0=%x3 o16=%4 o32=%5 o48=%6 o64=%7 o80=%8 o96=%9 o112=%10"
: :

View File

@ -39,8 +39,8 @@ static void dswap_kernel_32 (long n, double *x, double *y)
{ {
__asm__ __asm__
( (
".p2align 5 \n" ".align 5 \n"
"1: \n\t" "one%=: \n\t"
"lxvd2x 32, 0, %4 \n\t" "lxvd2x 32, 0, %4 \n\t"
"lxvd2x 33, %5, %4 \n\t" "lxvd2x 33, %5, %4 \n\t"
@ -131,7 +131,7 @@ static void dswap_kernel_32 (long n, double *x, double *y)
"addi %4, %4, 128 \n\t" "addi %4, %4, 128 \n\t"
"addic. %2, %2, -32 \n\t" "addic. %2, %2, -32 \n\t"
"bgt 1b \n" "bgt one%= \n"
"#n=%2 x=%0=%3 y=%1=%4 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11" "#n=%2 x=%0=%3 y=%1=%4 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11"
: :

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -58,8 +58,8 @@ static BLASLONG diamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) {
"lxvd2x 47, %[i48],%[ptr_tmp] \n\t" "lxvd2x 47, %[i48],%[ptr_tmp] \n\t"
"lxvd2x 48, %[i64],%[ptr_tmp] \n\t" "lxvd2x 48, %[i64],%[ptr_tmp] \n\t"
"lxvd2x 49, %[i80],%[ptr_tmp] \n\t" "lxvd2x 49, %[i80],%[ptr_tmp] \n\t"
"lxvd2x 50, %[i96],%[ptr_tmp] \n\t" "lxvd2x 6, %[i96],%[ptr_tmp] \n\t"
"lxvd2x 51,%[i112],%[ptr_tmp] \n\t" "lxvd2x 7,%[i112],%[ptr_tmp] \n\t"
"xxlor 40,%x[start],%x[start] \n\t" //{ 1,0} vs40 | v8 "xxlor 40,%x[start],%x[start] \n\t" //{ 1,0} vs40 | v8
"vaddudm 9,8,%[adder] \n\t" //{3,2} vs41 "vaddudm 9,8,%[adder] \n\t" //{3,2} vs41
@ -69,7 +69,7 @@ static BLASLONG diamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) {
"vaddudm 11,10,%[adder] \n\t" //{7,6} vs43 "vaddudm 11,10,%[adder] \n\t" //{7,6} vs43
"xxlxor 39,39,39 \n\t" // vs39 vec_max_value "xxlxor 39,39,39 \n\t" // vs39 vec_max_value
"vaddudm 4,11, %[adder] \n\t" // {9,8} -{8;8} vs36 | v4 "vaddudm 4,11, %[adder] \n\t" // {9,8} -{8;8} vs36 | v4
"xxspltd 36,36,0 \n\t" XXSPLTD_S(36,36,0)
"xvabsdp 44, 44 \n\t" "xvabsdp 44, 44 \n\t"
"xvabsdp 45, 45 \n\t" "xvabsdp 45, 45 \n\t"
@ -77,21 +77,21 @@ static BLASLONG diamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) {
"xvabsdp 47, 47 \n\t" "xvabsdp 47, 47 \n\t"
"xvabsdp 48, 48 \n\t" "xvabsdp 48, 48 \n\t"
"xvabsdp 49, 49 \n\t" "xvabsdp 49, 49 \n\t"
"xvabsdp 50, 50 \n\t" "xvabsdp 6, 6 \n\t"
"xvabsdp 51, 51 \n\t" "xvabsdp 7, 7 \n\t"
//jump first half forward //jump first half forward
"b 2f \n\t" "b two%= \n\t"
//=================================================================== //===================================================================
".p2align 5 \n\t" ".align 5 \n\t"
"1: \n\t" "one%=: \n\t"
"xvcmpgtdp 2,45,44 \n\t " "xvcmpgtdp 2,45,44 \n\t "
"xvcmpgtdp 3,47,46 \n\t " "xvcmpgtdp 3,47,46 \n\t "
"xvcmpgtdp 4,49,48 \n\t " "xvcmpgtdp 4,49,48 \n\t "
"xvcmpgtdp 5,51,50 \n\t" "xvcmpgtdp 5,7,6 \n\t"
"xxsel 32,40,41,2 \n\t" "xxsel 32,40,41,2 \n\t"
"xxsel 0,44,45,2 \n\t" "xxsel 0,44,45,2 \n\t"
@ -100,7 +100,7 @@ static BLASLONG diamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) {
"xxsel 34,40,41,4 \n\t" "xxsel 34,40,41,4 \n\t"
"xxsel 45,48,49,4 \n\t" "xxsel 45,48,49,4 \n\t"
"xxsel 35,42,43,5 \n\t" "xxsel 35,42,43,5 \n\t"
"xxsel 47,50,51,5 \n\t" "xxsel 47,6,7,5 \n\t"
"xvcmpgtdp 2, 1,0 \n\t" "xvcmpgtdp 2, 1,0 \n\t"
"xvcmpgtdp 3,47, 45 \n\t" "xvcmpgtdp 3,47, 45 \n\t"
@ -134,8 +134,8 @@ static BLASLONG diamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) {
"vaddudm 1,1,5 \n\t" // get real index for first bigger "vaddudm 1,1,5 \n\t" // get real index for first bigger
"lxvd2x 50, %[i96],%[ptr_tmp] \n\t" "lxvd2x 6, %[i96],%[ptr_tmp] \n\t"
"lxvd2x 51,%[i112],%[ptr_tmp] \n\t" "lxvd2x 7,%[i112],%[ptr_tmp] \n\t"
//compare with previous to get vec_max_index(v6 | vs38 ) and vec_max_value (vs39) //compare with previous to get vec_max_index(v6 | vs38 ) and vec_max_value (vs39)
"xvcmpgtdp 2, 3,39 \n\t" "xvcmpgtdp 2, 3,39 \n\t"
@ -155,16 +155,16 @@ static BLASLONG diamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) {
"xvabsdp 48, 48 \n\t" "xvabsdp 48, 48 \n\t"
"xvabsdp 49, 49 \n\t" "xvabsdp 49, 49 \n\t"
"xvabsdp 50, 50 \n\t" "xvabsdp 6, 6 \n\t"
"xvabsdp 51, 51 \n\t" "xvabsdp 7, 7 \n\t"
//<-----------jump here from first load //<-----------jump here from first load
"2: \n\t" "two%=: \n\t"
"xvcmpgtdp 2,45,44 \n\t " "xvcmpgtdp 2,45,44 \n\t "
"xvcmpgtdp 3,47,46 \n\t " "xvcmpgtdp 3,47,46 \n\t "
"xvcmpgtdp 4,49,48 \n\t " "xvcmpgtdp 4,49,48 \n\t "
"xvcmpgtdp 5,51,50 \n\t" "xvcmpgtdp 5,7,6 \n\t"
"xxsel 32,40,41,2 \n\t" "xxsel 32,40,41,2 \n\t"
"xxsel 0,44,45,2 \n\t" "xxsel 0,44,45,2 \n\t"
@ -173,7 +173,7 @@ static BLASLONG diamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) {
"xxsel 34,40,41,4 \n\t" "xxsel 34,40,41,4 \n\t"
"xxsel 45,48,49,4 \n\t" "xxsel 45,48,49,4 \n\t"
"xxsel 35,42,43,5 \n\t" "xxsel 35,42,43,5 \n\t"
"xxsel 47,50,51,5 \n\t" "xxsel 47,6,7,5 \n\t"
"xvcmpgtdp 2, 1,0 \n\t" "xvcmpgtdp 2, 1,0 \n\t"
"xvcmpgtdp 3,47, 45 \n\t" "xvcmpgtdp 3,47, 45 \n\t"
@ -203,8 +203,8 @@ static BLASLONG diamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) {
"vaddudm 1,1,5 \n\t" // get real index for first bigger "vaddudm 1,1,5 \n\t" // get real index for first bigger
"lxvd2x 50, %[i96],%[ptr_tmp] \n\t" "lxvd2x 6, %[i96],%[ptr_tmp] \n\t"
"lxvd2x 51,%[i112],%[ptr_tmp] \n\t" "lxvd2x 7,%[i112],%[ptr_tmp] \n\t"
@ -226,21 +226,21 @@ static BLASLONG diamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) {
"xvabsdp 48, 48 \n\t" "xvabsdp 48, 48 \n\t"
"xvabsdp 49, 49 \n\t" "xvabsdp 49, 49 \n\t"
"xvabsdp 50, 50 \n\t" "xvabsdp 6, 6 \n\t"
"xvabsdp 51, 51 \n\t" "xvabsdp 7, 7 \n\t"
//decrement n //decrement n
"addic. %[n], %[n], -32 \n\t" "addic. %[n], %[n], -32 \n\t"
//Loop back if >0 //Loop back if >0
"bgt+ 1b \n\t" "bgt+ one%= \n\t"
//============================================================================== //==============================================================================
"xvcmpgtdp 2,45,44 \n\t " "xvcmpgtdp 2,45,44 \n\t "
"xvcmpgtdp 3,47,46 \n\t " "xvcmpgtdp 3,47,46 \n\t "
"xvcmpgtdp 4,49,48 \n\t " "xvcmpgtdp 4,49,48 \n\t "
"xvcmpgtdp 5,51,50 \n\t" "xvcmpgtdp 5,7,6 \n\t"
"xxsel 32,40,41,2 \n\t" "xxsel 32,40,41,2 \n\t"
"xxsel 0,44,45,2 \n\t" "xxsel 0,44,45,2 \n\t"
@ -249,7 +249,7 @@ static BLASLONG diamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) {
"xxsel 34,40,41,4 \n\t" "xxsel 34,40,41,4 \n\t"
"xxsel 45,48,49,4 \n\t" "xxsel 45,48,49,4 \n\t"
"xxsel 35,42,43,5 \n\t" "xxsel 35,42,43,5 \n\t"
"xxsel 47,50,51,5 \n\t" "xxsel 47,6,7,5 \n\t"
"xvcmpgtdp 2, 1,0 \n\t" "xvcmpgtdp 2, 1,0 \n\t"
"xvcmpgtdp 3,47, 45 \n\t" "xvcmpgtdp 3,47, 45 \n\t"
@ -276,28 +276,28 @@ static BLASLONG diamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) {
///////extract max value and max index from vector ///////extract max value and max index from vector
"xxspltd 32,38,1 \n\t" XXSPLTD_S(32,38,1)
"xxspltd 40,39,1 \n\t" XXSPLTD_S(40,39,1)
"xvcmpeqdp. 2, 40,39 \n\t" "xvcmpeqdp. 2, 40,39 \n\t"
//cr6 0 bit set if all true, cr6=4*6+bit_ind=24,0011at CR(BI)==1, at=10 hint that it occurs rarely //cr6 0 bit set if all true, cr6=4*6+bit_ind=24,0011at CR(BI)==1, at=10 hint that it occurs rarely
//0b001110=14 //0b001110=14
"bc 14,24, 3f \n\t" "bc 14,24, three%= \n\t"
"xvcmpgtdp 4, 40,39 \n\t" "xvcmpgtdp 4, 40,39 \n\t"
"xxsel 0,39,40,4 \n\t" "xxsel 0,39,40,4 \n\t"
"xxsel 1,38,32,4 \n\t" "xxsel 1,38,32,4 \n\t"
"stxsdx 0,0,%[ptr_maxf] \n\t" "stxsdx 0,0,%[ptr_maxf] \n\t"
"b 4f \n\t" "b four%= \n\t"
"3: \n\t" "three%=: \n\t"
//if elements value are equal then choose minimum index //if elements value are equal then choose minimum index
"xxspltd 0,40,0 \n\t" XXSPLTD_S(0,40,0)
"vminud 0,0,6 \n\t" //vs32 vs38 "vminud 0,0,6 \n\t" //vs32 vs38
"xxlor 1,32,32 \n\t" "xxlor 1,32,32 \n\t"
"stxsdx 0,0,%[ptr_maxf] \n\t" "stxsdx 0,0,%[ptr_maxf] \n\t"
"4: \n\t" "four%=: \n\t"
"mfvsrd %[index],1 \n\t" "mfvsrd %[index],1 \n\t"
: [maxf] "=m"(*maxf),[ptr_tmp] "+&b"(x),[index] "=r"(index), [n] "+&r"(n) : [maxf] "=m"(*maxf),[ptr_tmp] "+&b"(x),[index] "=r"(index), [n] "+&r"(n)
@ -306,7 +306,7 @@ static BLASLONG diamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) {
[i64] "b"(64), [i80] "b"(80), [i96] "b"(96), [i112] "b"(112), [i64] "b"(64), [i80] "b"(80), [i96] "b"(96), [i112] "b"(112),
[start] "v"(start), [adder] "v"(temp_add_index) [start] "v"(start), [adder] "v"(temp_add_index)
: "cc", "vs0", "vs1","vs2","vs3", "vs4","vs5","vs32", "vs33", "vs34", "vs35", "vs36", : "cc", "vs0", "vs1","vs2","vs3", "vs4","vs5","vs32", "vs33", "vs34", "vs35", "vs36",
"vs37", "vs38", "vs39", "vs40", "vs41", "vs42", "vs43", "vs44", "vs45", "vs46", "vs47", "vs48", "vs49", "vs50", "vs51" "vs37", "vs38", "vs39", "vs40", "vs41", "vs42", "vs43", "vs44", "vs45", "vs46", "vs47", "vs48", "vs49", "vs6", "vs7"
); );

View File

@ -58,8 +58,8 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) {
"lxvd2x 47, %[i48],%[ptr_tmp] \n\t" "lxvd2x 47, %[i48],%[ptr_tmp] \n\t"
"lxvd2x 48, %[i64],%[ptr_tmp] \n\t" "lxvd2x 48, %[i64],%[ptr_tmp] \n\t"
"lxvd2x 49, %[i80],%[ptr_tmp] \n\t" "lxvd2x 49, %[i80],%[ptr_tmp] \n\t"
"lxvd2x 50, %[i96],%[ptr_tmp] \n\t" "lxvd2x 6, %[i96],%[ptr_tmp] \n\t"
"lxvd2x 51,%[i112],%[ptr_tmp] \n\t" "lxvd2x 7,%[i112],%[ptr_tmp] \n\t"
"xxlor 40,%x[start],%x[start] \n\t" //{ 1,0} vs40 | v8 "xxlor 40,%x[start],%x[start] \n\t" //{ 1,0} vs40 | v8
"vaddudm 9,8, %[adder] \n\t" //{3,2} vs41 "vaddudm 9,8, %[adder] \n\t" //{3,2} vs41
@ -69,7 +69,7 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) {
"vaddudm 11,10,%[adder] \n\t" //{7,6} vs43 "vaddudm 11,10,%[adder] \n\t" //{7,6} vs43
"lxvdsx 39,0,%[ptr_minf] \n\t" // vs39 vec_min_value "lxvdsx 39,0,%[ptr_minf] \n\t" // vs39 vec_min_value
"vaddudm 4,11, %[adder] \n\t" // {9,8} -{8;8} vs36 | v4 "vaddudm 4,11, %[adder] \n\t" // {9,8} -{8;8} vs36 | v4
"xxspltd 36,36,0 \n\t" XXSPLTD_S(36,36,0)
"xvabsdp 39, 39 \n\t" "xvabsdp 39, 39 \n\t"
"xvabsdp 44, 44 \n\t" "xvabsdp 44, 44 \n\t"
@ -78,21 +78,21 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) {
"xvabsdp 47, 47 \n\t" "xvabsdp 47, 47 \n\t"
"xvabsdp 48, 48 \n\t" "xvabsdp 48, 48 \n\t"
"xvabsdp 49, 49 \n\t" "xvabsdp 49, 49 \n\t"
"xvabsdp 50, 50 \n\t" "xvabsdp 6, 6 \n\t"
"xvabsdp 51, 51 \n\t" "xvabsdp 7, 7 \n\t"
//jump first half forward //jump first half forward
"b 2f \n\t" "b two%= \n\t"
//=================================================================== //===================================================================
".p2align 5 \n\t" ".align 5 \n\t"
"1: \n\t" "one%=: \n\t"
"xvcmpgtdp 2,44,45 \n\t " "xvcmpgtdp 2,44,45 \n\t "
"xvcmpgtdp 3,46,47 \n\t " "xvcmpgtdp 3,46,47 \n\t "
"xvcmpgtdp 4,48,49 \n\t " "xvcmpgtdp 4,48,49 \n\t "
"xvcmpgtdp 5,50,51 \n\t" "xvcmpgtdp 5,6,7 \n\t"
"xxsel 32,40,41,2 \n\t" "xxsel 32,40,41,2 \n\t"
"xxsel 0,44,45,2 \n\t" "xxsel 0,44,45,2 \n\t"
@ -101,7 +101,7 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) {
"xxsel 34,40,41,4 \n\t" "xxsel 34,40,41,4 \n\t"
"xxsel 45,48,49,4 \n\t" "xxsel 45,48,49,4 \n\t"
"xxsel 35,42,43,5 \n\t" "xxsel 35,42,43,5 \n\t"
"xxsel 47,50,51,5 \n\t" "xxsel 47,6,7,5 \n\t"
"xvcmpgtdp 2,0, 1 \n\t" "xvcmpgtdp 2,0, 1 \n\t"
"xvcmpgtdp 3, 45,47 \n\t" "xvcmpgtdp 3, 45,47 \n\t"
@ -135,8 +135,8 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) {
"vaddudm 1,1,5 \n\t" // get real index for first smaller "vaddudm 1,1,5 \n\t" // get real index for first smaller
"lxvd2x 50, %[i96],%[ptr_tmp] \n\t" "lxvd2x 6, %[i96],%[ptr_tmp] \n\t"
"lxvd2x 51,%[i112],%[ptr_tmp] \n\t" "lxvd2x 7,%[i112],%[ptr_tmp] \n\t"
//compare with previous to get vec_min_index(v6 | vs38 ) and vec_min_value (vs39) //compare with previous to get vec_min_index(v6 | vs38 ) and vec_min_value (vs39)
"xvcmpgtdp 2,39, 3 \n\t" "xvcmpgtdp 2,39, 3 \n\t"
@ -156,16 +156,16 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) {
"xvabsdp 48, 48 \n\t" "xvabsdp 48, 48 \n\t"
"xvabsdp 49, 49 \n\t" "xvabsdp 49, 49 \n\t"
"xvabsdp 50, 50 \n\t" "xvabsdp 6, 6 \n\t"
"xvabsdp 51, 51 \n\t" "xvabsdp 7, 7 \n\t"
//<-----------jump here from first load //<-----------jump here from first load
"2: \n\t" "two%=: \n\t"
"xvcmpgtdp 2,44,45 \n\t " "xvcmpgtdp 2,44,45 \n\t "
"xvcmpgtdp 3,46,47 \n\t " "xvcmpgtdp 3,46,47 \n\t "
"xvcmpgtdp 4,48,49 \n\t " "xvcmpgtdp 4,48,49 \n\t "
"xvcmpgtdp 5,50,51 \n\t" "xvcmpgtdp 5,6,7 \n\t"
"xxsel 32,40,41,2 \n\t" "xxsel 32,40,41,2 \n\t"
"xxsel 0,44,45,2 \n\t" "xxsel 0,44,45,2 \n\t"
@ -174,7 +174,7 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) {
"xxsel 34,40,41,4 \n\t" "xxsel 34,40,41,4 \n\t"
"xxsel 45,48,49,4 \n\t" "xxsel 45,48,49,4 \n\t"
"xxsel 35,42,43,5 \n\t" "xxsel 35,42,43,5 \n\t"
"xxsel 47,50,51,5 \n\t" "xxsel 47,6,7,5 \n\t"
"xvcmpgtdp 2,0, 1 \n\t" "xvcmpgtdp 2,0, 1 \n\t"
"xvcmpgtdp 3, 45,47 \n\t" "xvcmpgtdp 3, 45,47 \n\t"
@ -204,8 +204,8 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) {
"vaddudm 1,1,5 \n\t" // get real index for first smaller "vaddudm 1,1,5 \n\t" // get real index for first smaller
"lxvd2x 50, %[i96],%[ptr_tmp] \n\t" "lxvd2x 6, %[i96],%[ptr_tmp] \n\t"
"lxvd2x 51,%[i112],%[ptr_tmp] \n\t" "lxvd2x 7,%[i112],%[ptr_tmp] \n\t"
@ -227,21 +227,21 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) {
"xvabsdp 48, 48 \n\t" "xvabsdp 48, 48 \n\t"
"xvabsdp 49, 49 \n\t" "xvabsdp 49, 49 \n\t"
"xvabsdp 50, 50 \n\t" "xvabsdp 6, 6 \n\t"
"xvabsdp 51, 51 \n\t" "xvabsdp 7, 7 \n\t"
//decrement n //decrement n
"addic. %[n], %[n], -32 \n\t" "addic. %[n], %[n], -32 \n\t"
//Loop back if >0 //Loop back if >0
"bgt+ 1b \n\t" "bgt+ one%= \n\t"
//============================================================================== //==============================================================================
"xvcmpgtdp 2,44,45 \n\t " "xvcmpgtdp 2,44,45 \n\t "
"xvcmpgtdp 3,46,47 \n\t " "xvcmpgtdp 3,46,47 \n\t "
"xvcmpgtdp 4,48,49 \n\t " "xvcmpgtdp 4,48,49 \n\t "
"xvcmpgtdp 5,50,51 \n\t" "xvcmpgtdp 5,6,7 \n\t"
"xxsel 32,40,41,2 \n\t" "xxsel 32,40,41,2 \n\t"
"xxsel 0,44,45,2 \n\t" "xxsel 0,44,45,2 \n\t"
@ -250,7 +250,7 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) {
"xxsel 34,40,41,4 \n\t" "xxsel 34,40,41,4 \n\t"
"xxsel 45,48,49,4 \n\t" "xxsel 45,48,49,4 \n\t"
"xxsel 35,42,43,5 \n\t" "xxsel 35,42,43,5 \n\t"
"xxsel 47,50,51,5 \n\t" "xxsel 47,6,7,5 \n\t"
"xvcmpgtdp 2,0, 1 \n\t" "xvcmpgtdp 2,0, 1 \n\t"
"xvcmpgtdp 3, 45,47 \n\t" "xvcmpgtdp 3, 45,47 \n\t"
@ -277,28 +277,28 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) {
///////extract min value and min index from vector ///////extract min value and min index from vector
"xxspltd 32,38,1 \n\t" XXSPLTD_S(32,38,1)
"xxspltd 40,39,1 \n\t" XXSPLTD_S(40,39,1)
"xvcmpeqdp. 2, 40,39 \n\t" "xvcmpeqdp. 2, 40,39 \n\t"
//cr6 0 bit set if all true, cr6=4*6+bit_ind=24,0011at CR(BI)==1, at=10 hint that it occurs rarely //cr6 0 bit set if all true, cr6=4*6+bit_ind=24,0011at CR(BI)==1, at=10 hint that it occurs rarely
//0b001110=14 //0b001110=14
"bc 14,24, 3f \n\t" "bc 14,24, three%= \n\t"
"xvcmpgtdp 4,39, 40 \n\t" "xvcmpgtdp 4,39, 40 \n\t"
"xxsel 0,39,40,4 \n\t" "xxsel 0,39,40,4 \n\t"
"xxsel 1,38,32,4 \n\t" "xxsel 1,38,32,4 \n\t"
"stxsdx 0,0,%[ptr_minf] \n\t" "stxsdx 0,0,%[ptr_minf] \n\t"
"b 4f \n\t" "b four%= \n\t"
"3: \n\t" "three%=: \n\t"
//if elements value are equal then choose minimum index //if elements value are equal then choose minimum index
"xxspltd 0,40,0 \n\t" XXSPLTD_S(0,40,0)
"vminud 0,0,6 \n\t" //vs32 vs38 "vminud 0,0,6 \n\t" //vs32 vs38
"xxlor 1,32,32 \n\t" "xxlor 1,32,32 \n\t"
"stxsdx 0,0,%[ptr_minf] \n\t" "stxsdx 0,0,%[ptr_minf] \n\t"
"4: \n\t" "four%=: \n\t"
"mfvsrd %[index],1 \n\t" "mfvsrd %[index],1 \n\t"
: [minf] "=m"(*minf),[ptr_tmp] "+&b"(x),[index] "=r"(index), [n] "+&r"(n) : [minf] "=m"(*minf),[ptr_tmp] "+&b"(x),[index] "=r"(index), [n] "+&r"(n)
@ -307,7 +307,7 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) {
[i64] "b"(64), [i80] "b"(80), [i96] "b"(96), [i112] "b"(112), [i64] "b"(64), [i80] "b"(80), [i96] "b"(96), [i112] "b"(112),
[start] "v"(start), [adder] "v"(temp_add_index) [start] "v"(start), [adder] "v"(temp_add_index)
: "cc", "vs0", "vs1","vs2","vs3", "vs4","vs5","vs32", "vs33", "vs34", "vs35", "vs36", : "cc", "vs0", "vs1","vs2","vs3", "vs4","vs5","vs32", "vs33", "vs34", "vs35", "vs36",
"vs37", "vs38", "vs39", "vs40", "vs41", "vs42", "vs43", "vs44", "vs45", "vs46", "vs47", "vs48", "vs49", "vs50", "vs51" "vs37", "vs38", "vs39", "vs40", "vs41", "vs42", "vs43", "vs44", "vs45", "vs46", "vs47", "vs48", "vs49", "vs6", "vs7"
); );
return index; return index;

View File

@ -56,8 +56,8 @@ static BLASLONG ziamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *maxf) {
"lxvd2x 47, %[i48],%[ptr_tmp] \n\t" "lxvd2x 47, %[i48],%[ptr_tmp] \n\t"
"lxvd2x 48, %[i64],%[ptr_tmp] \n\t" "lxvd2x 48, %[i64],%[ptr_tmp] \n\t"
"lxvd2x 49, %[i80],%[ptr_tmp] \n\t" "lxvd2x 49, %[i80],%[ptr_tmp] \n\t"
"lxvd2x 50, %[i96],%[ptr_tmp] \n\t" "lxvd2x 6, %[i96],%[ptr_tmp] \n\t"
"lxvd2x 51,%[i112],%[ptr_tmp] \n\t" "lxvd2x 7,%[i112],%[ptr_tmp] \n\t"
"xxlor 40,%x[start],%x[start] \n\t" //{ 1,0} vs40 | v8 "xxlor 40,%x[start],%x[start] \n\t" //{ 1,0} vs40 | v8
"vaddudm 9,8,%[adder] \n\t" //{3,2} vs41 "vaddudm 9,8,%[adder] \n\t" //{3,2} vs41
@ -67,7 +67,7 @@ static BLASLONG ziamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *maxf) {
"vaddudm 11,10,%[adder] \n\t" //{7,6} vs43 "vaddudm 11,10,%[adder] \n\t" //{7,6} vs43
"xxlxor 39,39,39 \n\t" // vs39 vec_max_value is zero "xxlxor 39,39,39 \n\t" // vs39 vec_max_value is zero
"vaddudm 4,11, %[adder] \n\t" // {9,8} -{8;8} vs36 | v4 "vaddudm 4,11, %[adder] \n\t" // {9,8} -{8;8} vs36 | v4
"xxspltd 36,36,0 \n\t" XXSPLTD_S(36,36,0)
@ -77,24 +77,24 @@ static BLASLONG ziamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *maxf) {
"xvabsdp 47, 47 \n\t" "xvabsdp 47, 47 \n\t"
"xvabsdp 48, 48 \n\t" "xvabsdp 48, 48 \n\t"
"xvabsdp 49, 49 \n\t" "xvabsdp 49, 49 \n\t"
"xvabsdp 50, 50 \n\t" "xvabsdp 6, 6 \n\t"
"xvabsdp 51, 51 \n\t" "xvabsdp 7, 7 \n\t"
//jump first half forward //jump first half forward
"b 2f \n\t" "b two%= \n\t"
".p2align 5 \n\t" ".align 5 \n\t"
"1: \n\t" "one%=: \n\t"
"xxmrghd 0,44,45 \n\t" XXMRGHD_S(0,44,45)
"xxmrgld 1,44,45 \n\t" XXMRGLD_S(1,44,45)
"xxmrghd 2,46,47 \n\t" XXMRGHD_S(2,46,47)
"xxmrgld 3,46,47 \n\t" XXMRGLD_S(3,46,47)
"xxmrghd 4,48,49 \n\t" XXMRGHD_S(4,48,49)
"xxmrgld 5,48,49 \n\t" XXMRGLD_S(5,48,49)
"xxmrghd 44,50,51 \n\t" XXMRGHD_S(44,6,7)
"xxmrgld 45,50,51 \n\t" XXMRGLD_S(45,6,7)
"xvadddp 46, 0,1 \n\t" "xvadddp 46, 0,1 \n\t"
"xvadddp 47, 2,3 \n\t" "xvadddp 47, 2,3 \n\t"
@ -103,15 +103,15 @@ static BLASLONG ziamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *maxf) {
"xvcmpgtdp 50,47,46 \n\t " "xvcmpgtdp 6,47,46 \n\t "
"xvcmpgtdp 51,49,48 \n\t " "xvcmpgtdp 7,49,48 \n\t "
"addi %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t" "addi %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t"
"xxsel 32,40,41,50 \n\t" "xxsel 32,40,41,6 \n\t"
"xxsel 0,46,47,50 \n\t" "xxsel 0,46,47,6 \n\t"
"xxsel 33,42,43,51 \n\t" "xxsel 33,42,43,7 \n\t"
"xxsel 1,48,49,51 \n\t" "xxsel 1,48,49,7 \n\t"
"lxvd2x 44, 0,%[ptr_tmp] \n\t" "lxvd2x 44, 0,%[ptr_tmp] \n\t"
"lxvd2x 45, %[i16],%[ptr_tmp] \n\t" "lxvd2x 45, %[i16],%[ptr_tmp] \n\t"
@ -133,8 +133,8 @@ static BLASLONG ziamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *maxf) {
"lxvd2x 48, %[i64],%[ptr_tmp] \n\t" "lxvd2x 48, %[i64],%[ptr_tmp] \n\t"
"lxvd2x 49, %[i80],%[ptr_tmp] \n\t" "lxvd2x 49, %[i80],%[ptr_tmp] \n\t"
"lxvd2x 50, %[i96],%[ptr_tmp] \n\t" "lxvd2x 6, %[i96],%[ptr_tmp] \n\t"
"lxvd2x 51,%[i112],%[ptr_tmp] \n\t" "lxvd2x 7,%[i112],%[ptr_tmp] \n\t"
//select with previous //select with previous
"xxsel 38,38,32,4 \n\t" "xxsel 38,38,32,4 \n\t"
"xxsel 39,39,3,4 \n\t" "xxsel 39,39,3,4 \n\t"
@ -148,35 +148,35 @@ static BLASLONG ziamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *maxf) {
"xvabsdp 47, 47 \n\t" "xvabsdp 47, 47 \n\t"
"xvabsdp 48, 48 \n\t" "xvabsdp 48, 48 \n\t"
"xvabsdp 49, 49 \n\t" "xvabsdp 49, 49 \n\t"
"xvabsdp 50, 50 \n\t" "xvabsdp 6, 6 \n\t"
"xvabsdp 51, 51 \n\t" "xvabsdp 7, 7 \n\t"
//>>/////////////////////////////// half start //>>/////////////////////////////// half start
"2: \n\t" "two%=: \n\t"
"xxmrghd 0,44,45 \n\t" XXMRGHD_S(0,44,45)
"xxmrgld 1,44,45 \n\t" XXMRGLD_S(1,44,45)
"xxmrghd 2,46,47 \n\t" XXMRGHD_S(2,46,47)
"xxmrgld 3,46,47 \n\t" XXMRGLD_S(3,46,47)
"xxmrghd 4,48,49 \n\t" XXMRGHD_S(4,48,49)
"xxmrgld 5,48,49 \n\t" XXMRGLD_S(5,48,49)
"xxmrghd 44,50,51 \n\t" XXMRGHD_S(44,6,7)
"xxmrgld 45,50,51 \n\t" XXMRGLD_S(45,6,7)
"xvadddp 46, 0,1 \n\t" "xvadddp 46, 0,1 \n\t"
"xvadddp 47, 2,3 \n\t" "xvadddp 47, 2,3 \n\t"
"xvadddp 48, 4,5 \n\t" "xvadddp 48, 4,5 \n\t"
"xvadddp 49, 44,45 \n\t" "xvadddp 49, 44,45 \n\t"
"xvcmpgtdp 50,47,46 \n\t " "xvcmpgtdp 6,47,46 \n\t "
"xvcmpgtdp 51,49,48 \n\t " "xvcmpgtdp 7,49,48 \n\t "
"addi %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t" "addi %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t"
"xxsel 32,40,41,50 \n\t" "xxsel 32,40,41,6 \n\t"
"xxsel 0,46,47,50 \n\t" "xxsel 0,46,47,6 \n\t"
"xxsel 33,42,43,51 \n\t" "xxsel 33,42,43,7 \n\t"
"xxsel 1,48,49,51 \n\t" "xxsel 1,48,49,7 \n\t"
"lxvd2x 44, 0,%[ptr_tmp] \n\t" "lxvd2x 44, 0,%[ptr_tmp] \n\t"
"lxvd2x 45, %[i16],%[ptr_tmp] \n\t" "lxvd2x 45, %[i16],%[ptr_tmp] \n\t"
@ -198,8 +198,8 @@ static BLASLONG ziamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *maxf) {
"lxvd2x 48, %[i64],%[ptr_tmp] \n\t" "lxvd2x 48, %[i64],%[ptr_tmp] \n\t"
"lxvd2x 49, %[i80],%[ptr_tmp] \n\t" "lxvd2x 49, %[i80],%[ptr_tmp] \n\t"
"lxvd2x 50, %[i96],%[ptr_tmp] \n\t" "lxvd2x 6, %[i96],%[ptr_tmp] \n\t"
"lxvd2x 51,%[i112],%[ptr_tmp] \n\t" "lxvd2x 7,%[i112],%[ptr_tmp] \n\t"
//select with previous //select with previous
"xxsel 38,38,32,4 \n\t" "xxsel 38,38,32,4 \n\t"
"xxsel 39,39,3,4 \n\t" "xxsel 39,39,3,4 \n\t"
@ -211,24 +211,24 @@ static BLASLONG ziamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *maxf) {
"xvabsdp 47, 47 \n\t" "xvabsdp 47, 47 \n\t"
"xvabsdp 48, 48 \n\t" "xvabsdp 48, 48 \n\t"
"xvabsdp 49, 49 \n\t" "xvabsdp 49, 49 \n\t"
"xvabsdp 50, 50 \n\t" "xvabsdp 6, 6 \n\t"
"xvabsdp 51, 51 \n\t" "xvabsdp 7, 7 \n\t"
//decrement n //decrement n
"addic. %[n], %[n], -16 \n\t" "addic. %[n], %[n], -16 \n\t"
//Loop back if >0 //Loop back if >0
"bgt+ 1b \n\t" "bgt+ one%= \n\t"
"xxmrghd 0,44,45 \n\t" XXMRGHD_S(0,44,45)
"xxmrgld 1,44,45 \n\t" XXMRGLD_S(1,44,45)
"xxmrghd 2,46,47 \n\t" XXMRGHD_S(2,46,47)
"xxmrgld 3,46,47 \n\t" XXMRGLD_S(3,46,47)
"xxmrghd 4,48,49 \n\t" XXMRGHD_S(4,48,49)
"xxmrgld 5,48,49 \n\t" XXMRGLD_S(5,48,49)
"xxmrghd 44,50,51 \n\t" XXMRGHD_S(44,6,7)
"xxmrgld 45,50,51 \n\t" XXMRGLD_S(45,6,7)
"xvadddp 46, 0,1 \n\t" "xvadddp 46, 0,1 \n\t"
"xvadddp 47, 2,3 \n\t" "xvadddp 47, 2,3 \n\t"
@ -237,13 +237,13 @@ static BLASLONG ziamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *maxf) {
"xvcmpgtdp 50,47,46 \n\t " "xvcmpgtdp 6,47,46 \n\t "
"xvcmpgtdp 51,49,48 \n\t " "xvcmpgtdp 7,49,48 \n\t "
"xxsel 32,40,41,50 \n\t" "xxsel 32,40,41,6 \n\t"
"xxsel 0,46,47,50 \n\t" "xxsel 0,46,47,6 \n\t"
"xxsel 33,42,43,51 \n\t" "xxsel 33,42,43,7 \n\t"
"xxsel 1,48,49,51 \n\t" "xxsel 1,48,49,7 \n\t"
"xvcmpgtdp 2,1,0 \n\t " "xvcmpgtdp 2,1,0 \n\t "
"xxsel 32,32,33,2 \n\t" "xxsel 32,32,33,2 \n\t"
@ -262,28 +262,28 @@ static BLASLONG ziamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *maxf) {
///////extract max value and max index from vector ///////extract max value and max index from vector
"xxspltd 32,38,1 \n\t" XXSPLTD_S(32,38,1)
"xxspltd 40,39,1 \n\t" XXSPLTD_S(40,39,1)
"xvcmpeqdp. 2, 40,39 \n\t" "xvcmpeqdp. 2, 40,39 \n\t"
//cr6 0 bit set if all true, cr6=4*6+bit_ind=24,0011at CR(BI)==1, at=10 hint that it occurs rarely //cr6 0 bit set if all true, cr6=4*6+bit_ind=24,0011at CR(BI)==1, at=10 hint that it occurs rarely
//0b001110=14 //0b001110=14
"bc 14,24, 3f \n\t" "bc 14,24, three%= \n\t"
"xvcmpgtdp 4, 40,39 \n\t" "xvcmpgtdp 4, 40,39 \n\t"
"xxsel 0,39,40,4 \n\t" "xxsel 0,39,40,4 \n\t"
"xxsel 1,38,32,4 \n\t" "xxsel 1,38,32,4 \n\t"
"stxsdx 0,0,%[ptr_maxf] \n\t" "stxsdx 0,0,%[ptr_maxf] \n\t"
"b 4f \n\t" "b four%= \n\t"
"3: \n\t" "three%=: \n\t"
//if elements value are equal then choose minimum index //if elements value are equal then choose minimum index
"xxspltd 0,40,0 \n\t" XXSPLTD_S(0,40,0)
"vminud 0,0,6 \n\t" //vs32 vs38 "vminud 0,0,6 \n\t" //vs32 vs38
"xxlor 1,32,32 \n\t" "xxlor 1,32,32 \n\t"
"stxsdx 0,0,%[ptr_maxf] \n\t" "stxsdx 0,0,%[ptr_maxf] \n\t"
"4: \n\t" "four%=: \n\t"
"mfvsrd %[index],1 \n\t" "mfvsrd %[index],1 \n\t"
: [maxf] "=m"(*maxf),[ptr_tmp] "+&b"(x),[index] "=r"(index), [n] "+&r"(n) : [maxf] "=m"(*maxf),[ptr_tmp] "+&b"(x),[index] "=r"(index), [n] "+&r"(n)
@ -292,7 +292,7 @@ static BLASLONG ziamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *maxf) {
[i64] "b"(64), [i80] "b"(80), [i96] "b"(96), [i112] "b"(112), [i64] "b"(64), [i80] "b"(80), [i96] "b"(96), [i112] "b"(112),
[start] "v"(start), [adder] "v"(temp_add_index) [start] "v"(start), [adder] "v"(temp_add_index)
: "cc", "vs0", "vs1","vs2","vs3", "vs4","vs5","vs32", "vs33", "vs34", "vs35", "vs36", : "cc", "vs0", "vs1","vs2","vs3", "vs4","vs5","vs32", "vs33", "vs34", "vs35", "vs36",
"vs37", "vs38", "vs39", "vs40", "vs41", "vs42", "vs43", "vs44", "vs45", "vs46", "vs47", "vs48", "vs49", "vs50", "vs51" "vs37", "vs38", "vs39", "vs40", "vs41", "vs42", "vs43", "vs44", "vs45", "vs46", "vs47", "vs48", "vs49", "vs6", "vs7"
); );
return index; return index;

View File

@ -54,8 +54,8 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) {
"lxvd2x 47, %[i48],%[ptr_tmp] \n\t" "lxvd2x 47, %[i48],%[ptr_tmp] \n\t"
"lxvd2x 48, %[i64],%[ptr_tmp] \n\t" "lxvd2x 48, %[i64],%[ptr_tmp] \n\t"
"lxvd2x 49, %[i80],%[ptr_tmp] \n\t" "lxvd2x 49, %[i80],%[ptr_tmp] \n\t"
"lxvd2x 50, %[i96],%[ptr_tmp] \n\t" "lxvd2x 6, %[i96],%[ptr_tmp] \n\t"
"lxvd2x 51,%[i112],%[ptr_tmp] \n\t" "lxvd2x 7,%[i112],%[ptr_tmp] \n\t"
"xxlor 40,%x[start],%x[start] \n\t" //{ 1,0} vs40 | v8 "xxlor 40,%x[start],%x[start] \n\t" //{ 1,0} vs40 | v8
"vaddudm 9,8,%[adder] \n\t" //{3,2} vs41 "vaddudm 9,8,%[adder] \n\t" //{3,2} vs41
@ -65,7 +65,7 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) {
"vaddudm 11,10,%[adder] \n\t" //{7,6} vs43 "vaddudm 11,10,%[adder] \n\t" //{7,6} vs43
"lxvdsx 39,0,%[ptr_minf] \n\t" // vs39 vec_min_value "lxvdsx 39,0,%[ptr_minf] \n\t" // vs39 vec_min_value
"vaddudm 4,11, %[adder] \n\t" // {9,8} -{8;8} vs36 | v4 "vaddudm 4,11, %[adder] \n\t" // {9,8} -{8;8} vs36 | v4
"xxspltd 36,36,0 \n\t" XXSPLTD_S(36,36,0)
@ -75,24 +75,24 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) {
"xvabsdp 47, 47 \n\t" "xvabsdp 47, 47 \n\t"
"xvabsdp 48, 48 \n\t" "xvabsdp 48, 48 \n\t"
"xvabsdp 49, 49 \n\t" "xvabsdp 49, 49 \n\t"
"xvabsdp 50, 50 \n\t" "xvabsdp 6, 6 \n\t"
"xvabsdp 51, 51 \n\t" "xvabsdp 7, 7 \n\t"
//jump first half forward //jump first half forward
"b 2f \n\t" "b two%= \n\t"
".p2align 5 \n\t" ".align 5 \n\t"
"1: \n\t" "one%=: \n\t"
"xxmrghd 0,44,45 \n\t" XXMRGHD_S(0,44,45)
"xxmrgld 1,44,45 \n\t" XXMRGLD_S(1,44,45)
"xxmrghd 2,46,47 \n\t" XXMRGHD_S(2,46,47)
"xxmrgld 3,46,47 \n\t" XXMRGLD_S(3,46,47)
"xxmrghd 4,48,49 \n\t" XXMRGHD_S(4,48,49)
"xxmrgld 5,48,49 \n\t" XXMRGLD_S(5,48,49)
"xxmrghd 44,50,51 \n\t" XXMRGHD_S(44,6,7)
"xxmrgld 45,50,51 \n\t" XXMRGLD_S(45,6,7)
"xvadddp 46, 0,1 \n\t" "xvadddp 46, 0,1 \n\t"
"xvadddp 47, 2,3 \n\t" "xvadddp 47, 2,3 \n\t"
@ -101,15 +101,15 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) {
"xvcmpgtdp 50,46,47 \n\t " "xvcmpgtdp 6,46,47 \n\t "
"xvcmpgtdp 51,48,49 \n\t " "xvcmpgtdp 7,48,49 \n\t "
"addi %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t" "addi %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t"
"xxsel 32,40,41,50 \n\t" "xxsel 32,40,41,6 \n\t"
"xxsel 0,46,47,50 \n\t" "xxsel 0,46,47,6 \n\t"
"xxsel 33,42,43,51 \n\t" "xxsel 33,42,43,7 \n\t"
"xxsel 1,48,49,51 \n\t" "xxsel 1,48,49,7 \n\t"
"lxvd2x 44, 0,%[ptr_tmp] \n\t" "lxvd2x 44, 0,%[ptr_tmp] \n\t"
"lxvd2x 45, %[i16],%[ptr_tmp] \n\t" "lxvd2x 45, %[i16],%[ptr_tmp] \n\t"
@ -131,8 +131,8 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) {
"lxvd2x 48, %[i64],%[ptr_tmp] \n\t" "lxvd2x 48, %[i64],%[ptr_tmp] \n\t"
"lxvd2x 49, %[i80],%[ptr_tmp] \n\t" "lxvd2x 49, %[i80],%[ptr_tmp] \n\t"
"lxvd2x 50, %[i96],%[ptr_tmp] \n\t" "lxvd2x 6, %[i96],%[ptr_tmp] \n\t"
"lxvd2x 51,%[i112],%[ptr_tmp] \n\t" "lxvd2x 7,%[i112],%[ptr_tmp] \n\t"
//select with previous //select with previous
"xxsel 38,38,32,4 \n\t" "xxsel 38,38,32,4 \n\t"
"xxsel 39,39,3,4 \n\t" "xxsel 39,39,3,4 \n\t"
@ -146,35 +146,35 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) {
"xvabsdp 47, 47 \n\t" "xvabsdp 47, 47 \n\t"
"xvabsdp 48, 48 \n\t" "xvabsdp 48, 48 \n\t"
"xvabsdp 49, 49 \n\t" "xvabsdp 49, 49 \n\t"
"xvabsdp 50, 50 \n\t" "xvabsdp 6, 6 \n\t"
"xvabsdp 51, 51 \n\t" "xvabsdp 7, 7 \n\t"
//>>/////////////////////////////// half start //>>/////////////////////////////// half start
"2: \n\t" "two%=: \n\t"
"xxmrghd 0,44,45 \n\t" XXMRGHD_S(0,44,45)
"xxmrgld 1,44,45 \n\t" XXMRGLD_S(1,44,45)
"xxmrghd 2,46,47 \n\t" XXMRGHD_S(2,46,47)
"xxmrgld 3,46,47 \n\t" XXMRGLD_S(3,46,47)
"xxmrghd 4,48,49 \n\t" XXMRGHD_S(4,48,49)
"xxmrgld 5,48,49 \n\t" XXMRGLD_S(5,48,49)
"xxmrghd 44,50,51 \n\t" XXMRGHD_S(44,6,7)
"xxmrgld 45,50,51 \n\t" XXMRGLD_S(45,6,7)
"xvadddp 46, 0,1 \n\t" "xvadddp 46, 0,1 \n\t"
"xvadddp 47, 2,3 \n\t" "xvadddp 47, 2,3 \n\t"
"xvadddp 48, 4,5 \n\t" "xvadddp 48, 4,5 \n\t"
"xvadddp 49, 44,45 \n\t" "xvadddp 49, 44,45 \n\t"
"xvcmpgtdp 50,46,47 \n\t " "xvcmpgtdp 6,46,47 \n\t "
"xvcmpgtdp 51,48,49 \n\t " "xvcmpgtdp 7,48,49 \n\t "
"addi %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t" "addi %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t"
"xxsel 32,40,41,50 \n\t" "xxsel 32,40,41,6 \n\t"
"xxsel 0,46,47,50 \n\t" "xxsel 0,46,47,6 \n\t"
"xxsel 33,42,43,51 \n\t" "xxsel 33,42,43,7 \n\t"
"xxsel 1,48,49,51 \n\t" "xxsel 1,48,49,7 \n\t"
"lxvd2x 44, 0,%[ptr_tmp] \n\t" "lxvd2x 44, 0,%[ptr_tmp] \n\t"
"lxvd2x 45, %[i16],%[ptr_tmp] \n\t" "lxvd2x 45, %[i16],%[ptr_tmp] \n\t"
@ -196,8 +196,8 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) {
"lxvd2x 48, %[i64],%[ptr_tmp] \n\t" "lxvd2x 48, %[i64],%[ptr_tmp] \n\t"
"lxvd2x 49, %[i80],%[ptr_tmp] \n\t" "lxvd2x 49, %[i80],%[ptr_tmp] \n\t"
"lxvd2x 50, %[i96],%[ptr_tmp] \n\t" "lxvd2x 6, %[i96],%[ptr_tmp] \n\t"
"lxvd2x 51,%[i112],%[ptr_tmp] \n\t" "lxvd2x 7,%[i112],%[ptr_tmp] \n\t"
//select with previous //select with previous
"xxsel 38,38,32,4 \n\t" "xxsel 38,38,32,4 \n\t"
"xxsel 39,39,3,4 \n\t" "xxsel 39,39,3,4 \n\t"
@ -209,24 +209,24 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) {
"xvabsdp 47, 47 \n\t" "xvabsdp 47, 47 \n\t"
"xvabsdp 48, 48 \n\t" "xvabsdp 48, 48 \n\t"
"xvabsdp 49, 49 \n\t" "xvabsdp 49, 49 \n\t"
"xvabsdp 50, 50 \n\t" "xvabsdp 6, 6 \n\t"
"xvabsdp 51, 51 \n\t" "xvabsdp 7, 7 \n\t"
//decrement n //decrement n
"addic. %[n], %[n], -16 \n\t" "addic. %[n], %[n], -16 \n\t"
//Loop back if >0 //Loop back if >0
"bgt+ 1b \n\t" "bgt+ one%= \n\t"
"xxmrghd 0,44,45 \n\t" XXMRGHD_S(0,44,45)
"xxmrgld 1,44,45 \n\t" XXMRGLD_S(1,44,45)
"xxmrghd 2,46,47 \n\t" XXMRGHD_S(2,46,47)
"xxmrgld 3,46,47 \n\t" XXMRGLD_S(3,46,47)
"xxmrghd 4,48,49 \n\t" XXMRGHD_S(4,48,49)
"xxmrgld 5,48,49 \n\t" XXMRGLD_S(5,48,49)
"xxmrghd 44,50,51 \n\t" XXMRGHD_S(44,6,7)
"xxmrgld 45,50,51 \n\t" XXMRGLD_S(45,6,7)
"xvadddp 46, 0,1 \n\t" "xvadddp 46, 0,1 \n\t"
"xvadddp 47, 2,3 \n\t" "xvadddp 47, 2,3 \n\t"
@ -235,13 +235,13 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) {
"xvcmpgtdp 50,46,47 \n\t " "xvcmpgtdp 6,46,47 \n\t "
"xvcmpgtdp 51,48,49 \n\t " "xvcmpgtdp 7,48,49 \n\t "
"xxsel 32,40,41,50 \n\t" "xxsel 32,40,41,6 \n\t"
"xxsel 0,46,47,50 \n\t" "xxsel 0,46,47,6 \n\t"
"xxsel 33,42,43,51 \n\t" "xxsel 33,42,43,7 \n\t"
"xxsel 1,48,49,51 \n\t" "xxsel 1,48,49,7 \n\t"
"xvcmpgtdp 2,0,1 \n\t " "xvcmpgtdp 2,0,1 \n\t "
"xxsel 32,32,33,2 \n\t" "xxsel 32,32,33,2 \n\t"
@ -260,28 +260,28 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) {
///////extract min value and min index from vector ///////extract min value and min index from vector
"xxspltd 32,38,1 \n\t" XXSPLTD_S(32,38,1)
"xxspltd 40,39,1 \n\t" XXSPLTD_S(40,39,1)
"xvcmpeqdp. 2, 40,39 \n\t" "xvcmpeqdp. 2, 40,39 \n\t"
//cr6 0 bit set if all true, cr6=4*6+bit_ind=24,0011at CR(BI)==1, at=10 hint that it occurs rarely //cr6 0 bit set if all true, cr6=4*6+bit_ind=24,0011at CR(BI)==1, at=10 hint that it occurs rarely
//0b001110=14 //0b001110=14
"bc 14,24, 3f \n\t" "bc 14,24, three%= \n\t"
"xvcmpgtdp 4,39, 40 \n\t" "xvcmpgtdp 4,39, 40 \n\t"
"xxsel 0,39,40,4 \n\t" "xxsel 0,39,40,4 \n\t"
"xxsel 1,38,32,4 \n\t" "xxsel 1,38,32,4 \n\t"
"stxsdx 0,0,%[ptr_minf] \n\t" "stxsdx 0,0,%[ptr_minf] \n\t"
"b 4f \n\t" "b four%= \n\t"
"3: \n\t" "three%=: \n\t"
//if elements value are equal then choose minimum index //if elements value are equal then choose minimum index
"xxspltd 0,40,0 \n\t" XXSPLTD_S(0,40,0)
"vminud 0,0,6 \n\t" //vs32 vs38 "vminud 0,0,6 \n\t" //vs32 vs38
"xxlor 1,32,32 \n\t" "xxlor 1,32,32 \n\t"
"stxsdx 0,0,%[ptr_minf] \n\t" "stxsdx 0,0,%[ptr_minf] \n\t"
"4: \n\t" "four%=: \n\t"
"mfvsrd %[index],1 \n\t" "mfvsrd %[index],1 \n\t"
: [minf] "=m"(*minf),[ptr_tmp] "+&b"(x),[index] "=r"(index), [n] "+&r"(n) : [minf] "=m"(*minf),[ptr_tmp] "+&b"(x),[index] "=r"(index), [n] "+&r"(n)
@ -290,7 +290,7 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) {
[i64] "b"(64), [i80] "b"(80), [i96] "b"(96), [i112] "b"(112), [i64] "b"(64), [i80] "b"(80), [i96] "b"(96), [i112] "b"(112),
[start] "v"(start), [adder] "v"(temp_add_index) [start] "v"(start), [adder] "v"(temp_add_index)
: "cc", "vs0", "vs1","vs2","vs3", "vs4","vs5","vs32", "vs33", "vs34", "vs35", "vs36", : "cc", "vs0", "vs1","vs2","vs3", "vs4","vs5","vs32", "vs33", "vs34", "vs35", "vs36",
"vs37", "vs38", "vs39", "vs40", "vs41", "vs42", "vs43", "vs44", "vs45", "vs46", "vs47", "vs48", "vs49", "vs50", "vs51" "vs37", "vs38", "vs39", "vs40", "vs41", "vs42", "vs43", "vs44", "vs45", "vs46", "vs47", "vs48", "vs49", "vs6", "vs7"
); );
return index; return index;

View File

@ -46,10 +46,10 @@ static void __inline blas_lock(volatile BLASULONG *address){
" .machine \"any\" ;" " .machine \"any\" ;"
"0: lwarx %0,0, %1 ;" "0: lwarx %0,0, %1 ;"
" cmpwi 0,%0,0;" " cmpwi 0,%0,0;"
" bne 1f;" " bne one%=;"
" stwcx. %2,0, %1 ;" " stwcx. %2,0, %1 ;"
" bne- 0b;" " bne- 0b;"
"1: " "one%=: "
: "=&r"(ret) : "=&r"(ret)
: "r"(address), "r" (val) : "r"(address), "r" (val)
: "cr0", "memory"); : "cr0", "memory");

View File

@ -68,10 +68,10 @@ static float sasum_kernel_32 (long n, float *x)
"addi %2, %2, 128 \n\t" "addi %2, %2, 128 \n\t"
"addic. %1, %1, -32 \n\t" "addic. %1, %1, -32 \n\t"
"ble 2f \n\t" "ble two%= \n\t"
".p2align 5 \n" ".align 5 \n"
"1: \n\t" "one%=: \n\t"
"xvabssp 48, 40 \n\t" "xvabssp 48, 40 \n\t"
"xvabssp 49, 41 \n\t" "xvabssp 49, 41 \n\t"
@ -108,9 +108,9 @@ static float sasum_kernel_32 (long n, float *x)
"xvaddsp 38, 38, %x5 \n\t" "xvaddsp 38, 38, %x5 \n\t"
"xvaddsp 39, 39, %x6 \n\t" "xvaddsp 39, 39, %x6 \n\t"
"bgt 1b \n" "bgt one%= \n"
"2: \n\t" "two%=: \n\t"
"xvabssp 48, 40 \n\t" "xvabssp 48, 40 \n\t"
"xvabssp 49, 41 \n\t" "xvabssp 49, 41 \n\t"

View File

@ -51,10 +51,10 @@ static void scopy_kernel_32 (long n, float *x, float *y)
"addi %2, %2, 128 \n\t" "addi %2, %2, 128 \n\t"
"addic. %1, %1, -32 \n\t" "addic. %1, %1, -32 \n\t"
"ble 2f \n\t" "ble two%= \n\t"
".p2align 5 \n" ".align 5 \n"
"1: \n\t" "one%=: \n\t"
"stxvd2x 40, 0, %3 \n\t" "stxvd2x 40, 0, %3 \n\t"
"stxvd2x 41, %5, %3 \n\t" "stxvd2x 41, %5, %3 \n\t"
@ -77,9 +77,9 @@ static void scopy_kernel_32 (long n, float *x, float *y)
"addi %2, %2, 128 \n\t" "addi %2, %2, 128 \n\t"
"addic. %1, %1, -32 \n\t" "addic. %1, %1, -32 \n\t"
"bgt 1b \n" "bgt one%= \n"
"2: \n\t" "two%=: \n\t"
"stxvd2x 40, 0, %3 \n\t" "stxvd2x 40, 0, %3 \n\t"
"stxvd2x 41, %5, %3 \n\t" "stxvd2x 41, %5, %3 \n\t"

View File

@ -78,10 +78,10 @@ static float sdot_kernel_16 (long n, float *x, float *y)
"addi %3, %3, 128 \n\t" "addi %3, %3, 128 \n\t"
"addic. %1, %1, -32 \n\t" "addic. %1, %1, -32 \n\t"
"ble 2f \n\t" "ble two%= \n\t"
".p2align 5 \n" ".align 5 \n"
"1: \n\t" "one%=: \n\t"
"xvmaddasp 32, 40, 48 \n\t" "xvmaddasp 32, 40, 48 \n\t"
"lxvd2x 40, 0, %2 \n\t" "lxvd2x 40, 0, %2 \n\t"
@ -112,9 +112,9 @@ static float sdot_kernel_16 (long n, float *x, float *y)
"addi %3, %3, 128 \n\t" "addi %3, %3, 128 \n\t"
"addic. %1, %1, -32 \n\t" "addic. %1, %1, -32 \n\t"
"bgt 1b \n" "bgt one%= \n"
"2: \n\t" "two%=: \n\t"
"xvmaddasp 32, 40, 48 \n\t" "xvmaddasp 32, 40, 48 \n\t"
"xvmaddasp 33, 41, 49 \n\t" "xvmaddasp 33, 41, 49 \n\t"

File diff suppressed because it is too large Load Diff

View File

@ -38,7 +38,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
* Macros for N=4 and M=16 * Macros for N=4 and M=16
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_4x16', `
#else
.macro COPY_4x16 .macro COPY_4x16
#endif
lxvw4x vs32, o0, A0 lxvw4x vs32, o0, A0
lxvw4x vs33, o16, A0 lxvw4x vs33, o16, A0
@ -88,13 +92,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvw4x vs46, o32, T1 stxvw4x vs46, o32, T1
stxvw4x vs47, o48, T1 stxvw4x vs47, o48, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=4 and M=8 * Macros for N=4 and M=8
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_4x8', `
#else
.macro COPY_4x8 .macro COPY_4x8
#endif
lxvw4x vs32, o0, A0 lxvw4x vs32, o0, A0
lxvw4x vs33, o16, A0 lxvw4x vs33, o16, A0
@ -124,13 +136,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvw4x vs38, o32, T1 stxvw4x vs38, o32, T1
stxvw4x vs39, o48, T1 stxvw4x vs39, o48, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=4 and M=4 * Macros for N=4 and M=4
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_4x4', `
#else
.macro COPY_4x4 .macro COPY_4x4
#endif
lxvw4x vs32, o0, A0 lxvw4x vs32, o0, A0
@ -150,13 +170,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvw4x vs35, o48, T1 stxvw4x vs35, o48, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=4 and M=2 * Macros for N=4 and M=2
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_4x2', `
#else
.macro COPY_4x2 .macro COPY_4x2
#endif
lxsspx vs32, o0, A0 lxsspx vs32, o0, A0
lxsspx vs33, o4, A0 lxsspx vs33, o4, A0
@ -190,13 +218,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxsspx vs38, o0, T1 stxsspx vs38, o0, T1
stxsspx vs39, o4, T1 stxsspx vs39, o4, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=4 and M=1 * Macros for N=4 and M=1
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_4x1', `
#else
.macro COPY_4x1 .macro COPY_4x1
#endif
lxsspx vs32, o0, A0 lxsspx vs32, o0, A0
@ -218,13 +254,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxsspx vs35, o4, T1 stxsspx vs35, o4, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=2 and M=16 * Macros for N=2 and M=16
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_2x16', `
#else
.macro COPY_2x16 .macro COPY_2x16
#endif
lxvw4x vs32, o0, A0 lxvw4x vs32, o0, A0
lxvw4x vs33, o16, A0 lxvw4x vs33, o16, A0
@ -250,13 +294,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvw4x vs38, o32, T1 stxvw4x vs38, o32, T1
stxvw4x vs39, o48, T1 stxvw4x vs39, o48, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=2 and M=8 * Macros for N=2 and M=8
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_2x8', `
#else
.macro COPY_2x8 .macro COPY_2x8
#endif
lxvw4x vs32, o0, A0 lxvw4x vs32, o0, A0
lxvw4x vs33, o16, A0 lxvw4x vs33, o16, A0
@ -272,13 +324,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvw4x vs34, o32, T1 stxvw4x vs34, o32, T1
stxvw4x vs35, o48, T1 stxvw4x vs35, o48, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=2 and M=4 * Macros for N=2 and M=4
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_2x4', `
#else
.macro COPY_2x4 .macro COPY_2x4
#endif
lxvw4x vs32, o0, A0 lxvw4x vs32, o0, A0
@ -290,13 +350,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvw4x vs33, o16, T1 stxvw4x vs33, o16, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=2 and M=2 * Macros for N=2 and M=2
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_2x2', `
#else
.macro COPY_2x2 .macro COPY_2x2
#endif
lxsspx vs32, o0, A0 lxsspx vs32, o0, A0
lxsspx vs33, o4, A0 lxsspx vs33, o4, A0
@ -314,13 +382,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxsspx vs34, o0, T1 stxsspx vs34, o0, T1
stxsspx vs35, o4, T1 stxsspx vs35, o4, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=2 and M=1 * Macros for N=2 and M=1
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_2x1', `
#else
.macro COPY_2x1 .macro COPY_2x1
#endif
lxsspx vs32, o0, A0 lxsspx vs32, o0, A0
@ -332,13 +408,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxsspx vs33, o4, T1 stxsspx vs33, o4, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=1 and M=16 * Macros for N=1 and M=16
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_1x16', `
#else
.macro COPY_1x16 .macro COPY_1x16
#endif
lxvw4x vs32, o0, A0 lxvw4x vs32, o0, A0
lxvw4x vs33, o16, A0 lxvw4x vs33, o16, A0
@ -352,13 +436,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvw4x vs34, o32, T1 stxvw4x vs34, o32, T1
stxvw4x vs35, o48, T1 stxvw4x vs35, o48, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=1 and M=8 * Macros for N=1 and M=8
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_1x8', `
#else
.macro COPY_1x8 .macro COPY_1x8
#endif
lxvw4x vs32, o0, A0 lxvw4x vs32, o0, A0
lxvw4x vs33, o16, A0 lxvw4x vs33, o16, A0
@ -368,13 +460,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvw4x vs32, o0, T1 stxvw4x vs32, o0, T1
stxvw4x vs33, o16, T1 stxvw4x vs33, o16, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=1 and M=4 * Macros for N=1 and M=4
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_1x4', `
#else
.macro COPY_1x4 .macro COPY_1x4
#endif
lxvw4x vs32, o0, A0 lxvw4x vs32, o0, A0
@ -382,13 +482,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvw4x vs32, o0, T1 stxvw4x vs32, o0, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=1 and M=2 * Macros for N=1 and M=2
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_1x2', `
#else
.macro COPY_1x2 .macro COPY_1x2
#endif
lxsspx vs32, o0, A0 lxsspx vs32, o0, A0
lxsspx vs33, o4, A0 lxsspx vs33, o4, A0
@ -398,13 +506,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxsspx vs32, o0, T1 stxsspx vs32, o0, T1
stxsspx vs33, o4, T1 stxsspx vs33, o4, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=1 and M=1 * Macros for N=1 and M=1
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_1x1', `
#else
.macro COPY_1x1 .macro COPY_1x1
#endif
lxsspx vs32, o0, A0 lxsspx vs32, o0, A0
@ -412,5 +528,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxsspx vs32, o0, T1 stxsspx vs32, o0, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif

View File

@ -38,7 +38,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
* Macros for N=4 and M=8 * Macros for N=4 and M=8
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_4x8', `
#else
.macro COPY_4x8 .macro COPY_4x8
#endif
lxvw4x vs32, o0, A0 lxvw4x vs32, o0, A0
lxvw4x vs33, o16, A0 lxvw4x vs33, o16, A0
@ -68,13 +72,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvw4x vs38, o32, T1 stxvw4x vs38, o32, T1
stxvw4x vs39, o48, T1 stxvw4x vs39, o48, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=4 and M=4 * Macros for N=4 and M=4
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_4x4', `
#else
.macro COPY_4x4 .macro COPY_4x4
#endif
lxvw4x vs32, o0, A0 lxvw4x vs32, o0, A0
@ -94,13 +106,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvw4x vs35, o48, T1 stxvw4x vs35, o48, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=4 and M=2 * Macros for N=4 and M=2
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_4x2', `
#else
.macro COPY_4x2 .macro COPY_4x2
#endif
lxsspx vs32, o0, A0 lxsspx vs32, o0, A0
lxsspx vs33, o4, A0 lxsspx vs33, o4, A0
@ -134,13 +154,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxsspx vs38, o0, T1 stxsspx vs38, o0, T1
stxsspx vs39, o4, T1 stxsspx vs39, o4, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=4 and M=1 * Macros for N=4 and M=1
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_4x1', `
#else
.macro COPY_4x1 .macro COPY_4x1
#endif
lxsspx vs32, o0, A0 lxsspx vs32, o0, A0
@ -162,13 +190,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxsspx vs35, o4, T1 stxsspx vs35, o4, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=2 and M=8 * Macros for N=2 and M=8
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_2x8', `
#else
.macro COPY_2x8 .macro COPY_2x8
#endif
lxvw4x vs32, o0, A0 lxvw4x vs32, o0, A0
lxvw4x vs33, o16, A0 lxvw4x vs33, o16, A0
@ -184,13 +220,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvw4x vs34, o32, T1 stxvw4x vs34, o32, T1
stxvw4x vs35, o48, T1 stxvw4x vs35, o48, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=2 and M=4 * Macros for N=2 and M=4
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_2x4', `
#else
.macro COPY_2x4 .macro COPY_2x4
#endif
lxvw4x vs32, o0, A0 lxvw4x vs32, o0, A0
@ -202,13 +246,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvw4x vs33, o16, T1 stxvw4x vs33, o16, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=2 and M=2 * Macros for N=2 and M=2
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_2x2', `
#else
.macro COPY_2x2 .macro COPY_2x2
#endif
lxsspx vs32, o0, A0 lxsspx vs32, o0, A0
lxsspx vs33, o4, A0 lxsspx vs33, o4, A0
@ -226,13 +278,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxsspx vs34, o0, T1 stxsspx vs34, o0, T1
stxsspx vs35, o4, T1 stxsspx vs35, o4, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=2 and M=1 * Macros for N=2 and M=1
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_2x1', `
#else
.macro COPY_2x1 .macro COPY_2x1
#endif
lxsspx vs32, o0, A0 lxsspx vs32, o0, A0
@ -244,13 +304,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxsspx vs33, o4, T1 stxsspx vs33, o4, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=1 and M=8 * Macros for N=1 and M=8
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_1x8', `
#else
.macro COPY_1x8 .macro COPY_1x8
#endif
lxvw4x vs32, o0, A0 lxvw4x vs32, o0, A0
lxvw4x vs33, o16, A0 lxvw4x vs33, o16, A0
@ -260,13 +328,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvw4x vs32, o0, T1 stxvw4x vs32, o0, T1
stxvw4x vs33, o16, T1 stxvw4x vs33, o16, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=1 and M=4 * Macros for N=1 and M=4
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_1x4', `
#else
.macro COPY_1x4 .macro COPY_1x4
#endif
lxvw4x vs32, o0, A0 lxvw4x vs32, o0, A0
@ -274,13 +350,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvw4x vs32, o0, T1 stxvw4x vs32, o0, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=1 and M=2 * Macros for N=1 and M=2
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_1x2', `
#else
.macro COPY_1x2 .macro COPY_1x2
#endif
lxsspx vs32, o0, A0 lxsspx vs32, o0, A0
lxsspx vs33, o4, A0 lxsspx vs33, o4, A0
@ -290,13 +374,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxsspx vs32, o0, T1 stxsspx vs32, o0, T1
stxsspx vs33, o4, T1 stxsspx vs33, o4, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=1 and M=1 * Macros for N=1 and M=1
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_1x1', `
#else
.macro COPY_1x1 .macro COPY_1x1
#endif
lxsspx vs32, o0, A0 lxsspx vs32, o0, A0
@ -304,5 +396,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxsspx vs32, o0, T1 stxsspx vs32, o0, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif

View File

@ -71,10 +71,10 @@ static void srot_kernel_16 (long n, float *x, float *y, float c, float s)
"addi %4, %4, 64 \n\t" "addi %4, %4, 64 \n\t"
"addic. %2, %2, -16 \n\t" "addic. %2, %2, -16 \n\t"
"ble 2f \n\t" "ble two%= \n\t"
".p2align 5 \n" ".align 5 \n"
"1: \n\t" "one%=: \n\t"
"xvmulsp 40, 32, 36 \n\t" // c * x "xvmulsp 40, 32, 36 \n\t" // c * x
"xvmulsp 41, 33, 36 \n\t" "xvmulsp 41, 33, 36 \n\t"
@ -138,9 +138,9 @@ static void srot_kernel_16 (long n, float *x, float *y, float c, float s)
"addi %4, %4, 128 \n\t" "addi %4, %4, 128 \n\t"
"addic. %2, %2, -16 \n\t" "addic. %2, %2, -16 \n\t"
"bgt 1b \n" "bgt one%= \n"
"2: \n\t" "two%=: \n\t"
"xvmulsp 40, 32, 36 \n\t" // c * x "xvmulsp 40, 32, 36 \n\t" // c * x
"xvmulsp 41, 33, 36 \n\t" "xvmulsp 41, 33, 36 \n\t"

View File

@ -56,10 +56,10 @@ static void sscal_kernel_16 (long n, float *x, float alpha)
"addi %2, %2, 128 \n\t" "addi %2, %2, 128 \n\t"
"addic. %1, %1, -32 \n\t" "addic. %1, %1, -32 \n\t"
"ble 2f \n\t" "ble two%= \n\t"
".p2align 5 \n" ".align 5 \n"
"1: \n\t" "one%=: \n\t"
"xvmulsp 40, 32, %x3 \n\t" "xvmulsp 40, 32, %x3 \n\t"
"xvmulsp 41, 33, %x3 \n\t" "xvmulsp 41, 33, %x3 \n\t"
@ -92,9 +92,9 @@ static void sscal_kernel_16 (long n, float *x, float alpha)
"addi %2, %2, 256 \n\t" "addi %2, %2, 256 \n\t"
"addic. %1, %1, -32 \n\t" "addic. %1, %1, -32 \n\t"
"bgt 1b \n" "bgt one%= \n"
"2: \n\t" "two%=: \n\t"
"xvmulsp 40, 32, %x3 \n\t" "xvmulsp 40, 32, %x3 \n\t"
"xvmulsp 41, 33, %x3 \n\t" "xvmulsp 41, 33, %x3 \n\t"
@ -147,8 +147,8 @@ static void sscal_kernel_16_zero (long n, float *x)
( (
"xxlxor %x3, %x3, %x3 \n\t" "xxlxor %x3, %x3, %x3 \n\t"
".p2align 5 \n" ".align 5 \n"
"1: \n\t" "one%=: \n\t"
"stxvd2x %x3, 0, %2 \n\t" "stxvd2x %x3, 0, %2 \n\t"
"stxvd2x %x3, %4, %2 \n\t" "stxvd2x %x3, %4, %2 \n\t"
@ -162,7 +162,7 @@ static void sscal_kernel_16_zero (long n, float *x)
"addi %2, %2, 128 \n\t" "addi %2, %2, 128 \n\t"
"addic. %1, %1, -32 \n\t" "addic. %1, %1, -32 \n\t"
"bgt 1b \n" "bgt one%= \n"
"#n=%1 x=%0=%2 t0=%x3 o16=%4 o32=%5 o48=%6 o64=%7 o80=%8 o96=%9 o112=%10" "#n=%1 x=%0=%2 t0=%x3 o16=%4 o32=%5 o48=%6 o64=%7 o80=%8 o96=%9 o112=%10"
: :

View File

@ -39,8 +39,8 @@ static void sswap_kernel_32 (long n, float *x, float *y)
{ {
__asm__ __asm__
( (
".p2align 5 \n" ".align 5 \n"
"1: \n\t" "one%=: \n\t"
"lxvd2x 32, 0, %4 \n\t" "lxvd2x 32, 0, %4 \n\t"
"lxvd2x 33, %5, %4 \n\t" "lxvd2x 33, %5, %4 \n\t"
@ -83,7 +83,7 @@ static void sswap_kernel_32 (long n, float *x, float *y)
"addi %4, %4, 128 \n\t" "addi %4, %4, 128 \n\t"
"addic. %2, %2, -32 \n\t" "addic. %2, %2, -32 \n\t"
"bgt 1b \n" "bgt one%= \n"
"#n=%2 x=%0=%3 y=%1=%4 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11" "#n=%2 x=%0=%3 y=%1=%4 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11"
: :

File diff suppressed because it is too large Load Diff

View File

@ -68,10 +68,10 @@ static double zasum_kernel_8 (long n, double *x)
"addi %2, %2, 128 \n\t" "addi %2, %2, 128 \n\t"
"addic. %1, %1, -8 \n\t" "addic. %1, %1, -8 \n\t"
"ble 2f \n\t" "ble two%= \n\t"
".p2align 5 \n" ".align 5 \n"
"1: \n\t" "one%=: \n\t"
"xvabsdp 48, 40 \n\t" "xvabsdp 48, 40 \n\t"
"xvabsdp 49, 41 \n\t" "xvabsdp 49, 41 \n\t"
@ -108,9 +108,9 @@ static double zasum_kernel_8 (long n, double *x)
"xvadddp 38, 38, %x5 \n\t" "xvadddp 38, 38, %x5 \n\t"
"xvadddp 39, 39, %x6 \n\t" "xvadddp 39, 39, %x6 \n\t"
"bgt 1b \n" "bgt one%= \n"
"2: \n\t" "two%=: \n\t"
"xvabsdp 48, 40 \n\t" "xvabsdp 48, 40 \n\t"
"xvabsdp 49, 41 \n\t" "xvabsdp 49, 41 \n\t"
@ -140,7 +140,7 @@ static double zasum_kernel_8 (long n, double *x)
"xvadddp 32, 32, 36 \n\t" "xvadddp 32, 32, 36 \n\t"
"xxswapd 33, 32 \n\t" XXSWAPD_S(33,32)
"xsadddp %x0, 32, 33 \n" "xsadddp %x0, 32, 33 \n"
"#n=%1 x=%3=%2 sum=%0 o16=%8 o32=%9 o48=%10 o64=%11 o80=%12 o96=%13 o112=%14\n" "#n=%1 x=%3=%2 sum=%0 o16=%8 o32=%9 o48=%10 o64=%11 o80=%12 o96=%13 o112=%14\n"

View File

@ -61,8 +61,8 @@ static void zaxpy_kernel_4 (long n, double *x, double *y,
__asm__ __asm__
( (
"xxspltd 32, %x19, 0 \n\t" // alpha_r XXSPLTD_S(32,%x19,0) // alpha_r
"xxspltd 33, %x20, 0 \n\t" // alpha_i XXSPLTD_S(33,%x20,0) // alpha_i
"lxvd2x 36, 0, %21 \n\t" // mvec "lxvd2x 36, 0, %21 \n\t" // mvec
@ -87,10 +87,10 @@ static void zaxpy_kernel_4 (long n, double *x, double *y,
"lxvd2x 50, %23, %3 \n\t" // y2 "lxvd2x 50, %23, %3 \n\t" // y2
"lxvd2x 51, %24, %3 \n\t" // y3 "lxvd2x 51, %24, %3 \n\t" // y3
"xxswapd %x8, 40 \n\t" // exchange real and imag part XXSWAPD_S(%x8,40) // exchange real and imag part
"xxswapd %x9, 41 \n\t" // exchange real and imag part XXSWAPD_S(%x9,41) // exchange real and imag part
"xxswapd %x10, 42 \n\t" // exchange real and imag part XXSWAPD_S(%x10,42) // exchange real and imag part
"xxswapd %x11, 43 \n\t" // exchange real and imag part XXSWAPD_S(%x11,43) // exchange real and imag part
"addi %2, %2, 64 \n\t" "addi %2, %2, 64 \n\t"
"addi %3, %3, 64 \n\t" "addi %3, %3, 64 \n\t"
@ -105,19 +105,19 @@ static void zaxpy_kernel_4 (long n, double *x, double *y,
"lxvd2x %x6, %23, %3 \n\t" // y6 "lxvd2x %x6, %23, %3 \n\t" // y6
"lxvd2x %x7, %24, %3 \n\t" // y7 "lxvd2x %x7, %24, %3 \n\t" // y7
"xxswapd %x12, 44 \n\t" // exchange real and imag part XXSWAPD_S(%x12,44) // exchange real and imag part
"xxswapd %x13, 45 \n\t" // exchange real and imag part XXSWAPD_S(%x13,45) // exchange real and imag part
"xxswapd %x14, 46 \n\t" // exchange real and imag part XXSWAPD_S(%x14,46) // exchange real and imag part
"xxswapd %x15, 47 \n\t" // exchange real and imag part XXSWAPD_S(%x15,47) // exchange real and imag part
"addi %2, %2, 64 \n\t" "addi %2, %2, 64 \n\t"
"addi %3, %3, 64 \n\t" "addi %3, %3, 64 \n\t"
"addic. %1, %1, -8 \n\t" "addic. %1, %1, -8 \n\t"
"ble 2f \n\t" "ble two%= \n\t"
".p2align 5 \n" ".align 5 \n"
"1: \n\t" "one%=: \n\t"
"xvmaddadp 48, 40, 32 \n\t" // alpha_r * x0_r , alpha_r * x0_i "xvmaddadp 48, 40, 32 \n\t" // alpha_r * x0_r , alpha_r * x0_i
"xvmaddadp 49, 41, 32 \n\t" "xvmaddadp 49, 41, 32 \n\t"
@ -163,31 +163,31 @@ static void zaxpy_kernel_4 (long n, double *x, double *y,
"addi %16, %16, 64 \n\t" "addi %16, %16, 64 \n\t"
"xxswapd %x8, 40 \n\t" // exchange real and imag part XXSWAPD_S(%x8,40) // exchange real and imag part
"xxswapd %x9, 41 \n\t" // exchange real and imag part XXSWAPD_S(%x9,41) // exchange real and imag part
"lxvd2x 48, 0, %3 \n\t" // y0 "lxvd2x 48, 0, %3 \n\t" // y0
"lxvd2x 49, %22, %3 \n\t" // y1 "lxvd2x 49, %22, %3 \n\t" // y1
"xxswapd %x10, 42 \n\t" // exchange real and imag part XXSWAPD_S(%x10,42) // exchange real and imag part
"xxswapd %x11, 43 \n\t" // exchange real and imag part XXSWAPD_S(%x11,43) // exchange real and imag part
"lxvd2x 50, %23, %3 \n\t" // y2 "lxvd2x 50, %23, %3 \n\t" // y2
"lxvd2x 51, %24, %3 \n\t" // y3 "lxvd2x 51, %24, %3 \n\t" // y3
"xxswapd %x12, 44 \n\t" // exchange real and imag part XXSWAPD_S(%x12,44) // exchange real and imag part
"addi %3, %3, 64 \n\t" "addi %3, %3, 64 \n\t"
"xxswapd %x13, 45 \n\t" // exchange real and imag part XXSWAPD_S(%x13,45) // exchange real and imag part
"lxvd2x %x4, 0, %3 \n\t" // y4 "lxvd2x %x4, 0, %3 \n\t" // y4
"lxvd2x %x5, %22, %3 \n\t" // y5 "lxvd2x %x5, %22, %3 \n\t" // y5
"xxswapd %x14, 46 \n\t" // exchange real and imag part XXSWAPD_S(%x14,46) // exchange real and imag part
"xxswapd %x15, 47 \n\t" // exchange real and imag part XXSWAPD_S(%x15,47) // exchange real and imag part
"lxvd2x %x6, %23, %3 \n\t" // y6 "lxvd2x %x6, %23, %3 \n\t" // y6
"lxvd2x %x7, %24, %3 \n\t" // y7 "lxvd2x %x7, %24, %3 \n\t" // y7
"addi %3, %3, 64 \n\t" "addi %3, %3, 64 \n\t"
"addic. %1, %1, -8 \n\t" "addic. %1, %1, -8 \n\t"
"bgt 1b \n" "bgt one%= \n"
"2: \n\t" "two%=: \n\t"
"xvmaddadp 48, 40, 32 \n\t" // alpha_r * x0_r , alpha_r * x0_i "xvmaddadp 48, 40, 32 \n\t" // alpha_r * x0_r , alpha_r * x0_i
"xvmaddadp 49, 41, 32 \n\t" "xvmaddadp 49, 41, 32 \n\t"

View File

@ -62,10 +62,10 @@ static void zcopy_kernel_16 (long n, FLOAT *x, FLOAT *y)
"addi %2, %2, 128 \n\t" "addi %2, %2, 128 \n\t"
"addic. %1, %1, -16 \n\t" "addic. %1, %1, -16 \n\t"
"ble 2f \n\t" "ble two%= \n\t"
".p2align 5 \n" ".align 5 \n"
"1: \n\t" "one%=: \n\t"
"stxvd2x 32, 0, %3 \n\t" "stxvd2x 32, 0, %3 \n\t"
"stxvd2x 33, %5, %3 \n\t" "stxvd2x 33, %5, %3 \n\t"
@ -108,9 +108,9 @@ static void zcopy_kernel_16 (long n, FLOAT *x, FLOAT *y)
"addi %2, %2, 128 \n\t" "addi %2, %2, 128 \n\t"
"addic. %1, %1, -16 \n\t" "addic. %1, %1, -16 \n\t"
"bgt 1b \n" "bgt one%= \n"
"2: \n\t" "two%=: \n\t"
"stxvd2x 32, 0, %3 \n\t" "stxvd2x 32, 0, %3 \n\t"
"stxvd2x 33, %5, %3 \n\t" "stxvd2x 33, %5, %3 \n\t"

View File

@ -60,10 +60,10 @@ static void zdot_kernel_8 (long n, double *x, double *y, double *dot)
"lxvd2x 43, %9, %2 \n\t" // x3_r, x3_i "lxvd2x 43, %9, %2 \n\t" // x3_r, x3_i
"lxvd2x 51, %9, %3 \n\t" // y3_r, y3_i "lxvd2x 51, %9, %3 \n\t" // y3_r, y3_i
"xxswapd 0, 48 \n\t" // y0_i, y0_r XXSWAPD_S(0,48) // y0_i, y0_r
"xxswapd 1, 49 \n\t" // y1_i, y1_r XXSWAPD_S(1,49) // y1_i, y1_r
"xxswapd 2, 50 \n\t" // y2_i, y2_r XXSWAPD_S(2,50) // y2_i, y2_r
"xxswapd 3, 51 \n\t" // y3_i, y3_r XXSWAPD_S(3,51) // y3_i, y3_r
"addi %2, %2, 64 \n\t" "addi %2, %2, 64 \n\t"
"addi %3, %3, 64 \n\t" "addi %3, %3, 64 \n\t"
@ -77,19 +77,19 @@ static void zdot_kernel_8 (long n, double *x, double *y, double *dot)
"lxvd2x 47, %9, %2 \n\t" // x3_r, x3_i "lxvd2x 47, %9, %2 \n\t" // x3_r, x3_i
"lxvd2x 7, %9, %3 \n\t" // y3_r, y3_i "lxvd2x 7, %9, %3 \n\t" // y3_r, y3_i
"xxswapd 8, 4 \n\t" // y0_i, y0_r XXSWAPD_S(8,4) // y0_i, y0_r
"xxswapd 9, 5 \n\t" // y1_i, y1_r XXSWAPD_S(9,5) // y1_i, y1_r
"xxswapd 10, 6 \n\t" // y2_i, y2_r XXSWAPD_S(10,6) // y2_i, y2_r
"xxswapd 11, 7 \n\t" // y3_i, y3_r XXSWAPD_S(11,7) // y3_i, y3_r
"addi %2, %2, 64 \n\t" "addi %2, %2, 64 \n\t"
"addi %3, %3, 64 \n\t" "addi %3, %3, 64 \n\t"
"addic. %1, %1, -8 \n\t" "addic. %1, %1, -8 \n\t"
"ble 2f \n\t" "ble two%= \n\t"
".p2align 5 \n" ".align 5 \n"
"1: \n\t" "one%=: \n\t"
"xvmaddadp 32, 40, 48 \n\t" // x0_r * y0_r , x0_i * y0_i "xvmaddadp 32, 40, 48 \n\t" // x0_r * y0_r , x0_i * y0_i
"lxvd2x 48, 0, %3 \n\t" // y0_r, y0_i "lxvd2x 48, 0, %3 \n\t" // y0_r, y0_i
@ -111,14 +111,14 @@ static void zdot_kernel_8 (long n, double *x, double *y, double *dot)
"xvmaddadp 39, 43, 3 \n\t" // x3_r * y3_i , x3_i * y3_r "xvmaddadp 39, 43, 3 \n\t" // x3_r * y3_i , x3_i * y3_r
"lxvd2x 43, %9, %2 \n\t" // x3_r, x3_i "lxvd2x 43, %9, %2 \n\t" // x3_r, x3_i
"xxswapd 0,48 \n\t" // y0_i, y0_r XXSWAPD_S(0,48) // y0_i, y0_r
"xxswapd 1,49 \n\t" // y1_i, y1_r XXSWAPD_S(1,49) // y1_i, y1_r
"addi %2, %2, 64 \n\t" "addi %2, %2, 64 \n\t"
"addi %3, %3, 64 \n\t" "addi %3, %3, 64 \n\t"
"xxswapd 2,50 \n\t" // y2_i, y2_r XXSWAPD_S(2,50) // y2_i, y2_r
"xxswapd 3,51 \n\t" // y3_i, y3_r XXSWAPD_S(3,51) // y3_i, y3_r
"xvmaddadp 32, 44, 4 \n\t" // x0_r * y0_r , x0_i * y0_i "xvmaddadp 32, 44, 4 \n\t" // x0_r * y0_r , x0_i * y0_i
"lxvd2x 4, 0, %3 \n\t" // y0_r, y0_i "lxvd2x 4, 0, %3 \n\t" // y0_r, y0_i
@ -138,19 +138,19 @@ static void zdot_kernel_8 (long n, double *x, double *y, double *dot)
"xvmaddadp 39, 47, 11 \n\t" // x3_r * y3_i , x3_i * y3_r "xvmaddadp 39, 47, 11 \n\t" // x3_r * y3_i , x3_i * y3_r
"lxvd2x 47, %9, %2 \n\t" // x3_r, x3_i "lxvd2x 47, %9, %2 \n\t" // x3_r, x3_i
"xxswapd 8,4 \n\t" // y0_i, y0_r XXSWAPD_S(8,4) // y0_i, y0_r
"xxswapd 9,5 \n\t" // y1_i, y1_r XXSWAPD_S(9,5) // y1_i, y1_r
"addi %2, %2, 64 \n\t" "addi %2, %2, 64 \n\t"
"addi %3, %3, 64 \n\t" "addi %3, %3, 64 \n\t"
"xxswapd 10,6 \n\t" // y2_i, y2_r XXSWAPD_S(10,6) // y2_i, y2_r
"xxswapd 11,7 \n\t" // y3_i, y3_r XXSWAPD_S(11,7) // y3_i, y3_r
"addic. %1, %1, -8 \n\t" "addic. %1, %1, -8 \n\t"
"bgt 1b \n" "bgt one%= \n"
"2: \n\t" "two%=: \n\t"
"xvmaddadp 32, 40, 48 \n\t" // x0_r * y0_r , x0_i * y0_i "xvmaddadp 32, 40, 48 \n\t" // x0_r * y0_r , x0_i * y0_i
"xvmaddadp 34, 41, 49 \n\t" // x1_r * y1_r , x1_i * y1_i "xvmaddadp 34, 41, 49 \n\t" // x1_r * y1_r , x1_i * y1_i

File diff suppressed because it is too large Load Diff

View File

@ -38,7 +38,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
* Macros for N=4 and M=8 * Macros for N=4 and M=8
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_4x8', `
#else
.macro COPY_4x8 .macro COPY_4x8
#endif
lxvd2x vs32, o0, A0 lxvd2x vs32, o0, A0
lxvd2x vs33, o16, A0 lxvd2x vs33, o16, A0
@ -144,14 +148,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvd2x vs12, o32, T1 stxvd2x vs12, o32, T1
stxvd2x vs13, o48, T1 stxvd2x vs13, o48, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=4 and M=4 * Macros for N=4 and M=4
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_4x4', `
#else
.macro COPY_4x4 .macro COPY_4x4
#endif
lxvd2x vs32, o0, A0 lxvd2x vs32, o0, A0
lxvd2x vs33, o16, A0 lxvd2x vs33, o16, A0
@ -209,14 +221,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvd2x vs46, o32, T1 stxvd2x vs46, o32, T1
stxvd2x vs47, o48, T1 stxvd2x vs47, o48, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=4 and M=2 * Macros for N=4 and M=2
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_4x2', `
#else
.macro COPY_4x2 .macro COPY_4x2
#endif
lxvd2x vs32, o0, A0 lxvd2x vs32, o0, A0
lxvd2x vs33, o16, A0 lxvd2x vs33, o16, A0
@ -254,14 +274,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvd2x vs38, o32, T1 stxvd2x vs38, o32, T1
stxvd2x vs39, o48, T1 stxvd2x vs39, o48, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=4 and M=1 * Macros for N=4 and M=1
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_4x1', `
#else
.macro COPY_4x1 .macro COPY_4x1
#endif
lxvd2x vs32, o0, A0 lxvd2x vs32, o0, A0
addi A0, A0, 16 addi A0, A0, 16
@ -289,14 +317,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvd2x vs35, o48, T1 stxvd2x vs35, o48, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=2 and M=8 * Macros for N=2 and M=8
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_2x8', `
#else
.macro COPY_2x8 .macro COPY_2x8
#endif
lxvd2x vs32, o0, A0 lxvd2x vs32, o0, A0
lxvd2x vs33, o16, A0 lxvd2x vs33, o16, A0
@ -350,14 +386,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvd2x vs46, o32, T1 stxvd2x vs46, o32, T1
stxvd2x vs47, o48, T1 stxvd2x vs47, o48, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=2 and M=4 * Macros for N=2 and M=4
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_2x4', `
#else
.macro COPY_2x4 .macro COPY_2x4
#endif
lxvd2x vs32, o0, A0 lxvd2x vs32, o0, A0
lxvd2x vs33, o16, A0 lxvd2x vs33, o16, A0
@ -387,14 +431,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvd2x vs38, o32, T1 stxvd2x vs38, o32, T1
stxvd2x vs39, o48, T1 stxvd2x vs39, o48, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=2 and M=2 * Macros for N=2 and M=2
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_2x2', `
#else
.macro COPY_2x2 .macro COPY_2x2
#endif
lxvd2x vs32, o0, A0 lxvd2x vs32, o0, A0
lxvd2x vs33, o16, A0 lxvd2x vs33, o16, A0
@ -414,14 +466,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvd2x vs34, o32, T1 stxvd2x vs34, o32, T1
stxvd2x vs35, o48, T1 stxvd2x vs35, o48, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=2 and M=1 * Macros for N=2 and M=1
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_2x1', `
#else
.macro COPY_2x1 .macro COPY_2x1
#endif
lxvd2x vs32, o0, A0 lxvd2x vs32, o0, A0
addi A0, A0, 16 addi A0, A0, 16
@ -437,14 +497,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvd2x vs33, o16, T1 stxvd2x vs33, o16, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=1 and M=8 * Macros for N=1 and M=8
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_1x8', `
#else
.macro COPY_1x8 .macro COPY_1x8
#endif
lxvd2x vs32, o0, A0 lxvd2x vs32, o0, A0
lxvd2x vs33, o16, A0 lxvd2x vs33, o16, A0
@ -472,14 +540,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvd2x vs38, o32, T1 stxvd2x vs38, o32, T1
stxvd2x vs39, o48, T1 stxvd2x vs39, o48, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=1 and M=4 * Macros for N=1 and M=4
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_1x4', `
#else
.macro COPY_1x4 .macro COPY_1x4
#endif
lxvd2x vs32, o0, A0 lxvd2x vs32, o0, A0
lxvd2x vs33, o16, A0 lxvd2x vs33, o16, A0
@ -495,14 +571,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvd2x vs34, o32, T1 stxvd2x vs34, o32, T1
stxvd2x vs35, o48, T1 stxvd2x vs35, o48, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=1 and M=2 * Macros for N=1 and M=2
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_1x2', `
#else
.macro COPY_1x2 .macro COPY_1x2
#endif
lxvd2x vs32, o0, A0 lxvd2x vs32, o0, A0
lxvd2x vs33, o16, A0 lxvd2x vs33, o16, A0
@ -514,14 +598,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvd2x vs32, o0, T1 stxvd2x vs32, o0, T1
stxvd2x vs33, o16, T1 stxvd2x vs33, o16, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=1 and M=1 * Macros for N=1 and M=1
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_1x1', `
#else
.macro COPY_1x1 .macro COPY_1x1
#endif
lxvd2x vs32, o0, A0 lxvd2x vs32, o0, A0
addi A0, A0, 16 addi A0, A0, 16
@ -531,5 +623,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvd2x vs32, o0, T1 stxvd2x vs32, o0, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif

View File

@ -40,8 +40,8 @@ static void zrot_kernel_4(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT cosA, FLOAT si
__asm__ __asm__
( (
"xxspltd 36, %x[cos], 0 \n\t" // load c to both dwords XXSPLTD_S(36,%x[cos],0) // load c to both dwords
"xxspltd 37, %x[sin], 0 \n\t" // load s to both dwords XXSPLTD_S(37,%x[sin],0) // load s to both dwords
"lxvd2x 32, 0, %[x_ptr] \n\t" // load x "lxvd2x 32, 0, %[x_ptr] \n\t" // load x
"lxvd2x 33, %[i16], %[x_ptr] \n\t" "lxvd2x 33, %[i16], %[x_ptr] \n\t"
@ -57,10 +57,10 @@ static void zrot_kernel_4(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT cosA, FLOAT si
"addi %[y_ptr], %[y_ptr], 64 \n\t" "addi %[y_ptr], %[y_ptr], 64 \n\t"
"addic. %[temp_n], %[temp_n], -4 \n\t" "addic. %[temp_n], %[temp_n], -4 \n\t"
"ble 2f \n\t" "ble two%= \n\t"
".p2align 5 \n" ".align 5 \n"
"1: \n\t" "one%=: \n\t"
"xvmuldp 40, 32, 36 \n\t" // c * x "xvmuldp 40, 32, 36 \n\t" // c * x
"xvmuldp 41, 33, 36 \n\t" "xvmuldp 41, 33, 36 \n\t"
@ -124,9 +124,9 @@ static void zrot_kernel_4(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT cosA, FLOAT si
"addi %[y_ptr], %[y_ptr], 128 \n\t" "addi %[y_ptr], %[y_ptr], 128 \n\t"
"addic. %[temp_n], %[temp_n], -4 \n\t" "addic. %[temp_n], %[temp_n], -4 \n\t"
"bgt+ 1b \n" "bgt+ one%= \n"
"2: \n\t" "two%=: \n\t"
"xvmuldp 40, 32, 36 \n\t" // c * x "xvmuldp 40, 32, 36 \n\t" // c * x
"xvmuldp 41, 33, 36 \n\t" "xvmuldp 41, 33, 36 \n\t"

View File

@ -58,8 +58,8 @@ static void zscal_kernel_8 (long n, double *x, double alpha_r, double alpha_i)
"dcbt 0, %2 \n\t" "dcbt 0, %2 \n\t"
"xsnegdp 33, %x16 \n\t" // -alpha_i "xsnegdp 33, %x16 \n\t" // -alpha_i
"xxspltd 32, %x15, 0 \n\t" // alpha_r , alpha_r XXSPLTD_S(32,%x15,0) // alpha_r , alpha_r
"xxmrghd 33, 33, %x16 \n\t" // -alpha_i , alpha_i XXMRGHD_S(33,33,%x16) // -alpha_i , alpha_i
"lxvd2x 40, 0, %2 \n\t" // x0_r, x0_i "lxvd2x 40, 0, %2 \n\t" // x0_r, x0_i
"lxvd2x 41, %17, %2 \n\t" "lxvd2x 41, %17, %2 \n\t"
@ -73,10 +73,10 @@ static void zscal_kernel_8 (long n, double *x, double alpha_r, double alpha_i)
"addi %2, %2, 128 \n\t" "addi %2, %2, 128 \n\t"
"addic. %1, %1, -8 \n\t" "addic. %1, %1, -8 \n\t"
"ble 2f \n\t" "ble two%= \n\t"
".p2align 5 \n" ".align 5 \n"
"1: \n\t" "one%=: \n\t"
"xvmuldp 48, 40, 32 \n\t" // x0_r * alpha_r, x0_i * alpha_r "xvmuldp 48, 40, 32 \n\t" // x0_r * alpha_r, x0_i * alpha_r
"xvmuldp 49, 41, 32 \n\t" "xvmuldp 49, 41, 32 \n\t"
@ -87,14 +87,14 @@ static void zscal_kernel_8 (long n, double *x, double alpha_r, double alpha_i)
"xvmuldp %x5, 46, 32 \n\t" "xvmuldp %x5, 46, 32 \n\t"
"xvmuldp %x6, 47, 32 \n\t" "xvmuldp %x6, 47, 32 \n\t"
"xxswapd %x7, 40 \n\t" XXSWAPD_S(%x7,40)
"xxswapd %x8, 41 \n\t" XXSWAPD_S(%x8,41)
"xxswapd %x9, 42 \n\t" XXSWAPD_S(%x9,42)
"xxswapd %x10, 43 \n\t" XXSWAPD_S(%x10,43)
"xxswapd %x11, 44 \n\t" XXSWAPD_S(%x11,44)
"xxswapd %x12, 45 \n\t" XXSWAPD_S(%x12,45)
"xxswapd %x13, 46 \n\t" XXSWAPD_S(%x13,46)
"xxswapd %x14, 47 \n\t" XXSWAPD_S(%x14,47)
"xvmuldp %x7, %x7, 33 \n\t" // x0_i * -alpha_i, x0_r * alpha_i "xvmuldp %x7, %x7, 33 \n\t" // x0_i * -alpha_i, x0_r * alpha_i
"xvmuldp %x8, %x8, 33 \n\t" "xvmuldp %x8, %x8, 33 \n\t"
@ -147,9 +147,9 @@ static void zscal_kernel_8 (long n, double *x, double alpha_r, double alpha_i)
"addi %2, %2, 256 \n\t" "addi %2, %2, 256 \n\t"
"addic. %1, %1, -8 \n\t" "addic. %1, %1, -8 \n\t"
"bgt 1b \n" "bgt one%= \n"
"2: \n\t" "two%=: \n\t"
"xvmuldp 48, 40, 32 \n\t" // x0_r * alpha_r, x0_i * alpha_r "xvmuldp 48, 40, 32 \n\t" // x0_r * alpha_r, x0_i * alpha_r
"xvmuldp 49, 41, 32 \n\t" "xvmuldp 49, 41, 32 \n\t"
@ -160,14 +160,14 @@ static void zscal_kernel_8 (long n, double *x, double alpha_r, double alpha_i)
"xvmuldp %x5, 46, 32 \n\t" "xvmuldp %x5, 46, 32 \n\t"
"xvmuldp %x6, 47, 32 \n\t" "xvmuldp %x6, 47, 32 \n\t"
"xxswapd %x7, 40 \n\t" XXSWAPD_S(%x7,40)
"xxswapd %x8, 41 \n\t" XXSWAPD_S(%x8,41)
"xxswapd %x9, 42 \n\t" XXSWAPD_S(%x9,42)
"xxswapd %x10, 43 \n\t" XXSWAPD_S(%x10,43)
"xxswapd %x11, 44 \n\t" XXSWAPD_S(%x11,44)
"xxswapd %x12, 45 \n\t" XXSWAPD_S(%x12,45)
"xxswapd %x13, 46 \n\t" XXSWAPD_S(%x13,46)
"xxswapd %x14, 47 \n\t" XXSWAPD_S(%x14,47)
"addi %2, %2, -128 \n\t" "addi %2, %2, -128 \n\t"

View File

@ -40,8 +40,8 @@ zswap_kernel_16 (long n, double *x, double *y)
{ {
__asm__ __asm__
( (
".p2align 5 \n" ".align 5 \n"
"1: \n\t" "one%=: \n\t"
"lxvd2x 32, 0, %4 \n\t" "lxvd2x 32, 0, %4 \n\t"
"lxvd2x 33, %5, %4 \n\t" "lxvd2x 33, %5, %4 \n\t"
"lxvd2x 34, %6, %4 \n\t" "lxvd2x 34, %6, %4 \n\t"
@ -130,7 +130,7 @@ zswap_kernel_16 (long n, double *x, double *y)
"addi %4, %4, 128 \n\t" "addi %4, %4, 128 \n\t"
"addic. %2, %2, -16 \n\t" "addic. %2, %2, -16 \n\t"
"bgt 1b \n" "bgt one%= \n"
"#n=%2 x=%0=%3 y=%1=%4 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11" "#n=%2 x=%0=%3 y=%1=%4 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11"
: :

File diff suppressed because it is too large Load Diff