AIX changes for Power8
This commit is contained in:
parent
2a43062de7
commit
3dc6b26eff
|
@ -39,6 +39,35 @@
|
||||||
#ifndef COMMON_POWER
|
#ifndef COMMON_POWER
|
||||||
#define COMMON_POWER
|
#define COMMON_POWER
|
||||||
|
|
||||||
|
#define str(x) #x
|
||||||
|
|
||||||
|
#ifdef OS_AIX
|
||||||
|
#define XXSPLTD(T,A,z) xxpermdi T, A, A, 0b##z##z
|
||||||
|
#define XXMRGHD(T,A,B) xxpermdi T, A, B, 0b00
|
||||||
|
#define XXMRGLD(T,A,B) xxpermdi T, A, B, 0b11
|
||||||
|
#define XXSWAPD(T,A) xxpermdi T, A, A, 0b10
|
||||||
|
#define XVMOVDP(T,A) xvcpsgndp T, A, A
|
||||||
|
|
||||||
|
#define XXSPLTD_S(T,A,z) "xxpermdi " str(T) ", " str(A) ", " str(A) ", 0b" str(z ## z) " \n\t"
|
||||||
|
#define XXMRGHD_S(T,A,B) "xxpermdi " str(T) ", " str(A) ", " str(B) ", 0b00 \n\t"
|
||||||
|
#define XXMRGLD_S(T,A,B) "xxpermdi " str(T) ", " str(A) ", " str(B) ", 0b11 \n\t"
|
||||||
|
#define XXSWAPD_S(T,A) "xxpermdi " str(T) ", " str(A) ", " str(A) ", 0b10 \n\t"
|
||||||
|
|
||||||
|
#else
|
||||||
|
#define XXSPLTD(T,A,z) xxspltd T, A, z
|
||||||
|
#define XXMRGHD(T,A,B) xxmrghd T, A, B
|
||||||
|
#define XXMRGLD(T,A,B) xxmrgld T, A, B
|
||||||
|
#define XXSWAPD(T,A) xxswapd T, A
|
||||||
|
#define XVMOVDP(T,A) xvmovdp T, A
|
||||||
|
|
||||||
|
#define XXSPLTD_S(T,A,z) "xxspltd T, A, z \n\t"
|
||||||
|
#define XXMRGHD_S(T,A,B) "xxmrghd T, A, B \n\t"
|
||||||
|
#define XXMRGLD_S(T,A,B) "xxmrgld T, A, B \n\t"
|
||||||
|
#define XXSWAPD_S(T,A) "xxswapd T, A"
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
#if defined(POWER8) || defined(POWER9)
|
#if defined(POWER8) || defined(POWER9)
|
||||||
#define MB __asm__ __volatile__ ("eieio":::"memory")
|
#define MB __asm__ __volatile__ ("eieio":::"memory")
|
||||||
#define WMB __asm__ __volatile__ ("eieio":::"memory")
|
#define WMB __asm__ __volatile__ ("eieio":::"memory")
|
||||||
|
|
|
@ -57,8 +57,6 @@ USE_TRMM = 1
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
SKERNELOBJS += \
|
SKERNELOBJS += \
|
||||||
sgemm_kernel$(TSUFFIX).$(SUFFIX) \
|
sgemm_kernel$(TSUFFIX).$(SUFFIX) \
|
||||||
$(SGEMMINCOPYOBJ) $(SGEMMITCOPYOBJ) \
|
$(SGEMMINCOPYOBJ) $(SGEMMITCOPYOBJ) \
|
||||||
|
@ -436,7 +434,10 @@ $(KDIR)$(SGEMMONCOPYOBJ) : $(KERNELDIR)/$(SGEMMONCOPY)
|
||||||
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
|
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
|
||||||
|
|
||||||
$(KDIR)$(SGEMMOTCOPYOBJ) : $(KERNELDIR)/$(SGEMMOTCOPY)
|
$(KDIR)$(SGEMMOTCOPYOBJ) : $(KERNELDIR)/$(SGEMMOTCOPY)
|
||||||
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
|
$(CC) $(CFLAGS) -E -UDOUBLE -UCOMPLEX $< -o sgemmotcopy.s
|
||||||
|
m4 sgemmotcopy.s > sgemmotcopy_nomacros.s
|
||||||
|
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemmotcopy_nomacros.s -o $@
|
||||||
|
rm sgemmotcopy.s sgemmotcopy_nomacros.s
|
||||||
|
|
||||||
ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
|
ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
|
||||||
|
|
||||||
|
@ -444,12 +445,17 @@ $(KDIR)$(SGEMMINCOPYOBJ) : $(KERNELDIR)/$(SGEMMINCOPY)
|
||||||
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
|
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
|
||||||
|
|
||||||
$(KDIR)$(SGEMMITCOPYOBJ) : $(KERNELDIR)/$(SGEMMITCOPY)
|
$(KDIR)$(SGEMMITCOPYOBJ) : $(KERNELDIR)/$(SGEMMITCOPY)
|
||||||
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
|
$(CC) $(CFLAGS) -E -UDOUBLE -UCOMPLEX $< -o sgemmitcopy.s
|
||||||
|
m4 sgemmitcopy.s > sgemmitcopy_nomacros.s
|
||||||
|
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemmitcopy_nomacros.s -o $@
|
||||||
|
rm sgemmitcopy.s sgemmitcopy_nomacros.s
|
||||||
endif
|
endif
|
||||||
|
|
||||||
$(KDIR)$(DGEMMONCOPYOBJ) : $(KERNELDIR)/$(DGEMMONCOPY)
|
$(KDIR)$(DGEMMONCOPYOBJ) : $(KERNELDIR)/$(DGEMMONCOPY)
|
||||||
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
|
$(CC) $(CFLAGS) -E -DDOUBLE -UCOMPLEX $< -o dgemm_ncopy.s
|
||||||
|
m4 dgemm_ncopy.s > dgemm_ncopy_nomacros.s
|
||||||
|
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX dgemm_ncopy_nomacros.s -o $@
|
||||||
|
rm dgemm_ncopy.s dgemm_ncopy_nomacros.s
|
||||||
|
|
||||||
$(KDIR)$(DGEMMOTCOPYOBJ) : $(KERNELDIR)/$(DGEMMOTCOPY)
|
$(KDIR)$(DGEMMOTCOPYOBJ) : $(KERNELDIR)/$(DGEMMOTCOPY)
|
||||||
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
|
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
|
||||||
|
@ -460,7 +466,10 @@ $(KDIR)$(DGEMMINCOPYOBJ) : $(KERNELDIR)/$(DGEMMINCOPY)
|
||||||
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
|
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
|
||||||
|
|
||||||
$(KDIR)$(DGEMMITCOPYOBJ) : $(KERNELDIR)/$(DGEMMITCOPY)
|
$(KDIR)$(DGEMMITCOPYOBJ) : $(KERNELDIR)/$(DGEMMITCOPY)
|
||||||
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
|
$(CC) $(CFLAGS) -E -DDOUBLE -UCOMPLEX $< -o dgemm_itcopy.s
|
||||||
|
m4 dgemm_itcopy.s > dgemm_itcopy_nomacros.s
|
||||||
|
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX dgemm_itcopy_nomacros.s -o $@
|
||||||
|
rm dgemm_itcopy.s dgemm_itcopy_nomacros.s
|
||||||
|
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
@ -485,10 +494,16 @@ endif
|
||||||
endif
|
endif
|
||||||
|
|
||||||
$(KDIR)$(CGEMMONCOPYOBJ) : $(KERNELDIR)/$(CGEMMONCOPY)
|
$(KDIR)$(CGEMMONCOPYOBJ) : $(KERNELDIR)/$(CGEMMONCOPY)
|
||||||
|
# $(CC) $(CFLAGS) -E -UDOUBLE -UCOMPLEX $< -o cgemm_oncopy.s
|
||||||
|
# m4 cgemm_oncopy.s > cgemm_oncopy_nomacros.s
|
||||||
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
|
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
|
||||||
|
# rm cgemm_oncopy.s cgemm_oncopy_nomacros.s
|
||||||
|
|
||||||
$(KDIR)$(CGEMMOTCOPYOBJ) : $(KERNELDIR)/$(CGEMMOTCOPY)
|
$(KDIR)$(CGEMMOTCOPYOBJ) : $(KERNELDIR)/$(CGEMMOTCOPY)
|
||||||
|
# $(CC) $(CFLAGS) -E -UDOUBLE -UCOMPLEX $< -o cgemm_otcopy.s
|
||||||
|
# m4 cgemm_otcopy.s > cgemm_otcopy_nomacros.s
|
||||||
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
|
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
|
||||||
|
# rm cgemm_otcopy.s cgemm_otcopy_nomacros.s
|
||||||
|
|
||||||
ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
|
ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
|
||||||
|
|
||||||
|
@ -496,7 +511,10 @@ $(KDIR)$(CGEMMINCOPYOBJ) : $(KERNELDIR)/$(CGEMMINCOPY)
|
||||||
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
|
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
|
||||||
|
|
||||||
$(KDIR)$(CGEMMITCOPYOBJ) : $(KERNELDIR)/$(CGEMMITCOPY)
|
$(KDIR)$(CGEMMITCOPYOBJ) : $(KERNELDIR)/$(CGEMMITCOPY)
|
||||||
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
|
$(CC) $(CFLAGS) -UDOUBLE -UCOMPLEX -E $< -o cgemm_itcopy.s
|
||||||
|
m4 cgemm_itcopy.s > cgemm_itcopy_nomacros.s
|
||||||
|
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX cgemm_itcopy_nomacros.s -o $@
|
||||||
|
rm cgemm_itcopy.s cgemm_itcopy_nomacros.s
|
||||||
|
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
@ -512,7 +530,10 @@ $(KDIR)$(ZGEMMINCOPYOBJ) : $(KERNELDIR)/$(ZGEMMINCOPY)
|
||||||
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
|
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
|
||||||
|
|
||||||
$(KDIR)$(ZGEMMITCOPYOBJ) : $(KERNELDIR)/$(ZGEMMITCOPY)
|
$(KDIR)$(ZGEMMITCOPYOBJ) : $(KERNELDIR)/$(ZGEMMITCOPY)
|
||||||
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
|
$(CC) $(CFLAGS) -E -DDOUBLE -UCOMPLEX $< -o zgemm_itcopy.s
|
||||||
|
m4 zgemm_itcopy.s > zgemm_itcopy_nomacros.s
|
||||||
|
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX zgemm_itcopy_nomacros.s -o $@
|
||||||
|
rm zgemm_itcopy.s zgemm_itcopy_nomacros.s
|
||||||
|
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
@ -537,37 +558,67 @@ endif
|
||||||
endif
|
endif
|
||||||
|
|
||||||
$(KDIR)sgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(SGEMMDEPEND)
|
$(KDIR)sgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(SGEMMDEPEND)
|
||||||
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
|
$(CC) $(CFLAGS) -E -UDOUBLE -UCOMPLEX $< -o sgemm_kernel$(TSUFFIX).s
|
||||||
|
m4 sgemm_kernel$(TSUFFIX).s > sgemm_kernel$(TSUFFIX)_nomacros.s
|
||||||
|
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemm_kernel$(TSUFFIX)_nomacros.s -o $@
|
||||||
|
rm sgemm_kernel$(TSUFFIX).s sgemm_kernel$(TSUFFIX)_nomacros.s
|
||||||
|
|
||||||
$(KDIR)dgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) $(DGEMMDEPEND)
|
$(KDIR)dgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) $(DGEMMDEPEND)
|
||||||
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
|
$(CC) $(CFLAGS) -E -DDOUBLE -UCOMPLEX $< -o dgemm_kernel$(TSUFFIX).s
|
||||||
|
m4 dgemm_kernel$(TSUFFIX).s > dgemm_kernel$(TSUFFIX)_nomacros.s
|
||||||
|
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX dgemm_kernel$(TSUFFIX)_nomacros.s -o $@
|
||||||
|
rm dgemm_kernel$(TSUFFIX).s dgemm_kernel$(TSUFFIX)_nomacros.s
|
||||||
|
|
||||||
$(KDIR)qgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) $(QGEMMDEPEND)
|
$(KDIR)qgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) $(QGEMMDEPEND)
|
||||||
$(CC) $(CFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@
|
$(CC) $(CFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@
|
||||||
|
|
||||||
$(KDIR)cgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND)
|
$(KDIR)cgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND)
|
||||||
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN $< -o $@
|
$(CC) $(CFLAGS) -E -UDOUBLE -DCOMPLEX -DNN $< -o cgemm_kernel_n.s
|
||||||
|
m4 cgemm_kernel_n.s > cgemm_kernel_n_nomacros.s
|
||||||
|
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN cgemm_kernel_n_nomacros.s -o $@
|
||||||
|
rm cgemm_kernel_n.s cgemm_kernel_n_nomacros.s
|
||||||
|
|
||||||
$(KDIR)cgemm_kernel_l$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND)
|
$(KDIR)cgemm_kernel_l$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND)
|
||||||
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $@
|
$(CC) $(CFLAGS) -E -UDOUBLE -DCOMPLEX -DCN $< -o cgemm_kernel_l.s
|
||||||
|
m4 cgemm_kernel_l.s > cgemm_kernel_l_nomacros.s
|
||||||
|
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCN cgemm_kernel_l_nomacros.s -o $@
|
||||||
|
rm cgemm_kernel_l.s cgemm_kernel_l_nomacros.s
|
||||||
|
|
||||||
$(KDIR)cgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND)
|
$(KDIR)cgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND)
|
||||||
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNC $< -o $@
|
$(CC) $(CFLAGS) -E -UDOUBLE -DCOMPLEX -DNC $< -o cgemm_kernel_r.s
|
||||||
|
m4 cgemm_kernel_r.s > cgemm_kernel_r_nomacros.s
|
||||||
|
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNC cgemm_kernel_r_nomacros.s -o $@
|
||||||
|
rm cgemm_kernel_r.s cgemm_kernel_r_nomacros.s
|
||||||
|
|
||||||
$(KDIR)cgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND)
|
$(KDIR)cgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND)
|
||||||
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $@
|
$(CC) $(CFLAGS) -E -UDOUBLE -DCOMPLEX -DCC $< -o cgemm_kernel_b.s
|
||||||
|
m4 cgemm_kernel_b.s > cgemm_kernel_b_nomacros.s
|
||||||
|
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCC cgemm_kernel_b_nomacros.s -o $@
|
||||||
|
rm cgemm_kernel_b.s cgemm_kernel_b_nomacros.s
|
||||||
|
|
||||||
$(KDIR)zgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND)
|
$(KDIR)zgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND)
|
||||||
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN $< -o $@
|
$(CC) $(CFLAGS) -E -DDOUBLE -DCOMPLEX -DNN $< -o zgemm_kernel_n.s
|
||||||
|
m4 zgemm_kernel_n.s > zgemm_kernel_n_nomacros.s
|
||||||
|
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN zgemm_kernel_n_nomacros.s -o $@
|
||||||
|
rm zgemm_kernel_n.s zgemm_kernel_n_nomacros.s
|
||||||
|
|
||||||
$(KDIR)zgemm_kernel_l$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND)
|
$(KDIR)zgemm_kernel_l$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND)
|
||||||
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN $< -o $@
|
$(CC) $(CFLAGS) -E -DDOUBLE -DCOMPLEX -DCN $< -o zgemm_kernel_l.s
|
||||||
|
m4 zgemm_kernel_l.s > zgemm_kernel_l_nomacros.s
|
||||||
|
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN zgemm_kernel_l_nomacros.s -o $@
|
||||||
|
rm zgemm_kernel_l.s zgemm_kernel_l_nomacros.s
|
||||||
|
|
||||||
$(KDIR)zgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND)
|
$(KDIR)zgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND)
|
||||||
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNC $< -o $@
|
$(CC) $(CFLAGS) -E -DDOUBLE -DCOMPLEX -DNC $< -o zgemm_kernel_r.s
|
||||||
|
m4 zgemm_kernel_r.s > zgemm_kernel_r_nomacros.s
|
||||||
|
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNC zgemm_kernel_r_nomacros.s -o $@
|
||||||
|
rm zgemm_kernel_r.s zgemm_kernel_r_nomacros.s
|
||||||
|
|
||||||
$(KDIR)zgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND)
|
$(KDIR)zgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND)
|
||||||
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $@
|
$(CC) $(CFLAGS) -E -DDOUBLE -DCOMPLEX -DCC $< -o zgemm_kernel_b.s
|
||||||
|
m4 zgemm_kernel_b.s > zgemm_kernel_b_nomacros.s
|
||||||
|
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC zgemm_kernel_b_nomacros.s -o $@
|
||||||
|
rm zgemm_kernel_b.s zgemm_kernel_b_nomacros.s
|
||||||
|
|
||||||
$(KDIR)xgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMDEPEND)
|
$(KDIR)xgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMDEPEND)
|
||||||
$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DNN $< -o $@
|
$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DNN $< -o $@
|
||||||
|
@ -584,28 +635,56 @@ $(KDIR)xgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMD
|
||||||
|
|
||||||
ifdef USE_TRMM
|
ifdef USE_TRMM
|
||||||
$(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
|
$(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
|
||||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
|
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o strmmkernel_ln.s
|
||||||
|
m4 strmmkernel_ln.s > strmmkernel_ln_nomacros.s
|
||||||
|
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA strmmkernel_ln_nomacros.s -o $@
|
||||||
|
rm strmmkernel_ln.s strmmkernel_ln_nomacros.s
|
||||||
|
|
||||||
$(KDIR)strmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
|
$(KDIR)strmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
|
||||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@
|
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o strmmkernel_lt.s
|
||||||
|
m4 strmmkernel_lt.s > strmmkernel_lt_nomacros.s
|
||||||
|
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA strmmkernel_lt_nomacros.s -o $@
|
||||||
|
rm strmmkernel_lt.s strmmkernel_lt_nomacros.s
|
||||||
|
|
||||||
$(KDIR)strmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
|
$(KDIR)strmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
|
||||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@
|
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o strmmkernel_rn.s
|
||||||
|
m4 strmmkernel_rn.s > strmmkernel_rn_nomacros.s
|
||||||
|
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA strmmkernel_rn_nomacros.s -o $@
|
||||||
|
rm strmmkernel_rn.s strmmkernel_rn_nomacros.s
|
||||||
|
|
||||||
$(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
|
$(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
|
||||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
|
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o strmm_kernel_rt.s
|
||||||
|
m4 strmm_kernel_rt.s > strmm_kernel_rt_nomacros.s
|
||||||
|
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA strmm_kernel_rt_nomacros.s -o $@
|
||||||
|
rm strmm_kernel_rt.s strmm_kernel_rt_nomacros.s
|
||||||
|
|
||||||
$(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
|
$(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
|
||||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
|
$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o dtrmm_kernel_ln.s
|
||||||
|
# $(CC) $(CFLAGS) -E $< -o dtrmm_kernel_ln.s
|
||||||
|
m4 dtrmm_kernel_ln.s > dtrmm_kernel_ln_nomacros.s
|
||||||
|
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA dtrmm_kernel_ln_nomacros.s -o $@
|
||||||
|
rm dtrmm_kernel_ln.s dtrmm_kernel_ln_nomacros.s
|
||||||
|
|
||||||
$(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
|
$(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
|
||||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@
|
$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o dtrmm_kernel_lt.s
|
||||||
|
# $(CC) $(CFLAGS) -E $< -o dtrmm_kernel_lt.s
|
||||||
|
m4 dtrmm_kernel_lt.s > dtrmm_kernel_lt_nomacros.s
|
||||||
|
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA dtrmm_kernel_lt_nomacros.s -o $@
|
||||||
|
rm dtrmm_kernel_lt.s dtrmm_kernel_lt_nomacros.s
|
||||||
|
|
||||||
$(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
|
$(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
|
||||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@
|
$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o dtrmm_kernel_rn.s
|
||||||
|
# $(CC) $(CFLAGS) -E $< -o dtrmm_kernel_rn.s
|
||||||
|
m4 dtrmm_kernel_rn.s > dtrmm_kernel_rn_nomacros.s
|
||||||
|
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA dtrmm_kernel_rn_nomacros.s -o $@
|
||||||
|
rm dtrmm_kernel_rn.s dtrmm_kernel_rn_nomacros.s
|
||||||
|
|
||||||
$(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
|
$(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
|
||||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
|
$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o dtrmm_kernel_rt.s
|
||||||
|
# $(CC) $(CFLAGS) -E $< -o dtrmm_kernel_rt.s
|
||||||
|
m4 dtrmm_kernel_rt.s > dtrmm_kernel_rt_nomacros.s
|
||||||
|
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA dtrmm_kernel_rt_nomacros.s -o $@
|
||||||
|
rm dtrmm_kernel_rt.s dtrmm_kernel_rt_nomacros.s
|
||||||
|
|
||||||
$(KDIR)qtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL)
|
$(KDIR)qtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL)
|
||||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
|
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
|
||||||
|
@ -620,52 +699,100 @@ $(KDIR)qtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL)
|
||||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
|
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
|
||||||
|
|
||||||
$(KDIR)ctrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
|
$(KDIR)ctrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
|
||||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@
|
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o ctrmm_kernel_ln.s
|
||||||
|
m4 ctrmm_kernel_ln.s > ctrmm_kernel_ln_nomacros.s
|
||||||
|
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN ctrmm_kernel_ln_nomacros.s -o $@
|
||||||
|
rm ctrmm_kernel_ln.s ctrmm_kernel_ln_nomacros.s
|
||||||
|
|
||||||
$(KDIR)ctrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
|
$(KDIR)ctrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
|
||||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@
|
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o ctrmm_kernel_lt.s
|
||||||
|
m4 ctrmm_kernel_lt.s > ctrmm_kernel_lt_nomacros.s
|
||||||
|
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN ctrmm_kernel_lt_nomacros.s -o $@
|
||||||
|
rm ctrmm_kernel_lt.s ctrmm_kernel_lt_nomacros.s
|
||||||
|
|
||||||
$(KDIR)ctrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
|
$(KDIR)ctrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
|
||||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@
|
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o ctrmm_kernel_lr.s
|
||||||
|
m4 ctrmm_kernel_lr.s > ctrmm_kernel_lr_nomacros.s
|
||||||
|
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN ctrmm_kernel_lr_nomacros.s -o $@
|
||||||
|
rm ctrmm_kernel_lr.s ctrmm_kernel_lr_nomacros.s
|
||||||
|
|
||||||
$(KDIR)ctrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
|
$(KDIR)ctrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
|
||||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@
|
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o ctrmm_kernel_lc.s
|
||||||
|
m4 ctrmm_kernel_lc.s > ctrmm_kernel_lc_nomacros.s
|
||||||
|
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN ctrmm_kernel_lc_nomacros.s -o $@
|
||||||
|
rm ctrmm_kernel_lc_nomacros.s ctrmm_kernel_lc.s
|
||||||
|
|
||||||
$(KDIR)ctrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
|
$(KDIR)ctrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
|
||||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@
|
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o ctrmm_kernel_rn.s
|
||||||
|
m4 ctrmm_kernel_rn.s > ctrmm_kernel_rn_nomacros.s
|
||||||
|
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN ctrmm_kernel_rn_nomacros.s -o $@
|
||||||
|
rm ctrmm_kernel_rn.s ctrmm_kernel_rn_nomacros.s
|
||||||
|
|
||||||
$(KDIR)ctrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
|
$(KDIR)ctrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
|
||||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@
|
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o ctrmm_kernel_rt.s
|
||||||
|
m4 ctrmm_kernel_rt.s > ctrmm_kernel_rt_nomacros.s
|
||||||
|
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN ctrmm_kernel_rt_nomacros.s -o $@
|
||||||
|
rm ctrmm_kernel_rt.s ctrmm_kernel_rt_nomacros.s
|
||||||
|
|
||||||
$(KDIR)ctrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
|
$(KDIR)ctrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
|
||||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@
|
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o ctrmm_kernel_rr.s
|
||||||
|
m4 ctrmm_kernel_rr.s > ctrmm_kernel_rr_nomacros.s
|
||||||
|
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC ctrmm_kernel_rr_nomacros.s -o $@
|
||||||
|
rm ctrmm_kernel_rr.s ctrmm_kernel_rr_nomacros.s
|
||||||
|
|
||||||
$(KDIR)ctrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
|
$(KDIR)ctrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
|
||||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@
|
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o ctrmm_kernel_RC.s
|
||||||
|
m4 ctrmm_kernel_RC.s > ctrmm_kernel_RC_nomacros.s
|
||||||
|
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC ctrmm_kernel_RC_nomacros.s -o $@
|
||||||
|
rm ctrmm_kernel_RC.s ctrmm_kernel_RC_nomacros.s
|
||||||
|
|
||||||
$(KDIR)ztrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
|
$(KDIR)ztrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
|
||||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@
|
$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o ztrmm_kernel_ln.s
|
||||||
|
m4 ztrmm_kernel_ln.s > ztrmm_kernel_ln_nomacros.s
|
||||||
|
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN ztrmm_kernel_ln_nomacros.s -o $@
|
||||||
|
rm ztrmm_kernel_ln.s ztrmm_kernel_ln_nomacros.s
|
||||||
|
|
||||||
$(KDIR)ztrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
|
$(KDIR)ztrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
|
||||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@
|
$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o ztrmm_kernel_lt.s
|
||||||
|
m4 ztrmm_kernel_lt.s > ztrmm_kernel_lt_nomacros.s
|
||||||
|
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN ztrmm_kernel_lt_nomacros.s -o $@
|
||||||
|
rm ztrmm_kernel_lt.s ztrmm_kernel_lt_nomacros.s
|
||||||
|
|
||||||
$(KDIR)ztrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
|
$(KDIR)ztrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
|
||||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@
|
$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o ztrmm_kernel_lr.s
|
||||||
|
m4 ztrmm_kernel_lr.s > ztrmm_kernel_lr_nomacros.s
|
||||||
|
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN ztrmm_kernel_lr_nomacros.s -o $@
|
||||||
|
rm ztrmm_kernel_lr.s ztrmm_kernel_lr_nomacros.s
|
||||||
|
|
||||||
$(KDIR)ztrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
|
$(KDIR)ztrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
|
||||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@
|
$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o ztrmm_kernel_lc.s
|
||||||
|
m4 ztrmm_kernel_lc.s >ztrmm_kernel_lc_nomacros.s
|
||||||
|
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN ztrmm_kernel_lc_nomacros.s -o $@
|
||||||
|
rm ztrmm_kernel_lc.s ztrmm_kernel_lc_nomacros.s
|
||||||
|
|
||||||
$(KDIR)ztrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
|
$(KDIR)ztrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
|
||||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@
|
$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o ztrmm_kernel_rn.s
|
||||||
|
m4 ztrmm_kernel_rn.s > ztrmm_kernel_rn_nomacros.s
|
||||||
|
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN ztrmm_kernel_rn_nomacros.s -o $@
|
||||||
|
rm ztrmm_kernel_rn.s ztrmm_kernel_rn_nomacros.s
|
||||||
|
|
||||||
$(KDIR)ztrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
|
$(KDIR)ztrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
|
||||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@
|
$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o ztrmm_kernel_rt.s
|
||||||
|
m4 ztrmm_kernel_rt.s > ztrmm_kernel_rt_nomacros.s
|
||||||
|
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN ztrmm_kernel_rt_nomacros.s -o $@
|
||||||
|
rm ztrmm_kernel_rt.s ztrmm_kernel_rt_nomacros.s
|
||||||
|
|
||||||
$(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
|
$(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
|
||||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@
|
$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o ztrmm_kernel_rr.s
|
||||||
|
m4 ztrmm_kernel_rr.s > ztrmm_kernel_rr_nomacros.s
|
||||||
|
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC ztrmm_kernel_rr_nomacros.s -o $@
|
||||||
|
rm ztrmm_kernel_rr.s ztrmm_kernel_rr_nomacros.s
|
||||||
|
|
||||||
$(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
|
$(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
|
||||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@
|
$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o ztrmm_kernel_rc.s
|
||||||
|
m4 ztrmm_kernel_rc.s > ztrmm_kernel_rc_nomacros.s
|
||||||
|
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC ztrmm_kernel_rc_nomacros.s -o $@
|
||||||
|
rm ztrmm_kernel_rc.s ztrmm_kernel_rc_nomacros.s
|
||||||
else
|
else
|
||||||
$(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL)
|
$(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL)
|
||||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
|
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
|
||||||
|
@ -677,7 +804,10 @@ $(KDIR)strmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL)
|
||||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@
|
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@
|
||||||
|
|
||||||
$(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL)
|
$(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL)
|
||||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
|
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o strmm_kernel_rt.s
|
||||||
|
m4 strmm_kernel_rt.s > strmm_kernel_rt_nomacros.s
|
||||||
|
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA strmm_kernel_rt_nomacros.s -o $@
|
||||||
|
rm strmm_kernel_rt.s strmm_kernel_rt_nomacros.s
|
||||||
|
|
||||||
$(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL)
|
$(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL)
|
||||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
|
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
|
||||||
|
@ -801,10 +931,16 @@ $(KDIR)strsm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRSMKERNEL_RT) $(ST
|
||||||
$(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -UDOUBLE -UUPPER -DRT -UCONJ $< -o $@
|
$(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -UDOUBLE -UUPPER -DRT -UCONJ $< -o $@
|
||||||
|
|
||||||
$(KDIR)dtrsm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_LN) $(DTRSMDEPEND)
|
$(KDIR)dtrsm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_LN) $(DTRSMDEPEND)
|
||||||
|
# $(CC) $(CFLAGS) -E $< -o dtrsm_kernel_ln.s
|
||||||
|
# m4 dtrsm_kernel_ln.s > dtrsm_kernel_ln_nomacros.s
|
||||||
$(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -DUPPER -DLN -UCONJ $< -o $@
|
$(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -DUPPER -DLN -UCONJ $< -o $@
|
||||||
|
# rm dtrsm_kernel_ln.s dtrsm_kernel_ln_nomacros.s
|
||||||
|
|
||||||
$(KDIR)dtrsm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_LT) $(DTRSMDEPEND)
|
$(KDIR)dtrsm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_LT) $(DTRSMDEPEND)
|
||||||
$(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ $< -o $@
|
$(CC) $(CFLAGS) -E -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ $< -o dtrsm_kernel_lt.s
|
||||||
|
m4 dtrsm_kernel_lt.s > dtrsm_kernel_lt_nomacros.s
|
||||||
|
$(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ dtrsm_kernel_lt_nomacros.s -o $@
|
||||||
|
rm dtrsm_kernel_lt.s dtrsm_kernel_lt_nomacros.s
|
||||||
|
|
||||||
$(KDIR)dtrsm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_RN) $(DTRSMDEPEND)
|
$(KDIR)dtrsm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_RN) $(DTRSMDEPEND)
|
||||||
$(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -DUPPER -DRN -UCONJ $< -o $@
|
$(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -DUPPER -DRN -UCONJ $< -o $@
|
||||||
|
@ -1940,7 +2076,7 @@ $(SGEMMITCOPYOBJ_P) : $(KERNELDIR)/$(SGEMMITCOPY)
|
||||||
|
|
||||||
endif
|
endif
|
||||||
|
|
||||||
$(DGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(DGEMMONCOPY)
|
$(D<GEMMONCOPYOBJ_P) : $(KERNELDIR)/$(DGEMMONCOPY)
|
||||||
$(CC) $(PFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
|
$(CC) $(PFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
|
||||||
|
|
||||||
$(DGEMMOTCOPYOBJ_P) : $(KERNELDIR)/$(DGEMMOTCOPY)
|
$(DGEMMOTCOPYOBJ_P) : $(KERNELDIR)/$(DGEMMOTCOPY)
|
||||||
|
@ -2044,7 +2180,10 @@ $(KDIR)cgemm_kernel_l$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMM
|
||||||
$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $@
|
$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $@
|
||||||
|
|
||||||
$(KDIR)cgemm_kernel_r$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND)
|
$(KDIR)cgemm_kernel_r$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND)
|
||||||
$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DNC $< -o $@
|
$(CC) $(PFLAGS) -E -UDOUBLE -DCOMPLEX -DNC $< -o cgemm_kernel_r.s
|
||||||
|
m4 cgemm_kernel_r.s > cgemm_kernel_r_nomacros.s
|
||||||
|
$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DNC cgemm_kernel_r_nomacros.s -o $@
|
||||||
|
rm cgemm_kernel_r.s cgemm_kernel_r_nomacros.s
|
||||||
|
|
||||||
$(KDIR)cgemm_kernel_b$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND)
|
$(KDIR)cgemm_kernel_b$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND)
|
||||||
$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $@
|
$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $@
|
||||||
|
@ -2083,7 +2222,10 @@ $(KDIR)strmm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL)
|
||||||
$(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@
|
$(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@
|
||||||
|
|
||||||
$(KDIR)strmm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL)
|
$(KDIR)strmm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL)
|
||||||
$(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
|
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o strmm_kernel_rt.s
|
||||||
|
m4 strmmkernel_rn.s > strmm_kernel_rt_nomacros.s
|
||||||
|
$(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA strmm_kernel_rt_nomacros.s -o $@
|
||||||
|
rm strmm_kernel_rt.s strmm_kernel_rt_nomacros.s
|
||||||
|
|
||||||
$(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL)
|
$(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL)
|
||||||
$(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
|
$(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
|
||||||
|
|
|
@ -68,10 +68,10 @@ static float casum_kernel_16 (long n, float *x)
|
||||||
"addi %2, %2, 128 \n\t"
|
"addi %2, %2, 128 \n\t"
|
||||||
|
|
||||||
"addic. %1, %1, -16 \n\t"
|
"addic. %1, %1, -16 \n\t"
|
||||||
"ble 2f \n\t"
|
"ble two%= \n\t"
|
||||||
|
|
||||||
".p2align 5 \n"
|
".align 5 \n"
|
||||||
"1: \n\t"
|
"one%=: \n\t"
|
||||||
|
|
||||||
"xvabssp 48, 40 \n\t"
|
"xvabssp 48, 40 \n\t"
|
||||||
"xvabssp 49, 41 \n\t"
|
"xvabssp 49, 41 \n\t"
|
||||||
|
@ -108,9 +108,9 @@ static float casum_kernel_16 (long n, float *x)
|
||||||
"xvaddsp 38, 38, %x5 \n\t"
|
"xvaddsp 38, 38, %x5 \n\t"
|
||||||
"xvaddsp 39, 39, %x6 \n\t"
|
"xvaddsp 39, 39, %x6 \n\t"
|
||||||
|
|
||||||
"bgt 1b \n"
|
"bgt one%= \n"
|
||||||
|
|
||||||
"2: \n\t"
|
"two%=: \n\t"
|
||||||
|
|
||||||
"xvabssp 48, 40 \n\t"
|
"xvabssp 48, 40 \n\t"
|
||||||
"xvabssp 49, 41 \n\t"
|
"xvabssp 49, 41 \n\t"
|
||||||
|
|
|
@ -62,10 +62,10 @@ static void ccopy_kernel_32 (long n, float *x, float *y)
|
||||||
"addi %2, %2, 128 \n\t"
|
"addi %2, %2, 128 \n\t"
|
||||||
|
|
||||||
"addic. %1, %1, -32 \n\t"
|
"addic. %1, %1, -32 \n\t"
|
||||||
"ble 2f \n\t"
|
"ble two%= \n\t"
|
||||||
|
|
||||||
".p2align 5 \n"
|
".align 5 \n"
|
||||||
"1: \n\t"
|
"one%=: \n\t"
|
||||||
|
|
||||||
"stxvd2x 32, 0, %3 \n\t"
|
"stxvd2x 32, 0, %3 \n\t"
|
||||||
"stxvd2x 33, %5, %3 \n\t"
|
"stxvd2x 33, %5, %3 \n\t"
|
||||||
|
@ -108,9 +108,9 @@ static void ccopy_kernel_32 (long n, float *x, float *y)
|
||||||
"addi %2, %2, 128 \n\t"
|
"addi %2, %2, 128 \n\t"
|
||||||
|
|
||||||
"addic. %1, %1, -32 \n\t"
|
"addic. %1, %1, -32 \n\t"
|
||||||
"bgt 1b \n"
|
"bgt one%= \n"
|
||||||
|
|
||||||
"2: \n\t"
|
"two%=: \n\t"
|
||||||
|
|
||||||
"stxvd2x 32, 0, %3 \n\t"
|
"stxvd2x 32, 0, %3 \n\t"
|
||||||
"stxvd2x 33, %5, %3 \n\t"
|
"stxvd2x 33, %5, %3 \n\t"
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -38,7 +38,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
* Macros for N=4 and M=8
|
* Macros for N=4 and M=8
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_4x8', `
|
||||||
|
#else
|
||||||
.macro COPY_4x8
|
.macro COPY_4x8
|
||||||
|
#endif
|
||||||
|
|
||||||
lxvw4x vs32, o0, A0
|
lxvw4x vs32, o0, A0
|
||||||
lxvw4x vs33, o16, A0
|
lxvw4x vs33, o16, A0
|
||||||
|
@ -93,13 +97,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
stxvw4x vs46, o32, T1
|
stxvw4x vs46, o32, T1
|
||||||
stxvw4x vs47, o48, T1
|
stxvw4x vs47, o48, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=4 and M=4
|
* Macros for N=4 and M=4
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_4x4', `
|
||||||
|
#else
|
||||||
.macro COPY_4x4
|
.macro COPY_4x4
|
||||||
|
#endif
|
||||||
|
|
||||||
lxvw4x vs32, o0, A0
|
lxvw4x vs32, o0, A0
|
||||||
lxvw4x vs33, o16, A0
|
lxvw4x vs33, o16, A0
|
||||||
|
@ -133,13 +145,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
stxvw4x vs38, o32, T1
|
stxvw4x vs38, o32, T1
|
||||||
stxvw4x vs39, o48, T1
|
stxvw4x vs39, o48, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=4 and M=2
|
* Macros for N=4 and M=2
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_4x2', `
|
||||||
|
#else
|
||||||
.macro COPY_4x2
|
.macro COPY_4x2
|
||||||
|
#endif
|
||||||
|
|
||||||
lxvw4x vs32, o0, A0
|
lxvw4x vs32, o0, A0
|
||||||
addi A0, A0, 16
|
addi A0, A0, 16
|
||||||
|
@ -163,13 +183,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
stxvw4x vs35, o48, T1
|
stxvw4x vs35, o48, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=4 and M=1
|
* Macros for N=4 and M=1
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_4x1', `
|
||||||
|
#else
|
||||||
.macro COPY_4x1
|
.macro COPY_4x1
|
||||||
|
#endif
|
||||||
|
|
||||||
lxsspx vs32, o0, A0
|
lxsspx vs32, o0, A0
|
||||||
lxsspx vs33, o4, A0
|
lxsspx vs33, o4, A0
|
||||||
|
@ -207,13 +235,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
stxsspx vs38, o0, T1
|
stxsspx vs38, o0, T1
|
||||||
stxsspx vs39, o4, T1
|
stxsspx vs39, o4, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=2 and M=8
|
* Macros for N=2 and M=8
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_2x8', `
|
||||||
|
#else
|
||||||
.macro COPY_2x8
|
.macro COPY_2x8
|
||||||
|
#endif
|
||||||
|
|
||||||
lxvw4x vs32, o0, A0
|
lxvw4x vs32, o0, A0
|
||||||
lxvw4x vs33, o16, A0
|
lxvw4x vs33, o16, A0
|
||||||
|
@ -241,13 +277,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
stxvw4x vs38, o32, T1
|
stxvw4x vs38, o32, T1
|
||||||
stxvw4x vs39, o48, T1
|
stxvw4x vs39, o48, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=2 and M=4
|
* Macros for N=2 and M=4
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_2x4', `
|
||||||
|
#else
|
||||||
.macro COPY_2x4
|
.macro COPY_2x4
|
||||||
|
#endif
|
||||||
|
|
||||||
lxvw4x vs32, o0, A0
|
lxvw4x vs32, o0, A0
|
||||||
lxvw4x vs33, o16, A0
|
lxvw4x vs33, o16, A0
|
||||||
|
@ -265,13 +309,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
stxvw4x vs34, o32, T1
|
stxvw4x vs34, o32, T1
|
||||||
stxvw4x vs35, o48, T1
|
stxvw4x vs35, o48, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=2 and M=2
|
* Macros for N=2 and M=2
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_2x2', `
|
||||||
|
#else
|
||||||
.macro COPY_2x2
|
.macro COPY_2x2
|
||||||
|
#endif
|
||||||
|
|
||||||
lxvw4x vs32, o0, A0
|
lxvw4x vs32, o0, A0
|
||||||
addi A0, A0, 16
|
addi A0, A0, 16
|
||||||
|
@ -285,13 +337,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
stxvw4x vs33, o16, T1
|
stxvw4x vs33, o16, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=2 and M=1
|
* Macros for N=2 and M=1
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_2x1', `
|
||||||
|
#else
|
||||||
.macro COPY_2x1
|
.macro COPY_2x1
|
||||||
|
#endif
|
||||||
|
|
||||||
lxsspx vs32, o0, A0
|
lxsspx vs32, o0, A0
|
||||||
lxsspx vs33, o4, A0
|
lxsspx vs33, o4, A0
|
||||||
|
@ -311,13 +371,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
stxsspx vs34, o0, T1
|
stxsspx vs34, o0, T1
|
||||||
stxsspx vs35, o4, T1
|
stxsspx vs35, o4, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=1 and M=8
|
* Macros for N=1 and M=8
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_1x8', `
|
||||||
|
#else
|
||||||
.macro COPY_1x8
|
.macro COPY_1x8
|
||||||
|
#endif
|
||||||
|
|
||||||
lxvw4x vs32, o0, A0
|
lxvw4x vs32, o0, A0
|
||||||
lxvw4x vs33, o16, A0
|
lxvw4x vs33, o16, A0
|
||||||
|
@ -332,13 +400,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
stxvw4x vs34, o32, T1
|
stxvw4x vs34, o32, T1
|
||||||
stxvw4x vs35, o48, T1
|
stxvw4x vs35, o48, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=1 and M=4
|
* Macros for N=1 and M=4
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_1x4', `
|
||||||
|
#else
|
||||||
.macro COPY_1x4
|
.macro COPY_1x4
|
||||||
|
#endif
|
||||||
|
|
||||||
lxvw4x vs32, o0, A0
|
lxvw4x vs32, o0, A0
|
||||||
lxvw4x vs33, o16, A0
|
lxvw4x vs33, o16, A0
|
||||||
|
@ -349,13 +425,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
stxvw4x vs32, o0, T1
|
stxvw4x vs32, o0, T1
|
||||||
stxvw4x vs33, o16, T1
|
stxvw4x vs33, o16, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=1 and M=2
|
* Macros for N=1 and M=2
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_1x2', `
|
||||||
|
#else
|
||||||
.macro COPY_1x2
|
.macro COPY_1x2
|
||||||
|
#endif
|
||||||
|
|
||||||
lxvw4x vs32, o0, A0
|
lxvw4x vs32, o0, A0
|
||||||
addi A0, A0, 16
|
addi A0, A0, 16
|
||||||
|
@ -364,13 +448,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
stxvw4x vs32, o0, T1
|
stxvw4x vs32, o0, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=1 and M=1
|
* Macros for N=1 and M=1
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_1x1', `
|
||||||
|
#else
|
||||||
.macro COPY_1x1
|
.macro COPY_1x1
|
||||||
|
#endif
|
||||||
|
|
||||||
lxsspx vs32, o0, A0
|
lxsspx vs32, o0, A0
|
||||||
lxsspx vs33, o4, A0
|
lxsspx vs33, o4, A0
|
||||||
|
@ -381,5 +473,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
stxsspx vs32, o0, T1
|
stxsspx vs32, o0, T1
|
||||||
stxsspx vs33, o4, T1
|
stxsspx vs33, o4, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
|
@ -56,9 +56,9 @@ static void crot_kernel_8 (long n, float *x, float *y, float c, float s)
|
||||||
"addi %[x_ptr], %[x_ptr], 64 \n\t"
|
"addi %[x_ptr], %[x_ptr], 64 \n\t"
|
||||||
"addi %[y_ptr], %[y_ptr], 64 \n\t"
|
"addi %[y_ptr], %[y_ptr], 64 \n\t"
|
||||||
"addic. %[temp_n], %[temp_n], -8 \n\t"
|
"addic. %[temp_n], %[temp_n], -8 \n\t"
|
||||||
"ble 2f \n\t"
|
"ble two%= \n\t"
|
||||||
".p2align 5 \n\t"
|
".align 5 \n\t"
|
||||||
"1: \n\t"
|
"one%=: \n\t"
|
||||||
"xvmulsp 40, 32, 36 \n\t" // c * x
|
"xvmulsp 40, 32, 36 \n\t" // c * x
|
||||||
"xvmulsp 41, 33, 36 \n\t"
|
"xvmulsp 41, 33, 36 \n\t"
|
||||||
"xvmulsp 42, 34, 36 \n\t"
|
"xvmulsp 42, 34, 36 \n\t"
|
||||||
|
@ -104,8 +104,8 @@ static void crot_kernel_8 (long n, float *x, float *y, float c, float s)
|
||||||
"addi %[x_ptr], %[x_ptr], 128 \n\t"
|
"addi %[x_ptr], %[x_ptr], 128 \n\t"
|
||||||
"addi %[y_ptr], %[y_ptr], 128 \n\t"
|
"addi %[y_ptr], %[y_ptr], 128 \n\t"
|
||||||
"addic. %[temp_n], %[temp_n], -8 \n\t"
|
"addic. %[temp_n], %[temp_n], -8 \n\t"
|
||||||
"bgt 1b \n\t"
|
"bgt one%= \n\t"
|
||||||
"2: \n\t"
|
"two%=: \n\t"
|
||||||
"xvmulsp 40, 32, 36 \n\t" // c * x
|
"xvmulsp 40, 32, 36 \n\t" // c * x
|
||||||
"xvmulsp 41, 33, 36 \n\t"
|
"xvmulsp 41, 33, 36 \n\t"
|
||||||
"xvmulsp 42, 34, 36 \n\t"
|
"xvmulsp 42, 34, 36 \n\t"
|
||||||
|
|
|
@ -39,8 +39,8 @@ static void cswap_kernel_32 (long n, float *x, float *y)
|
||||||
{
|
{
|
||||||
__asm__
|
__asm__
|
||||||
(
|
(
|
||||||
".p2align 5 \n"
|
".align 5 \n"
|
||||||
"1: \n\t"
|
"one%=: \n\t"
|
||||||
|
|
||||||
"lxvd2x 32, 0, %4 \n\t"
|
"lxvd2x 32, 0, %4 \n\t"
|
||||||
"lxvd2x 33, %5, %4 \n\t"
|
"lxvd2x 33, %5, %4 \n\t"
|
||||||
|
@ -131,7 +131,7 @@ static void cswap_kernel_32 (long n, float *x, float *y)
|
||||||
"addi %4, %4, 128 \n\t"
|
"addi %4, %4, 128 \n\t"
|
||||||
|
|
||||||
"addic. %2, %2, -32 \n\t"
|
"addic. %2, %2, -32 \n\t"
|
||||||
"bgt 1b \n"
|
"bgt one%= \n"
|
||||||
|
|
||||||
"#n=%2 x=%0=%3 y=%1=%4 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11"
|
"#n=%2 x=%0=%3 y=%1=%4 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11"
|
||||||
:
|
:
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -68,10 +68,10 @@ static double dasum_kernel_16 (long n, double *x)
|
||||||
"addi %2, %2, 128 \n\t"
|
"addi %2, %2, 128 \n\t"
|
||||||
|
|
||||||
"addic. %1, %1, -16 \n\t"
|
"addic. %1, %1, -16 \n\t"
|
||||||
"ble 2f \n\t"
|
"ble two%= \n\t"
|
||||||
|
|
||||||
".p2align 5 \n"
|
".align 5 \n"
|
||||||
"1: \n\t"
|
"one%=: \n\t"
|
||||||
|
|
||||||
"xvabsdp 48, 40 \n\t"
|
"xvabsdp 48, 40 \n\t"
|
||||||
"xvabsdp 49, 41 \n\t"
|
"xvabsdp 49, 41 \n\t"
|
||||||
|
@ -108,9 +108,9 @@ static double dasum_kernel_16 (long n, double *x)
|
||||||
"xvadddp 38, 38, %x5 \n\t"
|
"xvadddp 38, 38, %x5 \n\t"
|
||||||
"xvadddp 39, 39, %x6 \n\t"
|
"xvadddp 39, 39, %x6 \n\t"
|
||||||
|
|
||||||
"bgt 1b \n"
|
"bgt one%= \n"
|
||||||
|
|
||||||
"2: \n\t"
|
"two%=: \n\t"
|
||||||
|
|
||||||
"xvabsdp 48, 40 \n\t"
|
"xvabsdp 48, 40 \n\t"
|
||||||
"xvabsdp 49, 41 \n\t"
|
"xvabsdp 49, 41 \n\t"
|
||||||
|
@ -140,7 +140,7 @@ static double dasum_kernel_16 (long n, double *x)
|
||||||
|
|
||||||
"xvadddp 32, 32, 36 \n\t"
|
"xvadddp 32, 32, 36 \n\t"
|
||||||
|
|
||||||
"xxswapd 33, 32 \n\t"
|
XXSWAPD_S(33,32)
|
||||||
"xsadddp %x0, 32, 33 \n"
|
"xsadddp %x0, 32, 33 \n"
|
||||||
|
|
||||||
"#n=%1 x=%3=%2 sum=%0 o16=%8 o32=%9 o48=%10 o64=%11 o80=%12 o96=%13 o112=%14\n"
|
"#n=%1 x=%3=%2 sum=%0 o16=%8 o32=%9 o48=%10 o64=%11 o80=%12 o96=%13 o112=%14\n"
|
||||||
|
|
|
@ -58,7 +58,7 @@ static void daxpy_kernel_8 (long n, double *x, double *y, double alpha)
|
||||||
|
|
||||||
__asm__
|
__asm__
|
||||||
(
|
(
|
||||||
"xxspltd %x4, %x22, 0 \n\t"
|
XXSPLTD_S(%x4,%x22,0)
|
||||||
|
|
||||||
"dcbt 0, %2 \n\t"
|
"dcbt 0, %2 \n\t"
|
||||||
"dcbt 0, %3 \n\t"
|
"dcbt 0, %3 \n\t"
|
||||||
|
@ -90,10 +90,10 @@ static void daxpy_kernel_8 (long n, double *x, double *y, double alpha)
|
||||||
"addi %3, %3, -64 \n\t"
|
"addi %3, %3, -64 \n\t"
|
||||||
|
|
||||||
"addic. %1, %1, -16 \n\t"
|
"addic. %1, %1, -16 \n\t"
|
||||||
"ble 2f \n\t"
|
"ble two%= \n\t"
|
||||||
|
|
||||||
".align 5 \n"
|
".align 5 \n"
|
||||||
"1: \n\t"
|
"one%=: \n\t"
|
||||||
|
|
||||||
"xvmaddadp %x13, %x5, %x4 \n\t"
|
"xvmaddadp %x13, %x5, %x4 \n\t"
|
||||||
"xvmaddadp %x14, %x6, %x4 \n\t"
|
"xvmaddadp %x14, %x6, %x4 \n\t"
|
||||||
|
@ -152,9 +152,9 @@ static void daxpy_kernel_8 (long n, double *x, double *y, double alpha)
|
||||||
"addi %3, %3, -64 \n\t"
|
"addi %3, %3, -64 \n\t"
|
||||||
|
|
||||||
"addic. %1, %1, -16 \n\t"
|
"addic. %1, %1, -16 \n\t"
|
||||||
"bgt 1b \n"
|
"bgt one%= \n"
|
||||||
|
|
||||||
"2: \n\t"
|
"two%=: \n\t"
|
||||||
|
|
||||||
"xvmaddadp %x13, %x5, %x4 \n\t"
|
"xvmaddadp %x13, %x5, %x4 \n\t"
|
||||||
"xvmaddadp %x14, %x6, %x4 \n\t"
|
"xvmaddadp %x14, %x6, %x4 \n\t"
|
||||||
|
|
|
@ -62,10 +62,10 @@ static void dcopy_kernel_32 (long n, double *x, double *y)
|
||||||
"addi %2, %2, 128 \n\t"
|
"addi %2, %2, 128 \n\t"
|
||||||
|
|
||||||
"addic. %1, %1, -32 \n\t"
|
"addic. %1, %1, -32 \n\t"
|
||||||
"ble 2f \n\t"
|
"ble two%= \n\t"
|
||||||
|
|
||||||
".p2align 5 \n"
|
".align 5 \n"
|
||||||
"1: \n\t"
|
"one%=: \n\t"
|
||||||
|
|
||||||
"stxvd2x 32, 0, %3 \n\t"
|
"stxvd2x 32, 0, %3 \n\t"
|
||||||
"stxvd2x 33, %5, %3 \n\t"
|
"stxvd2x 33, %5, %3 \n\t"
|
||||||
|
@ -108,9 +108,9 @@ static void dcopy_kernel_32 (long n, double *x, double *y)
|
||||||
"addi %2, %2, 128 \n\t"
|
"addi %2, %2, 128 \n\t"
|
||||||
|
|
||||||
"addic. %1, %1, -32 \n\t"
|
"addic. %1, %1, -32 \n\t"
|
||||||
"bgt 1b \n"
|
"bgt one%= \n"
|
||||||
|
|
||||||
"2: \n\t"
|
"two%=: \n\t"
|
||||||
|
|
||||||
"stxvd2x 32, 0, %3 \n\t"
|
"stxvd2x 32, 0, %3 \n\t"
|
||||||
"stxvd2x 33, %5, %3 \n\t"
|
"stxvd2x 33, %5, %3 \n\t"
|
||||||
|
|
|
@ -78,10 +78,10 @@ static double ddot_kernel_8 (long n, double *x, double *y)
|
||||||
"addi %3, %3, 128 \n\t"
|
"addi %3, %3, 128 \n\t"
|
||||||
|
|
||||||
"addic. %1, %1, -16 \n\t"
|
"addic. %1, %1, -16 \n\t"
|
||||||
"ble 2f \n\t"
|
"ble two%= \n\t"
|
||||||
|
|
||||||
".p2align 5 \n"
|
".align 5 \n"
|
||||||
"1: \n\t"
|
"one%=: \n\t"
|
||||||
|
|
||||||
"xvmaddadp 32, 40, 48 \n\t"
|
"xvmaddadp 32, 40, 48 \n\t"
|
||||||
"lxvd2x 40, 0, %2 \n\t"
|
"lxvd2x 40, 0, %2 \n\t"
|
||||||
|
@ -112,9 +112,9 @@ static double ddot_kernel_8 (long n, double *x, double *y)
|
||||||
"addi %3, %3, 128 \n\t"
|
"addi %3, %3, 128 \n\t"
|
||||||
|
|
||||||
"addic. %1, %1, -16 \n\t"
|
"addic. %1, %1, -16 \n\t"
|
||||||
"bgt 1b \n"
|
"bgt one%= \n"
|
||||||
|
|
||||||
"2: \n\t"
|
"two%=: \n\t"
|
||||||
|
|
||||||
"xvmaddadp 32, 40, 48 \n\t"
|
"xvmaddadp 32, 40, 48 \n\t"
|
||||||
"xvmaddadp 33, 41, 49 \n\t"
|
"xvmaddadp 33, 41, 49 \n\t"
|
||||||
|
@ -135,7 +135,7 @@ static double ddot_kernel_8 (long n, double *x, double *y)
|
||||||
|
|
||||||
"xvadddp 32, 32, 36 \n\t"
|
"xvadddp 32, 32, 36 \n\t"
|
||||||
|
|
||||||
"xxswapd 33, 32 \n\t"
|
XXSWAPD_S(33,32)
|
||||||
|
|
||||||
"xsadddp %x0, 32, 33 \n"
|
"xsadddp %x0, 32, 33 \n"
|
||||||
|
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -38,7 +38,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
* Macros for N=4 and M=16
|
* Macros for N=4 and M=16
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_4x16', `
|
||||||
|
#else
|
||||||
.macro COPY_4x16
|
.macro COPY_4x16
|
||||||
|
#endif
|
||||||
|
|
||||||
lxvd2x vs0, o0, A0
|
lxvd2x vs0, o0, A0
|
||||||
lxvd2x vs1, o0, A1
|
lxvd2x vs1, o0, A1
|
||||||
|
@ -180,14 +184,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
addi BO, BO, 128
|
addi BO, BO, 128
|
||||||
|
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=4 and M=8
|
* Macros for N=4 and M=8
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_4x8', `
|
||||||
|
#else
|
||||||
.macro COPY_4x8
|
.macro COPY_4x8
|
||||||
|
#endif
|
||||||
|
|
||||||
lxvd2x vs0, o0, A0
|
lxvd2x vs0, o0, A0
|
||||||
lxvd2x vs1, o16, A0
|
lxvd2x vs1, o16, A0
|
||||||
|
@ -259,14 +271,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
addi BO, BO, 128
|
addi BO, BO, 128
|
||||||
|
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=4 and M=4
|
* Macros for N=4 and M=4
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_4x4', `
|
||||||
|
#else
|
||||||
.macro COPY_4x4
|
.macro COPY_4x4
|
||||||
|
#endif
|
||||||
|
|
||||||
lxvd2x vs0, o0, A0
|
lxvd2x vs0, o0, A0
|
||||||
lxvd2x vs1, o16, A0
|
lxvd2x vs1, o16, A0
|
||||||
|
@ -310,14 +330,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
addi BO, BO, 128
|
addi BO, BO, 128
|
||||||
|
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=4 and M=2
|
* Macros for N=4 and M=2
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_4x2', `
|
||||||
|
#else
|
||||||
.macro COPY_4x2
|
.macro COPY_4x2
|
||||||
|
#endif
|
||||||
|
|
||||||
lxvd2x vs0, o0, A0
|
lxvd2x vs0, o0, A0
|
||||||
addi A0, A0, 16
|
addi A0, A0, 16
|
||||||
|
@ -348,14 +376,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
addi BO, BO, 64
|
addi BO, BO, 64
|
||||||
|
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=4 and M=1
|
* Macros for N=4 and M=1
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_4x1', `
|
||||||
|
#else
|
||||||
.macro COPY_4x1
|
.macro COPY_4x1
|
||||||
|
#endif
|
||||||
|
|
||||||
lxsdx vs0, o0, A0
|
lxsdx vs0, o0, A0
|
||||||
addi A0, A0, 8
|
addi A0, A0, 8
|
||||||
|
@ -382,14 +418,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
addi BO, BO, 32
|
addi BO, BO, 32
|
||||||
|
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=2 and M=16
|
* Macros for N=2 and M=16
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_2x16', `
|
||||||
|
#else
|
||||||
.macro COPY_2x16
|
.macro COPY_2x16
|
||||||
|
#endif
|
||||||
|
|
||||||
lxvd2x vs0, o0, A0
|
lxvd2x vs0, o0, A0
|
||||||
lxvd2x vs1, o16, A0
|
lxvd2x vs1, o16, A0
|
||||||
|
@ -459,14 +503,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
addi BO, BO, 128
|
addi BO, BO, 128
|
||||||
|
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=2 and M=8
|
* Macros for N=2 and M=8
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_2x8', `
|
||||||
|
#else
|
||||||
.macro COPY_2x8
|
.macro COPY_2x8
|
||||||
|
#endif
|
||||||
|
|
||||||
lxvd2x vs0, o0, A0
|
lxvd2x vs0, o0, A0
|
||||||
lxvd2x vs1, o16, A0
|
lxvd2x vs1, o16, A0
|
||||||
|
@ -506,14 +558,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
addi BO, BO, 128
|
addi BO, BO, 128
|
||||||
|
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=2 and M=4
|
* Macros for N=2 and M=4
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_2x4', `
|
||||||
|
#else
|
||||||
.macro COPY_2x4
|
.macro COPY_2x4
|
||||||
|
#endif
|
||||||
|
|
||||||
lxvd2x vs0, o0, A0
|
lxvd2x vs0, o0, A0
|
||||||
lxvd2x vs1, o16, A0
|
lxvd2x vs1, o16, A0
|
||||||
|
@ -539,14 +599,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
addi BO, BO, 64
|
addi BO, BO, 64
|
||||||
|
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=2 and M=2
|
* Macros for N=2 and M=2
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_2x2', `
|
||||||
|
#else
|
||||||
.macro COPY_2x2
|
.macro COPY_2x2
|
||||||
|
#endif
|
||||||
|
|
||||||
lxvd2x vs0, o0, A0
|
lxvd2x vs0, o0, A0
|
||||||
addi A0, A0, 16
|
addi A0, A0, 16
|
||||||
|
@ -565,14 +633,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
addi BO, BO, 32
|
addi BO, BO, 32
|
||||||
|
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=2 and M=1
|
* Macros for N=2 and M=1
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_2x1', `
|
||||||
|
#else
|
||||||
.macro COPY_2x1
|
.macro COPY_2x1
|
||||||
|
#endif
|
||||||
|
|
||||||
lxsdx vs0, o0, A0
|
lxsdx vs0, o0, A0
|
||||||
addi A0, A0, 8
|
addi A0, A0, 8
|
||||||
|
@ -589,14 +665,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
addi BO, BO, 16
|
addi BO, BO, 16
|
||||||
|
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=1 and M=16
|
* Macros for N=1 and M=16
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_1x16', `
|
||||||
|
#else
|
||||||
.macro COPY_1x16
|
.macro COPY_1x16
|
||||||
|
#endif
|
||||||
|
|
||||||
lxvd2x vs0, o0, A0
|
lxvd2x vs0, o0, A0
|
||||||
lxvd2x vs1, o16, A0
|
lxvd2x vs1, o16, A0
|
||||||
|
@ -622,14 +706,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
addi BO, BO, 64
|
addi BO, BO, 64
|
||||||
|
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=1 and M=8
|
* Macros for N=1 and M=8
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_1x8', `
|
||||||
|
#else
|
||||||
.macro COPY_1x8
|
.macro COPY_1x8
|
||||||
|
#endif
|
||||||
|
|
||||||
lxvd2x vs0, o0, A0
|
lxvd2x vs0, o0, A0
|
||||||
lxvd2x vs1, o16, A0
|
lxvd2x vs1, o16, A0
|
||||||
|
@ -645,14 +737,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
addi BO, BO, 64
|
addi BO, BO, 64
|
||||||
|
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=1 and M=4
|
* Macros for N=1 and M=4
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_1x4', `
|
||||||
|
#else
|
||||||
.macro COPY_1x4
|
.macro COPY_1x4
|
||||||
|
#endif
|
||||||
|
|
||||||
lxvd2x vs0, o0, A0
|
lxvd2x vs0, o0, A0
|
||||||
lxvd2x vs1, o16, A0
|
lxvd2x vs1, o16, A0
|
||||||
|
@ -664,14 +764,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
addi BO, BO, 32
|
addi BO, BO, 32
|
||||||
|
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=1 and M=2
|
* Macros for N=1 and M=2
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_1x2', `
|
||||||
|
#else
|
||||||
.macro COPY_1x2
|
.macro COPY_1x2
|
||||||
|
#endif
|
||||||
|
|
||||||
lxvd2x vs0, o0, A0
|
lxvd2x vs0, o0, A0
|
||||||
addi A0, A0, 16
|
addi A0, A0, 16
|
||||||
|
@ -681,14 +789,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
addi BO, BO, 16
|
addi BO, BO, 16
|
||||||
|
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=1 and M=1
|
* Macros for N=1 and M=1
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_1x1', `
|
||||||
|
#else
|
||||||
.macro COPY_1x1
|
.macro COPY_1x1
|
||||||
|
#endif
|
||||||
|
|
||||||
lxsdx vs0, o0, A0
|
lxsdx vs0, o0, A0
|
||||||
addi A0, A0, 8
|
addi A0, A0, 8
|
||||||
|
@ -698,5 +814,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
addi BO, BO, 8
|
addi BO, BO, 8
|
||||||
|
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
|
@ -38,7 +38,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
* Macros for N=4 and M=16
|
* Macros for N=4 and M=16
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_4x16', `
|
||||||
|
#else
|
||||||
.macro COPY_4x16
|
.macro COPY_4x16
|
||||||
|
#endif
|
||||||
|
|
||||||
lxvd2x vs32, o0, A0
|
lxvd2x vs32, o0, A0
|
||||||
lxvd2x vs33, o16, A0
|
lxvd2x vs33, o16, A0
|
||||||
|
@ -140,14 +144,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
stxvd2x vs10, o32, T1
|
stxvd2x vs10, o32, T1
|
||||||
stxvd2x vs11, o48, T1
|
stxvd2x vs11, o48, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=4 and M=8
|
* Macros for N=4 and M=8
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_4x8', `
|
||||||
|
#else
|
||||||
.macro COPY_4x8
|
.macro COPY_4x8
|
||||||
|
#endif
|
||||||
|
|
||||||
lxvd2x vs32, o0, A0
|
lxvd2x vs32, o0, A0
|
||||||
lxvd2x vs33, o16, A0
|
lxvd2x vs33, o16, A0
|
||||||
|
@ -205,14 +217,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
stxvd2x vs46, o32, T1
|
stxvd2x vs46, o32, T1
|
||||||
stxvd2x vs47, o48, T1
|
stxvd2x vs47, o48, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=4 and M=4
|
* Macros for N=4 and M=4
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_4x4', `
|
||||||
|
#else
|
||||||
.macro COPY_4x4
|
.macro COPY_4x4
|
||||||
|
#endif
|
||||||
|
|
||||||
lxvd2x vs32, o0, A0
|
lxvd2x vs32, o0, A0
|
||||||
lxvd2x vs33, o16, A0
|
lxvd2x vs33, o16, A0
|
||||||
|
@ -250,14 +270,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
stxvd2x vs38, o32, T1
|
stxvd2x vs38, o32, T1
|
||||||
stxvd2x vs39, o48, T1
|
stxvd2x vs39, o48, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=4 and M=2
|
* Macros for N=4 and M=2
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_4x2', `
|
||||||
|
#else
|
||||||
.macro COPY_4x2
|
.macro COPY_4x2
|
||||||
|
#endif
|
||||||
|
|
||||||
lxvd2x vs32, o0, A0
|
lxvd2x vs32, o0, A0
|
||||||
addi A0, A0, 16
|
addi A0, A0, 16
|
||||||
|
@ -285,14 +313,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
stxvd2x vs35, o48, T1
|
stxvd2x vs35, o48, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=4 and M=1
|
* Macros for N=4 and M=1
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_4x1', `
|
||||||
|
#else
|
||||||
.macro COPY_4x1
|
.macro COPY_4x1
|
||||||
|
#endif
|
||||||
|
|
||||||
lxsdx vs32, o0, A0
|
lxsdx vs32, o0, A0
|
||||||
addi A0, A0, 8
|
addi A0, A0, 8
|
||||||
|
@ -322,14 +358,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
stxsdx vs35, o8, T1
|
stxsdx vs35, o8, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=2 and M=16
|
* Macros for N=2 and M=16
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_2x16', `
|
||||||
|
#else
|
||||||
.macro COPY_2x16
|
.macro COPY_2x16
|
||||||
|
#endif
|
||||||
|
|
||||||
lxvd2x vs32, o0, A0
|
lxvd2x vs32, o0, A0
|
||||||
lxvd2x vs33, o16, A0
|
lxvd2x vs33, o16, A0
|
||||||
|
@ -383,14 +427,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
stxvd2x vs46, o32, T1
|
stxvd2x vs46, o32, T1
|
||||||
stxvd2x vs47, o48, T1
|
stxvd2x vs47, o48, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=2 and M=8
|
* Macros for N=2 and M=8
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_2x8', `
|
||||||
|
#else
|
||||||
.macro COPY_2x8
|
.macro COPY_2x8
|
||||||
|
#endif
|
||||||
|
|
||||||
lxvd2x vs32, o0, A0
|
lxvd2x vs32, o0, A0
|
||||||
lxvd2x vs33, o16, A0
|
lxvd2x vs33, o16, A0
|
||||||
|
@ -420,14 +472,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
stxvd2x vs38, o32, T1
|
stxvd2x vs38, o32, T1
|
||||||
stxvd2x vs39, o48, T1
|
stxvd2x vs39, o48, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=2 and M=4
|
* Macros for N=2 and M=4
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_2x4', `
|
||||||
|
#else
|
||||||
.macro COPY_2x4
|
.macro COPY_2x4
|
||||||
|
#endif
|
||||||
|
|
||||||
lxvd2x vs32, o0, A0
|
lxvd2x vs32, o0, A0
|
||||||
lxvd2x vs33, o16, A0
|
lxvd2x vs33, o16, A0
|
||||||
|
@ -447,14 +507,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
stxvd2x vs34, o32, T1
|
stxvd2x vs34, o32, T1
|
||||||
stxvd2x vs35, o48, T1
|
stxvd2x vs35, o48, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=2 and M=2
|
* Macros for N=2 and M=2
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_2x2', `
|
||||||
|
#else
|
||||||
.macro COPY_2x2
|
.macro COPY_2x2
|
||||||
|
#endif
|
||||||
|
|
||||||
lxvd2x vs32, o0, A0
|
lxvd2x vs32, o0, A0
|
||||||
addi A0, A0, 16
|
addi A0, A0, 16
|
||||||
|
@ -470,14 +538,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
stxvd2x vs33, o16, T1
|
stxvd2x vs33, o16, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=2 and M=1
|
* Macros for N=2 and M=1
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_2x1', `
|
||||||
|
#else
|
||||||
.macro COPY_2x1
|
.macro COPY_2x1
|
||||||
|
#endif
|
||||||
|
|
||||||
lxsdx vs32, o0, A0
|
lxsdx vs32, o0, A0
|
||||||
addi A0, A0, 8
|
addi A0, A0, 8
|
||||||
|
@ -493,14 +569,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
stxsdx vs33, o8, T1
|
stxsdx vs33, o8, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=1 and M=16
|
* Macros for N=1 and M=16
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_1x16', `
|
||||||
|
#else
|
||||||
.macro COPY_1x16
|
.macro COPY_1x16
|
||||||
|
#endif
|
||||||
|
|
||||||
lxvd2x vs32, o0, A0
|
lxvd2x vs32, o0, A0
|
||||||
lxvd2x vs33, o16, A0
|
lxvd2x vs33, o16, A0
|
||||||
|
@ -528,14 +612,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
stxvd2x vs38, o32, T1
|
stxvd2x vs38, o32, T1
|
||||||
stxvd2x vs39, o48, T1
|
stxvd2x vs39, o48, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=1 and M=8
|
* Macros for N=1 and M=8
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_1x8', `
|
||||||
|
#else
|
||||||
.macro COPY_1x8
|
.macro COPY_1x8
|
||||||
|
#endif
|
||||||
|
|
||||||
lxvd2x vs32, o0, A0
|
lxvd2x vs32, o0, A0
|
||||||
lxvd2x vs33, o16, A0
|
lxvd2x vs33, o16, A0
|
||||||
|
@ -551,14 +643,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
stxvd2x vs34, o32, T1
|
stxvd2x vs34, o32, T1
|
||||||
stxvd2x vs35, o48, T1
|
stxvd2x vs35, o48, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=1 and M=4
|
* Macros for N=1 and M=4
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_1x4', `
|
||||||
|
#else
|
||||||
.macro COPY_1x4
|
.macro COPY_1x4
|
||||||
|
#endif
|
||||||
|
|
||||||
lxvd2x vs32, o0, A0
|
lxvd2x vs32, o0, A0
|
||||||
lxvd2x vs33, o16, A0
|
lxvd2x vs33, o16, A0
|
||||||
|
@ -570,14 +670,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
stxvd2x vs32, o0, T1
|
stxvd2x vs32, o0, T1
|
||||||
stxvd2x vs33, o16, T1
|
stxvd2x vs33, o16, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=1 and M=2
|
* Macros for N=1 and M=2
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_1x2', `
|
||||||
|
#else
|
||||||
.macro COPY_1x2
|
.macro COPY_1x2
|
||||||
|
#endif
|
||||||
|
|
||||||
lxvd2x vs32, o0, A0
|
lxvd2x vs32, o0, A0
|
||||||
addi A0, A0, 16
|
addi A0, A0, 16
|
||||||
|
@ -587,14 +695,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
stxvd2x vs32, o0, T1
|
stxvd2x vs32, o0, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=1 and M=1
|
* Macros for N=1 and M=1
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_1x1', `
|
||||||
|
#else
|
||||||
.macro COPY_1x1
|
.macro COPY_1x1
|
||||||
|
#endif
|
||||||
|
|
||||||
lxsdx vs32, o0, A0
|
lxsdx vs32, o0, A0
|
||||||
addi A0, A0, 8
|
addi A0, A0, 8
|
||||||
|
@ -604,5 +720,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
stxsdx vs32, o0, T1
|
stxsdx vs32, o0, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
|
@ -46,7 +46,7 @@ static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y
|
||||||
(
|
(
|
||||||
"lxvd2x 34, 0, %10 \n\t" // x0, x1
|
"lxvd2x 34, 0, %10 \n\t" // x0, x1
|
||||||
"lxvd2x 35, %11, %10 \n\t" // x2, x3
|
"lxvd2x 35, %11, %10 \n\t" // x2, x3
|
||||||
"xxspltd 32, %x9, 0 \n\t" // alpha, alpha
|
XXSPLTD_S(32,%x9,0) // alpha, alpha
|
||||||
|
|
||||||
"sldi %6, %13, 3 \n\t" // lda * sizeof (double)
|
"sldi %6, %13, 3 \n\t" // lda * sizeof (double)
|
||||||
|
|
||||||
|
@ -56,10 +56,10 @@ static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y
|
||||||
"add %4, %3, %6 \n\t" // a0 = ap, a1 = a0 + lda
|
"add %4, %3, %6 \n\t" // a0 = ap, a1 = a0 + lda
|
||||||
"add %6, %6, %6 \n\t" // 2 * lda
|
"add %6, %6, %6 \n\t" // 2 * lda
|
||||||
|
|
||||||
"xxspltd 32, 34, 0 \n\t" // x0 * alpha, x0 * alpha
|
XXSPLTD_S(32,34,0) // x0 * alpha, x0 * alpha
|
||||||
"xxspltd 33, 34, 1 \n\t" // x1 * alpha, x1 * alpha
|
XXSPLTD_S(33,34,1) // x1 * alpha, x1 * alpha
|
||||||
"xxspltd 34, 35, 0 \n\t" // x2 * alpha, x2 * alpha
|
XXSPLTD_S(34,35,0) // x2 * alpha, x2 * alpha
|
||||||
"xxspltd 35, 35, 1 \n\t" // x3 * alpha, x3 * alpha
|
XXSPLTD_S(35,35,1) // x3 * alpha, x3 * alpha
|
||||||
|
|
||||||
"add %5, %3, %6 \n\t" // a2 = a0 + 2 * lda
|
"add %5, %3, %6 \n\t" // a2 = a0 + 2 * lda
|
||||||
"add %6, %4, %6 \n\t" // a3 = a1 + 2 * lda
|
"add %6, %4, %6 \n\t" // a3 = a1 + 2 * lda
|
||||||
|
@ -89,10 +89,10 @@ static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y
|
||||||
"addi %6, %6, 32 \n\t"
|
"addi %6, %6, 32 \n\t"
|
||||||
|
|
||||||
"addic. %1, %1, -4 \n\t"
|
"addic. %1, %1, -4 \n\t"
|
||||||
"ble 2f \n\t"
|
"ble two%= \n\t"
|
||||||
|
|
||||||
".p2align 5 \n"
|
".align 5 \n"
|
||||||
"1: \n\t"
|
"one%=: \n\t"
|
||||||
|
|
||||||
"lxvd2x 36, 0, %2 \n\t" // y0, y1
|
"lxvd2x 36, 0, %2 \n\t" // y0, y1
|
||||||
"lxvd2x 37, %11, %2 \n\t" // y2, y3
|
"lxvd2x 37, %11, %2 \n\t" // y2, y3
|
||||||
|
@ -131,7 +131,7 @@ static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y
|
||||||
"addi %2, %2, 32 \n\t"
|
"addi %2, %2, 32 \n\t"
|
||||||
|
|
||||||
"addic. %1, %1, -4 \n\t"
|
"addic. %1, %1, -4 \n\t"
|
||||||
"ble 2f \n\t"
|
"ble two%= \n\t"
|
||||||
|
|
||||||
|
|
||||||
"lxvd2x 36, 0, %2 \n\t" // y0, y1
|
"lxvd2x 36, 0, %2 \n\t" // y0, y1
|
||||||
|
@ -171,7 +171,7 @@ static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y
|
||||||
"addi %2, %2, 32 \n\t"
|
"addi %2, %2, 32 \n\t"
|
||||||
|
|
||||||
"addic. %1, %1, -4 \n\t"
|
"addic. %1, %1, -4 \n\t"
|
||||||
"ble 2f \n\t"
|
"ble two%= \n\t"
|
||||||
|
|
||||||
|
|
||||||
"lxvd2x 36, 0, %2 \n\t" // y0, y1
|
"lxvd2x 36, 0, %2 \n\t" // y0, y1
|
||||||
|
@ -211,7 +211,7 @@ static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y
|
||||||
"addi %2, %2, 32 \n\t"
|
"addi %2, %2, 32 \n\t"
|
||||||
|
|
||||||
"addic. %1, %1, -4 \n\t"
|
"addic. %1, %1, -4 \n\t"
|
||||||
"ble 2f \n\t"
|
"ble two%= \n\t"
|
||||||
|
|
||||||
|
|
||||||
"lxvd2x 36, 0, %2 \n\t" // y0, y1
|
"lxvd2x 36, 0, %2 \n\t" // y0, y1
|
||||||
|
@ -251,9 +251,9 @@ static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y
|
||||||
"addi %2, %2, 32 \n\t"
|
"addi %2, %2, 32 \n\t"
|
||||||
|
|
||||||
"addic. %1, %1, -4 \n\t"
|
"addic. %1, %1, -4 \n\t"
|
||||||
"bgt 1b \n"
|
"bgt one%= \n"
|
||||||
|
|
||||||
"2: \n\t"
|
"two%=: \n\t"
|
||||||
|
|
||||||
"lxvd2x 36, 0, %2 \n\t" // y0, y1
|
"lxvd2x 36, 0, %2 \n\t" // y0, y1
|
||||||
"lxvd2x 37, %11, %2 \n\t" // y2, y3
|
"lxvd2x 37, %11, %2 \n\t" // y2, y3
|
||||||
|
|
|
@ -93,11 +93,11 @@ static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, do
|
||||||
"li %[off],32 \n\t"
|
"li %[off],32 \n\t"
|
||||||
|
|
||||||
|
|
||||||
"ble- 2f \n\t"
|
"ble- two%= \n\t"
|
||||||
|
|
||||||
//--------------------------------------------------
|
//--------------------------------------------------
|
||||||
".p2align 5 \n\t"
|
".align 5 \n\t"
|
||||||
"1: \n\t"
|
"one%=: \n\t"
|
||||||
"xvmaddadp 34,36,32 \n\t"
|
"xvmaddadp 34,36,32 \n\t"
|
||||||
"xvmaddadp 35,38,32 \n\t"
|
"xvmaddadp 35,38,32 \n\t"
|
||||||
"addi %[off2], %[off2],32 \n\t"
|
"addi %[off2], %[off2],32 \n\t"
|
||||||
|
@ -137,7 +137,7 @@ static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, do
|
||||||
"lxvd2x 49, %[a6], %[off2] \n\t"
|
"lxvd2x 49, %[a6], %[off2] \n\t"
|
||||||
"lxvd2x 51, %[a7], %[off2] \n\t"
|
"lxvd2x 51, %[a7], %[off2] \n\t"
|
||||||
"lxvd2x 33, %[x], %[off2] \n\t"
|
"lxvd2x 33, %[x], %[off2] \n\t"
|
||||||
"ble- 2f \n\t"
|
"ble- two%= \n\t"
|
||||||
"xvmaddadp 34,36,32 \n\t"
|
"xvmaddadp 34,36,32 \n\t"
|
||||||
"xvmaddadp 35,38,32 \n\t"
|
"xvmaddadp 35,38,32 \n\t"
|
||||||
"addi %[off2], %[off2],32 \n\t"
|
"addi %[off2], %[off2],32 \n\t"
|
||||||
|
@ -177,7 +177,7 @@ static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, do
|
||||||
"lxvd2x 49, %[a6], %[off2] \n\t"
|
"lxvd2x 49, %[a6], %[off2] \n\t"
|
||||||
"lxvd2x 51, %[a7], %[off2] \n\t"
|
"lxvd2x 51, %[a7], %[off2] \n\t"
|
||||||
"lxvd2x 33, %[x], %[off2] \n\t"
|
"lxvd2x 33, %[x], %[off2] \n\t"
|
||||||
"ble- 2f \n\t"
|
"ble- two%= \n\t"
|
||||||
"xvmaddadp 34,36,32 \n\t"
|
"xvmaddadp 34,36,32 \n\t"
|
||||||
"xvmaddadp 35,38,32 \n\t"
|
"xvmaddadp 35,38,32 \n\t"
|
||||||
#if defined(PREFETCH)
|
#if defined(PREFETCH)
|
||||||
|
@ -229,7 +229,7 @@ static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, do
|
||||||
|
|
||||||
"lxvd2x 33, %[x], %[off2] \n\t"
|
"lxvd2x 33, %[x], %[off2] \n\t"
|
||||||
"addic. %[n],%[n],-4 \n\t"
|
"addic. %[n],%[n],-4 \n\t"
|
||||||
"ble- 2f \n\t"
|
"ble- two%= \n\t"
|
||||||
|
|
||||||
"addi %[off2], %[off2],32 \n\t"
|
"addi %[off2], %[off2],32 \n\t"
|
||||||
#if defined(PREFETCH)
|
#if defined(PREFETCH)
|
||||||
|
@ -288,9 +288,9 @@ static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, do
|
||||||
#if defined(PREFETCH)
|
#if defined(PREFETCH)
|
||||||
"dcbt %[temp],%[x] \n\t"
|
"dcbt %[temp],%[x] \n\t"
|
||||||
#endif
|
#endif
|
||||||
"bgt+ 1b \n\t"
|
"bgt+ one%= \n\t"
|
||||||
".p2align 5 \n\t"
|
".align 5 \n\t"
|
||||||
"2: \n\t"
|
"two%=: \n\t"
|
||||||
//--------------------------------------------
|
//--------------------------------------------
|
||||||
|
|
||||||
"xvmaddadp 34,36,32 \n\t"
|
"xvmaddadp 34,36,32 \n\t"
|
||||||
|
@ -301,7 +301,7 @@ static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, do
|
||||||
"xvmaddadp 7,46,32 \n\t"
|
"xvmaddadp 7,46,32 \n\t"
|
||||||
"xvmaddadp 8,48,32 \n\t"
|
"xvmaddadp 8,48,32 \n\t"
|
||||||
"xvmaddadp 9,50,32 \n\t"
|
"xvmaddadp 9,50,32 \n\t"
|
||||||
"xxspltd 36, %x[alpha], 0 \n\t"
|
XXSPLTD_S(36,%x[alpha],0)
|
||||||
"xvmaddadp 34,37,33 \n\t"
|
"xvmaddadp 34,37,33 \n\t"
|
||||||
"xvmaddadp 35,39,33 \n\t"
|
"xvmaddadp 35,39,33 \n\t"
|
||||||
"xvmaddadp 4,41,33 \n\t"
|
"xvmaddadp 4,41,33 \n\t"
|
||||||
|
@ -322,21 +322,21 @@ static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, do
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
"xxmrgld 42,34,35 \n\t"
|
XXMRGLD_S(42,34,35)
|
||||||
"xxmrghd 43,34,35 \n\t"
|
XXMRGHD_S(43,34,35)
|
||||||
|
|
||||||
"xxmrgld 44,4,5 \n\t"
|
XXMRGLD_S(44,4,5)
|
||||||
"xxmrghd 45,4,5 \n\t"
|
XXMRGHD_S(45,4,5)
|
||||||
|
|
||||||
"xvadddp 42,42,43 \n\t"
|
"xvadddp 42,42,43 \n\t"
|
||||||
|
|
||||||
"xxmrgld 46,6,7 \n\t"
|
XXMRGLD_S(46,6,7)
|
||||||
"xxmrghd 47,6,7 \n\t"
|
XXMRGHD_S(47,6,7)
|
||||||
|
|
||||||
"xvadddp 44,44,45 \n\t"
|
"xvadddp 44,44,45 \n\t"
|
||||||
|
|
||||||
"xxmrgld 48,8,9 \n\t"
|
XXMRGLD_S(48,8,9)
|
||||||
"xxmrghd 49,8,9 \n\t"
|
XXMRGHD_S(49,8,9)
|
||||||
|
|
||||||
"xvadddp 46,46,47 \n\t"
|
"xvadddp 46,46,47 \n\t"
|
||||||
|
|
||||||
|
|
|
@ -51,8 +51,8 @@ static void drot_kernel_16 (long n, double *x, double *y, double c, double s)
|
||||||
|
|
||||||
__asm__
|
__asm__
|
||||||
(
|
(
|
||||||
"xxspltd 36, %x13, 0 \n\t" // load c to both dwords
|
XXSPLTD_S(36,%x13,0) // load c to both dwords
|
||||||
"xxspltd 37, %x14, 0 \n\t" // load s to both dwords
|
XXSPLTD_S(37,%x14,0) // load s to both dwords
|
||||||
|
|
||||||
"lxvd2x 32, 0, %3 \n\t" // load x
|
"lxvd2x 32, 0, %3 \n\t" // load x
|
||||||
"lxvd2x 33, %15, %3 \n\t"
|
"lxvd2x 33, %15, %3 \n\t"
|
||||||
|
@ -68,10 +68,10 @@ static void drot_kernel_16 (long n, double *x, double *y, double c, double s)
|
||||||
"addi %4, %4, 64 \n\t"
|
"addi %4, %4, 64 \n\t"
|
||||||
|
|
||||||
"addic. %2, %2, -8 \n\t"
|
"addic. %2, %2, -8 \n\t"
|
||||||
"ble 2f \n\t"
|
"ble two%= \n\t"
|
||||||
|
|
||||||
".p2align 5 \n"
|
".align 5 \n"
|
||||||
"1: \n\t"
|
"one%=: \n\t"
|
||||||
|
|
||||||
"xvmuldp 40, 32, 36 \n\t" // c * x
|
"xvmuldp 40, 32, 36 \n\t" // c * x
|
||||||
"xvmuldp 41, 33, 36 \n\t"
|
"xvmuldp 41, 33, 36 \n\t"
|
||||||
|
@ -135,9 +135,9 @@ static void drot_kernel_16 (long n, double *x, double *y, double c, double s)
|
||||||
"addi %4, %4, 128 \n\t"
|
"addi %4, %4, 128 \n\t"
|
||||||
|
|
||||||
"addic. %2, %2, -8 \n\t"
|
"addic. %2, %2, -8 \n\t"
|
||||||
"bgt 1b \n"
|
"bgt one%= \n"
|
||||||
|
|
||||||
"2: \n\t"
|
"two%=: \n\t"
|
||||||
|
|
||||||
"xvmuldp 40, 32, 36 \n\t" // c * x
|
"xvmuldp 40, 32, 36 \n\t" // c * x
|
||||||
"xvmuldp 41, 33, 36 \n\t"
|
"xvmuldp 41, 33, 36 \n\t"
|
||||||
|
|
|
@ -41,7 +41,7 @@ static void dscal_kernel_8 (long n, double *x, double alpha)
|
||||||
(
|
(
|
||||||
"dcbt 0, %2 \n\t"
|
"dcbt 0, %2 \n\t"
|
||||||
|
|
||||||
"xxspltd %x3, %x3, 0 \n\t"
|
XXSPLTD_S(%x3,%x3,0)
|
||||||
|
|
||||||
"lxvd2x 32, 0, %2 \n\t"
|
"lxvd2x 32, 0, %2 \n\t"
|
||||||
"lxvd2x 33, %4, %2 \n\t"
|
"lxvd2x 33, %4, %2 \n\t"
|
||||||
|
@ -55,10 +55,10 @@ static void dscal_kernel_8 (long n, double *x, double alpha)
|
||||||
"addi %2, %2, 128 \n\t"
|
"addi %2, %2, 128 \n\t"
|
||||||
|
|
||||||
"addic. %1, %1, -16 \n\t"
|
"addic. %1, %1, -16 \n\t"
|
||||||
"ble 2f \n\t"
|
"ble two%= \n\t"
|
||||||
|
|
||||||
".p2align 5 \n"
|
".align 5 \n"
|
||||||
"1: \n\t"
|
"one%=: \n\t"
|
||||||
|
|
||||||
"xvmuldp 40, 32, %x3 \n\t"
|
"xvmuldp 40, 32, %x3 \n\t"
|
||||||
"xvmuldp 41, 33, %x3 \n\t"
|
"xvmuldp 41, 33, %x3 \n\t"
|
||||||
|
@ -91,9 +91,9 @@ static void dscal_kernel_8 (long n, double *x, double alpha)
|
||||||
"addi %2, %2, 256 \n\t"
|
"addi %2, %2, 256 \n\t"
|
||||||
|
|
||||||
"addic. %1, %1, -16 \n\t"
|
"addic. %1, %1, -16 \n\t"
|
||||||
"bgt 1b \n"
|
"bgt one%= \n"
|
||||||
|
|
||||||
"2: \n\t"
|
"two%=: \n\t"
|
||||||
|
|
||||||
"xvmuldp 40, 32, %x3 \n\t"
|
"xvmuldp 40, 32, %x3 \n\t"
|
||||||
"xvmuldp 41, 33, %x3 \n\t"
|
"xvmuldp 41, 33, %x3 \n\t"
|
||||||
|
@ -146,8 +146,8 @@ static void dscal_kernel_8_zero (long n, double *x)
|
||||||
(
|
(
|
||||||
"xxlxor %x3, %x3, %x3 \n\t"
|
"xxlxor %x3, %x3, %x3 \n\t"
|
||||||
|
|
||||||
".p2align 5 \n"
|
".align 5 \n"
|
||||||
"1: \n\t"
|
"one%=: \n\t"
|
||||||
|
|
||||||
"stxvd2x %x3, 0, %2 \n\t"
|
"stxvd2x %x3, 0, %2 \n\t"
|
||||||
"stxvd2x %x3, %4, %2 \n\t"
|
"stxvd2x %x3, %4, %2 \n\t"
|
||||||
|
@ -161,7 +161,7 @@ static void dscal_kernel_8_zero (long n, double *x)
|
||||||
"addi %2, %2, 128 \n\t"
|
"addi %2, %2, 128 \n\t"
|
||||||
|
|
||||||
"addic. %1, %1, -16 \n\t"
|
"addic. %1, %1, -16 \n\t"
|
||||||
"bgt 1b \n"
|
"bgt one%= \n"
|
||||||
|
|
||||||
"#n=%1 x=%0=%2 t0=%x3 o16=%4 o32=%5 o48=%6 o64=%7 o80=%8 o96=%9 o112=%10"
|
"#n=%1 x=%0=%2 t0=%x3 o16=%4 o32=%5 o48=%6 o64=%7 o80=%8 o96=%9 o112=%10"
|
||||||
:
|
:
|
||||||
|
|
|
@ -39,8 +39,8 @@ static void dswap_kernel_32 (long n, double *x, double *y)
|
||||||
{
|
{
|
||||||
__asm__
|
__asm__
|
||||||
(
|
(
|
||||||
".p2align 5 \n"
|
".align 5 \n"
|
||||||
"1: \n\t"
|
"one%=: \n\t"
|
||||||
|
|
||||||
"lxvd2x 32, 0, %4 \n\t"
|
"lxvd2x 32, 0, %4 \n\t"
|
||||||
"lxvd2x 33, %5, %4 \n\t"
|
"lxvd2x 33, %5, %4 \n\t"
|
||||||
|
@ -131,7 +131,7 @@ static void dswap_kernel_32 (long n, double *x, double *y)
|
||||||
"addi %4, %4, 128 \n\t"
|
"addi %4, %4, 128 \n\t"
|
||||||
|
|
||||||
"addic. %2, %2, -32 \n\t"
|
"addic. %2, %2, -32 \n\t"
|
||||||
"bgt 1b \n"
|
"bgt one%= \n"
|
||||||
|
|
||||||
"#n=%2 x=%0=%3 y=%1=%4 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11"
|
"#n=%2 x=%0=%3 y=%1=%4 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11"
|
||||||
:
|
:
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -58,8 +58,8 @@ static BLASLONG diamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) {
|
||||||
"lxvd2x 47, %[i48],%[ptr_tmp] \n\t"
|
"lxvd2x 47, %[i48],%[ptr_tmp] \n\t"
|
||||||
"lxvd2x 48, %[i64],%[ptr_tmp] \n\t"
|
"lxvd2x 48, %[i64],%[ptr_tmp] \n\t"
|
||||||
"lxvd2x 49, %[i80],%[ptr_tmp] \n\t"
|
"lxvd2x 49, %[i80],%[ptr_tmp] \n\t"
|
||||||
"lxvd2x 50, %[i96],%[ptr_tmp] \n\t"
|
"lxvd2x 6, %[i96],%[ptr_tmp] \n\t"
|
||||||
"lxvd2x 51,%[i112],%[ptr_tmp] \n\t"
|
"lxvd2x 7,%[i112],%[ptr_tmp] \n\t"
|
||||||
|
|
||||||
"xxlor 40,%x[start],%x[start] \n\t" //{ 1,0} vs40 | v8
|
"xxlor 40,%x[start],%x[start] \n\t" //{ 1,0} vs40 | v8
|
||||||
"vaddudm 9,8,%[adder] \n\t" //{3,2} vs41
|
"vaddudm 9,8,%[adder] \n\t" //{3,2} vs41
|
||||||
|
@ -69,7 +69,7 @@ static BLASLONG diamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) {
|
||||||
"vaddudm 11,10,%[adder] \n\t" //{7,6} vs43
|
"vaddudm 11,10,%[adder] \n\t" //{7,6} vs43
|
||||||
"xxlxor 39,39,39 \n\t" // vs39 vec_max_value
|
"xxlxor 39,39,39 \n\t" // vs39 vec_max_value
|
||||||
"vaddudm 4,11, %[adder] \n\t" // {9,8} -{8;8} vs36 | v4
|
"vaddudm 4,11, %[adder] \n\t" // {9,8} -{8;8} vs36 | v4
|
||||||
"xxspltd 36,36,0 \n\t"
|
XXSPLTD_S(36,36,0)
|
||||||
|
|
||||||
"xvabsdp 44, 44 \n\t"
|
"xvabsdp 44, 44 \n\t"
|
||||||
"xvabsdp 45, 45 \n\t"
|
"xvabsdp 45, 45 \n\t"
|
||||||
|
@ -77,21 +77,21 @@ static BLASLONG diamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) {
|
||||||
"xvabsdp 47, 47 \n\t"
|
"xvabsdp 47, 47 \n\t"
|
||||||
"xvabsdp 48, 48 \n\t"
|
"xvabsdp 48, 48 \n\t"
|
||||||
"xvabsdp 49, 49 \n\t"
|
"xvabsdp 49, 49 \n\t"
|
||||||
"xvabsdp 50, 50 \n\t"
|
"xvabsdp 6, 6 \n\t"
|
||||||
"xvabsdp 51, 51 \n\t"
|
"xvabsdp 7, 7 \n\t"
|
||||||
|
|
||||||
//jump first half forward
|
//jump first half forward
|
||||||
"b 2f \n\t"
|
"b two%= \n\t"
|
||||||
|
|
||||||
//===================================================================
|
//===================================================================
|
||||||
|
|
||||||
".p2align 5 \n\t"
|
".align 5 \n\t"
|
||||||
|
|
||||||
"1: \n\t"
|
"one%=: \n\t"
|
||||||
"xvcmpgtdp 2,45,44 \n\t "
|
"xvcmpgtdp 2,45,44 \n\t "
|
||||||
"xvcmpgtdp 3,47,46 \n\t "
|
"xvcmpgtdp 3,47,46 \n\t "
|
||||||
"xvcmpgtdp 4,49,48 \n\t "
|
"xvcmpgtdp 4,49,48 \n\t "
|
||||||
"xvcmpgtdp 5,51,50 \n\t"
|
"xvcmpgtdp 5,7,6 \n\t"
|
||||||
|
|
||||||
"xxsel 32,40,41,2 \n\t"
|
"xxsel 32,40,41,2 \n\t"
|
||||||
"xxsel 0,44,45,2 \n\t"
|
"xxsel 0,44,45,2 \n\t"
|
||||||
|
@ -100,7 +100,7 @@ static BLASLONG diamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) {
|
||||||
"xxsel 34,40,41,4 \n\t"
|
"xxsel 34,40,41,4 \n\t"
|
||||||
"xxsel 45,48,49,4 \n\t"
|
"xxsel 45,48,49,4 \n\t"
|
||||||
"xxsel 35,42,43,5 \n\t"
|
"xxsel 35,42,43,5 \n\t"
|
||||||
"xxsel 47,50,51,5 \n\t"
|
"xxsel 47,6,7,5 \n\t"
|
||||||
|
|
||||||
"xvcmpgtdp 2, 1,0 \n\t"
|
"xvcmpgtdp 2, 1,0 \n\t"
|
||||||
"xvcmpgtdp 3,47, 45 \n\t"
|
"xvcmpgtdp 3,47, 45 \n\t"
|
||||||
|
@ -134,8 +134,8 @@ static BLASLONG diamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) {
|
||||||
|
|
||||||
"vaddudm 1,1,5 \n\t" // get real index for first bigger
|
"vaddudm 1,1,5 \n\t" // get real index for first bigger
|
||||||
|
|
||||||
"lxvd2x 50, %[i96],%[ptr_tmp] \n\t"
|
"lxvd2x 6, %[i96],%[ptr_tmp] \n\t"
|
||||||
"lxvd2x 51,%[i112],%[ptr_tmp] \n\t"
|
"lxvd2x 7,%[i112],%[ptr_tmp] \n\t"
|
||||||
|
|
||||||
//compare with previous to get vec_max_index(v6 | vs38 ) and vec_max_value (vs39)
|
//compare with previous to get vec_max_index(v6 | vs38 ) and vec_max_value (vs39)
|
||||||
"xvcmpgtdp 2, 3,39 \n\t"
|
"xvcmpgtdp 2, 3,39 \n\t"
|
||||||
|
@ -155,16 +155,16 @@ static BLASLONG diamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) {
|
||||||
|
|
||||||
"xvabsdp 48, 48 \n\t"
|
"xvabsdp 48, 48 \n\t"
|
||||||
"xvabsdp 49, 49 \n\t"
|
"xvabsdp 49, 49 \n\t"
|
||||||
"xvabsdp 50, 50 \n\t"
|
"xvabsdp 6, 6 \n\t"
|
||||||
"xvabsdp 51, 51 \n\t"
|
"xvabsdp 7, 7 \n\t"
|
||||||
|
|
||||||
//<-----------jump here from first load
|
//<-----------jump here from first load
|
||||||
"2: \n\t"
|
"two%=: \n\t"
|
||||||
|
|
||||||
"xvcmpgtdp 2,45,44 \n\t "
|
"xvcmpgtdp 2,45,44 \n\t "
|
||||||
"xvcmpgtdp 3,47,46 \n\t "
|
"xvcmpgtdp 3,47,46 \n\t "
|
||||||
"xvcmpgtdp 4,49,48 \n\t "
|
"xvcmpgtdp 4,49,48 \n\t "
|
||||||
"xvcmpgtdp 5,51,50 \n\t"
|
"xvcmpgtdp 5,7,6 \n\t"
|
||||||
|
|
||||||
"xxsel 32,40,41,2 \n\t"
|
"xxsel 32,40,41,2 \n\t"
|
||||||
"xxsel 0,44,45,2 \n\t"
|
"xxsel 0,44,45,2 \n\t"
|
||||||
|
@ -173,7 +173,7 @@ static BLASLONG diamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) {
|
||||||
"xxsel 34,40,41,4 \n\t"
|
"xxsel 34,40,41,4 \n\t"
|
||||||
"xxsel 45,48,49,4 \n\t"
|
"xxsel 45,48,49,4 \n\t"
|
||||||
"xxsel 35,42,43,5 \n\t"
|
"xxsel 35,42,43,5 \n\t"
|
||||||
"xxsel 47,50,51,5 \n\t"
|
"xxsel 47,6,7,5 \n\t"
|
||||||
|
|
||||||
"xvcmpgtdp 2, 1,0 \n\t"
|
"xvcmpgtdp 2, 1,0 \n\t"
|
||||||
"xvcmpgtdp 3,47, 45 \n\t"
|
"xvcmpgtdp 3,47, 45 \n\t"
|
||||||
|
@ -203,8 +203,8 @@ static BLASLONG diamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) {
|
||||||
|
|
||||||
"vaddudm 1,1,5 \n\t" // get real index for first bigger
|
"vaddudm 1,1,5 \n\t" // get real index for first bigger
|
||||||
|
|
||||||
"lxvd2x 50, %[i96],%[ptr_tmp] \n\t"
|
"lxvd2x 6, %[i96],%[ptr_tmp] \n\t"
|
||||||
"lxvd2x 51,%[i112],%[ptr_tmp] \n\t"
|
"lxvd2x 7,%[i112],%[ptr_tmp] \n\t"
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -226,21 +226,21 @@ static BLASLONG diamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) {
|
||||||
|
|
||||||
"xvabsdp 48, 48 \n\t"
|
"xvabsdp 48, 48 \n\t"
|
||||||
"xvabsdp 49, 49 \n\t"
|
"xvabsdp 49, 49 \n\t"
|
||||||
"xvabsdp 50, 50 \n\t"
|
"xvabsdp 6, 6 \n\t"
|
||||||
"xvabsdp 51, 51 \n\t"
|
"xvabsdp 7, 7 \n\t"
|
||||||
|
|
||||||
//decrement n
|
//decrement n
|
||||||
"addic. %[n], %[n], -32 \n\t"
|
"addic. %[n], %[n], -32 \n\t"
|
||||||
|
|
||||||
//Loop back if >0
|
//Loop back if >0
|
||||||
"bgt+ 1b \n\t"
|
"bgt+ one%= \n\t"
|
||||||
|
|
||||||
//==============================================================================
|
//==============================================================================
|
||||||
|
|
||||||
"xvcmpgtdp 2,45,44 \n\t "
|
"xvcmpgtdp 2,45,44 \n\t "
|
||||||
"xvcmpgtdp 3,47,46 \n\t "
|
"xvcmpgtdp 3,47,46 \n\t "
|
||||||
"xvcmpgtdp 4,49,48 \n\t "
|
"xvcmpgtdp 4,49,48 \n\t "
|
||||||
"xvcmpgtdp 5,51,50 \n\t"
|
"xvcmpgtdp 5,7,6 \n\t"
|
||||||
|
|
||||||
"xxsel 32,40,41,2 \n\t"
|
"xxsel 32,40,41,2 \n\t"
|
||||||
"xxsel 0,44,45,2 \n\t"
|
"xxsel 0,44,45,2 \n\t"
|
||||||
|
@ -249,7 +249,7 @@ static BLASLONG diamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) {
|
||||||
"xxsel 34,40,41,4 \n\t"
|
"xxsel 34,40,41,4 \n\t"
|
||||||
"xxsel 45,48,49,4 \n\t"
|
"xxsel 45,48,49,4 \n\t"
|
||||||
"xxsel 35,42,43,5 \n\t"
|
"xxsel 35,42,43,5 \n\t"
|
||||||
"xxsel 47,50,51,5 \n\t"
|
"xxsel 47,6,7,5 \n\t"
|
||||||
|
|
||||||
"xvcmpgtdp 2, 1,0 \n\t"
|
"xvcmpgtdp 2, 1,0 \n\t"
|
||||||
"xvcmpgtdp 3,47, 45 \n\t"
|
"xvcmpgtdp 3,47, 45 \n\t"
|
||||||
|
@ -276,28 +276,28 @@ static BLASLONG diamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) {
|
||||||
|
|
||||||
///////extract max value and max index from vector
|
///////extract max value and max index from vector
|
||||||
|
|
||||||
"xxspltd 32,38,1 \n\t"
|
XXSPLTD_S(32,38,1)
|
||||||
"xxspltd 40,39,1 \n\t"
|
XXSPLTD_S(40,39,1)
|
||||||
"xvcmpeqdp. 2, 40,39 \n\t"
|
"xvcmpeqdp. 2, 40,39 \n\t"
|
||||||
|
|
||||||
//cr6 0 bit set if all true, cr6=4*6+bit_ind=24,0011at CR(BI)==1, at=10 hint that it occurs rarely
|
//cr6 0 bit set if all true, cr6=4*6+bit_ind=24,0011at CR(BI)==1, at=10 hint that it occurs rarely
|
||||||
//0b001110=14
|
//0b001110=14
|
||||||
"bc 14,24, 3f \n\t"
|
"bc 14,24, three%= \n\t"
|
||||||
"xvcmpgtdp 4, 40,39 \n\t"
|
"xvcmpgtdp 4, 40,39 \n\t"
|
||||||
"xxsel 0,39,40,4 \n\t"
|
"xxsel 0,39,40,4 \n\t"
|
||||||
"xxsel 1,38,32,4 \n\t"
|
"xxsel 1,38,32,4 \n\t"
|
||||||
"stxsdx 0,0,%[ptr_maxf] \n\t"
|
"stxsdx 0,0,%[ptr_maxf] \n\t"
|
||||||
"b 4f \n\t"
|
"b four%= \n\t"
|
||||||
|
|
||||||
"3: \n\t"
|
"three%=: \n\t"
|
||||||
//if elements value are equal then choose minimum index
|
//if elements value are equal then choose minimum index
|
||||||
"xxspltd 0,40,0 \n\t"
|
XXSPLTD_S(0,40,0)
|
||||||
"vminud 0,0,6 \n\t" //vs32 vs38
|
"vminud 0,0,6 \n\t" //vs32 vs38
|
||||||
"xxlor 1,32,32 \n\t"
|
"xxlor 1,32,32 \n\t"
|
||||||
"stxsdx 0,0,%[ptr_maxf] \n\t"
|
"stxsdx 0,0,%[ptr_maxf] \n\t"
|
||||||
|
|
||||||
|
|
||||||
"4: \n\t"
|
"four%=: \n\t"
|
||||||
"mfvsrd %[index],1 \n\t"
|
"mfvsrd %[index],1 \n\t"
|
||||||
|
|
||||||
: [maxf] "=m"(*maxf),[ptr_tmp] "+&b"(x),[index] "=r"(index), [n] "+&r"(n)
|
: [maxf] "=m"(*maxf),[ptr_tmp] "+&b"(x),[index] "=r"(index), [n] "+&r"(n)
|
||||||
|
@ -306,7 +306,7 @@ static BLASLONG diamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) {
|
||||||
[i64] "b"(64), [i80] "b"(80), [i96] "b"(96), [i112] "b"(112),
|
[i64] "b"(64), [i80] "b"(80), [i96] "b"(96), [i112] "b"(112),
|
||||||
[start] "v"(start), [adder] "v"(temp_add_index)
|
[start] "v"(start), [adder] "v"(temp_add_index)
|
||||||
: "cc", "vs0", "vs1","vs2","vs3", "vs4","vs5","vs32", "vs33", "vs34", "vs35", "vs36",
|
: "cc", "vs0", "vs1","vs2","vs3", "vs4","vs5","vs32", "vs33", "vs34", "vs35", "vs36",
|
||||||
"vs37", "vs38", "vs39", "vs40", "vs41", "vs42", "vs43", "vs44", "vs45", "vs46", "vs47", "vs48", "vs49", "vs50", "vs51"
|
"vs37", "vs38", "vs39", "vs40", "vs41", "vs42", "vs43", "vs44", "vs45", "vs46", "vs47", "vs48", "vs49", "vs6", "vs7"
|
||||||
);
|
);
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -58,8 +58,8 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) {
|
||||||
"lxvd2x 47, %[i48],%[ptr_tmp] \n\t"
|
"lxvd2x 47, %[i48],%[ptr_tmp] \n\t"
|
||||||
"lxvd2x 48, %[i64],%[ptr_tmp] \n\t"
|
"lxvd2x 48, %[i64],%[ptr_tmp] \n\t"
|
||||||
"lxvd2x 49, %[i80],%[ptr_tmp] \n\t"
|
"lxvd2x 49, %[i80],%[ptr_tmp] \n\t"
|
||||||
"lxvd2x 50, %[i96],%[ptr_tmp] \n\t"
|
"lxvd2x 6, %[i96],%[ptr_tmp] \n\t"
|
||||||
"lxvd2x 51,%[i112],%[ptr_tmp] \n\t"
|
"lxvd2x 7,%[i112],%[ptr_tmp] \n\t"
|
||||||
|
|
||||||
"xxlor 40,%x[start],%x[start] \n\t" //{ 1,0} vs40 | v8
|
"xxlor 40,%x[start],%x[start] \n\t" //{ 1,0} vs40 | v8
|
||||||
"vaddudm 9,8, %[adder] \n\t" //{3,2} vs41
|
"vaddudm 9,8, %[adder] \n\t" //{3,2} vs41
|
||||||
|
@ -69,7 +69,7 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) {
|
||||||
"vaddudm 11,10,%[adder] \n\t" //{7,6} vs43
|
"vaddudm 11,10,%[adder] \n\t" //{7,6} vs43
|
||||||
"lxvdsx 39,0,%[ptr_minf] \n\t" // vs39 vec_min_value
|
"lxvdsx 39,0,%[ptr_minf] \n\t" // vs39 vec_min_value
|
||||||
"vaddudm 4,11, %[adder] \n\t" // {9,8} -{8;8} vs36 | v4
|
"vaddudm 4,11, %[adder] \n\t" // {9,8} -{8;8} vs36 | v4
|
||||||
"xxspltd 36,36,0 \n\t"
|
XXSPLTD_S(36,36,0)
|
||||||
"xvabsdp 39, 39 \n\t"
|
"xvabsdp 39, 39 \n\t"
|
||||||
|
|
||||||
"xvabsdp 44, 44 \n\t"
|
"xvabsdp 44, 44 \n\t"
|
||||||
|
@ -78,21 +78,21 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) {
|
||||||
"xvabsdp 47, 47 \n\t"
|
"xvabsdp 47, 47 \n\t"
|
||||||
"xvabsdp 48, 48 \n\t"
|
"xvabsdp 48, 48 \n\t"
|
||||||
"xvabsdp 49, 49 \n\t"
|
"xvabsdp 49, 49 \n\t"
|
||||||
"xvabsdp 50, 50 \n\t"
|
"xvabsdp 6, 6 \n\t"
|
||||||
"xvabsdp 51, 51 \n\t"
|
"xvabsdp 7, 7 \n\t"
|
||||||
|
|
||||||
//jump first half forward
|
//jump first half forward
|
||||||
"b 2f \n\t"
|
"b two%= \n\t"
|
||||||
|
|
||||||
//===================================================================
|
//===================================================================
|
||||||
|
|
||||||
".p2align 5 \n\t"
|
".align 5 \n\t"
|
||||||
|
|
||||||
"1: \n\t"
|
"one%=: \n\t"
|
||||||
"xvcmpgtdp 2,44,45 \n\t "
|
"xvcmpgtdp 2,44,45 \n\t "
|
||||||
"xvcmpgtdp 3,46,47 \n\t "
|
"xvcmpgtdp 3,46,47 \n\t "
|
||||||
"xvcmpgtdp 4,48,49 \n\t "
|
"xvcmpgtdp 4,48,49 \n\t "
|
||||||
"xvcmpgtdp 5,50,51 \n\t"
|
"xvcmpgtdp 5,6,7 \n\t"
|
||||||
|
|
||||||
"xxsel 32,40,41,2 \n\t"
|
"xxsel 32,40,41,2 \n\t"
|
||||||
"xxsel 0,44,45,2 \n\t"
|
"xxsel 0,44,45,2 \n\t"
|
||||||
|
@ -101,7 +101,7 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) {
|
||||||
"xxsel 34,40,41,4 \n\t"
|
"xxsel 34,40,41,4 \n\t"
|
||||||
"xxsel 45,48,49,4 \n\t"
|
"xxsel 45,48,49,4 \n\t"
|
||||||
"xxsel 35,42,43,5 \n\t"
|
"xxsel 35,42,43,5 \n\t"
|
||||||
"xxsel 47,50,51,5 \n\t"
|
"xxsel 47,6,7,5 \n\t"
|
||||||
|
|
||||||
"xvcmpgtdp 2,0, 1 \n\t"
|
"xvcmpgtdp 2,0, 1 \n\t"
|
||||||
"xvcmpgtdp 3, 45,47 \n\t"
|
"xvcmpgtdp 3, 45,47 \n\t"
|
||||||
|
@ -135,8 +135,8 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) {
|
||||||
|
|
||||||
"vaddudm 1,1,5 \n\t" // get real index for first smaller
|
"vaddudm 1,1,5 \n\t" // get real index for first smaller
|
||||||
|
|
||||||
"lxvd2x 50, %[i96],%[ptr_tmp] \n\t"
|
"lxvd2x 6, %[i96],%[ptr_tmp] \n\t"
|
||||||
"lxvd2x 51,%[i112],%[ptr_tmp] \n\t"
|
"lxvd2x 7,%[i112],%[ptr_tmp] \n\t"
|
||||||
|
|
||||||
//compare with previous to get vec_min_index(v6 | vs38 ) and vec_min_value (vs39)
|
//compare with previous to get vec_min_index(v6 | vs38 ) and vec_min_value (vs39)
|
||||||
"xvcmpgtdp 2,39, 3 \n\t"
|
"xvcmpgtdp 2,39, 3 \n\t"
|
||||||
|
@ -156,16 +156,16 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) {
|
||||||
|
|
||||||
"xvabsdp 48, 48 \n\t"
|
"xvabsdp 48, 48 \n\t"
|
||||||
"xvabsdp 49, 49 \n\t"
|
"xvabsdp 49, 49 \n\t"
|
||||||
"xvabsdp 50, 50 \n\t"
|
"xvabsdp 6, 6 \n\t"
|
||||||
"xvabsdp 51, 51 \n\t"
|
"xvabsdp 7, 7 \n\t"
|
||||||
|
|
||||||
//<-----------jump here from first load
|
//<-----------jump here from first load
|
||||||
"2: \n\t"
|
"two%=: \n\t"
|
||||||
|
|
||||||
"xvcmpgtdp 2,44,45 \n\t "
|
"xvcmpgtdp 2,44,45 \n\t "
|
||||||
"xvcmpgtdp 3,46,47 \n\t "
|
"xvcmpgtdp 3,46,47 \n\t "
|
||||||
"xvcmpgtdp 4,48,49 \n\t "
|
"xvcmpgtdp 4,48,49 \n\t "
|
||||||
"xvcmpgtdp 5,50,51 \n\t"
|
"xvcmpgtdp 5,6,7 \n\t"
|
||||||
|
|
||||||
"xxsel 32,40,41,2 \n\t"
|
"xxsel 32,40,41,2 \n\t"
|
||||||
"xxsel 0,44,45,2 \n\t"
|
"xxsel 0,44,45,2 \n\t"
|
||||||
|
@ -174,7 +174,7 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) {
|
||||||
"xxsel 34,40,41,4 \n\t"
|
"xxsel 34,40,41,4 \n\t"
|
||||||
"xxsel 45,48,49,4 \n\t"
|
"xxsel 45,48,49,4 \n\t"
|
||||||
"xxsel 35,42,43,5 \n\t"
|
"xxsel 35,42,43,5 \n\t"
|
||||||
"xxsel 47,50,51,5 \n\t"
|
"xxsel 47,6,7,5 \n\t"
|
||||||
|
|
||||||
"xvcmpgtdp 2,0, 1 \n\t"
|
"xvcmpgtdp 2,0, 1 \n\t"
|
||||||
"xvcmpgtdp 3, 45,47 \n\t"
|
"xvcmpgtdp 3, 45,47 \n\t"
|
||||||
|
@ -204,8 +204,8 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) {
|
||||||
|
|
||||||
"vaddudm 1,1,5 \n\t" // get real index for first smaller
|
"vaddudm 1,1,5 \n\t" // get real index for first smaller
|
||||||
|
|
||||||
"lxvd2x 50, %[i96],%[ptr_tmp] \n\t"
|
"lxvd2x 6, %[i96],%[ptr_tmp] \n\t"
|
||||||
"lxvd2x 51,%[i112],%[ptr_tmp] \n\t"
|
"lxvd2x 7,%[i112],%[ptr_tmp] \n\t"
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -227,21 +227,21 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) {
|
||||||
|
|
||||||
"xvabsdp 48, 48 \n\t"
|
"xvabsdp 48, 48 \n\t"
|
||||||
"xvabsdp 49, 49 \n\t"
|
"xvabsdp 49, 49 \n\t"
|
||||||
"xvabsdp 50, 50 \n\t"
|
"xvabsdp 6, 6 \n\t"
|
||||||
"xvabsdp 51, 51 \n\t"
|
"xvabsdp 7, 7 \n\t"
|
||||||
|
|
||||||
//decrement n
|
//decrement n
|
||||||
"addic. %[n], %[n], -32 \n\t"
|
"addic. %[n], %[n], -32 \n\t"
|
||||||
|
|
||||||
//Loop back if >0
|
//Loop back if >0
|
||||||
"bgt+ 1b \n\t"
|
"bgt+ one%= \n\t"
|
||||||
|
|
||||||
//==============================================================================
|
//==============================================================================
|
||||||
|
|
||||||
"xvcmpgtdp 2,44,45 \n\t "
|
"xvcmpgtdp 2,44,45 \n\t "
|
||||||
"xvcmpgtdp 3,46,47 \n\t "
|
"xvcmpgtdp 3,46,47 \n\t "
|
||||||
"xvcmpgtdp 4,48,49 \n\t "
|
"xvcmpgtdp 4,48,49 \n\t "
|
||||||
"xvcmpgtdp 5,50,51 \n\t"
|
"xvcmpgtdp 5,6,7 \n\t"
|
||||||
|
|
||||||
"xxsel 32,40,41,2 \n\t"
|
"xxsel 32,40,41,2 \n\t"
|
||||||
"xxsel 0,44,45,2 \n\t"
|
"xxsel 0,44,45,2 \n\t"
|
||||||
|
@ -250,7 +250,7 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) {
|
||||||
"xxsel 34,40,41,4 \n\t"
|
"xxsel 34,40,41,4 \n\t"
|
||||||
"xxsel 45,48,49,4 \n\t"
|
"xxsel 45,48,49,4 \n\t"
|
||||||
"xxsel 35,42,43,5 \n\t"
|
"xxsel 35,42,43,5 \n\t"
|
||||||
"xxsel 47,50,51,5 \n\t"
|
"xxsel 47,6,7,5 \n\t"
|
||||||
|
|
||||||
"xvcmpgtdp 2,0, 1 \n\t"
|
"xvcmpgtdp 2,0, 1 \n\t"
|
||||||
"xvcmpgtdp 3, 45,47 \n\t"
|
"xvcmpgtdp 3, 45,47 \n\t"
|
||||||
|
@ -277,28 +277,28 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) {
|
||||||
|
|
||||||
///////extract min value and min index from vector
|
///////extract min value and min index from vector
|
||||||
|
|
||||||
"xxspltd 32,38,1 \n\t"
|
XXSPLTD_S(32,38,1)
|
||||||
"xxspltd 40,39,1 \n\t"
|
XXSPLTD_S(40,39,1)
|
||||||
"xvcmpeqdp. 2, 40,39 \n\t"
|
"xvcmpeqdp. 2, 40,39 \n\t"
|
||||||
|
|
||||||
//cr6 0 bit set if all true, cr6=4*6+bit_ind=24,0011at CR(BI)==1, at=10 hint that it occurs rarely
|
//cr6 0 bit set if all true, cr6=4*6+bit_ind=24,0011at CR(BI)==1, at=10 hint that it occurs rarely
|
||||||
//0b001110=14
|
//0b001110=14
|
||||||
"bc 14,24, 3f \n\t"
|
"bc 14,24, three%= \n\t"
|
||||||
"xvcmpgtdp 4,39, 40 \n\t"
|
"xvcmpgtdp 4,39, 40 \n\t"
|
||||||
"xxsel 0,39,40,4 \n\t"
|
"xxsel 0,39,40,4 \n\t"
|
||||||
"xxsel 1,38,32,4 \n\t"
|
"xxsel 1,38,32,4 \n\t"
|
||||||
"stxsdx 0,0,%[ptr_minf] \n\t"
|
"stxsdx 0,0,%[ptr_minf] \n\t"
|
||||||
"b 4f \n\t"
|
"b four%= \n\t"
|
||||||
|
|
||||||
"3: \n\t"
|
"three%=: \n\t"
|
||||||
//if elements value are equal then choose minimum index
|
//if elements value are equal then choose minimum index
|
||||||
"xxspltd 0,40,0 \n\t"
|
XXSPLTD_S(0,40,0)
|
||||||
"vminud 0,0,6 \n\t" //vs32 vs38
|
"vminud 0,0,6 \n\t" //vs32 vs38
|
||||||
"xxlor 1,32,32 \n\t"
|
"xxlor 1,32,32 \n\t"
|
||||||
"stxsdx 0,0,%[ptr_minf] \n\t"
|
"stxsdx 0,0,%[ptr_minf] \n\t"
|
||||||
|
|
||||||
|
|
||||||
"4: \n\t"
|
"four%=: \n\t"
|
||||||
"mfvsrd %[index],1 \n\t"
|
"mfvsrd %[index],1 \n\t"
|
||||||
|
|
||||||
: [minf] "=m"(*minf),[ptr_tmp] "+&b"(x),[index] "=r"(index), [n] "+&r"(n)
|
: [minf] "=m"(*minf),[ptr_tmp] "+&b"(x),[index] "=r"(index), [n] "+&r"(n)
|
||||||
|
@ -307,7 +307,7 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) {
|
||||||
[i64] "b"(64), [i80] "b"(80), [i96] "b"(96), [i112] "b"(112),
|
[i64] "b"(64), [i80] "b"(80), [i96] "b"(96), [i112] "b"(112),
|
||||||
[start] "v"(start), [adder] "v"(temp_add_index)
|
[start] "v"(start), [adder] "v"(temp_add_index)
|
||||||
: "cc", "vs0", "vs1","vs2","vs3", "vs4","vs5","vs32", "vs33", "vs34", "vs35", "vs36",
|
: "cc", "vs0", "vs1","vs2","vs3", "vs4","vs5","vs32", "vs33", "vs34", "vs35", "vs36",
|
||||||
"vs37", "vs38", "vs39", "vs40", "vs41", "vs42", "vs43", "vs44", "vs45", "vs46", "vs47", "vs48", "vs49", "vs50", "vs51"
|
"vs37", "vs38", "vs39", "vs40", "vs41", "vs42", "vs43", "vs44", "vs45", "vs46", "vs47", "vs48", "vs49", "vs6", "vs7"
|
||||||
);
|
);
|
||||||
|
|
||||||
return index;
|
return index;
|
||||||
|
|
|
@ -56,8 +56,8 @@ static BLASLONG ziamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *maxf) {
|
||||||
"lxvd2x 47, %[i48],%[ptr_tmp] \n\t"
|
"lxvd2x 47, %[i48],%[ptr_tmp] \n\t"
|
||||||
"lxvd2x 48, %[i64],%[ptr_tmp] \n\t"
|
"lxvd2x 48, %[i64],%[ptr_tmp] \n\t"
|
||||||
"lxvd2x 49, %[i80],%[ptr_tmp] \n\t"
|
"lxvd2x 49, %[i80],%[ptr_tmp] \n\t"
|
||||||
"lxvd2x 50, %[i96],%[ptr_tmp] \n\t"
|
"lxvd2x 6, %[i96],%[ptr_tmp] \n\t"
|
||||||
"lxvd2x 51,%[i112],%[ptr_tmp] \n\t"
|
"lxvd2x 7,%[i112],%[ptr_tmp] \n\t"
|
||||||
|
|
||||||
"xxlor 40,%x[start],%x[start] \n\t" //{ 1,0} vs40 | v8
|
"xxlor 40,%x[start],%x[start] \n\t" //{ 1,0} vs40 | v8
|
||||||
"vaddudm 9,8,%[adder] \n\t" //{3,2} vs41
|
"vaddudm 9,8,%[adder] \n\t" //{3,2} vs41
|
||||||
|
@ -67,7 +67,7 @@ static BLASLONG ziamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *maxf) {
|
||||||
"vaddudm 11,10,%[adder] \n\t" //{7,6} vs43
|
"vaddudm 11,10,%[adder] \n\t" //{7,6} vs43
|
||||||
"xxlxor 39,39,39 \n\t" // vs39 vec_max_value is zero
|
"xxlxor 39,39,39 \n\t" // vs39 vec_max_value is zero
|
||||||
"vaddudm 4,11, %[adder] \n\t" // {9,8} -{8;8} vs36 | v4
|
"vaddudm 4,11, %[adder] \n\t" // {9,8} -{8;8} vs36 | v4
|
||||||
"xxspltd 36,36,0 \n\t"
|
XXSPLTD_S(36,36,0)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -77,24 +77,24 @@ static BLASLONG ziamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *maxf) {
|
||||||
"xvabsdp 47, 47 \n\t"
|
"xvabsdp 47, 47 \n\t"
|
||||||
"xvabsdp 48, 48 \n\t"
|
"xvabsdp 48, 48 \n\t"
|
||||||
"xvabsdp 49, 49 \n\t"
|
"xvabsdp 49, 49 \n\t"
|
||||||
"xvabsdp 50, 50 \n\t"
|
"xvabsdp 6, 6 \n\t"
|
||||||
"xvabsdp 51, 51 \n\t"
|
"xvabsdp 7, 7 \n\t"
|
||||||
|
|
||||||
//jump first half forward
|
//jump first half forward
|
||||||
"b 2f \n\t"
|
"b two%= \n\t"
|
||||||
|
|
||||||
".p2align 5 \n\t"
|
".align 5 \n\t"
|
||||||
"1: \n\t"
|
"one%=: \n\t"
|
||||||
|
|
||||||
|
|
||||||
"xxmrghd 0,44,45 \n\t"
|
XXMRGHD_S(0,44,45)
|
||||||
"xxmrgld 1,44,45 \n\t"
|
XXMRGLD_S(1,44,45)
|
||||||
"xxmrghd 2,46,47 \n\t"
|
XXMRGHD_S(2,46,47)
|
||||||
"xxmrgld 3,46,47 \n\t"
|
XXMRGLD_S(3,46,47)
|
||||||
"xxmrghd 4,48,49 \n\t"
|
XXMRGHD_S(4,48,49)
|
||||||
"xxmrgld 5,48,49 \n\t"
|
XXMRGLD_S(5,48,49)
|
||||||
"xxmrghd 44,50,51 \n\t"
|
XXMRGHD_S(44,6,7)
|
||||||
"xxmrgld 45,50,51 \n\t"
|
XXMRGLD_S(45,6,7)
|
||||||
|
|
||||||
"xvadddp 46, 0,1 \n\t"
|
"xvadddp 46, 0,1 \n\t"
|
||||||
"xvadddp 47, 2,3 \n\t"
|
"xvadddp 47, 2,3 \n\t"
|
||||||
|
@ -103,15 +103,15 @@ static BLASLONG ziamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *maxf) {
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
"xvcmpgtdp 50,47,46 \n\t "
|
"xvcmpgtdp 6,47,46 \n\t "
|
||||||
"xvcmpgtdp 51,49,48 \n\t "
|
"xvcmpgtdp 7,49,48 \n\t "
|
||||||
|
|
||||||
"addi %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t"
|
"addi %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t"
|
||||||
|
|
||||||
"xxsel 32,40,41,50 \n\t"
|
"xxsel 32,40,41,6 \n\t"
|
||||||
"xxsel 0,46,47,50 \n\t"
|
"xxsel 0,46,47,6 \n\t"
|
||||||
"xxsel 33,42,43,51 \n\t"
|
"xxsel 33,42,43,7 \n\t"
|
||||||
"xxsel 1,48,49,51 \n\t"
|
"xxsel 1,48,49,7 \n\t"
|
||||||
|
|
||||||
"lxvd2x 44, 0,%[ptr_tmp] \n\t"
|
"lxvd2x 44, 0,%[ptr_tmp] \n\t"
|
||||||
"lxvd2x 45, %[i16],%[ptr_tmp] \n\t"
|
"lxvd2x 45, %[i16],%[ptr_tmp] \n\t"
|
||||||
|
@ -133,8 +133,8 @@ static BLASLONG ziamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *maxf) {
|
||||||
|
|
||||||
"lxvd2x 48, %[i64],%[ptr_tmp] \n\t"
|
"lxvd2x 48, %[i64],%[ptr_tmp] \n\t"
|
||||||
"lxvd2x 49, %[i80],%[ptr_tmp] \n\t"
|
"lxvd2x 49, %[i80],%[ptr_tmp] \n\t"
|
||||||
"lxvd2x 50, %[i96],%[ptr_tmp] \n\t"
|
"lxvd2x 6, %[i96],%[ptr_tmp] \n\t"
|
||||||
"lxvd2x 51,%[i112],%[ptr_tmp] \n\t"
|
"lxvd2x 7,%[i112],%[ptr_tmp] \n\t"
|
||||||
//select with previous
|
//select with previous
|
||||||
"xxsel 38,38,32,4 \n\t"
|
"xxsel 38,38,32,4 \n\t"
|
||||||
"xxsel 39,39,3,4 \n\t"
|
"xxsel 39,39,3,4 \n\t"
|
||||||
|
@ -148,35 +148,35 @@ static BLASLONG ziamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *maxf) {
|
||||||
"xvabsdp 47, 47 \n\t"
|
"xvabsdp 47, 47 \n\t"
|
||||||
"xvabsdp 48, 48 \n\t"
|
"xvabsdp 48, 48 \n\t"
|
||||||
"xvabsdp 49, 49 \n\t"
|
"xvabsdp 49, 49 \n\t"
|
||||||
"xvabsdp 50, 50 \n\t"
|
"xvabsdp 6, 6 \n\t"
|
||||||
"xvabsdp 51, 51 \n\t"
|
"xvabsdp 7, 7 \n\t"
|
||||||
|
|
||||||
|
|
||||||
//>>/////////////////////////////// half start
|
//>>/////////////////////////////// half start
|
||||||
"2: \n\t"
|
"two%=: \n\t"
|
||||||
"xxmrghd 0,44,45 \n\t"
|
XXMRGHD_S(0,44,45)
|
||||||
"xxmrgld 1,44,45 \n\t"
|
XXMRGLD_S(1,44,45)
|
||||||
"xxmrghd 2,46,47 \n\t"
|
XXMRGHD_S(2,46,47)
|
||||||
"xxmrgld 3,46,47 \n\t"
|
XXMRGLD_S(3,46,47)
|
||||||
"xxmrghd 4,48,49 \n\t"
|
XXMRGHD_S(4,48,49)
|
||||||
"xxmrgld 5,48,49 \n\t"
|
XXMRGLD_S(5,48,49)
|
||||||
"xxmrghd 44,50,51 \n\t"
|
XXMRGHD_S(44,6,7)
|
||||||
"xxmrgld 45,50,51 \n\t"
|
XXMRGLD_S(45,6,7)
|
||||||
|
|
||||||
"xvadddp 46, 0,1 \n\t"
|
"xvadddp 46, 0,1 \n\t"
|
||||||
"xvadddp 47, 2,3 \n\t"
|
"xvadddp 47, 2,3 \n\t"
|
||||||
"xvadddp 48, 4,5 \n\t"
|
"xvadddp 48, 4,5 \n\t"
|
||||||
"xvadddp 49, 44,45 \n\t"
|
"xvadddp 49, 44,45 \n\t"
|
||||||
|
|
||||||
"xvcmpgtdp 50,47,46 \n\t "
|
"xvcmpgtdp 6,47,46 \n\t "
|
||||||
"xvcmpgtdp 51,49,48 \n\t "
|
"xvcmpgtdp 7,49,48 \n\t "
|
||||||
|
|
||||||
"addi %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t"
|
"addi %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t"
|
||||||
|
|
||||||
"xxsel 32,40,41,50 \n\t"
|
"xxsel 32,40,41,6 \n\t"
|
||||||
"xxsel 0,46,47,50 \n\t"
|
"xxsel 0,46,47,6 \n\t"
|
||||||
"xxsel 33,42,43,51 \n\t"
|
"xxsel 33,42,43,7 \n\t"
|
||||||
"xxsel 1,48,49,51 \n\t"
|
"xxsel 1,48,49,7 \n\t"
|
||||||
|
|
||||||
"lxvd2x 44, 0,%[ptr_tmp] \n\t"
|
"lxvd2x 44, 0,%[ptr_tmp] \n\t"
|
||||||
"lxvd2x 45, %[i16],%[ptr_tmp] \n\t"
|
"lxvd2x 45, %[i16],%[ptr_tmp] \n\t"
|
||||||
|
@ -198,8 +198,8 @@ static BLASLONG ziamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *maxf) {
|
||||||
|
|
||||||
"lxvd2x 48, %[i64],%[ptr_tmp] \n\t"
|
"lxvd2x 48, %[i64],%[ptr_tmp] \n\t"
|
||||||
"lxvd2x 49, %[i80],%[ptr_tmp] \n\t"
|
"lxvd2x 49, %[i80],%[ptr_tmp] \n\t"
|
||||||
"lxvd2x 50, %[i96],%[ptr_tmp] \n\t"
|
"lxvd2x 6, %[i96],%[ptr_tmp] \n\t"
|
||||||
"lxvd2x 51,%[i112],%[ptr_tmp] \n\t"
|
"lxvd2x 7,%[i112],%[ptr_tmp] \n\t"
|
||||||
//select with previous
|
//select with previous
|
||||||
"xxsel 38,38,32,4 \n\t"
|
"xxsel 38,38,32,4 \n\t"
|
||||||
"xxsel 39,39,3,4 \n\t"
|
"xxsel 39,39,3,4 \n\t"
|
||||||
|
@ -211,24 +211,24 @@ static BLASLONG ziamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *maxf) {
|
||||||
"xvabsdp 47, 47 \n\t"
|
"xvabsdp 47, 47 \n\t"
|
||||||
"xvabsdp 48, 48 \n\t"
|
"xvabsdp 48, 48 \n\t"
|
||||||
"xvabsdp 49, 49 \n\t"
|
"xvabsdp 49, 49 \n\t"
|
||||||
"xvabsdp 50, 50 \n\t"
|
"xvabsdp 6, 6 \n\t"
|
||||||
"xvabsdp 51, 51 \n\t"
|
"xvabsdp 7, 7 \n\t"
|
||||||
|
|
||||||
|
|
||||||
//decrement n
|
//decrement n
|
||||||
"addic. %[n], %[n], -16 \n\t"
|
"addic. %[n], %[n], -16 \n\t"
|
||||||
//Loop back if >0
|
//Loop back if >0
|
||||||
"bgt+ 1b \n\t"
|
"bgt+ one%= \n\t"
|
||||||
|
|
||||||
|
|
||||||
"xxmrghd 0,44,45 \n\t"
|
XXMRGHD_S(0,44,45)
|
||||||
"xxmrgld 1,44,45 \n\t"
|
XXMRGLD_S(1,44,45)
|
||||||
"xxmrghd 2,46,47 \n\t"
|
XXMRGHD_S(2,46,47)
|
||||||
"xxmrgld 3,46,47 \n\t"
|
XXMRGLD_S(3,46,47)
|
||||||
"xxmrghd 4,48,49 \n\t"
|
XXMRGHD_S(4,48,49)
|
||||||
"xxmrgld 5,48,49 \n\t"
|
XXMRGLD_S(5,48,49)
|
||||||
"xxmrghd 44,50,51 \n\t"
|
XXMRGHD_S(44,6,7)
|
||||||
"xxmrgld 45,50,51 \n\t"
|
XXMRGLD_S(45,6,7)
|
||||||
|
|
||||||
"xvadddp 46, 0,1 \n\t"
|
"xvadddp 46, 0,1 \n\t"
|
||||||
"xvadddp 47, 2,3 \n\t"
|
"xvadddp 47, 2,3 \n\t"
|
||||||
|
@ -237,13 +237,13 @@ static BLASLONG ziamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *maxf) {
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
"xvcmpgtdp 50,47,46 \n\t "
|
"xvcmpgtdp 6,47,46 \n\t "
|
||||||
"xvcmpgtdp 51,49,48 \n\t "
|
"xvcmpgtdp 7,49,48 \n\t "
|
||||||
|
|
||||||
"xxsel 32,40,41,50 \n\t"
|
"xxsel 32,40,41,6 \n\t"
|
||||||
"xxsel 0,46,47,50 \n\t"
|
"xxsel 0,46,47,6 \n\t"
|
||||||
"xxsel 33,42,43,51 \n\t"
|
"xxsel 33,42,43,7 \n\t"
|
||||||
"xxsel 1,48,49,51 \n\t"
|
"xxsel 1,48,49,7 \n\t"
|
||||||
|
|
||||||
"xvcmpgtdp 2,1,0 \n\t "
|
"xvcmpgtdp 2,1,0 \n\t "
|
||||||
"xxsel 32,32,33,2 \n\t"
|
"xxsel 32,32,33,2 \n\t"
|
||||||
|
@ -262,28 +262,28 @@ static BLASLONG ziamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *maxf) {
|
||||||
|
|
||||||
///////extract max value and max index from vector
|
///////extract max value and max index from vector
|
||||||
|
|
||||||
"xxspltd 32,38,1 \n\t"
|
XXSPLTD_S(32,38,1)
|
||||||
"xxspltd 40,39,1 \n\t"
|
XXSPLTD_S(40,39,1)
|
||||||
"xvcmpeqdp. 2, 40,39 \n\t"
|
"xvcmpeqdp. 2, 40,39 \n\t"
|
||||||
|
|
||||||
//cr6 0 bit set if all true, cr6=4*6+bit_ind=24,0011at CR(BI)==1, at=10 hint that it occurs rarely
|
//cr6 0 bit set if all true, cr6=4*6+bit_ind=24,0011at CR(BI)==1, at=10 hint that it occurs rarely
|
||||||
//0b001110=14
|
//0b001110=14
|
||||||
"bc 14,24, 3f \n\t"
|
"bc 14,24, three%= \n\t"
|
||||||
"xvcmpgtdp 4, 40,39 \n\t"
|
"xvcmpgtdp 4, 40,39 \n\t"
|
||||||
"xxsel 0,39,40,4 \n\t"
|
"xxsel 0,39,40,4 \n\t"
|
||||||
"xxsel 1,38,32,4 \n\t"
|
"xxsel 1,38,32,4 \n\t"
|
||||||
"stxsdx 0,0,%[ptr_maxf] \n\t"
|
"stxsdx 0,0,%[ptr_maxf] \n\t"
|
||||||
"b 4f \n\t"
|
"b four%= \n\t"
|
||||||
|
|
||||||
"3: \n\t"
|
"three%=: \n\t"
|
||||||
//if elements value are equal then choose minimum index
|
//if elements value are equal then choose minimum index
|
||||||
"xxspltd 0,40,0 \n\t"
|
XXSPLTD_S(0,40,0)
|
||||||
"vminud 0,0,6 \n\t" //vs32 vs38
|
"vminud 0,0,6 \n\t" //vs32 vs38
|
||||||
"xxlor 1,32,32 \n\t"
|
"xxlor 1,32,32 \n\t"
|
||||||
"stxsdx 0,0,%[ptr_maxf] \n\t"
|
"stxsdx 0,0,%[ptr_maxf] \n\t"
|
||||||
|
|
||||||
|
|
||||||
"4: \n\t"
|
"four%=: \n\t"
|
||||||
"mfvsrd %[index],1 \n\t"
|
"mfvsrd %[index],1 \n\t"
|
||||||
|
|
||||||
: [maxf] "=m"(*maxf),[ptr_tmp] "+&b"(x),[index] "=r"(index), [n] "+&r"(n)
|
: [maxf] "=m"(*maxf),[ptr_tmp] "+&b"(x),[index] "=r"(index), [n] "+&r"(n)
|
||||||
|
@ -292,7 +292,7 @@ static BLASLONG ziamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *maxf) {
|
||||||
[i64] "b"(64), [i80] "b"(80), [i96] "b"(96), [i112] "b"(112),
|
[i64] "b"(64), [i80] "b"(80), [i96] "b"(96), [i112] "b"(112),
|
||||||
[start] "v"(start), [adder] "v"(temp_add_index)
|
[start] "v"(start), [adder] "v"(temp_add_index)
|
||||||
: "cc", "vs0", "vs1","vs2","vs3", "vs4","vs5","vs32", "vs33", "vs34", "vs35", "vs36",
|
: "cc", "vs0", "vs1","vs2","vs3", "vs4","vs5","vs32", "vs33", "vs34", "vs35", "vs36",
|
||||||
"vs37", "vs38", "vs39", "vs40", "vs41", "vs42", "vs43", "vs44", "vs45", "vs46", "vs47", "vs48", "vs49", "vs50", "vs51"
|
"vs37", "vs38", "vs39", "vs40", "vs41", "vs42", "vs43", "vs44", "vs45", "vs46", "vs47", "vs48", "vs49", "vs6", "vs7"
|
||||||
);
|
);
|
||||||
|
|
||||||
return index;
|
return index;
|
||||||
|
|
|
@ -54,8 +54,8 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) {
|
||||||
"lxvd2x 47, %[i48],%[ptr_tmp] \n\t"
|
"lxvd2x 47, %[i48],%[ptr_tmp] \n\t"
|
||||||
"lxvd2x 48, %[i64],%[ptr_tmp] \n\t"
|
"lxvd2x 48, %[i64],%[ptr_tmp] \n\t"
|
||||||
"lxvd2x 49, %[i80],%[ptr_tmp] \n\t"
|
"lxvd2x 49, %[i80],%[ptr_tmp] \n\t"
|
||||||
"lxvd2x 50, %[i96],%[ptr_tmp] \n\t"
|
"lxvd2x 6, %[i96],%[ptr_tmp] \n\t"
|
||||||
"lxvd2x 51,%[i112],%[ptr_tmp] \n\t"
|
"lxvd2x 7,%[i112],%[ptr_tmp] \n\t"
|
||||||
|
|
||||||
"xxlor 40,%x[start],%x[start] \n\t" //{ 1,0} vs40 | v8
|
"xxlor 40,%x[start],%x[start] \n\t" //{ 1,0} vs40 | v8
|
||||||
"vaddudm 9,8,%[adder] \n\t" //{3,2} vs41
|
"vaddudm 9,8,%[adder] \n\t" //{3,2} vs41
|
||||||
|
@ -65,7 +65,7 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) {
|
||||||
"vaddudm 11,10,%[adder] \n\t" //{7,6} vs43
|
"vaddudm 11,10,%[adder] \n\t" //{7,6} vs43
|
||||||
"lxvdsx 39,0,%[ptr_minf] \n\t" // vs39 vec_min_value
|
"lxvdsx 39,0,%[ptr_minf] \n\t" // vs39 vec_min_value
|
||||||
"vaddudm 4,11, %[adder] \n\t" // {9,8} -{8;8} vs36 | v4
|
"vaddudm 4,11, %[adder] \n\t" // {9,8} -{8;8} vs36 | v4
|
||||||
"xxspltd 36,36,0 \n\t"
|
XXSPLTD_S(36,36,0)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -75,24 +75,24 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) {
|
||||||
"xvabsdp 47, 47 \n\t"
|
"xvabsdp 47, 47 \n\t"
|
||||||
"xvabsdp 48, 48 \n\t"
|
"xvabsdp 48, 48 \n\t"
|
||||||
"xvabsdp 49, 49 \n\t"
|
"xvabsdp 49, 49 \n\t"
|
||||||
"xvabsdp 50, 50 \n\t"
|
"xvabsdp 6, 6 \n\t"
|
||||||
"xvabsdp 51, 51 \n\t"
|
"xvabsdp 7, 7 \n\t"
|
||||||
|
|
||||||
//jump first half forward
|
//jump first half forward
|
||||||
"b 2f \n\t"
|
"b two%= \n\t"
|
||||||
|
|
||||||
".p2align 5 \n\t"
|
".align 5 \n\t"
|
||||||
"1: \n\t"
|
"one%=: \n\t"
|
||||||
|
|
||||||
|
|
||||||
"xxmrghd 0,44,45 \n\t"
|
XXMRGHD_S(0,44,45)
|
||||||
"xxmrgld 1,44,45 \n\t"
|
XXMRGLD_S(1,44,45)
|
||||||
"xxmrghd 2,46,47 \n\t"
|
XXMRGHD_S(2,46,47)
|
||||||
"xxmrgld 3,46,47 \n\t"
|
XXMRGLD_S(3,46,47)
|
||||||
"xxmrghd 4,48,49 \n\t"
|
XXMRGHD_S(4,48,49)
|
||||||
"xxmrgld 5,48,49 \n\t"
|
XXMRGLD_S(5,48,49)
|
||||||
"xxmrghd 44,50,51 \n\t"
|
XXMRGHD_S(44,6,7)
|
||||||
"xxmrgld 45,50,51 \n\t"
|
XXMRGLD_S(45,6,7)
|
||||||
|
|
||||||
"xvadddp 46, 0,1 \n\t"
|
"xvadddp 46, 0,1 \n\t"
|
||||||
"xvadddp 47, 2,3 \n\t"
|
"xvadddp 47, 2,3 \n\t"
|
||||||
|
@ -101,15 +101,15 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) {
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
"xvcmpgtdp 50,46,47 \n\t "
|
"xvcmpgtdp 6,46,47 \n\t "
|
||||||
"xvcmpgtdp 51,48,49 \n\t "
|
"xvcmpgtdp 7,48,49 \n\t "
|
||||||
|
|
||||||
"addi %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t"
|
"addi %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t"
|
||||||
|
|
||||||
"xxsel 32,40,41,50 \n\t"
|
"xxsel 32,40,41,6 \n\t"
|
||||||
"xxsel 0,46,47,50 \n\t"
|
"xxsel 0,46,47,6 \n\t"
|
||||||
"xxsel 33,42,43,51 \n\t"
|
"xxsel 33,42,43,7 \n\t"
|
||||||
"xxsel 1,48,49,51 \n\t"
|
"xxsel 1,48,49,7 \n\t"
|
||||||
|
|
||||||
"lxvd2x 44, 0,%[ptr_tmp] \n\t"
|
"lxvd2x 44, 0,%[ptr_tmp] \n\t"
|
||||||
"lxvd2x 45, %[i16],%[ptr_tmp] \n\t"
|
"lxvd2x 45, %[i16],%[ptr_tmp] \n\t"
|
||||||
|
@ -131,8 +131,8 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) {
|
||||||
|
|
||||||
"lxvd2x 48, %[i64],%[ptr_tmp] \n\t"
|
"lxvd2x 48, %[i64],%[ptr_tmp] \n\t"
|
||||||
"lxvd2x 49, %[i80],%[ptr_tmp] \n\t"
|
"lxvd2x 49, %[i80],%[ptr_tmp] \n\t"
|
||||||
"lxvd2x 50, %[i96],%[ptr_tmp] \n\t"
|
"lxvd2x 6, %[i96],%[ptr_tmp] \n\t"
|
||||||
"lxvd2x 51,%[i112],%[ptr_tmp] \n\t"
|
"lxvd2x 7,%[i112],%[ptr_tmp] \n\t"
|
||||||
//select with previous
|
//select with previous
|
||||||
"xxsel 38,38,32,4 \n\t"
|
"xxsel 38,38,32,4 \n\t"
|
||||||
"xxsel 39,39,3,4 \n\t"
|
"xxsel 39,39,3,4 \n\t"
|
||||||
|
@ -146,35 +146,35 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) {
|
||||||
"xvabsdp 47, 47 \n\t"
|
"xvabsdp 47, 47 \n\t"
|
||||||
"xvabsdp 48, 48 \n\t"
|
"xvabsdp 48, 48 \n\t"
|
||||||
"xvabsdp 49, 49 \n\t"
|
"xvabsdp 49, 49 \n\t"
|
||||||
"xvabsdp 50, 50 \n\t"
|
"xvabsdp 6, 6 \n\t"
|
||||||
"xvabsdp 51, 51 \n\t"
|
"xvabsdp 7, 7 \n\t"
|
||||||
|
|
||||||
|
|
||||||
//>>/////////////////////////////// half start
|
//>>/////////////////////////////// half start
|
||||||
"2: \n\t"
|
"two%=: \n\t"
|
||||||
"xxmrghd 0,44,45 \n\t"
|
XXMRGHD_S(0,44,45)
|
||||||
"xxmrgld 1,44,45 \n\t"
|
XXMRGLD_S(1,44,45)
|
||||||
"xxmrghd 2,46,47 \n\t"
|
XXMRGHD_S(2,46,47)
|
||||||
"xxmrgld 3,46,47 \n\t"
|
XXMRGLD_S(3,46,47)
|
||||||
"xxmrghd 4,48,49 \n\t"
|
XXMRGHD_S(4,48,49)
|
||||||
"xxmrgld 5,48,49 \n\t"
|
XXMRGLD_S(5,48,49)
|
||||||
"xxmrghd 44,50,51 \n\t"
|
XXMRGHD_S(44,6,7)
|
||||||
"xxmrgld 45,50,51 \n\t"
|
XXMRGLD_S(45,6,7)
|
||||||
|
|
||||||
"xvadddp 46, 0,1 \n\t"
|
"xvadddp 46, 0,1 \n\t"
|
||||||
"xvadddp 47, 2,3 \n\t"
|
"xvadddp 47, 2,3 \n\t"
|
||||||
"xvadddp 48, 4,5 \n\t"
|
"xvadddp 48, 4,5 \n\t"
|
||||||
"xvadddp 49, 44,45 \n\t"
|
"xvadddp 49, 44,45 \n\t"
|
||||||
|
|
||||||
"xvcmpgtdp 50,46,47 \n\t "
|
"xvcmpgtdp 6,46,47 \n\t "
|
||||||
"xvcmpgtdp 51,48,49 \n\t "
|
"xvcmpgtdp 7,48,49 \n\t "
|
||||||
|
|
||||||
"addi %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t"
|
"addi %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t"
|
||||||
|
|
||||||
"xxsel 32,40,41,50 \n\t"
|
"xxsel 32,40,41,6 \n\t"
|
||||||
"xxsel 0,46,47,50 \n\t"
|
"xxsel 0,46,47,6 \n\t"
|
||||||
"xxsel 33,42,43,51 \n\t"
|
"xxsel 33,42,43,7 \n\t"
|
||||||
"xxsel 1,48,49,51 \n\t"
|
"xxsel 1,48,49,7 \n\t"
|
||||||
|
|
||||||
"lxvd2x 44, 0,%[ptr_tmp] \n\t"
|
"lxvd2x 44, 0,%[ptr_tmp] \n\t"
|
||||||
"lxvd2x 45, %[i16],%[ptr_tmp] \n\t"
|
"lxvd2x 45, %[i16],%[ptr_tmp] \n\t"
|
||||||
|
@ -196,8 +196,8 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) {
|
||||||
|
|
||||||
"lxvd2x 48, %[i64],%[ptr_tmp] \n\t"
|
"lxvd2x 48, %[i64],%[ptr_tmp] \n\t"
|
||||||
"lxvd2x 49, %[i80],%[ptr_tmp] \n\t"
|
"lxvd2x 49, %[i80],%[ptr_tmp] \n\t"
|
||||||
"lxvd2x 50, %[i96],%[ptr_tmp] \n\t"
|
"lxvd2x 6, %[i96],%[ptr_tmp] \n\t"
|
||||||
"lxvd2x 51,%[i112],%[ptr_tmp] \n\t"
|
"lxvd2x 7,%[i112],%[ptr_tmp] \n\t"
|
||||||
//select with previous
|
//select with previous
|
||||||
"xxsel 38,38,32,4 \n\t"
|
"xxsel 38,38,32,4 \n\t"
|
||||||
"xxsel 39,39,3,4 \n\t"
|
"xxsel 39,39,3,4 \n\t"
|
||||||
|
@ -209,24 +209,24 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) {
|
||||||
"xvabsdp 47, 47 \n\t"
|
"xvabsdp 47, 47 \n\t"
|
||||||
"xvabsdp 48, 48 \n\t"
|
"xvabsdp 48, 48 \n\t"
|
||||||
"xvabsdp 49, 49 \n\t"
|
"xvabsdp 49, 49 \n\t"
|
||||||
"xvabsdp 50, 50 \n\t"
|
"xvabsdp 6, 6 \n\t"
|
||||||
"xvabsdp 51, 51 \n\t"
|
"xvabsdp 7, 7 \n\t"
|
||||||
|
|
||||||
|
|
||||||
//decrement n
|
//decrement n
|
||||||
"addic. %[n], %[n], -16 \n\t"
|
"addic. %[n], %[n], -16 \n\t"
|
||||||
//Loop back if >0
|
//Loop back if >0
|
||||||
"bgt+ 1b \n\t"
|
"bgt+ one%= \n\t"
|
||||||
|
|
||||||
|
|
||||||
"xxmrghd 0,44,45 \n\t"
|
XXMRGHD_S(0,44,45)
|
||||||
"xxmrgld 1,44,45 \n\t"
|
XXMRGLD_S(1,44,45)
|
||||||
"xxmrghd 2,46,47 \n\t"
|
XXMRGHD_S(2,46,47)
|
||||||
"xxmrgld 3,46,47 \n\t"
|
XXMRGLD_S(3,46,47)
|
||||||
"xxmrghd 4,48,49 \n\t"
|
XXMRGHD_S(4,48,49)
|
||||||
"xxmrgld 5,48,49 \n\t"
|
XXMRGLD_S(5,48,49)
|
||||||
"xxmrghd 44,50,51 \n\t"
|
XXMRGHD_S(44,6,7)
|
||||||
"xxmrgld 45,50,51 \n\t"
|
XXMRGLD_S(45,6,7)
|
||||||
|
|
||||||
"xvadddp 46, 0,1 \n\t"
|
"xvadddp 46, 0,1 \n\t"
|
||||||
"xvadddp 47, 2,3 \n\t"
|
"xvadddp 47, 2,3 \n\t"
|
||||||
|
@ -235,13 +235,13 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) {
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
"xvcmpgtdp 50,46,47 \n\t "
|
"xvcmpgtdp 6,46,47 \n\t "
|
||||||
"xvcmpgtdp 51,48,49 \n\t "
|
"xvcmpgtdp 7,48,49 \n\t "
|
||||||
|
|
||||||
"xxsel 32,40,41,50 \n\t"
|
"xxsel 32,40,41,6 \n\t"
|
||||||
"xxsel 0,46,47,50 \n\t"
|
"xxsel 0,46,47,6 \n\t"
|
||||||
"xxsel 33,42,43,51 \n\t"
|
"xxsel 33,42,43,7 \n\t"
|
||||||
"xxsel 1,48,49,51 \n\t"
|
"xxsel 1,48,49,7 \n\t"
|
||||||
|
|
||||||
"xvcmpgtdp 2,0,1 \n\t "
|
"xvcmpgtdp 2,0,1 \n\t "
|
||||||
"xxsel 32,32,33,2 \n\t"
|
"xxsel 32,32,33,2 \n\t"
|
||||||
|
@ -260,28 +260,28 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) {
|
||||||
|
|
||||||
///////extract min value and min index from vector
|
///////extract min value and min index from vector
|
||||||
|
|
||||||
"xxspltd 32,38,1 \n\t"
|
XXSPLTD_S(32,38,1)
|
||||||
"xxspltd 40,39,1 \n\t"
|
XXSPLTD_S(40,39,1)
|
||||||
"xvcmpeqdp. 2, 40,39 \n\t"
|
"xvcmpeqdp. 2, 40,39 \n\t"
|
||||||
|
|
||||||
//cr6 0 bit set if all true, cr6=4*6+bit_ind=24,0011at CR(BI)==1, at=10 hint that it occurs rarely
|
//cr6 0 bit set if all true, cr6=4*6+bit_ind=24,0011at CR(BI)==1, at=10 hint that it occurs rarely
|
||||||
//0b001110=14
|
//0b001110=14
|
||||||
"bc 14,24, 3f \n\t"
|
"bc 14,24, three%= \n\t"
|
||||||
"xvcmpgtdp 4,39, 40 \n\t"
|
"xvcmpgtdp 4,39, 40 \n\t"
|
||||||
"xxsel 0,39,40,4 \n\t"
|
"xxsel 0,39,40,4 \n\t"
|
||||||
"xxsel 1,38,32,4 \n\t"
|
"xxsel 1,38,32,4 \n\t"
|
||||||
"stxsdx 0,0,%[ptr_minf] \n\t"
|
"stxsdx 0,0,%[ptr_minf] \n\t"
|
||||||
"b 4f \n\t"
|
"b four%= \n\t"
|
||||||
|
|
||||||
"3: \n\t"
|
"three%=: \n\t"
|
||||||
//if elements value are equal then choose minimum index
|
//if elements value are equal then choose minimum index
|
||||||
"xxspltd 0,40,0 \n\t"
|
XXSPLTD_S(0,40,0)
|
||||||
"vminud 0,0,6 \n\t" //vs32 vs38
|
"vminud 0,0,6 \n\t" //vs32 vs38
|
||||||
"xxlor 1,32,32 \n\t"
|
"xxlor 1,32,32 \n\t"
|
||||||
"stxsdx 0,0,%[ptr_minf] \n\t"
|
"stxsdx 0,0,%[ptr_minf] \n\t"
|
||||||
|
|
||||||
|
|
||||||
"4: \n\t"
|
"four%=: \n\t"
|
||||||
"mfvsrd %[index],1 \n\t"
|
"mfvsrd %[index],1 \n\t"
|
||||||
|
|
||||||
: [minf] "=m"(*minf),[ptr_tmp] "+&b"(x),[index] "=r"(index), [n] "+&r"(n)
|
: [minf] "=m"(*minf),[ptr_tmp] "+&b"(x),[index] "=r"(index), [n] "+&r"(n)
|
||||||
|
@ -290,7 +290,7 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) {
|
||||||
[i64] "b"(64), [i80] "b"(80), [i96] "b"(96), [i112] "b"(112),
|
[i64] "b"(64), [i80] "b"(80), [i96] "b"(96), [i112] "b"(112),
|
||||||
[start] "v"(start), [adder] "v"(temp_add_index)
|
[start] "v"(start), [adder] "v"(temp_add_index)
|
||||||
: "cc", "vs0", "vs1","vs2","vs3", "vs4","vs5","vs32", "vs33", "vs34", "vs35", "vs36",
|
: "cc", "vs0", "vs1","vs2","vs3", "vs4","vs5","vs32", "vs33", "vs34", "vs35", "vs36",
|
||||||
"vs37", "vs38", "vs39", "vs40", "vs41", "vs42", "vs43", "vs44", "vs45", "vs46", "vs47", "vs48", "vs49", "vs50", "vs51"
|
"vs37", "vs38", "vs39", "vs40", "vs41", "vs42", "vs43", "vs44", "vs45", "vs46", "vs47", "vs48", "vs49", "vs6", "vs7"
|
||||||
);
|
);
|
||||||
|
|
||||||
return index;
|
return index;
|
||||||
|
|
|
@ -46,10 +46,10 @@ static void __inline blas_lock(volatile BLASULONG *address){
|
||||||
" .machine \"any\" ;"
|
" .machine \"any\" ;"
|
||||||
"0: lwarx %0,0, %1 ;"
|
"0: lwarx %0,0, %1 ;"
|
||||||
" cmpwi 0,%0,0;"
|
" cmpwi 0,%0,0;"
|
||||||
" bne 1f;"
|
" bne one%=;"
|
||||||
" stwcx. %2,0, %1 ;"
|
" stwcx. %2,0, %1 ;"
|
||||||
" bne- 0b;"
|
" bne- 0b;"
|
||||||
"1: "
|
"one%=: "
|
||||||
: "=&r"(ret)
|
: "=&r"(ret)
|
||||||
: "r"(address), "r" (val)
|
: "r"(address), "r" (val)
|
||||||
: "cr0", "memory");
|
: "cr0", "memory");
|
||||||
|
|
|
@ -68,10 +68,10 @@ static float sasum_kernel_32 (long n, float *x)
|
||||||
"addi %2, %2, 128 \n\t"
|
"addi %2, %2, 128 \n\t"
|
||||||
|
|
||||||
"addic. %1, %1, -32 \n\t"
|
"addic. %1, %1, -32 \n\t"
|
||||||
"ble 2f \n\t"
|
"ble two%= \n\t"
|
||||||
|
|
||||||
".p2align 5 \n"
|
".align 5 \n"
|
||||||
"1: \n\t"
|
"one%=: \n\t"
|
||||||
|
|
||||||
"xvabssp 48, 40 \n\t"
|
"xvabssp 48, 40 \n\t"
|
||||||
"xvabssp 49, 41 \n\t"
|
"xvabssp 49, 41 \n\t"
|
||||||
|
@ -108,9 +108,9 @@ static float sasum_kernel_32 (long n, float *x)
|
||||||
"xvaddsp 38, 38, %x5 \n\t"
|
"xvaddsp 38, 38, %x5 \n\t"
|
||||||
"xvaddsp 39, 39, %x6 \n\t"
|
"xvaddsp 39, 39, %x6 \n\t"
|
||||||
|
|
||||||
"bgt 1b \n"
|
"bgt one%= \n"
|
||||||
|
|
||||||
"2: \n\t"
|
"two%=: \n\t"
|
||||||
|
|
||||||
"xvabssp 48, 40 \n\t"
|
"xvabssp 48, 40 \n\t"
|
||||||
"xvabssp 49, 41 \n\t"
|
"xvabssp 49, 41 \n\t"
|
||||||
|
|
|
@ -51,10 +51,10 @@ static void scopy_kernel_32 (long n, float *x, float *y)
|
||||||
"addi %2, %2, 128 \n\t"
|
"addi %2, %2, 128 \n\t"
|
||||||
|
|
||||||
"addic. %1, %1, -32 \n\t"
|
"addic. %1, %1, -32 \n\t"
|
||||||
"ble 2f \n\t"
|
"ble two%= \n\t"
|
||||||
|
|
||||||
".p2align 5 \n"
|
".align 5 \n"
|
||||||
"1: \n\t"
|
"one%=: \n\t"
|
||||||
|
|
||||||
"stxvd2x 40, 0, %3 \n\t"
|
"stxvd2x 40, 0, %3 \n\t"
|
||||||
"stxvd2x 41, %5, %3 \n\t"
|
"stxvd2x 41, %5, %3 \n\t"
|
||||||
|
@ -77,9 +77,9 @@ static void scopy_kernel_32 (long n, float *x, float *y)
|
||||||
"addi %2, %2, 128 \n\t"
|
"addi %2, %2, 128 \n\t"
|
||||||
|
|
||||||
"addic. %1, %1, -32 \n\t"
|
"addic. %1, %1, -32 \n\t"
|
||||||
"bgt 1b \n"
|
"bgt one%= \n"
|
||||||
|
|
||||||
"2: \n\t"
|
"two%=: \n\t"
|
||||||
|
|
||||||
"stxvd2x 40, 0, %3 \n\t"
|
"stxvd2x 40, 0, %3 \n\t"
|
||||||
"stxvd2x 41, %5, %3 \n\t"
|
"stxvd2x 41, %5, %3 \n\t"
|
||||||
|
|
|
@ -78,10 +78,10 @@ static float sdot_kernel_16 (long n, float *x, float *y)
|
||||||
"addi %3, %3, 128 \n\t"
|
"addi %3, %3, 128 \n\t"
|
||||||
|
|
||||||
"addic. %1, %1, -32 \n\t"
|
"addic. %1, %1, -32 \n\t"
|
||||||
"ble 2f \n\t"
|
"ble two%= \n\t"
|
||||||
|
|
||||||
".p2align 5 \n"
|
".align 5 \n"
|
||||||
"1: \n\t"
|
"one%=: \n\t"
|
||||||
|
|
||||||
"xvmaddasp 32, 40, 48 \n\t"
|
"xvmaddasp 32, 40, 48 \n\t"
|
||||||
"lxvd2x 40, 0, %2 \n\t"
|
"lxvd2x 40, 0, %2 \n\t"
|
||||||
|
@ -112,9 +112,9 @@ static float sdot_kernel_16 (long n, float *x, float *y)
|
||||||
"addi %3, %3, 128 \n\t"
|
"addi %3, %3, 128 \n\t"
|
||||||
|
|
||||||
"addic. %1, %1, -32 \n\t"
|
"addic. %1, %1, -32 \n\t"
|
||||||
"bgt 1b \n"
|
"bgt one%= \n"
|
||||||
|
|
||||||
"2: \n\t"
|
"two%=: \n\t"
|
||||||
|
|
||||||
"xvmaddasp 32, 40, 48 \n\t"
|
"xvmaddasp 32, 40, 48 \n\t"
|
||||||
"xvmaddasp 33, 41, 49 \n\t"
|
"xvmaddasp 33, 41, 49 \n\t"
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -38,7 +38,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
* Macros for N=4 and M=16
|
* Macros for N=4 and M=16
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_4x16', `
|
||||||
|
#else
|
||||||
.macro COPY_4x16
|
.macro COPY_4x16
|
||||||
|
#endif
|
||||||
|
|
||||||
lxvw4x vs32, o0, A0
|
lxvw4x vs32, o0, A0
|
||||||
lxvw4x vs33, o16, A0
|
lxvw4x vs33, o16, A0
|
||||||
|
@ -88,13 +92,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
stxvw4x vs46, o32, T1
|
stxvw4x vs46, o32, T1
|
||||||
stxvw4x vs47, o48, T1
|
stxvw4x vs47, o48, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=4 and M=8
|
* Macros for N=4 and M=8
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_4x8', `
|
||||||
|
#else
|
||||||
.macro COPY_4x8
|
.macro COPY_4x8
|
||||||
|
#endif
|
||||||
|
|
||||||
lxvw4x vs32, o0, A0
|
lxvw4x vs32, o0, A0
|
||||||
lxvw4x vs33, o16, A0
|
lxvw4x vs33, o16, A0
|
||||||
|
@ -124,13 +136,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
stxvw4x vs38, o32, T1
|
stxvw4x vs38, o32, T1
|
||||||
stxvw4x vs39, o48, T1
|
stxvw4x vs39, o48, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=4 and M=4
|
* Macros for N=4 and M=4
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_4x4', `
|
||||||
|
#else
|
||||||
.macro COPY_4x4
|
.macro COPY_4x4
|
||||||
|
#endif
|
||||||
|
|
||||||
lxvw4x vs32, o0, A0
|
lxvw4x vs32, o0, A0
|
||||||
|
|
||||||
|
@ -150,13 +170,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
stxvw4x vs35, o48, T1
|
stxvw4x vs35, o48, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=4 and M=2
|
* Macros for N=4 and M=2
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_4x2', `
|
||||||
|
#else
|
||||||
.macro COPY_4x2
|
.macro COPY_4x2
|
||||||
|
#endif
|
||||||
|
|
||||||
lxsspx vs32, o0, A0
|
lxsspx vs32, o0, A0
|
||||||
lxsspx vs33, o4, A0
|
lxsspx vs33, o4, A0
|
||||||
|
@ -190,13 +218,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
stxsspx vs38, o0, T1
|
stxsspx vs38, o0, T1
|
||||||
stxsspx vs39, o4, T1
|
stxsspx vs39, o4, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=4 and M=1
|
* Macros for N=4 and M=1
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_4x1', `
|
||||||
|
#else
|
||||||
.macro COPY_4x1
|
.macro COPY_4x1
|
||||||
|
#endif
|
||||||
|
|
||||||
lxsspx vs32, o0, A0
|
lxsspx vs32, o0, A0
|
||||||
|
|
||||||
|
@ -218,13 +254,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
stxsspx vs35, o4, T1
|
stxsspx vs35, o4, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=2 and M=16
|
* Macros for N=2 and M=16
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_2x16', `
|
||||||
|
#else
|
||||||
.macro COPY_2x16
|
.macro COPY_2x16
|
||||||
|
#endif
|
||||||
|
|
||||||
lxvw4x vs32, o0, A0
|
lxvw4x vs32, o0, A0
|
||||||
lxvw4x vs33, o16, A0
|
lxvw4x vs33, o16, A0
|
||||||
|
@ -250,13 +294,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
stxvw4x vs38, o32, T1
|
stxvw4x vs38, o32, T1
|
||||||
stxvw4x vs39, o48, T1
|
stxvw4x vs39, o48, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=2 and M=8
|
* Macros for N=2 and M=8
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_2x8', `
|
||||||
|
#else
|
||||||
.macro COPY_2x8
|
.macro COPY_2x8
|
||||||
|
#endif
|
||||||
|
|
||||||
lxvw4x vs32, o0, A0
|
lxvw4x vs32, o0, A0
|
||||||
lxvw4x vs33, o16, A0
|
lxvw4x vs33, o16, A0
|
||||||
|
@ -272,13 +324,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
stxvw4x vs34, o32, T1
|
stxvw4x vs34, o32, T1
|
||||||
stxvw4x vs35, o48, T1
|
stxvw4x vs35, o48, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=2 and M=4
|
* Macros for N=2 and M=4
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_2x4', `
|
||||||
|
#else
|
||||||
.macro COPY_2x4
|
.macro COPY_2x4
|
||||||
|
#endif
|
||||||
|
|
||||||
lxvw4x vs32, o0, A0
|
lxvw4x vs32, o0, A0
|
||||||
|
|
||||||
|
@ -290,13 +350,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
stxvw4x vs33, o16, T1
|
stxvw4x vs33, o16, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=2 and M=2
|
* Macros for N=2 and M=2
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_2x2', `
|
||||||
|
#else
|
||||||
.macro COPY_2x2
|
.macro COPY_2x2
|
||||||
|
#endif
|
||||||
|
|
||||||
lxsspx vs32, o0, A0
|
lxsspx vs32, o0, A0
|
||||||
lxsspx vs33, o4, A0
|
lxsspx vs33, o4, A0
|
||||||
|
@ -314,13 +382,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
stxsspx vs34, o0, T1
|
stxsspx vs34, o0, T1
|
||||||
stxsspx vs35, o4, T1
|
stxsspx vs35, o4, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=2 and M=1
|
* Macros for N=2 and M=1
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_2x1', `
|
||||||
|
#else
|
||||||
.macro COPY_2x1
|
.macro COPY_2x1
|
||||||
|
#endif
|
||||||
|
|
||||||
lxsspx vs32, o0, A0
|
lxsspx vs32, o0, A0
|
||||||
|
|
||||||
|
@ -332,13 +408,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
stxsspx vs33, o4, T1
|
stxsspx vs33, o4, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=1 and M=16
|
* Macros for N=1 and M=16
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_1x16', `
|
||||||
|
#else
|
||||||
.macro COPY_1x16
|
.macro COPY_1x16
|
||||||
|
#endif
|
||||||
|
|
||||||
lxvw4x vs32, o0, A0
|
lxvw4x vs32, o0, A0
|
||||||
lxvw4x vs33, o16, A0
|
lxvw4x vs33, o16, A0
|
||||||
|
@ -352,13 +436,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
stxvw4x vs34, o32, T1
|
stxvw4x vs34, o32, T1
|
||||||
stxvw4x vs35, o48, T1
|
stxvw4x vs35, o48, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=1 and M=8
|
* Macros for N=1 and M=8
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_1x8', `
|
||||||
|
#else
|
||||||
.macro COPY_1x8
|
.macro COPY_1x8
|
||||||
|
#endif
|
||||||
|
|
||||||
lxvw4x vs32, o0, A0
|
lxvw4x vs32, o0, A0
|
||||||
lxvw4x vs33, o16, A0
|
lxvw4x vs33, o16, A0
|
||||||
|
@ -368,13 +460,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
stxvw4x vs32, o0, T1
|
stxvw4x vs32, o0, T1
|
||||||
stxvw4x vs33, o16, T1
|
stxvw4x vs33, o16, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=1 and M=4
|
* Macros for N=1 and M=4
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_1x4', `
|
||||||
|
#else
|
||||||
.macro COPY_1x4
|
.macro COPY_1x4
|
||||||
|
#endif
|
||||||
|
|
||||||
lxvw4x vs32, o0, A0
|
lxvw4x vs32, o0, A0
|
||||||
|
|
||||||
|
@ -382,13 +482,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
stxvw4x vs32, o0, T1
|
stxvw4x vs32, o0, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=1 and M=2
|
* Macros for N=1 and M=2
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_1x2', `
|
||||||
|
#else
|
||||||
.macro COPY_1x2
|
.macro COPY_1x2
|
||||||
|
#endif
|
||||||
|
|
||||||
lxsspx vs32, o0, A0
|
lxsspx vs32, o0, A0
|
||||||
lxsspx vs33, o4, A0
|
lxsspx vs33, o4, A0
|
||||||
|
@ -398,13 +506,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
stxsspx vs32, o0, T1
|
stxsspx vs32, o0, T1
|
||||||
stxsspx vs33, o4, T1
|
stxsspx vs33, o4, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=1 and M=1
|
* Macros for N=1 and M=1
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_1x1', `
|
||||||
|
#else
|
||||||
.macro COPY_1x1
|
.macro COPY_1x1
|
||||||
|
#endif
|
||||||
|
|
||||||
lxsspx vs32, o0, A0
|
lxsspx vs32, o0, A0
|
||||||
|
|
||||||
|
@ -412,5 +528,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
stxsspx vs32, o0, T1
|
stxsspx vs32, o0, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
|
@ -38,7 +38,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
* Macros for N=4 and M=8
|
* Macros for N=4 and M=8
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_4x8', `
|
||||||
|
#else
|
||||||
.macro COPY_4x8
|
.macro COPY_4x8
|
||||||
|
#endif
|
||||||
|
|
||||||
lxvw4x vs32, o0, A0
|
lxvw4x vs32, o0, A0
|
||||||
lxvw4x vs33, o16, A0
|
lxvw4x vs33, o16, A0
|
||||||
|
@ -68,13 +72,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
stxvw4x vs38, o32, T1
|
stxvw4x vs38, o32, T1
|
||||||
stxvw4x vs39, o48, T1
|
stxvw4x vs39, o48, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=4 and M=4
|
* Macros for N=4 and M=4
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_4x4', `
|
||||||
|
#else
|
||||||
.macro COPY_4x4
|
.macro COPY_4x4
|
||||||
|
#endif
|
||||||
|
|
||||||
lxvw4x vs32, o0, A0
|
lxvw4x vs32, o0, A0
|
||||||
|
|
||||||
|
@ -94,13 +106,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
stxvw4x vs35, o48, T1
|
stxvw4x vs35, o48, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=4 and M=2
|
* Macros for N=4 and M=2
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_4x2', `
|
||||||
|
#else
|
||||||
.macro COPY_4x2
|
.macro COPY_4x2
|
||||||
|
#endif
|
||||||
|
|
||||||
lxsspx vs32, o0, A0
|
lxsspx vs32, o0, A0
|
||||||
lxsspx vs33, o4, A0
|
lxsspx vs33, o4, A0
|
||||||
|
@ -134,13 +154,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
stxsspx vs38, o0, T1
|
stxsspx vs38, o0, T1
|
||||||
stxsspx vs39, o4, T1
|
stxsspx vs39, o4, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=4 and M=1
|
* Macros for N=4 and M=1
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_4x1', `
|
||||||
|
#else
|
||||||
.macro COPY_4x1
|
.macro COPY_4x1
|
||||||
|
#endif
|
||||||
|
|
||||||
lxsspx vs32, o0, A0
|
lxsspx vs32, o0, A0
|
||||||
|
|
||||||
|
@ -162,13 +190,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
stxsspx vs35, o4, T1
|
stxsspx vs35, o4, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=2 and M=8
|
* Macros for N=2 and M=8
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_2x8', `
|
||||||
|
#else
|
||||||
.macro COPY_2x8
|
.macro COPY_2x8
|
||||||
|
#endif
|
||||||
|
|
||||||
lxvw4x vs32, o0, A0
|
lxvw4x vs32, o0, A0
|
||||||
lxvw4x vs33, o16, A0
|
lxvw4x vs33, o16, A0
|
||||||
|
@ -184,13 +220,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
stxvw4x vs34, o32, T1
|
stxvw4x vs34, o32, T1
|
||||||
stxvw4x vs35, o48, T1
|
stxvw4x vs35, o48, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=2 and M=4
|
* Macros for N=2 and M=4
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_2x4', `
|
||||||
|
#else
|
||||||
.macro COPY_2x4
|
.macro COPY_2x4
|
||||||
|
#endif
|
||||||
|
|
||||||
lxvw4x vs32, o0, A0
|
lxvw4x vs32, o0, A0
|
||||||
|
|
||||||
|
@ -202,13 +246,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
stxvw4x vs33, o16, T1
|
stxvw4x vs33, o16, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=2 and M=2
|
* Macros for N=2 and M=2
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_2x2', `
|
||||||
|
#else
|
||||||
.macro COPY_2x2
|
.macro COPY_2x2
|
||||||
|
#endif
|
||||||
|
|
||||||
lxsspx vs32, o0, A0
|
lxsspx vs32, o0, A0
|
||||||
lxsspx vs33, o4, A0
|
lxsspx vs33, o4, A0
|
||||||
|
@ -226,13 +278,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
stxsspx vs34, o0, T1
|
stxsspx vs34, o0, T1
|
||||||
stxsspx vs35, o4, T1
|
stxsspx vs35, o4, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=2 and M=1
|
* Macros for N=2 and M=1
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_2x1', `
|
||||||
|
#else
|
||||||
.macro COPY_2x1
|
.macro COPY_2x1
|
||||||
|
#endif
|
||||||
|
|
||||||
lxsspx vs32, o0, A0
|
lxsspx vs32, o0, A0
|
||||||
|
|
||||||
|
@ -244,13 +304,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
stxsspx vs33, o4, T1
|
stxsspx vs33, o4, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=1 and M=8
|
* Macros for N=1 and M=8
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_1x8', `
|
||||||
|
#else
|
||||||
.macro COPY_1x8
|
.macro COPY_1x8
|
||||||
|
#endif
|
||||||
|
|
||||||
lxvw4x vs32, o0, A0
|
lxvw4x vs32, o0, A0
|
||||||
lxvw4x vs33, o16, A0
|
lxvw4x vs33, o16, A0
|
||||||
|
@ -260,13 +328,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
stxvw4x vs32, o0, T1
|
stxvw4x vs32, o0, T1
|
||||||
stxvw4x vs33, o16, T1
|
stxvw4x vs33, o16, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=1 and M=4
|
* Macros for N=1 and M=4
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_1x4', `
|
||||||
|
#else
|
||||||
.macro COPY_1x4
|
.macro COPY_1x4
|
||||||
|
#endif
|
||||||
|
|
||||||
lxvw4x vs32, o0, A0
|
lxvw4x vs32, o0, A0
|
||||||
|
|
||||||
|
@ -274,13 +350,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
stxvw4x vs32, o0, T1
|
stxvw4x vs32, o0, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=1 and M=2
|
* Macros for N=1 and M=2
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_1x2', `
|
||||||
|
#else
|
||||||
.macro COPY_1x2
|
.macro COPY_1x2
|
||||||
|
#endif
|
||||||
|
|
||||||
lxsspx vs32, o0, A0
|
lxsspx vs32, o0, A0
|
||||||
lxsspx vs33, o4, A0
|
lxsspx vs33, o4, A0
|
||||||
|
@ -290,13 +374,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
stxsspx vs32, o0, T1
|
stxsspx vs32, o0, T1
|
||||||
stxsspx vs33, o4, T1
|
stxsspx vs33, o4, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=1 and M=1
|
* Macros for N=1 and M=1
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_1x1', `
|
||||||
|
#else
|
||||||
.macro COPY_1x1
|
.macro COPY_1x1
|
||||||
|
#endif
|
||||||
|
|
||||||
lxsspx vs32, o0, A0
|
lxsspx vs32, o0, A0
|
||||||
|
|
||||||
|
@ -304,5 +396,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
stxsspx vs32, o0, T1
|
stxsspx vs32, o0, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
|
@ -71,10 +71,10 @@ static void srot_kernel_16 (long n, float *x, float *y, float c, float s)
|
||||||
"addi %4, %4, 64 \n\t"
|
"addi %4, %4, 64 \n\t"
|
||||||
|
|
||||||
"addic. %2, %2, -16 \n\t"
|
"addic. %2, %2, -16 \n\t"
|
||||||
"ble 2f \n\t"
|
"ble two%= \n\t"
|
||||||
|
|
||||||
".p2align 5 \n"
|
".align 5 \n"
|
||||||
"1: \n\t"
|
"one%=: \n\t"
|
||||||
|
|
||||||
"xvmulsp 40, 32, 36 \n\t" // c * x
|
"xvmulsp 40, 32, 36 \n\t" // c * x
|
||||||
"xvmulsp 41, 33, 36 \n\t"
|
"xvmulsp 41, 33, 36 \n\t"
|
||||||
|
@ -138,9 +138,9 @@ static void srot_kernel_16 (long n, float *x, float *y, float c, float s)
|
||||||
"addi %4, %4, 128 \n\t"
|
"addi %4, %4, 128 \n\t"
|
||||||
|
|
||||||
"addic. %2, %2, -16 \n\t"
|
"addic. %2, %2, -16 \n\t"
|
||||||
"bgt 1b \n"
|
"bgt one%= \n"
|
||||||
|
|
||||||
"2: \n\t"
|
"two%=: \n\t"
|
||||||
|
|
||||||
"xvmulsp 40, 32, 36 \n\t" // c * x
|
"xvmulsp 40, 32, 36 \n\t" // c * x
|
||||||
"xvmulsp 41, 33, 36 \n\t"
|
"xvmulsp 41, 33, 36 \n\t"
|
||||||
|
|
|
@ -56,10 +56,10 @@ static void sscal_kernel_16 (long n, float *x, float alpha)
|
||||||
"addi %2, %2, 128 \n\t"
|
"addi %2, %2, 128 \n\t"
|
||||||
|
|
||||||
"addic. %1, %1, -32 \n\t"
|
"addic. %1, %1, -32 \n\t"
|
||||||
"ble 2f \n\t"
|
"ble two%= \n\t"
|
||||||
|
|
||||||
".p2align 5 \n"
|
".align 5 \n"
|
||||||
"1: \n\t"
|
"one%=: \n\t"
|
||||||
|
|
||||||
"xvmulsp 40, 32, %x3 \n\t"
|
"xvmulsp 40, 32, %x3 \n\t"
|
||||||
"xvmulsp 41, 33, %x3 \n\t"
|
"xvmulsp 41, 33, %x3 \n\t"
|
||||||
|
@ -92,9 +92,9 @@ static void sscal_kernel_16 (long n, float *x, float alpha)
|
||||||
"addi %2, %2, 256 \n\t"
|
"addi %2, %2, 256 \n\t"
|
||||||
|
|
||||||
"addic. %1, %1, -32 \n\t"
|
"addic. %1, %1, -32 \n\t"
|
||||||
"bgt 1b \n"
|
"bgt one%= \n"
|
||||||
|
|
||||||
"2: \n\t"
|
"two%=: \n\t"
|
||||||
|
|
||||||
"xvmulsp 40, 32, %x3 \n\t"
|
"xvmulsp 40, 32, %x3 \n\t"
|
||||||
"xvmulsp 41, 33, %x3 \n\t"
|
"xvmulsp 41, 33, %x3 \n\t"
|
||||||
|
@ -147,8 +147,8 @@ static void sscal_kernel_16_zero (long n, float *x)
|
||||||
(
|
(
|
||||||
"xxlxor %x3, %x3, %x3 \n\t"
|
"xxlxor %x3, %x3, %x3 \n\t"
|
||||||
|
|
||||||
".p2align 5 \n"
|
".align 5 \n"
|
||||||
"1: \n\t"
|
"one%=: \n\t"
|
||||||
|
|
||||||
"stxvd2x %x3, 0, %2 \n\t"
|
"stxvd2x %x3, 0, %2 \n\t"
|
||||||
"stxvd2x %x3, %4, %2 \n\t"
|
"stxvd2x %x3, %4, %2 \n\t"
|
||||||
|
@ -162,7 +162,7 @@ static void sscal_kernel_16_zero (long n, float *x)
|
||||||
"addi %2, %2, 128 \n\t"
|
"addi %2, %2, 128 \n\t"
|
||||||
|
|
||||||
"addic. %1, %1, -32 \n\t"
|
"addic. %1, %1, -32 \n\t"
|
||||||
"bgt 1b \n"
|
"bgt one%= \n"
|
||||||
|
|
||||||
"#n=%1 x=%0=%2 t0=%x3 o16=%4 o32=%5 o48=%6 o64=%7 o80=%8 o96=%9 o112=%10"
|
"#n=%1 x=%0=%2 t0=%x3 o16=%4 o32=%5 o48=%6 o64=%7 o80=%8 o96=%9 o112=%10"
|
||||||
:
|
:
|
||||||
|
|
|
@ -39,8 +39,8 @@ static void sswap_kernel_32 (long n, float *x, float *y)
|
||||||
{
|
{
|
||||||
__asm__
|
__asm__
|
||||||
(
|
(
|
||||||
".p2align 5 \n"
|
".align 5 \n"
|
||||||
"1: \n\t"
|
"one%=: \n\t"
|
||||||
|
|
||||||
"lxvd2x 32, 0, %4 \n\t"
|
"lxvd2x 32, 0, %4 \n\t"
|
||||||
"lxvd2x 33, %5, %4 \n\t"
|
"lxvd2x 33, %5, %4 \n\t"
|
||||||
|
@ -83,7 +83,7 @@ static void sswap_kernel_32 (long n, float *x, float *y)
|
||||||
"addi %4, %4, 128 \n\t"
|
"addi %4, %4, 128 \n\t"
|
||||||
|
|
||||||
"addic. %2, %2, -32 \n\t"
|
"addic. %2, %2, -32 \n\t"
|
||||||
"bgt 1b \n"
|
"bgt one%= \n"
|
||||||
|
|
||||||
"#n=%2 x=%0=%3 y=%1=%4 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11"
|
"#n=%2 x=%0=%3 y=%1=%4 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11"
|
||||||
:
|
:
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -68,10 +68,10 @@ static double zasum_kernel_8 (long n, double *x)
|
||||||
"addi %2, %2, 128 \n\t"
|
"addi %2, %2, 128 \n\t"
|
||||||
|
|
||||||
"addic. %1, %1, -8 \n\t"
|
"addic. %1, %1, -8 \n\t"
|
||||||
"ble 2f \n\t"
|
"ble two%= \n\t"
|
||||||
|
|
||||||
".p2align 5 \n"
|
".align 5 \n"
|
||||||
"1: \n\t"
|
"one%=: \n\t"
|
||||||
|
|
||||||
"xvabsdp 48, 40 \n\t"
|
"xvabsdp 48, 40 \n\t"
|
||||||
"xvabsdp 49, 41 \n\t"
|
"xvabsdp 49, 41 \n\t"
|
||||||
|
@ -108,9 +108,9 @@ static double zasum_kernel_8 (long n, double *x)
|
||||||
"xvadddp 38, 38, %x5 \n\t"
|
"xvadddp 38, 38, %x5 \n\t"
|
||||||
"xvadddp 39, 39, %x6 \n\t"
|
"xvadddp 39, 39, %x6 \n\t"
|
||||||
|
|
||||||
"bgt 1b \n"
|
"bgt one%= \n"
|
||||||
|
|
||||||
"2: \n\t"
|
"two%=: \n\t"
|
||||||
|
|
||||||
"xvabsdp 48, 40 \n\t"
|
"xvabsdp 48, 40 \n\t"
|
||||||
"xvabsdp 49, 41 \n\t"
|
"xvabsdp 49, 41 \n\t"
|
||||||
|
@ -140,7 +140,7 @@ static double zasum_kernel_8 (long n, double *x)
|
||||||
|
|
||||||
"xvadddp 32, 32, 36 \n\t"
|
"xvadddp 32, 32, 36 \n\t"
|
||||||
|
|
||||||
"xxswapd 33, 32 \n\t"
|
XXSWAPD_S(33,32)
|
||||||
"xsadddp %x0, 32, 33 \n"
|
"xsadddp %x0, 32, 33 \n"
|
||||||
|
|
||||||
"#n=%1 x=%3=%2 sum=%0 o16=%8 o32=%9 o48=%10 o64=%11 o80=%12 o96=%13 o112=%14\n"
|
"#n=%1 x=%3=%2 sum=%0 o16=%8 o32=%9 o48=%10 o64=%11 o80=%12 o96=%13 o112=%14\n"
|
||||||
|
|
|
@ -61,8 +61,8 @@ static void zaxpy_kernel_4 (long n, double *x, double *y,
|
||||||
|
|
||||||
__asm__
|
__asm__
|
||||||
(
|
(
|
||||||
"xxspltd 32, %x19, 0 \n\t" // alpha_r
|
XXSPLTD_S(32,%x19,0) // alpha_r
|
||||||
"xxspltd 33, %x20, 0 \n\t" // alpha_i
|
XXSPLTD_S(33,%x20,0) // alpha_i
|
||||||
|
|
||||||
"lxvd2x 36, 0, %21 \n\t" // mvec
|
"lxvd2x 36, 0, %21 \n\t" // mvec
|
||||||
|
|
||||||
|
@ -87,10 +87,10 @@ static void zaxpy_kernel_4 (long n, double *x, double *y,
|
||||||
"lxvd2x 50, %23, %3 \n\t" // y2
|
"lxvd2x 50, %23, %3 \n\t" // y2
|
||||||
"lxvd2x 51, %24, %3 \n\t" // y3
|
"lxvd2x 51, %24, %3 \n\t" // y3
|
||||||
|
|
||||||
"xxswapd %x8, 40 \n\t" // exchange real and imag part
|
XXSWAPD_S(%x8,40) // exchange real and imag part
|
||||||
"xxswapd %x9, 41 \n\t" // exchange real and imag part
|
XXSWAPD_S(%x9,41) // exchange real and imag part
|
||||||
"xxswapd %x10, 42 \n\t" // exchange real and imag part
|
XXSWAPD_S(%x10,42) // exchange real and imag part
|
||||||
"xxswapd %x11, 43 \n\t" // exchange real and imag part
|
XXSWAPD_S(%x11,43) // exchange real and imag part
|
||||||
|
|
||||||
"addi %2, %2, 64 \n\t"
|
"addi %2, %2, 64 \n\t"
|
||||||
"addi %3, %3, 64 \n\t"
|
"addi %3, %3, 64 \n\t"
|
||||||
|
@ -105,19 +105,19 @@ static void zaxpy_kernel_4 (long n, double *x, double *y,
|
||||||
"lxvd2x %x6, %23, %3 \n\t" // y6
|
"lxvd2x %x6, %23, %3 \n\t" // y6
|
||||||
"lxvd2x %x7, %24, %3 \n\t" // y7
|
"lxvd2x %x7, %24, %3 \n\t" // y7
|
||||||
|
|
||||||
"xxswapd %x12, 44 \n\t" // exchange real and imag part
|
XXSWAPD_S(%x12,44) // exchange real and imag part
|
||||||
"xxswapd %x13, 45 \n\t" // exchange real and imag part
|
XXSWAPD_S(%x13,45) // exchange real and imag part
|
||||||
"xxswapd %x14, 46 \n\t" // exchange real and imag part
|
XXSWAPD_S(%x14,46) // exchange real and imag part
|
||||||
"xxswapd %x15, 47 \n\t" // exchange real and imag part
|
XXSWAPD_S(%x15,47) // exchange real and imag part
|
||||||
|
|
||||||
"addi %2, %2, 64 \n\t"
|
"addi %2, %2, 64 \n\t"
|
||||||
"addi %3, %3, 64 \n\t"
|
"addi %3, %3, 64 \n\t"
|
||||||
|
|
||||||
"addic. %1, %1, -8 \n\t"
|
"addic. %1, %1, -8 \n\t"
|
||||||
"ble 2f \n\t"
|
"ble two%= \n\t"
|
||||||
|
|
||||||
".p2align 5 \n"
|
".align 5 \n"
|
||||||
"1: \n\t"
|
"one%=: \n\t"
|
||||||
|
|
||||||
"xvmaddadp 48, 40, 32 \n\t" // alpha_r * x0_r , alpha_r * x0_i
|
"xvmaddadp 48, 40, 32 \n\t" // alpha_r * x0_r , alpha_r * x0_i
|
||||||
"xvmaddadp 49, 41, 32 \n\t"
|
"xvmaddadp 49, 41, 32 \n\t"
|
||||||
|
@ -163,31 +163,31 @@ static void zaxpy_kernel_4 (long n, double *x, double *y,
|
||||||
|
|
||||||
"addi %16, %16, 64 \n\t"
|
"addi %16, %16, 64 \n\t"
|
||||||
|
|
||||||
"xxswapd %x8, 40 \n\t" // exchange real and imag part
|
XXSWAPD_S(%x8,40) // exchange real and imag part
|
||||||
"xxswapd %x9, 41 \n\t" // exchange real and imag part
|
XXSWAPD_S(%x9,41) // exchange real and imag part
|
||||||
"lxvd2x 48, 0, %3 \n\t" // y0
|
"lxvd2x 48, 0, %3 \n\t" // y0
|
||||||
"lxvd2x 49, %22, %3 \n\t" // y1
|
"lxvd2x 49, %22, %3 \n\t" // y1
|
||||||
"xxswapd %x10, 42 \n\t" // exchange real and imag part
|
XXSWAPD_S(%x10,42) // exchange real and imag part
|
||||||
"xxswapd %x11, 43 \n\t" // exchange real and imag part
|
XXSWAPD_S(%x11,43) // exchange real and imag part
|
||||||
"lxvd2x 50, %23, %3 \n\t" // y2
|
"lxvd2x 50, %23, %3 \n\t" // y2
|
||||||
"lxvd2x 51, %24, %3 \n\t" // y3
|
"lxvd2x 51, %24, %3 \n\t" // y3
|
||||||
|
|
||||||
"xxswapd %x12, 44 \n\t" // exchange real and imag part
|
XXSWAPD_S(%x12,44) // exchange real and imag part
|
||||||
"addi %3, %3, 64 \n\t"
|
"addi %3, %3, 64 \n\t"
|
||||||
"xxswapd %x13, 45 \n\t" // exchange real and imag part
|
XXSWAPD_S(%x13,45) // exchange real and imag part
|
||||||
"lxvd2x %x4, 0, %3 \n\t" // y4
|
"lxvd2x %x4, 0, %3 \n\t" // y4
|
||||||
"lxvd2x %x5, %22, %3 \n\t" // y5
|
"lxvd2x %x5, %22, %3 \n\t" // y5
|
||||||
"xxswapd %x14, 46 \n\t" // exchange real and imag part
|
XXSWAPD_S(%x14,46) // exchange real and imag part
|
||||||
"xxswapd %x15, 47 \n\t" // exchange real and imag part
|
XXSWAPD_S(%x15,47) // exchange real and imag part
|
||||||
"lxvd2x %x6, %23, %3 \n\t" // y6
|
"lxvd2x %x6, %23, %3 \n\t" // y6
|
||||||
"lxvd2x %x7, %24, %3 \n\t" // y7
|
"lxvd2x %x7, %24, %3 \n\t" // y7
|
||||||
|
|
||||||
"addi %3, %3, 64 \n\t"
|
"addi %3, %3, 64 \n\t"
|
||||||
|
|
||||||
"addic. %1, %1, -8 \n\t"
|
"addic. %1, %1, -8 \n\t"
|
||||||
"bgt 1b \n"
|
"bgt one%= \n"
|
||||||
|
|
||||||
"2: \n\t"
|
"two%=: \n\t"
|
||||||
|
|
||||||
"xvmaddadp 48, 40, 32 \n\t" // alpha_r * x0_r , alpha_r * x0_i
|
"xvmaddadp 48, 40, 32 \n\t" // alpha_r * x0_r , alpha_r * x0_i
|
||||||
"xvmaddadp 49, 41, 32 \n\t"
|
"xvmaddadp 49, 41, 32 \n\t"
|
||||||
|
|
|
@ -62,10 +62,10 @@ static void zcopy_kernel_16 (long n, FLOAT *x, FLOAT *y)
|
||||||
"addi %2, %2, 128 \n\t"
|
"addi %2, %2, 128 \n\t"
|
||||||
|
|
||||||
"addic. %1, %1, -16 \n\t"
|
"addic. %1, %1, -16 \n\t"
|
||||||
"ble 2f \n\t"
|
"ble two%= \n\t"
|
||||||
|
|
||||||
".p2align 5 \n"
|
".align 5 \n"
|
||||||
"1: \n\t"
|
"one%=: \n\t"
|
||||||
|
|
||||||
"stxvd2x 32, 0, %3 \n\t"
|
"stxvd2x 32, 0, %3 \n\t"
|
||||||
"stxvd2x 33, %5, %3 \n\t"
|
"stxvd2x 33, %5, %3 \n\t"
|
||||||
|
@ -108,9 +108,9 @@ static void zcopy_kernel_16 (long n, FLOAT *x, FLOAT *y)
|
||||||
"addi %2, %2, 128 \n\t"
|
"addi %2, %2, 128 \n\t"
|
||||||
|
|
||||||
"addic. %1, %1, -16 \n\t"
|
"addic. %1, %1, -16 \n\t"
|
||||||
"bgt 1b \n"
|
"bgt one%= \n"
|
||||||
|
|
||||||
"2: \n\t"
|
"two%=: \n\t"
|
||||||
|
|
||||||
"stxvd2x 32, 0, %3 \n\t"
|
"stxvd2x 32, 0, %3 \n\t"
|
||||||
"stxvd2x 33, %5, %3 \n\t"
|
"stxvd2x 33, %5, %3 \n\t"
|
||||||
|
|
|
@ -60,10 +60,10 @@ static void zdot_kernel_8 (long n, double *x, double *y, double *dot)
|
||||||
"lxvd2x 43, %9, %2 \n\t" // x3_r, x3_i
|
"lxvd2x 43, %9, %2 \n\t" // x3_r, x3_i
|
||||||
"lxvd2x 51, %9, %3 \n\t" // y3_r, y3_i
|
"lxvd2x 51, %9, %3 \n\t" // y3_r, y3_i
|
||||||
|
|
||||||
"xxswapd 0, 48 \n\t" // y0_i, y0_r
|
XXSWAPD_S(0,48) // y0_i, y0_r
|
||||||
"xxswapd 1, 49 \n\t" // y1_i, y1_r
|
XXSWAPD_S(1,49) // y1_i, y1_r
|
||||||
"xxswapd 2, 50 \n\t" // y2_i, y2_r
|
XXSWAPD_S(2,50) // y2_i, y2_r
|
||||||
"xxswapd 3, 51 \n\t" // y3_i, y3_r
|
XXSWAPD_S(3,51) // y3_i, y3_r
|
||||||
|
|
||||||
"addi %2, %2, 64 \n\t"
|
"addi %2, %2, 64 \n\t"
|
||||||
"addi %3, %3, 64 \n\t"
|
"addi %3, %3, 64 \n\t"
|
||||||
|
@ -77,19 +77,19 @@ static void zdot_kernel_8 (long n, double *x, double *y, double *dot)
|
||||||
"lxvd2x 47, %9, %2 \n\t" // x3_r, x3_i
|
"lxvd2x 47, %9, %2 \n\t" // x3_r, x3_i
|
||||||
"lxvd2x 7, %9, %3 \n\t" // y3_r, y3_i
|
"lxvd2x 7, %9, %3 \n\t" // y3_r, y3_i
|
||||||
|
|
||||||
"xxswapd 8, 4 \n\t" // y0_i, y0_r
|
XXSWAPD_S(8,4) // y0_i, y0_r
|
||||||
"xxswapd 9, 5 \n\t" // y1_i, y1_r
|
XXSWAPD_S(9,5) // y1_i, y1_r
|
||||||
"xxswapd 10, 6 \n\t" // y2_i, y2_r
|
XXSWAPD_S(10,6) // y2_i, y2_r
|
||||||
"xxswapd 11, 7 \n\t" // y3_i, y3_r
|
XXSWAPD_S(11,7) // y3_i, y3_r
|
||||||
|
|
||||||
"addi %2, %2, 64 \n\t"
|
"addi %2, %2, 64 \n\t"
|
||||||
"addi %3, %3, 64 \n\t"
|
"addi %3, %3, 64 \n\t"
|
||||||
|
|
||||||
"addic. %1, %1, -8 \n\t"
|
"addic. %1, %1, -8 \n\t"
|
||||||
"ble 2f \n\t"
|
"ble two%= \n\t"
|
||||||
|
|
||||||
".p2align 5 \n"
|
".align 5 \n"
|
||||||
"1: \n\t"
|
"one%=: \n\t"
|
||||||
|
|
||||||
"xvmaddadp 32, 40, 48 \n\t" // x0_r * y0_r , x0_i * y0_i
|
"xvmaddadp 32, 40, 48 \n\t" // x0_r * y0_r , x0_i * y0_i
|
||||||
"lxvd2x 48, 0, %3 \n\t" // y0_r, y0_i
|
"lxvd2x 48, 0, %3 \n\t" // y0_r, y0_i
|
||||||
|
@ -111,14 +111,14 @@ static void zdot_kernel_8 (long n, double *x, double *y, double *dot)
|
||||||
"xvmaddadp 39, 43, 3 \n\t" // x3_r * y3_i , x3_i * y3_r
|
"xvmaddadp 39, 43, 3 \n\t" // x3_r * y3_i , x3_i * y3_r
|
||||||
"lxvd2x 43, %9, %2 \n\t" // x3_r, x3_i
|
"lxvd2x 43, %9, %2 \n\t" // x3_r, x3_i
|
||||||
|
|
||||||
"xxswapd 0,48 \n\t" // y0_i, y0_r
|
XXSWAPD_S(0,48) // y0_i, y0_r
|
||||||
"xxswapd 1,49 \n\t" // y1_i, y1_r
|
XXSWAPD_S(1,49) // y1_i, y1_r
|
||||||
|
|
||||||
"addi %2, %2, 64 \n\t"
|
"addi %2, %2, 64 \n\t"
|
||||||
"addi %3, %3, 64 \n\t"
|
"addi %3, %3, 64 \n\t"
|
||||||
|
|
||||||
"xxswapd 2,50 \n\t" // y2_i, y2_r
|
XXSWAPD_S(2,50) // y2_i, y2_r
|
||||||
"xxswapd 3,51 \n\t" // y3_i, y3_r
|
XXSWAPD_S(3,51) // y3_i, y3_r
|
||||||
|
|
||||||
"xvmaddadp 32, 44, 4 \n\t" // x0_r * y0_r , x0_i * y0_i
|
"xvmaddadp 32, 44, 4 \n\t" // x0_r * y0_r , x0_i * y0_i
|
||||||
"lxvd2x 4, 0, %3 \n\t" // y0_r, y0_i
|
"lxvd2x 4, 0, %3 \n\t" // y0_r, y0_i
|
||||||
|
@ -138,19 +138,19 @@ static void zdot_kernel_8 (long n, double *x, double *y, double *dot)
|
||||||
"xvmaddadp 39, 47, 11 \n\t" // x3_r * y3_i , x3_i * y3_r
|
"xvmaddadp 39, 47, 11 \n\t" // x3_r * y3_i , x3_i * y3_r
|
||||||
"lxvd2x 47, %9, %2 \n\t" // x3_r, x3_i
|
"lxvd2x 47, %9, %2 \n\t" // x3_r, x3_i
|
||||||
|
|
||||||
"xxswapd 8,4 \n\t" // y0_i, y0_r
|
XXSWAPD_S(8,4) // y0_i, y0_r
|
||||||
"xxswapd 9,5 \n\t" // y1_i, y1_r
|
XXSWAPD_S(9,5) // y1_i, y1_r
|
||||||
|
|
||||||
"addi %2, %2, 64 \n\t"
|
"addi %2, %2, 64 \n\t"
|
||||||
"addi %3, %3, 64 \n\t"
|
"addi %3, %3, 64 \n\t"
|
||||||
|
|
||||||
"xxswapd 10,6 \n\t" // y2_i, y2_r
|
XXSWAPD_S(10,6) // y2_i, y2_r
|
||||||
"xxswapd 11,7 \n\t" // y3_i, y3_r
|
XXSWAPD_S(11,7) // y3_i, y3_r
|
||||||
|
|
||||||
"addic. %1, %1, -8 \n\t"
|
"addic. %1, %1, -8 \n\t"
|
||||||
"bgt 1b \n"
|
"bgt one%= \n"
|
||||||
|
|
||||||
"2: \n\t"
|
"two%=: \n\t"
|
||||||
|
|
||||||
"xvmaddadp 32, 40, 48 \n\t" // x0_r * y0_r , x0_i * y0_i
|
"xvmaddadp 32, 40, 48 \n\t" // x0_r * y0_r , x0_i * y0_i
|
||||||
"xvmaddadp 34, 41, 49 \n\t" // x1_r * y1_r , x1_i * y1_i
|
"xvmaddadp 34, 41, 49 \n\t" // x1_r * y1_r , x1_i * y1_i
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -38,7 +38,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
* Macros for N=4 and M=8
|
* Macros for N=4 and M=8
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_4x8', `
|
||||||
|
#else
|
||||||
.macro COPY_4x8
|
.macro COPY_4x8
|
||||||
|
#endif
|
||||||
|
|
||||||
lxvd2x vs32, o0, A0
|
lxvd2x vs32, o0, A0
|
||||||
lxvd2x vs33, o16, A0
|
lxvd2x vs33, o16, A0
|
||||||
|
@ -144,14 +148,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
stxvd2x vs12, o32, T1
|
stxvd2x vs12, o32, T1
|
||||||
stxvd2x vs13, o48, T1
|
stxvd2x vs13, o48, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=4 and M=4
|
* Macros for N=4 and M=4
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_4x4', `
|
||||||
|
#else
|
||||||
.macro COPY_4x4
|
.macro COPY_4x4
|
||||||
|
#endif
|
||||||
|
|
||||||
lxvd2x vs32, o0, A0
|
lxvd2x vs32, o0, A0
|
||||||
lxvd2x vs33, o16, A0
|
lxvd2x vs33, o16, A0
|
||||||
|
@ -209,14 +221,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
stxvd2x vs46, o32, T1
|
stxvd2x vs46, o32, T1
|
||||||
stxvd2x vs47, o48, T1
|
stxvd2x vs47, o48, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=4 and M=2
|
* Macros for N=4 and M=2
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_4x2', `
|
||||||
|
#else
|
||||||
.macro COPY_4x2
|
.macro COPY_4x2
|
||||||
|
#endif
|
||||||
|
|
||||||
lxvd2x vs32, o0, A0
|
lxvd2x vs32, o0, A0
|
||||||
lxvd2x vs33, o16, A0
|
lxvd2x vs33, o16, A0
|
||||||
|
@ -254,14 +274,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
stxvd2x vs38, o32, T1
|
stxvd2x vs38, o32, T1
|
||||||
stxvd2x vs39, o48, T1
|
stxvd2x vs39, o48, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=4 and M=1
|
* Macros for N=4 and M=1
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_4x1', `
|
||||||
|
#else
|
||||||
.macro COPY_4x1
|
.macro COPY_4x1
|
||||||
|
#endif
|
||||||
|
|
||||||
lxvd2x vs32, o0, A0
|
lxvd2x vs32, o0, A0
|
||||||
addi A0, A0, 16
|
addi A0, A0, 16
|
||||||
|
@ -289,14 +317,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
stxvd2x vs35, o48, T1
|
stxvd2x vs35, o48, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=2 and M=8
|
* Macros for N=2 and M=8
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_2x8', `
|
||||||
|
#else
|
||||||
.macro COPY_2x8
|
.macro COPY_2x8
|
||||||
|
#endif
|
||||||
|
|
||||||
lxvd2x vs32, o0, A0
|
lxvd2x vs32, o0, A0
|
||||||
lxvd2x vs33, o16, A0
|
lxvd2x vs33, o16, A0
|
||||||
|
@ -350,14 +386,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
stxvd2x vs46, o32, T1
|
stxvd2x vs46, o32, T1
|
||||||
stxvd2x vs47, o48, T1
|
stxvd2x vs47, o48, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=2 and M=4
|
* Macros for N=2 and M=4
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_2x4', `
|
||||||
|
#else
|
||||||
.macro COPY_2x4
|
.macro COPY_2x4
|
||||||
|
#endif
|
||||||
|
|
||||||
lxvd2x vs32, o0, A0
|
lxvd2x vs32, o0, A0
|
||||||
lxvd2x vs33, o16, A0
|
lxvd2x vs33, o16, A0
|
||||||
|
@ -387,14 +431,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
stxvd2x vs38, o32, T1
|
stxvd2x vs38, o32, T1
|
||||||
stxvd2x vs39, o48, T1
|
stxvd2x vs39, o48, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=2 and M=2
|
* Macros for N=2 and M=2
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_2x2', `
|
||||||
|
#else
|
||||||
.macro COPY_2x2
|
.macro COPY_2x2
|
||||||
|
#endif
|
||||||
|
|
||||||
lxvd2x vs32, o0, A0
|
lxvd2x vs32, o0, A0
|
||||||
lxvd2x vs33, o16, A0
|
lxvd2x vs33, o16, A0
|
||||||
|
@ -414,14 +466,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
stxvd2x vs34, o32, T1
|
stxvd2x vs34, o32, T1
|
||||||
stxvd2x vs35, o48, T1
|
stxvd2x vs35, o48, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=2 and M=1
|
* Macros for N=2 and M=1
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_2x1', `
|
||||||
|
#else
|
||||||
.macro COPY_2x1
|
.macro COPY_2x1
|
||||||
|
#endif
|
||||||
|
|
||||||
lxvd2x vs32, o0, A0
|
lxvd2x vs32, o0, A0
|
||||||
addi A0, A0, 16
|
addi A0, A0, 16
|
||||||
|
@ -437,14 +497,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
stxvd2x vs33, o16, T1
|
stxvd2x vs33, o16, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=1 and M=8
|
* Macros for N=1 and M=8
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_1x8', `
|
||||||
|
#else
|
||||||
.macro COPY_1x8
|
.macro COPY_1x8
|
||||||
|
#endif
|
||||||
|
|
||||||
lxvd2x vs32, o0, A0
|
lxvd2x vs32, o0, A0
|
||||||
lxvd2x vs33, o16, A0
|
lxvd2x vs33, o16, A0
|
||||||
|
@ -472,14 +540,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
stxvd2x vs38, o32, T1
|
stxvd2x vs38, o32, T1
|
||||||
stxvd2x vs39, o48, T1
|
stxvd2x vs39, o48, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=1 and M=4
|
* Macros for N=1 and M=4
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_1x4', `
|
||||||
|
#else
|
||||||
.macro COPY_1x4
|
.macro COPY_1x4
|
||||||
|
#endif
|
||||||
|
|
||||||
lxvd2x vs32, o0, A0
|
lxvd2x vs32, o0, A0
|
||||||
lxvd2x vs33, o16, A0
|
lxvd2x vs33, o16, A0
|
||||||
|
@ -495,14 +571,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
stxvd2x vs34, o32, T1
|
stxvd2x vs34, o32, T1
|
||||||
stxvd2x vs35, o48, T1
|
stxvd2x vs35, o48, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=1 and M=2
|
* Macros for N=1 and M=2
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_1x2', `
|
||||||
|
#else
|
||||||
.macro COPY_1x2
|
.macro COPY_1x2
|
||||||
|
#endif
|
||||||
|
|
||||||
lxvd2x vs32, o0, A0
|
lxvd2x vs32, o0, A0
|
||||||
lxvd2x vs33, o16, A0
|
lxvd2x vs33, o16, A0
|
||||||
|
@ -514,14 +598,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
stxvd2x vs32, o0, T1
|
stxvd2x vs32, o0, T1
|
||||||
stxvd2x vs33, o16, T1
|
stxvd2x vs33, o16, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=1 and M=1
|
* Macros for N=1 and M=1
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_1x1', `
|
||||||
|
#else
|
||||||
.macro COPY_1x1
|
.macro COPY_1x1
|
||||||
|
#endif
|
||||||
|
|
||||||
lxvd2x vs32, o0, A0
|
lxvd2x vs32, o0, A0
|
||||||
addi A0, A0, 16
|
addi A0, A0, 16
|
||||||
|
@ -531,5 +623,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
stxvd2x vs32, o0, T1
|
stxvd2x vs32, o0, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
|
@ -40,8 +40,8 @@ static void zrot_kernel_4(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT cosA, FLOAT si
|
||||||
|
|
||||||
__asm__
|
__asm__
|
||||||
(
|
(
|
||||||
"xxspltd 36, %x[cos], 0 \n\t" // load c to both dwords
|
XXSPLTD_S(36,%x[cos],0) // load c to both dwords
|
||||||
"xxspltd 37, %x[sin], 0 \n\t" // load s to both dwords
|
XXSPLTD_S(37,%x[sin],0) // load s to both dwords
|
||||||
|
|
||||||
"lxvd2x 32, 0, %[x_ptr] \n\t" // load x
|
"lxvd2x 32, 0, %[x_ptr] \n\t" // load x
|
||||||
"lxvd2x 33, %[i16], %[x_ptr] \n\t"
|
"lxvd2x 33, %[i16], %[x_ptr] \n\t"
|
||||||
|
@ -57,10 +57,10 @@ static void zrot_kernel_4(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT cosA, FLOAT si
|
||||||
"addi %[y_ptr], %[y_ptr], 64 \n\t"
|
"addi %[y_ptr], %[y_ptr], 64 \n\t"
|
||||||
|
|
||||||
"addic. %[temp_n], %[temp_n], -4 \n\t"
|
"addic. %[temp_n], %[temp_n], -4 \n\t"
|
||||||
"ble 2f \n\t"
|
"ble two%= \n\t"
|
||||||
|
|
||||||
".p2align 5 \n"
|
".align 5 \n"
|
||||||
"1: \n\t"
|
"one%=: \n\t"
|
||||||
|
|
||||||
"xvmuldp 40, 32, 36 \n\t" // c * x
|
"xvmuldp 40, 32, 36 \n\t" // c * x
|
||||||
"xvmuldp 41, 33, 36 \n\t"
|
"xvmuldp 41, 33, 36 \n\t"
|
||||||
|
@ -124,9 +124,9 @@ static void zrot_kernel_4(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT cosA, FLOAT si
|
||||||
"addi %[y_ptr], %[y_ptr], 128 \n\t"
|
"addi %[y_ptr], %[y_ptr], 128 \n\t"
|
||||||
|
|
||||||
"addic. %[temp_n], %[temp_n], -4 \n\t"
|
"addic. %[temp_n], %[temp_n], -4 \n\t"
|
||||||
"bgt+ 1b \n"
|
"bgt+ one%= \n"
|
||||||
|
|
||||||
"2: \n\t"
|
"two%=: \n\t"
|
||||||
|
|
||||||
"xvmuldp 40, 32, 36 \n\t" // c * x
|
"xvmuldp 40, 32, 36 \n\t" // c * x
|
||||||
"xvmuldp 41, 33, 36 \n\t"
|
"xvmuldp 41, 33, 36 \n\t"
|
||||||
|
|
|
@ -58,8 +58,8 @@ static void zscal_kernel_8 (long n, double *x, double alpha_r, double alpha_i)
|
||||||
"dcbt 0, %2 \n\t"
|
"dcbt 0, %2 \n\t"
|
||||||
|
|
||||||
"xsnegdp 33, %x16 \n\t" // -alpha_i
|
"xsnegdp 33, %x16 \n\t" // -alpha_i
|
||||||
"xxspltd 32, %x15, 0 \n\t" // alpha_r , alpha_r
|
XXSPLTD_S(32,%x15,0) // alpha_r , alpha_r
|
||||||
"xxmrghd 33, 33, %x16 \n\t" // -alpha_i , alpha_i
|
XXMRGHD_S(33,33,%x16) // -alpha_i , alpha_i
|
||||||
|
|
||||||
"lxvd2x 40, 0, %2 \n\t" // x0_r, x0_i
|
"lxvd2x 40, 0, %2 \n\t" // x0_r, x0_i
|
||||||
"lxvd2x 41, %17, %2 \n\t"
|
"lxvd2x 41, %17, %2 \n\t"
|
||||||
|
@ -73,10 +73,10 @@ static void zscal_kernel_8 (long n, double *x, double alpha_r, double alpha_i)
|
||||||
"addi %2, %2, 128 \n\t"
|
"addi %2, %2, 128 \n\t"
|
||||||
|
|
||||||
"addic. %1, %1, -8 \n\t"
|
"addic. %1, %1, -8 \n\t"
|
||||||
"ble 2f \n\t"
|
"ble two%= \n\t"
|
||||||
|
|
||||||
".p2align 5 \n"
|
".align 5 \n"
|
||||||
"1: \n\t"
|
"one%=: \n\t"
|
||||||
|
|
||||||
"xvmuldp 48, 40, 32 \n\t" // x0_r * alpha_r, x0_i * alpha_r
|
"xvmuldp 48, 40, 32 \n\t" // x0_r * alpha_r, x0_i * alpha_r
|
||||||
"xvmuldp 49, 41, 32 \n\t"
|
"xvmuldp 49, 41, 32 \n\t"
|
||||||
|
@ -87,14 +87,14 @@ static void zscal_kernel_8 (long n, double *x, double alpha_r, double alpha_i)
|
||||||
"xvmuldp %x5, 46, 32 \n\t"
|
"xvmuldp %x5, 46, 32 \n\t"
|
||||||
"xvmuldp %x6, 47, 32 \n\t"
|
"xvmuldp %x6, 47, 32 \n\t"
|
||||||
|
|
||||||
"xxswapd %x7, 40 \n\t"
|
XXSWAPD_S(%x7,40)
|
||||||
"xxswapd %x8, 41 \n\t"
|
XXSWAPD_S(%x8,41)
|
||||||
"xxswapd %x9, 42 \n\t"
|
XXSWAPD_S(%x9,42)
|
||||||
"xxswapd %x10, 43 \n\t"
|
XXSWAPD_S(%x10,43)
|
||||||
"xxswapd %x11, 44 \n\t"
|
XXSWAPD_S(%x11,44)
|
||||||
"xxswapd %x12, 45 \n\t"
|
XXSWAPD_S(%x12,45)
|
||||||
"xxswapd %x13, 46 \n\t"
|
XXSWAPD_S(%x13,46)
|
||||||
"xxswapd %x14, 47 \n\t"
|
XXSWAPD_S(%x14,47)
|
||||||
|
|
||||||
"xvmuldp %x7, %x7, 33 \n\t" // x0_i * -alpha_i, x0_r * alpha_i
|
"xvmuldp %x7, %x7, 33 \n\t" // x0_i * -alpha_i, x0_r * alpha_i
|
||||||
"xvmuldp %x8, %x8, 33 \n\t"
|
"xvmuldp %x8, %x8, 33 \n\t"
|
||||||
|
@ -147,9 +147,9 @@ static void zscal_kernel_8 (long n, double *x, double alpha_r, double alpha_i)
|
||||||
"addi %2, %2, 256 \n\t"
|
"addi %2, %2, 256 \n\t"
|
||||||
|
|
||||||
"addic. %1, %1, -8 \n\t"
|
"addic. %1, %1, -8 \n\t"
|
||||||
"bgt 1b \n"
|
"bgt one%= \n"
|
||||||
|
|
||||||
"2: \n\t"
|
"two%=: \n\t"
|
||||||
|
|
||||||
"xvmuldp 48, 40, 32 \n\t" // x0_r * alpha_r, x0_i * alpha_r
|
"xvmuldp 48, 40, 32 \n\t" // x0_r * alpha_r, x0_i * alpha_r
|
||||||
"xvmuldp 49, 41, 32 \n\t"
|
"xvmuldp 49, 41, 32 \n\t"
|
||||||
|
@ -160,14 +160,14 @@ static void zscal_kernel_8 (long n, double *x, double alpha_r, double alpha_i)
|
||||||
"xvmuldp %x5, 46, 32 \n\t"
|
"xvmuldp %x5, 46, 32 \n\t"
|
||||||
"xvmuldp %x6, 47, 32 \n\t"
|
"xvmuldp %x6, 47, 32 \n\t"
|
||||||
|
|
||||||
"xxswapd %x7, 40 \n\t"
|
XXSWAPD_S(%x7,40)
|
||||||
"xxswapd %x8, 41 \n\t"
|
XXSWAPD_S(%x8,41)
|
||||||
"xxswapd %x9, 42 \n\t"
|
XXSWAPD_S(%x9,42)
|
||||||
"xxswapd %x10, 43 \n\t"
|
XXSWAPD_S(%x10,43)
|
||||||
"xxswapd %x11, 44 \n\t"
|
XXSWAPD_S(%x11,44)
|
||||||
"xxswapd %x12, 45 \n\t"
|
XXSWAPD_S(%x12,45)
|
||||||
"xxswapd %x13, 46 \n\t"
|
XXSWAPD_S(%x13,46)
|
||||||
"xxswapd %x14, 47 \n\t"
|
XXSWAPD_S(%x14,47)
|
||||||
|
|
||||||
"addi %2, %2, -128 \n\t"
|
"addi %2, %2, -128 \n\t"
|
||||||
|
|
||||||
|
|
|
@ -40,8 +40,8 @@ zswap_kernel_16 (long n, double *x, double *y)
|
||||||
{
|
{
|
||||||
__asm__
|
__asm__
|
||||||
(
|
(
|
||||||
".p2align 5 \n"
|
".align 5 \n"
|
||||||
"1: \n\t"
|
"one%=: \n\t"
|
||||||
"lxvd2x 32, 0, %4 \n\t"
|
"lxvd2x 32, 0, %4 \n\t"
|
||||||
"lxvd2x 33, %5, %4 \n\t"
|
"lxvd2x 33, %5, %4 \n\t"
|
||||||
"lxvd2x 34, %6, %4 \n\t"
|
"lxvd2x 34, %6, %4 \n\t"
|
||||||
|
@ -130,7 +130,7 @@ zswap_kernel_16 (long n, double *x, double *y)
|
||||||
|
|
||||||
"addi %4, %4, 128 \n\t"
|
"addi %4, %4, 128 \n\t"
|
||||||
"addic. %2, %2, -16 \n\t"
|
"addic. %2, %2, -16 \n\t"
|
||||||
"bgt 1b \n"
|
"bgt one%= \n"
|
||||||
|
|
||||||
"#n=%2 x=%0=%3 y=%1=%4 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11"
|
"#n=%2 x=%0=%3 y=%1=%4 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11"
|
||||||
:
|
:
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue