From 3dc6b26eff770dc3a74bb370500f5fb99ce540d1 Mon Sep 17 00:00:00 2001 From: Kavana Bhat Date: Tue, 20 Aug 2019 06:51:35 -0500 Subject: [PATCH 01/27] AIX changes for Power8 --- common_power.h | 29 + kernel/Makefile.L3 | 238 +++- kernel/power/casum_microk_power8.c | 10 +- kernel/power/ccopy_microk_power8.c | 10 +- kernel/power/cgemm_macros_8x4_power8.S | 768 +++++++++++ kernel/power/cgemm_tcopy_macros_8_power8.S | 96 ++ kernel/power/crot.c | 10 +- kernel/power/cswap_microk_power8.c | 6 +- kernel/power/ctrmm_macros_8x4_power8.S | 768 +++++++++++ kernel/power/dasum_microk_power8.c | 12 +- kernel/power/daxpy_microk_power8.c | 10 +- kernel/power/dcopy_microk_power8.c | 10 +- kernel/power/ddot_microk_power8.c | 12 +- kernel/power/dgemm_macros_16x4_power8.S | 976 ++++++++++++++ kernel/power/dgemm_ncopy_macros_4_power8.S | 120 ++ kernel/power/dgemm_tcopy_macros_16_power8.S | 120 ++ kernel/power/dgemv_n_microk_power8.c | 26 +- kernel/power/dgemv_t.c | 36 +- kernel/power/drot_microk_power8.c | 14 +- kernel/power/dscal_microk_power8.c | 18 +- kernel/power/dswap_microk_power8.c | 6 +- kernel/power/dtrmm_macros_16x4_power8.S | 960 ++++++++++++++ kernel/power/dtrsm_macros_LT_16x4_power8.S | 862 ++++++++---- kernel/power/idamax.c | 64 +- kernel/power/idamin.c | 64 +- kernel/power/izamax.c | 136 +- kernel/power/izamin.c | 136 +- kernel/power/lock.c | 4 +- kernel/power/sasum_microk_power8.c | 10 +- kernel/power/scopy_microk_power8.c | 10 +- kernel/power/sdot_microk_power8.c | 10 +- kernel/power/sgemm_macros_16x8_power8.S | 1296 +++++++++++++++++++ kernel/power/sgemm_tcopy_macros_16_power8.S | 120 ++ kernel/power/sgemm_tcopy_macros_8_power8.S | 96 ++ kernel/power/srot_microk_power8.c | 10 +- kernel/power/sscal_microk_power8.c | 16 +- kernel/power/sswap_microk_power8.c | 6 +- kernel/power/strmm_macros_16x8_power8.S | 1280 ++++++++++++++++++ kernel/power/zasum_microk_power8.c | 12 +- kernel/power/zaxpy_microk_power8.c | 46 +- kernel/power/zcopy_microk_power8.c | 10 +- kernel/power/zdot_microk_power8.c | 42 +- kernel/power/zgemm_macros_8x2_power8.S | 830 +++++++++--- kernel/power/zgemm_tcopy_macros_8_power8.S | 96 ++ kernel/power/zrot.c | 14 +- kernel/power/zscal_microk_power8.c | 46 +- kernel/power/zswap_microk_power8.c | 6 +- kernel/power/ztrmm_macros_8x2_power8.S | 782 +++++++++-- 48 files changed, 9263 insertions(+), 996 deletions(-) diff --git a/common_power.h b/common_power.h index 889205c75..76b9f0f32 100644 --- a/common_power.h +++ b/common_power.h @@ -39,6 +39,35 @@ #ifndef COMMON_POWER #define COMMON_POWER +#define str(x) #x + +#ifdef OS_AIX +#define XXSPLTD(T,A,z) xxpermdi T, A, A, 0b##z##z +#define XXMRGHD(T,A,B) xxpermdi T, A, B, 0b00 +#define XXMRGLD(T,A,B) xxpermdi T, A, B, 0b11 +#define XXSWAPD(T,A) xxpermdi T, A, A, 0b10 +#define XVMOVDP(T,A) xvcpsgndp T, A, A + +#define XXSPLTD_S(T,A,z) "xxpermdi " str(T) ", " str(A) ", " str(A) ", 0b" str(z ## z) " \n\t" +#define XXMRGHD_S(T,A,B) "xxpermdi " str(T) ", " str(A) ", " str(B) ", 0b00 \n\t" +#define XXMRGLD_S(T,A,B) "xxpermdi " str(T) ", " str(A) ", " str(B) ", 0b11 \n\t" +#define XXSWAPD_S(T,A) "xxpermdi " str(T) ", " str(A) ", " str(A) ", 0b10 \n\t" + +#else +#define XXSPLTD(T,A,z) xxspltd T, A, z +#define XXMRGHD(T,A,B) xxmrghd T, A, B +#define XXMRGLD(T,A,B) xxmrgld T, A, B +#define XXSWAPD(T,A) xxswapd T, A +#define XVMOVDP(T,A) xvmovdp T, A + +#define XXSPLTD_S(T,A,z) "xxspltd T, A, z \n\t" +#define XXMRGHD_S(T,A,B) "xxmrghd T, A, B \n\t" +#define XXMRGLD_S(T,A,B) "xxmrgld T, A, B \n\t" +#define XXSWAPD_S(T,A) "xxswapd T, A" + +#endif + + #if defined(POWER8) || defined(POWER9) #define MB __asm__ __volatile__ ("eieio":::"memory") #define WMB __asm__ __volatile__ ("eieio":::"memory") diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index f83def47b..ed8ae406f 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -57,8 +57,6 @@ USE_TRMM = 1 endif - - SKERNELOBJS += \ sgemm_kernel$(TSUFFIX).$(SUFFIX) \ $(SGEMMINCOPYOBJ) $(SGEMMITCOPYOBJ) \ @@ -436,7 +434,10 @@ $(KDIR)$(SGEMMONCOPYOBJ) : $(KERNELDIR)/$(SGEMMONCOPY) $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ $(KDIR)$(SGEMMOTCOPYOBJ) : $(KERNELDIR)/$(SGEMMOTCOPY) - $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ + $(CC) $(CFLAGS) -E -UDOUBLE -UCOMPLEX $< -o sgemmotcopy.s + m4 sgemmotcopy.s > sgemmotcopy_nomacros.s + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemmotcopy_nomacros.s -o $@ + rm sgemmotcopy.s sgemmotcopy_nomacros.s ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) @@ -444,12 +445,17 @@ $(KDIR)$(SGEMMINCOPYOBJ) : $(KERNELDIR)/$(SGEMMINCOPY) $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ $(KDIR)$(SGEMMITCOPYOBJ) : $(KERNELDIR)/$(SGEMMITCOPY) - $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ - + $(CC) $(CFLAGS) -E -UDOUBLE -UCOMPLEX $< -o sgemmitcopy.s + m4 sgemmitcopy.s > sgemmitcopy_nomacros.s + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemmitcopy_nomacros.s -o $@ + rm sgemmitcopy.s sgemmitcopy_nomacros.s endif $(KDIR)$(DGEMMONCOPYOBJ) : $(KERNELDIR)/$(DGEMMONCOPY) - $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ + $(CC) $(CFLAGS) -E -DDOUBLE -UCOMPLEX $< -o dgemm_ncopy.s + m4 dgemm_ncopy.s > dgemm_ncopy_nomacros.s + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX dgemm_ncopy_nomacros.s -o $@ + rm dgemm_ncopy.s dgemm_ncopy_nomacros.s $(KDIR)$(DGEMMOTCOPYOBJ) : $(KERNELDIR)/$(DGEMMOTCOPY) $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ @@ -460,7 +466,10 @@ $(KDIR)$(DGEMMINCOPYOBJ) : $(KERNELDIR)/$(DGEMMINCOPY) $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ $(KDIR)$(DGEMMITCOPYOBJ) : $(KERNELDIR)/$(DGEMMITCOPY) - $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ + $(CC) $(CFLAGS) -E -DDOUBLE -UCOMPLEX $< -o dgemm_itcopy.s + m4 dgemm_itcopy.s > dgemm_itcopy_nomacros.s + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX dgemm_itcopy_nomacros.s -o $@ + rm dgemm_itcopy.s dgemm_itcopy_nomacros.s endif @@ -485,10 +494,16 @@ endif endif $(KDIR)$(CGEMMONCOPYOBJ) : $(KERNELDIR)/$(CGEMMONCOPY) +# $(CC) $(CFLAGS) -E -UDOUBLE -UCOMPLEX $< -o cgemm_oncopy.s +# m4 cgemm_oncopy.s > cgemm_oncopy_nomacros.s $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ +# rm cgemm_oncopy.s cgemm_oncopy_nomacros.s $(KDIR)$(CGEMMOTCOPYOBJ) : $(KERNELDIR)/$(CGEMMOTCOPY) +# $(CC) $(CFLAGS) -E -UDOUBLE -UCOMPLEX $< -o cgemm_otcopy.s +# m4 cgemm_otcopy.s > cgemm_otcopy_nomacros.s $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ +# rm cgemm_otcopy.s cgemm_otcopy_nomacros.s ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) @@ -496,7 +511,10 @@ $(KDIR)$(CGEMMINCOPYOBJ) : $(KERNELDIR)/$(CGEMMINCOPY) $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ $(KDIR)$(CGEMMITCOPYOBJ) : $(KERNELDIR)/$(CGEMMITCOPY) - $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ + $(CC) $(CFLAGS) -UDOUBLE -UCOMPLEX -E $< -o cgemm_itcopy.s + m4 cgemm_itcopy.s > cgemm_itcopy_nomacros.s + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX cgemm_itcopy_nomacros.s -o $@ + rm cgemm_itcopy.s cgemm_itcopy_nomacros.s endif @@ -512,7 +530,10 @@ $(KDIR)$(ZGEMMINCOPYOBJ) : $(KERNELDIR)/$(ZGEMMINCOPY) $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ $(KDIR)$(ZGEMMITCOPYOBJ) : $(KERNELDIR)/$(ZGEMMITCOPY) - $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ + $(CC) $(CFLAGS) -E -DDOUBLE -UCOMPLEX $< -o zgemm_itcopy.s + m4 zgemm_itcopy.s > zgemm_itcopy_nomacros.s + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX zgemm_itcopy_nomacros.s -o $@ + rm zgemm_itcopy.s zgemm_itcopy_nomacros.s endif @@ -537,37 +558,67 @@ endif endif $(KDIR)sgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(SGEMMDEPEND) - $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ + $(CC) $(CFLAGS) -E -UDOUBLE -UCOMPLEX $< -o sgemm_kernel$(TSUFFIX).s + m4 sgemm_kernel$(TSUFFIX).s > sgemm_kernel$(TSUFFIX)_nomacros.s + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemm_kernel$(TSUFFIX)_nomacros.s -o $@ + rm sgemm_kernel$(TSUFFIX).s sgemm_kernel$(TSUFFIX)_nomacros.s $(KDIR)dgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) $(DGEMMDEPEND) - $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ + $(CC) $(CFLAGS) -E -DDOUBLE -UCOMPLEX $< -o dgemm_kernel$(TSUFFIX).s + m4 dgemm_kernel$(TSUFFIX).s > dgemm_kernel$(TSUFFIX)_nomacros.s + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX dgemm_kernel$(TSUFFIX)_nomacros.s -o $@ + rm dgemm_kernel$(TSUFFIX).s dgemm_kernel$(TSUFFIX)_nomacros.s $(KDIR)qgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) $(QGEMMDEPEND) $(CC) $(CFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ $(KDIR)cgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN $< -o $@ + $(CC) $(CFLAGS) -E -UDOUBLE -DCOMPLEX -DNN $< -o cgemm_kernel_n.s + m4 cgemm_kernel_n.s > cgemm_kernel_n_nomacros.s + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN cgemm_kernel_n_nomacros.s -o $@ + rm cgemm_kernel_n.s cgemm_kernel_n_nomacros.s $(KDIR)cgemm_kernel_l$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $@ + $(CC) $(CFLAGS) -E -UDOUBLE -DCOMPLEX -DCN $< -o cgemm_kernel_l.s + m4 cgemm_kernel_l.s > cgemm_kernel_l_nomacros.s + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCN cgemm_kernel_l_nomacros.s -o $@ + rm cgemm_kernel_l.s cgemm_kernel_l_nomacros.s $(KDIR)cgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNC $< -o $@ + $(CC) $(CFLAGS) -E -UDOUBLE -DCOMPLEX -DNC $< -o cgemm_kernel_r.s + m4 cgemm_kernel_r.s > cgemm_kernel_r_nomacros.s + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNC cgemm_kernel_r_nomacros.s -o $@ + rm cgemm_kernel_r.s cgemm_kernel_r_nomacros.s $(KDIR)cgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $@ + $(CC) $(CFLAGS) -E -UDOUBLE -DCOMPLEX -DCC $< -o cgemm_kernel_b.s + m4 cgemm_kernel_b.s > cgemm_kernel_b_nomacros.s + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCC cgemm_kernel_b_nomacros.s -o $@ + rm cgemm_kernel_b.s cgemm_kernel_b_nomacros.s $(KDIR)zgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN $< -o $@ + $(CC) $(CFLAGS) -E -DDOUBLE -DCOMPLEX -DNN $< -o zgemm_kernel_n.s + m4 zgemm_kernel_n.s > zgemm_kernel_n_nomacros.s + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN zgemm_kernel_n_nomacros.s -o $@ + rm zgemm_kernel_n.s zgemm_kernel_n_nomacros.s $(KDIR)zgemm_kernel_l$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN $< -o $@ + $(CC) $(CFLAGS) -E -DDOUBLE -DCOMPLEX -DCN $< -o zgemm_kernel_l.s + m4 zgemm_kernel_l.s > zgemm_kernel_l_nomacros.s + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN zgemm_kernel_l_nomacros.s -o $@ + rm zgemm_kernel_l.s zgemm_kernel_l_nomacros.s $(KDIR)zgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNC $< -o $@ + $(CC) $(CFLAGS) -E -DDOUBLE -DCOMPLEX -DNC $< -o zgemm_kernel_r.s + m4 zgemm_kernel_r.s > zgemm_kernel_r_nomacros.s + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNC zgemm_kernel_r_nomacros.s -o $@ + rm zgemm_kernel_r.s zgemm_kernel_r_nomacros.s $(KDIR)zgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $@ + $(CC) $(CFLAGS) -E -DDOUBLE -DCOMPLEX -DCC $< -o zgemm_kernel_b.s + m4 zgemm_kernel_b.s > zgemm_kernel_b_nomacros.s + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC zgemm_kernel_b_nomacros.s -o $@ + rm zgemm_kernel_b.s zgemm_kernel_b_nomacros.s $(KDIR)xgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMDEPEND) $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DNN $< -o $@ @@ -584,28 +635,56 @@ $(KDIR)xgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMD ifdef USE_TRMM $(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ + $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o strmmkernel_ln.s + m4 strmmkernel_ln.s > strmmkernel_ln_nomacros.s + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA strmmkernel_ln_nomacros.s -o $@ + rm strmmkernel_ln.s strmmkernel_ln_nomacros.s $(KDIR)strmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ + $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o strmmkernel_lt.s + m4 strmmkernel_lt.s > strmmkernel_lt_nomacros.s + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA strmmkernel_lt_nomacros.s -o $@ + rm strmmkernel_lt.s strmmkernel_lt_nomacros.s $(KDIR)strmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ + $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o strmmkernel_rn.s + m4 strmmkernel_rn.s > strmmkernel_rn_nomacros.s + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA strmmkernel_rn_nomacros.s -o $@ + rm strmmkernel_rn.s strmmkernel_rn_nomacros.s $(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ + $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o strmm_kernel_rt.s + m4 strmm_kernel_rt.s > strmm_kernel_rt_nomacros.s + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA strmm_kernel_rt_nomacros.s -o $@ + rm strmm_kernel_rt.s strmm_kernel_rt_nomacros.s $(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ + $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o dtrmm_kernel_ln.s +# $(CC) $(CFLAGS) -E $< -o dtrmm_kernel_ln.s + m4 dtrmm_kernel_ln.s > dtrmm_kernel_ln_nomacros.s + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA dtrmm_kernel_ln_nomacros.s -o $@ + rm dtrmm_kernel_ln.s dtrmm_kernel_ln_nomacros.s $(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ + $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o dtrmm_kernel_lt.s +# $(CC) $(CFLAGS) -E $< -o dtrmm_kernel_lt.s + m4 dtrmm_kernel_lt.s > dtrmm_kernel_lt_nomacros.s + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA dtrmm_kernel_lt_nomacros.s -o $@ + rm dtrmm_kernel_lt.s dtrmm_kernel_lt_nomacros.s $(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ + $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o dtrmm_kernel_rn.s +# $(CC) $(CFLAGS) -E $< -o dtrmm_kernel_rn.s + m4 dtrmm_kernel_rn.s > dtrmm_kernel_rn_nomacros.s + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA dtrmm_kernel_rn_nomacros.s -o $@ + rm dtrmm_kernel_rn.s dtrmm_kernel_rn_nomacros.s $(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ + $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o dtrmm_kernel_rt.s +# $(CC) $(CFLAGS) -E $< -o dtrmm_kernel_rt.s + m4 dtrmm_kernel_rt.s > dtrmm_kernel_rt_nomacros.s + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA dtrmm_kernel_rt_nomacros.s -o $@ + rm dtrmm_kernel_rt.s dtrmm_kernel_rt_nomacros.s $(KDIR)qtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ @@ -620,52 +699,100 @@ $(KDIR)qtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ $(KDIR)ctrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ + $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o ctrmm_kernel_ln.s + m4 ctrmm_kernel_ln.s > ctrmm_kernel_ln_nomacros.s + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN ctrmm_kernel_ln_nomacros.s -o $@ + rm ctrmm_kernel_ln.s ctrmm_kernel_ln_nomacros.s $(KDIR)ctrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ + $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o ctrmm_kernel_lt.s + m4 ctrmm_kernel_lt.s > ctrmm_kernel_lt_nomacros.s + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN ctrmm_kernel_lt_nomacros.s -o $@ + rm ctrmm_kernel_lt.s ctrmm_kernel_lt_nomacros.s $(KDIR)ctrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ + $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o ctrmm_kernel_lr.s + m4 ctrmm_kernel_lr.s > ctrmm_kernel_lr_nomacros.s + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN ctrmm_kernel_lr_nomacros.s -o $@ + rm ctrmm_kernel_lr.s ctrmm_kernel_lr_nomacros.s $(KDIR)ctrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ + $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o ctrmm_kernel_lc.s + m4 ctrmm_kernel_lc.s > ctrmm_kernel_lc_nomacros.s + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN ctrmm_kernel_lc_nomacros.s -o $@ + rm ctrmm_kernel_lc_nomacros.s ctrmm_kernel_lc.s $(KDIR)ctrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ + $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o ctrmm_kernel_rn.s + m4 ctrmm_kernel_rn.s > ctrmm_kernel_rn_nomacros.s + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN ctrmm_kernel_rn_nomacros.s -o $@ + rm ctrmm_kernel_rn.s ctrmm_kernel_rn_nomacros.s $(KDIR)ctrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ + $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o ctrmm_kernel_rt.s + m4 ctrmm_kernel_rt.s > ctrmm_kernel_rt_nomacros.s + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN ctrmm_kernel_rt_nomacros.s -o $@ + rm ctrmm_kernel_rt.s ctrmm_kernel_rt_nomacros.s $(KDIR)ctrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ + $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o ctrmm_kernel_rr.s + m4 ctrmm_kernel_rr.s > ctrmm_kernel_rr_nomacros.s + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC ctrmm_kernel_rr_nomacros.s -o $@ + rm ctrmm_kernel_rr.s ctrmm_kernel_rr_nomacros.s $(KDIR)ctrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ + $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o ctrmm_kernel_RC.s + m4 ctrmm_kernel_RC.s > ctrmm_kernel_RC_nomacros.s + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC ctrmm_kernel_RC_nomacros.s -o $@ + rm ctrmm_kernel_RC.s ctrmm_kernel_RC_nomacros.s $(KDIR)ztrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ + $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o ztrmm_kernel_ln.s + m4 ztrmm_kernel_ln.s > ztrmm_kernel_ln_nomacros.s + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN ztrmm_kernel_ln_nomacros.s -o $@ + rm ztrmm_kernel_ln.s ztrmm_kernel_ln_nomacros.s $(KDIR)ztrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ + $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o ztrmm_kernel_lt.s + m4 ztrmm_kernel_lt.s > ztrmm_kernel_lt_nomacros.s + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN ztrmm_kernel_lt_nomacros.s -o $@ + rm ztrmm_kernel_lt.s ztrmm_kernel_lt_nomacros.s $(KDIR)ztrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ + $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o ztrmm_kernel_lr.s + m4 ztrmm_kernel_lr.s > ztrmm_kernel_lr_nomacros.s + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN ztrmm_kernel_lr_nomacros.s -o $@ + rm ztrmm_kernel_lr.s ztrmm_kernel_lr_nomacros.s $(KDIR)ztrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ + $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o ztrmm_kernel_lc.s + m4 ztrmm_kernel_lc.s >ztrmm_kernel_lc_nomacros.s + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN ztrmm_kernel_lc_nomacros.s -o $@ + rm ztrmm_kernel_lc.s ztrmm_kernel_lc_nomacros.s $(KDIR)ztrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ + $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o ztrmm_kernel_rn.s + m4 ztrmm_kernel_rn.s > ztrmm_kernel_rn_nomacros.s + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN ztrmm_kernel_rn_nomacros.s -o $@ + rm ztrmm_kernel_rn.s ztrmm_kernel_rn_nomacros.s $(KDIR)ztrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ + $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o ztrmm_kernel_rt.s + m4 ztrmm_kernel_rt.s > ztrmm_kernel_rt_nomacros.s + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN ztrmm_kernel_rt_nomacros.s -o $@ + rm ztrmm_kernel_rt.s ztrmm_kernel_rt_nomacros.s $(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ + $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o ztrmm_kernel_rr.s + m4 ztrmm_kernel_rr.s > ztrmm_kernel_rr_nomacros.s + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC ztrmm_kernel_rr_nomacros.s -o $@ + rm ztrmm_kernel_rr.s ztrmm_kernel_rr_nomacros.s $(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ + $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o ztrmm_kernel_rc.s + m4 ztrmm_kernel_rc.s > ztrmm_kernel_rc_nomacros.s + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC ztrmm_kernel_rc_nomacros.s -o $@ + rm ztrmm_kernel_rc.s ztrmm_kernel_rc_nomacros.s else $(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ @@ -677,7 +804,10 @@ $(KDIR)strmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ $(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ + $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o strmm_kernel_rt.s + m4 strmm_kernel_rt.s > strmm_kernel_rt_nomacros.s + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA strmm_kernel_rt_nomacros.s -o $@ + rm strmm_kernel_rt.s strmm_kernel_rt_nomacros.s $(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ @@ -801,10 +931,16 @@ $(KDIR)strsm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRSMKERNEL_RT) $(ST $(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -UDOUBLE -UUPPER -DRT -UCONJ $< -o $@ $(KDIR)dtrsm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_LN) $(DTRSMDEPEND) +# $(CC) $(CFLAGS) -E $< -o dtrsm_kernel_ln.s +# m4 dtrsm_kernel_ln.s > dtrsm_kernel_ln_nomacros.s $(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -DUPPER -DLN -UCONJ $< -o $@ +# rm dtrsm_kernel_ln.s dtrsm_kernel_ln_nomacros.s $(KDIR)dtrsm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_LT) $(DTRSMDEPEND) - $(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ $< -o $@ + $(CC) $(CFLAGS) -E -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ $< -o dtrsm_kernel_lt.s + m4 dtrsm_kernel_lt.s > dtrsm_kernel_lt_nomacros.s + $(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ dtrsm_kernel_lt_nomacros.s -o $@ + rm dtrsm_kernel_lt.s dtrsm_kernel_lt_nomacros.s $(KDIR)dtrsm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_RN) $(DTRSMDEPEND) $(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -DUPPER -DRN -UCONJ $< -o $@ @@ -1940,7 +2076,7 @@ $(SGEMMITCOPYOBJ_P) : $(KERNELDIR)/$(SGEMMITCOPY) endif -$(DGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(DGEMMONCOPY) +$(D cgemm_kernel_r_nomacros.s + $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DNC cgemm_kernel_r_nomacros.s -o $@ + rm cgemm_kernel_r.s cgemm_kernel_r_nomacros.s $(KDIR)cgemm_kernel_b$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $@ @@ -2083,7 +2222,10 @@ $(KDIR)strmm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ $(KDIR)strmm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) - $(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ + $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o strmm_kernel_rt.s + m4 strmmkernel_rn.s > strmm_kernel_rt_nomacros.s + $(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA strmm_kernel_rt_nomacros.s -o $@ + rm strmm_kernel_rt.s strmm_kernel_rt_nomacros.s $(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) $(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ diff --git a/kernel/power/casum_microk_power8.c b/kernel/power/casum_microk_power8.c index 7d12c9885..91d53ffc3 100644 --- a/kernel/power/casum_microk_power8.c +++ b/kernel/power/casum_microk_power8.c @@ -68,10 +68,10 @@ static float casum_kernel_16 (long n, float *x) "addi %2, %2, 128 \n\t" "addic. %1, %1, -16 \n\t" - "ble 2f \n\t" + "ble two%= \n\t" - ".p2align 5 \n" - "1: \n\t" + ".align 5 \n" + "one%=: \n\t" "xvabssp 48, 40 \n\t" "xvabssp 49, 41 \n\t" @@ -108,9 +108,9 @@ static float casum_kernel_16 (long n, float *x) "xvaddsp 38, 38, %x5 \n\t" "xvaddsp 39, 39, %x6 \n\t" - "bgt 1b \n" + "bgt one%= \n" - "2: \n\t" + "two%=: \n\t" "xvabssp 48, 40 \n\t" "xvabssp 49, 41 \n\t" diff --git a/kernel/power/ccopy_microk_power8.c b/kernel/power/ccopy_microk_power8.c index 613c4d286..6a7886e6f 100644 --- a/kernel/power/ccopy_microk_power8.c +++ b/kernel/power/ccopy_microk_power8.c @@ -62,10 +62,10 @@ static void ccopy_kernel_32 (long n, float *x, float *y) "addi %2, %2, 128 \n\t" "addic. %1, %1, -32 \n\t" - "ble 2f \n\t" + "ble two%= \n\t" - ".p2align 5 \n" - "1: \n\t" + ".align 5 \n" + "one%=: \n\t" "stxvd2x 32, 0, %3 \n\t" "stxvd2x 33, %5, %3 \n\t" @@ -108,9 +108,9 @@ static void ccopy_kernel_32 (long n, float *x, float *y) "addi %2, %2, 128 \n\t" "addic. %1, %1, -32 \n\t" - "bgt 1b \n" + "bgt one%= \n" - "2: \n\t" + "two%=: \n\t" "stxvd2x 32, 0, %3 \n\t" "stxvd2x 33, %5, %3 \n\t" diff --git a/kernel/power/cgemm_macros_8x4_power8.S b/kernel/power/cgemm_macros_8x4_power8.S index 9a18cb189..46108bbb4 100644 --- a/kernel/power/cgemm_macros_8x4_power8.S +++ b/kernel/power/cgemm_macros_8x4_power8.S @@ -83,7 +83,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * Macros for N=4 and M=8 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD4x8_1', ` +#else .macro LOAD4x8_1 +#endif lxvw4x vs0, o0, AO // load a0, a1 lxvw4x vs1, o16, AO // load a2, a3 @@ -107,9 +111,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 64 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x8_I1', ` +#else .macro KERNEL4x8_I1 +#endif lxvw4x vs4, o0, AO // load a0, a1 @@ -172,9 +184,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs63, vs3, vs15 // a3_r*b3_i, a3_i*b3_i, a1_r*b3_i, a1_i*b3_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x8_1', ` +#else .macro KERNEL4x8_1 +#endif lxvw4x vs4, o0, AO // load a0, a1 @@ -237,9 +257,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs63, vs3, vs15 // a3_r*b3_i, a3_i*b3_i, a1_r*b3_i, a1_i*b3_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x8_2', ` +#else .macro KERNEL4x8_2 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -302,9 +330,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs63, vs7, vs23 // a7_r*b3_i, a7_i*b3_i, a1_r*b3_i, a1_i*b3_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x8_E2', ` +#else .macro KERNEL4x8_E2 +#endif xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r @@ -344,9 +380,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs63, vs7, vs23 // a7_r*b3_i, a7_i*b3_i, a1_r*b3_i, a1_i*b3_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x8_SUBI1', ` +#else .macro KERNEL4x8_SUBI1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -409,9 +453,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs63, vs3, vs15 // a3_r*b3_i, a3_i*b3_i, a1_r*b3_i, a1_i*b3_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x8_SUB1', ` +#else .macro KERNEL4x8_SUB1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -474,9 +526,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs63, vs3, vs15 // a3_r*b3_i, a3_i*b3_i, a1_r*b3_i, a1_i*b3_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE4x8', ` +#else .macro SAVE4x8 +#endif mr T1, CO xxlxor vs24, vs24, vs24 @@ -1546,14 +1606,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 64 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=4 and M=4 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD4x4_1', ` +#else .macro LOAD4x4_1 +#endif lxvw4x vs0, o0, AO // load a0, a1 lxvw4x vs1, o16, AO // load a2, a3 @@ -1575,9 +1643,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 64 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x4_I1', ` +#else .macro KERNEL4x4_I1 +#endif lxvw4x vs4, o0, AO // load a0, a1 @@ -1622,9 +1698,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs47, vs1, vs15 // a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x4_1', ` +#else .macro KERNEL4x4_1 +#endif lxvw4x vs4, o0, AO // load a0, a1 @@ -1669,9 +1753,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs47, vs1, vs15 // a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x4_2', ` +#else .macro KERNEL4x4_2 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -1716,9 +1808,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs47, vs5, vs23 // a5_r*b3_i, a5_i*b3_i, a1_r*b3_i, a1_i*b3_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x4_E2', ` +#else .macro KERNEL4x4_E2 +#endif xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r @@ -1742,9 +1842,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs47, vs5, vs23 // a5_r*b3_i, a5_i*b3_i, a1_r*b3_i, a1_i*b3_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x4_SUBI1', ` +#else .macro KERNEL4x4_SUBI1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -1789,9 +1897,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs47, vs1, vs15 // a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x4_SUB1', ` +#else .macro KERNEL4x4_SUB1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -1836,9 +1952,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs47, vs1, vs15 // a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE4x4', ` +#else .macro SAVE4x4 +#endif mr T1, CO xxlxor vs24, vs24, vs24 @@ -2388,14 +2512,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 32 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=4 and M=2 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD4x2_1', ` +#else .macro LOAD4x2_1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -2416,9 +2548,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 64 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x2_I1', ` +#else .macro KERNEL4x2_I1 +#endif lxvw4x vs4, o0, AO // load a0, a1 @@ -2454,9 +2594,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs39, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x2_1', ` +#else .macro KERNEL4x2_1 +#endif lxvw4x vs4, o0, AO // load a0, a1 @@ -2492,9 +2640,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x2_2', ` +#else .macro KERNEL4x2_2 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -2530,9 +2686,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs4, vs23 // a4_r*b3_i, a4_i*b3_i, a1_r*b3_i, a1_i*b3_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x2_E2', ` +#else .macro KERNEL4x2_E2 +#endif xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r @@ -2548,9 +2712,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs4, vs23 // a4_r*b3_i, a4_i*b3_i, a1_r*b3_i, a1_i*b3_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x2_SUBI1', ` +#else .macro KERNEL4x2_SUBI1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -2586,9 +2758,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs39, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x2_SUB1', ` +#else .macro KERNEL4x2_SUB1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -2624,9 +2804,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE4x2', ` +#else .macro SAVE4x2 +#endif mr T1, CO xxlxor vs24, vs24, vs24 @@ -2916,14 +3104,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 16 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=4 and M=1 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD4x1_1', ` +#else .macro LOAD4x1_1 +#endif lxsspx vs0, o0, AO // load a0_r lxsspx vs1, o4, AO // load a0_i @@ -2945,9 +3141,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 64 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x1_I1', ` +#else .macro KERNEL4x1_I1 +#endif lxsspx vs4, o0, AO // load a0_r @@ -2992,9 +3196,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs47, vs1, vs14 // a0_i*b3_r +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x1_1', ` +#else .macro KERNEL4x1_1 +#endif lxsspx vs4, o0, AO // load a0_r @@ -3039,9 +3251,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs47, vs1, vs14 // a0_i*b3_r +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x1_2', ` +#else .macro KERNEL4x1_2 +#endif lxsspx vs0, o0, AO // load a0_r @@ -3086,9 +3306,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs47, vs5, vs22 // a4_i*b3_r +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x1_E2', ` +#else .macro KERNEL4x1_E2 +#endif xsmaddadp vs32, vs4, vs16 // a4_r*b0_r @@ -3112,9 +3340,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs47, vs5, vs22 // a4_i*b3_r +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x1_SUBI1', ` +#else .macro KERNEL4x1_SUBI1 +#endif lxsspx vs0, o0, AO // load a0_r @@ -3159,9 +3395,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs47, vs1, vs14 // a0_i*b3_r +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x1_SUB1', ` +#else .macro KERNEL4x1_SUB1 +#endif lxsspx vs0, o0, AO // load a0_r @@ -3206,9 +3450,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs47, vs1, vs14 // a0_i*b3_r +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE4x1', ` +#else .macro SAVE4x1 +#endif mr T1, CO xxlxor vs24, vs24, vs24 @@ -3382,14 +3634,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 8 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=8 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD2x8_1', ` +#else .macro LOAD2x8_1 +#endif lxvw4x vs0, o0, AO // load a0, a1 lxvw4x vs1, o16, AO // load a2, a3 @@ -3406,9 +3666,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 64 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_I1', ` +#else .macro KERNEL2x8_I1 +#endif lxvw4x vs4, o0, AO // load a0, a1 @@ -3446,9 +3714,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs47, vs3, vs11 // a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_1', ` +#else .macro KERNEL2x8_1 +#endif lxvw4x vs4, o0, AO // load a0, a1 @@ -3486,9 +3762,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs47, vs3, vs11 // a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_2', ` +#else .macro KERNEL2x8_2 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -3526,9 +3810,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs47, vs7, vs19 // a7_r*b1_i, a7_i*b1_i, a1_r*b1_i, a1_i*b1_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_E2', ` +#else .macro KERNEL2x8_E2 +#endif xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r @@ -3550,9 +3842,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs47, vs7, vs19 // a7_r*b1_i, a7_i*b1_i, a1_r*b1_i, a1_i*b1_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_SUBI1', ` +#else .macro KERNEL2x8_SUBI1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -3590,9 +3890,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs47, vs3, vs11 // a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_SUB1', ` +#else .macro KERNEL2x8_SUB1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -3630,9 +3938,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs47, vs3, vs11 // a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE2x8', ` +#else .macro SAVE2x8 +#endif mr T1, CO xxlxor vs24, vs24, vs24 @@ -4170,14 +4486,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 64 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=4 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD2x4_1', ` +#else .macro LOAD2x4_1 +#endif lxvw4x vs0, o0, AO // load a0, a1 lxvw4x vs1, o16, AO // load a2, a3 @@ -4192,9 +4516,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 64 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_I1', ` +#else .macro KERNEL2x4_I1 +#endif lxvw4x vs4, o0, AO // load a0, a1 @@ -4222,9 +4554,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs39, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_1', ` +#else .macro KERNEL2x4_1 +#endif lxvw4x vs4, o0, AO // load a0, a1 @@ -4252,9 +4592,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_2', ` +#else .macro KERNEL2x4_2 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -4282,9 +4630,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs5, vs19 // a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_E2', ` +#else .macro KERNEL2x4_E2 +#endif xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r @@ -4298,9 +4654,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs5, vs19 // a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_SUBI1', ` +#else .macro KERNEL2x4_SUBI1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -4328,9 +4692,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs39, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_SUB1', ` +#else .macro KERNEL2x4_SUB1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -4358,9 +4730,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE2x4', ` +#else .macro SAVE2x4 +#endif mr T1, CO xxlxor vs24, vs24, vs24 @@ -4638,14 +5018,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 32 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=2 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD2x2_1', ` +#else .macro LOAD2x2_1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -4659,9 +5047,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 64 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_I1', ` +#else .macro KERNEL2x2_I1 +#endif lxvw4x vs4, o0, AO // load a0, a1 @@ -4684,9 +5080,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs35, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_1', ` +#else .macro KERNEL2x2_1 +#endif lxvw4x vs4, o0, AO // load a0, a1 @@ -4709,9 +5113,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs35, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_2', ` +#else .macro KERNEL2x2_2 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -4734,9 +5146,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs35, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_E2', ` +#else .macro KERNEL2x2_E2 +#endif xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r @@ -4746,9 +5166,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs35, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_SUBI1', ` +#else .macro KERNEL2x2_SUBI1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -4771,9 +5199,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs35, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_SUB1', ` +#else .macro KERNEL2x2_SUB1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -4796,9 +5232,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs35, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE2x2', ` +#else .macro SAVE2x2 +#endif mr T1, CO xxlxor vs24, vs24, vs24 @@ -4946,14 +5390,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 16 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=1 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD2x1_1', ` +#else .macro LOAD2x1_1 +#endif lxsspx vs0, o0, AO // load a0_r lxsspx vs1, o4, AO // load a0_i @@ -4968,9 +5420,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 64 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_I1', ` +#else .macro KERNEL2x1_I1 +#endif lxsspx vs4, o0, AO // load a0_r @@ -4998,9 +5458,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs39, vs1, vs10 // a0_i*b1_r +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_1', ` +#else .macro KERNEL2x1_1 +#endif lxsspx vs4, o0, AO // load a0_r @@ -5028,9 +5496,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs39, vs1, vs10 // a0_i*b1_r +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_2', ` +#else .macro KERNEL2x1_2 +#endif lxsspx vs0, o0, AO // load a0_r @@ -5058,9 +5534,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs39, vs5, vs18 // a4_i*b1_r +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_E2', ` +#else .macro KERNEL2x1_E2 +#endif xsmaddadp vs32, vs4, vs16 // a4_r*b0_r @@ -5074,9 +5558,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs39, vs5, vs18 // a4_i*b1_r +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_SUBI1', ` +#else .macro KERNEL2x1_SUBI1 +#endif lxsspx vs0, o0, AO // load a0_r @@ -5104,9 +5596,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs39, vs1, vs10 // a0_i*b1_r +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_SUB1', ` +#else .macro KERNEL2x1_SUB1 +#endif lxsspx vs0, o0, AO // load a0_r @@ -5134,9 +5634,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs39, vs1, vs10 // a0_i*b1_r +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE2x1', ` +#else .macro SAVE2x1 +#endif mr T1, CO xxlxor vs24, vs24, vs24 @@ -5226,14 +5734,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 8 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=8 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD1x8_1', ` +#else .macro LOAD1x8_1 +#endif lxvw4x vs0, o0, AO // load a0, a1 lxvw4x vs1, o16, AO // load a2, a3 @@ -5247,9 +5763,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_I1', ` +#else .macro KERNEL1x8_I1 +#endif lxvw4x vs4, o0, AO // load a0, a1 @@ -5275,9 +5799,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_1', ` +#else .macro KERNEL1x8_1 +#endif lxvw4x vs4, o0, AO // load a0, a1 @@ -5303,9 +5835,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_2', ` +#else .macro KERNEL1x8_2 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -5331,9 +5871,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs7, vs17 // a7_r*b0_i, a7_i*b0_i, a1_r*b0_i, a1_i*b0_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_E2', ` +#else .macro KERNEL1x8_E2 +#endif xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r @@ -5346,9 +5894,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs7, vs17 // a7_r*b0_i, a7_i*b0_i, a1_r*b0_i, a1_i*b0_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_SUBI1', ` +#else .macro KERNEL1x8_SUBI1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -5374,9 +5930,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_SUB1', ` +#else .macro KERNEL1x8_SUB1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -5402,9 +5966,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE1x8', ` +#else .macro SAVE1x8 +#endif mr T1, CO xxlxor vs24, vs24, vs24 @@ -5676,14 +6248,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 64 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=4 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD1x4_1', ` +#else .macro LOAD1x4_1 +#endif lxvw4x vs0, o0, AO // load a0, a1 lxvw4x vs1, o16, AO // load a2, a3 @@ -5695,9 +6275,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_I1', ` +#else .macro KERNEL1x4_I1 +#endif lxvw4x vs4, o0, AO // load a0, a1 @@ -5717,9 +6305,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_1', ` +#else .macro KERNEL1x4_1 +#endif lxvw4x vs4, o0, AO // load a0, a1 @@ -5739,9 +6335,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_2', ` +#else .macro KERNEL1x4_2 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -5761,9 +6365,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_E2', ` +#else .macro KERNEL1x4_E2 +#endif xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r @@ -5772,9 +6384,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_SUBI1', ` +#else .macro KERNEL1x4_SUBI1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -5794,9 +6414,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_SUB1', ` +#else .macro KERNEL1x4_SUB1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -5816,9 +6444,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE1x4', ` +#else .macro SAVE1x4 +#endif mr T1, CO xxlxor vs24, vs24, vs24 @@ -5960,14 +6596,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 32 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=2 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD1x2_1', ` +#else .macro LOAD1x2_1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -5978,9 +6622,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_I1', ` +#else .macro KERNEL1x2_I1 +#endif lxvw4x vs4, o0, AO // load a0, a1 @@ -5997,9 +6649,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_1', ` +#else .macro KERNEL1x2_1 +#endif lxvw4x vs4, o0, AO // load a0, a1 @@ -6016,9 +6676,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_2', ` +#else .macro KERNEL1x2_2 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -6035,18 +6703,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_E2', ` +#else .macro KERNEL1x2_E2 +#endif xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_SUBI1', ` +#else .macro KERNEL1x2_SUBI1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -6063,9 +6747,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_SUB1', ` +#else .macro KERNEL1x2_SUB1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -6082,9 +6774,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE1x2', ` +#else .macro SAVE1x2 +#endif mr T1, CO xxlxor vs24, vs24, vs24 @@ -6161,14 +6861,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 16 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=1 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD1x1_1', ` +#else .macro LOAD1x1_1 +#endif lxsspx vs0, o0, AO // load a0_r lxsspx vs1, o4, AO // load a0_i @@ -6180,9 +6888,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_I1', ` +#else .macro KERNEL1x1_I1 +#endif lxsspx vs4, o0, AO // load a0_r @@ -6202,9 +6918,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs35, vs1, vs8 // a0_i*b0_r +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_1', ` +#else .macro KERNEL1x1_1 +#endif lxsspx vs4, o0, AO // load a0_r @@ -6224,9 +6948,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs35, vs1, vs8 // a0_i*b0_r +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_2', ` +#else .macro KERNEL1x1_2 +#endif lxsspx vs0, o0, AO // load a0_r @@ -6246,9 +6978,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs35, vs5, vs16 // a4_i*b0_r +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_E2', ` +#else .macro KERNEL1x1_E2 +#endif xsmaddadp vs32, vs4, vs16 // a4_r*b0_r @@ -6257,9 +6997,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs35, vs5, vs16 // a4_i*b0_r +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_SUBI1', ` +#else .macro KERNEL1x1_SUBI1 +#endif lxsspx vs0, o0, AO // load a0_r @@ -6279,9 +7027,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs35, vs1, vs8 // a0_i*b0_r +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_SUB1', ` +#else .macro KERNEL1x1_SUB1 +#endif lxsspx vs0, o0, AO // load a0_r @@ -6301,9 +7057,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs35, vs1, vs8 // a0_i*b0_r +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE1x1', ` +#else .macro SAVE1x1 +#endif mr T1, CO xxlxor vs24, vs24, vs24 @@ -6351,5 +7115,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 8 +#if defined(_AIX) +') +#else .endm +#endif diff --git a/kernel/power/cgemm_tcopy_macros_8_power8.S b/kernel/power/cgemm_tcopy_macros_8_power8.S index 03fda2766..64bf8dd99 100644 --- a/kernel/power/cgemm_tcopy_macros_8_power8.S +++ b/kernel/power/cgemm_tcopy_macros_8_power8.S @@ -38,7 +38,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * Macros for N=4 and M=8 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_4x8', ` +#else .macro COPY_4x8 +#endif lxvw4x vs32, o0, A0 lxvw4x vs33, o16, A0 @@ -93,13 +97,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvw4x vs46, o32, T1 stxvw4x vs47, o48, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=4 and M=4 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_4x4', ` +#else .macro COPY_4x4 +#endif lxvw4x vs32, o0, A0 lxvw4x vs33, o16, A0 @@ -133,13 +145,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvw4x vs38, o32, T1 stxvw4x vs39, o48, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=4 and M=2 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_4x2', ` +#else .macro COPY_4x2 +#endif lxvw4x vs32, o0, A0 addi A0, A0, 16 @@ -163,13 +183,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvw4x vs35, o48, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=4 and M=1 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_4x1', ` +#else .macro COPY_4x1 +#endif lxsspx vs32, o0, A0 lxsspx vs33, o4, A0 @@ -207,13 +235,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxsspx vs38, o0, T1 stxsspx vs39, o4, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=8 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_2x8', ` +#else .macro COPY_2x8 +#endif lxvw4x vs32, o0, A0 lxvw4x vs33, o16, A0 @@ -241,13 +277,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvw4x vs38, o32, T1 stxvw4x vs39, o48, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=4 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_2x4', ` +#else .macro COPY_2x4 +#endif lxvw4x vs32, o0, A0 lxvw4x vs33, o16, A0 @@ -265,13 +309,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvw4x vs34, o32, T1 stxvw4x vs35, o48, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=2 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_2x2', ` +#else .macro COPY_2x2 +#endif lxvw4x vs32, o0, A0 addi A0, A0, 16 @@ -285,13 +337,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvw4x vs33, o16, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=1 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_2x1', ` +#else .macro COPY_2x1 +#endif lxsspx vs32, o0, A0 lxsspx vs33, o4, A0 @@ -311,13 +371,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxsspx vs34, o0, T1 stxsspx vs35, o4, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=8 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_1x8', ` +#else .macro COPY_1x8 +#endif lxvw4x vs32, o0, A0 lxvw4x vs33, o16, A0 @@ -332,13 +400,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvw4x vs34, o32, T1 stxvw4x vs35, o48, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=4 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_1x4', ` +#else .macro COPY_1x4 +#endif lxvw4x vs32, o0, A0 lxvw4x vs33, o16, A0 @@ -349,13 +425,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvw4x vs32, o0, T1 stxvw4x vs33, o16, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=2 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_1x2', ` +#else .macro COPY_1x2 +#endif lxvw4x vs32, o0, A0 addi A0, A0, 16 @@ -364,13 +448,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvw4x vs32, o0, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=1 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_1x1', ` +#else .macro COPY_1x1 +#endif lxsspx vs32, o0, A0 lxsspx vs33, o4, A0 @@ -381,5 +473,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxsspx vs32, o0, T1 stxsspx vs33, o4, T1 +#if defined(_AIX) +') +#else .endm +#endif diff --git a/kernel/power/crot.c b/kernel/power/crot.c index 959a9eda0..2a5835546 100644 --- a/kernel/power/crot.c +++ b/kernel/power/crot.c @@ -56,9 +56,9 @@ static void crot_kernel_8 (long n, float *x, float *y, float c, float s) "addi %[x_ptr], %[x_ptr], 64 \n\t" "addi %[y_ptr], %[y_ptr], 64 \n\t" "addic. %[temp_n], %[temp_n], -8 \n\t" - "ble 2f \n\t" - ".p2align 5 \n\t" - "1: \n\t" + "ble two%= \n\t" + ".align 5 \n\t" + "one%=: \n\t" "xvmulsp 40, 32, 36 \n\t" // c * x "xvmulsp 41, 33, 36 \n\t" "xvmulsp 42, 34, 36 \n\t" @@ -104,8 +104,8 @@ static void crot_kernel_8 (long n, float *x, float *y, float c, float s) "addi %[x_ptr], %[x_ptr], 128 \n\t" "addi %[y_ptr], %[y_ptr], 128 \n\t" "addic. %[temp_n], %[temp_n], -8 \n\t" - "bgt 1b \n\t" - "2: \n\t" + "bgt one%= \n\t" + "two%=: \n\t" "xvmulsp 40, 32, 36 \n\t" // c * x "xvmulsp 41, 33, 36 \n\t" "xvmulsp 42, 34, 36 \n\t" diff --git a/kernel/power/cswap_microk_power8.c b/kernel/power/cswap_microk_power8.c index 8d7d0c0b9..829800230 100644 --- a/kernel/power/cswap_microk_power8.c +++ b/kernel/power/cswap_microk_power8.c @@ -39,8 +39,8 @@ static void cswap_kernel_32 (long n, float *x, float *y) { __asm__ ( - ".p2align 5 \n" - "1: \n\t" + ".align 5 \n" + "one%=: \n\t" "lxvd2x 32, 0, %4 \n\t" "lxvd2x 33, %5, %4 \n\t" @@ -131,7 +131,7 @@ static void cswap_kernel_32 (long n, float *x, float *y) "addi %4, %4, 128 \n\t" "addic. %2, %2, -32 \n\t" - "bgt 1b \n" + "bgt one%= \n" "#n=%2 x=%0=%3 y=%1=%4 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11" : diff --git a/kernel/power/ctrmm_macros_8x4_power8.S b/kernel/power/ctrmm_macros_8x4_power8.S index 48a21252c..922cab57a 100644 --- a/kernel/power/ctrmm_macros_8x4_power8.S +++ b/kernel/power/ctrmm_macros_8x4_power8.S @@ -83,7 +83,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * Macros for N=4 and M=8 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD4x8_1', ` +#else .macro LOAD4x8_1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -113,9 +117,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x8_I1', ` +#else .macro KERNEL4x8_I1 +#endif lxvw4x vs4, o0, AO // load a0, a1 @@ -184,9 +196,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs63, vs3, vs15 // a3_r*b3_i, a3_i*b3_i, a1_r*b3_i, a1_i*b3_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x8_1', ` +#else .macro KERNEL4x8_1 +#endif lxvw4x vs4, o0, AO // load a0, a1 @@ -255,9 +275,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs63, vs3, vs15 // a3_r*b3_i, a3_i*b3_i, a1_r*b3_i, a1_i*b3_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x8_2', ` +#else .macro KERNEL4x8_2 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -326,9 +354,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs63, vs7, vs23 // a7_r*b3_i, a7_i*b3_i, a1_r*b3_i, a1_i*b3_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x8_E2', ` +#else .macro KERNEL4x8_E2 +#endif xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r @@ -368,9 +404,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs63, vs7, vs23 // a7_r*b3_i, a7_i*b3_i, a1_r*b3_i, a1_i*b3_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x8_SUBI1', ` +#else .macro KERNEL4x8_SUBI1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -439,9 +483,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs63, vs3, vs15 // a3_r*b3_i, a3_i*b3_i, a1_r*b3_i, a1_i*b3_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x8_SUB1', ` +#else .macro KERNEL4x8_SUB1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -510,9 +562,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs63, vs3, vs15 // a3_r*b3_i, a3_i*b3_i, a1_r*b3_i, a1_i*b3_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE4x8', ` +#else .macro SAVE4x8 +#endif mr T1, CO @@ -1597,14 +1657,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 64 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=4 and M=4 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD4x4_1', ` +#else .macro LOAD4x4_1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -1630,9 +1698,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x4_I1', ` +#else .macro KERNEL4x4_I1 +#endif lxvw4x vs4, o0, AO // load a0, a1 @@ -1681,9 +1757,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs47, vs1, vs15 // a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x4_1', ` +#else .macro KERNEL4x4_1 +#endif lxvw4x vs4, o0, AO // load a0, a1 @@ -1732,9 +1816,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs47, vs1, vs15 // a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x4_2', ` +#else .macro KERNEL4x4_2 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -1783,9 +1875,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs47, vs5, vs23 // a5_r*b3_i, a5_i*b3_i, a1_r*b3_i, a1_i*b3_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x4_E2', ` +#else .macro KERNEL4x4_E2 +#endif xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r @@ -1809,9 +1909,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs47, vs5, vs23 // a5_r*b3_i, a5_i*b3_i, a1_r*b3_i, a1_i*b3_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x4_SUBI1', ` +#else .macro KERNEL4x4_SUBI1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -1860,9 +1968,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs47, vs1, vs15 // a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x4_SUB1', ` +#else .macro KERNEL4x4_SUB1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -1911,9 +2027,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs47, vs1, vs15 // a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE4x4', ` +#else .macro SAVE4x4 +#endif mr T1, CO @@ -2470,14 +2594,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 32 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=4 and M=2 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD4x2_1', ` +#else .macro LOAD4x2_1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -2501,9 +2633,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x2_I1', ` +#else .macro KERNEL4x2_I1 +#endif lxvw4x vs4, o0, AO // load a0, a1 @@ -2542,9 +2682,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs39, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x2_1', ` +#else .macro KERNEL4x2_1 +#endif lxvw4x vs4, o0, AO // load a0, a1 @@ -2583,9 +2731,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x2_2', ` +#else .macro KERNEL4x2_2 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -2624,9 +2780,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs4, vs23 // a4_r*b3_i, a4_i*b3_i, a1_r*b3_i, a1_i*b3_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x2_E2', ` +#else .macro KERNEL4x2_E2 +#endif xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r @@ -2642,9 +2806,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs4, vs23 // a4_r*b3_i, a4_i*b3_i, a1_r*b3_i, a1_i*b3_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x2_SUBI1', ` +#else .macro KERNEL4x2_SUBI1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -2683,9 +2855,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs39, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x2_SUB1', ` +#else .macro KERNEL4x2_SUB1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -2724,9 +2904,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE4x2', ` +#else .macro SAVE4x2 +#endif mr T1, CO @@ -3019,14 +3207,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 16 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=4 and M=1 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD4x1_1', ` +#else .macro LOAD4x1_1 +#endif lxsspx vs0, o0, AO // load a0_r lxsspx vs1, o4, AO // load a0_i @@ -3055,9 +3251,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x1_I1', ` +#else .macro KERNEL4x1_I1 +#endif lxsspx vs4, o0, AO // load a0_r @@ -3109,9 +3313,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs47, vs1, vs14 // a0_i*b3_r +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x1_1', ` +#else .macro KERNEL4x1_1 +#endif lxsspx vs4, o0, AO // load a0_r @@ -3163,9 +3375,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs47, vs1, vs14 // a0_i*b3_r +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x1_2', ` +#else .macro KERNEL4x1_2 +#endif lxsspx vs0, o0, AO // load a0_r @@ -3217,9 +3437,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs47, vs5, vs22 // a4_i*b3_r +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x1_E2', ` +#else .macro KERNEL4x1_E2 +#endif xsmaddadp vs32, vs4, vs16 // a4_r*b0_r @@ -3243,9 +3471,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs47, vs5, vs22 // a4_i*b3_r +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x1_SUBI1', ` +#else .macro KERNEL4x1_SUBI1 +#endif lxsspx vs0, o0, AO // load a0_r @@ -3297,9 +3533,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs47, vs1, vs14 // a0_i*b3_r +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x1_SUB1', ` +#else .macro KERNEL4x1_SUB1 +#endif lxsspx vs0, o0, AO // load a0_r @@ -3351,9 +3595,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs47, vs1, vs14 // a0_i*b3_r +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE4x1', ` +#else .macro SAVE4x1 +#endif mr T1, CO @@ -3526,14 +3778,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 8 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=8 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD2x8_1', ` +#else .macro LOAD2x8_1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -3556,9 +3816,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_I1', ` +#else .macro KERNEL2x8_I1 +#endif lxvw4x vs4, o0, AO // load a0, a1 @@ -3602,9 +3870,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs47, vs3, vs11 // a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_1', ` +#else .macro KERNEL2x8_1 +#endif lxvw4x vs4, o0, AO // load a0, a1 @@ -3648,9 +3924,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs47, vs3, vs11 // a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_2', ` +#else .macro KERNEL2x8_2 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -3694,9 +3978,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs47, vs7, vs19 // a7_r*b1_i, a7_i*b1_i, a1_r*b1_i, a1_i*b1_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_E2', ` +#else .macro KERNEL2x8_E2 +#endif xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r @@ -3718,9 +4010,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs47, vs7, vs19 // a7_r*b1_i, a7_i*b1_i, a1_r*b1_i, a1_i*b1_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_SUBI1', ` +#else .macro KERNEL2x8_SUBI1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -3764,9 +4064,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs47, vs3, vs11 // a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_SUB1', ` +#else .macro KERNEL2x8_SUB1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -3810,9 +4118,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs47, vs3, vs11 // a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE2x8', ` +#else .macro SAVE2x8 +#endif mr T1, CO @@ -4357,14 +4673,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 64 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=4 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD2x4_1', ` +#else .macro LOAD2x4_1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -4383,9 +4707,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_I1', ` +#else .macro KERNEL2x4_I1 +#endif lxvw4x vs4, o0, AO // load a0, a1 @@ -4417,9 +4749,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs39, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_1', ` +#else .macro KERNEL2x4_1 +#endif lxvw4x vs4, o0, AO // load a0, a1 @@ -4451,9 +4791,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_2', ` +#else .macro KERNEL2x4_2 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -4485,9 +4833,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs5, vs19 // a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_E2', ` +#else .macro KERNEL2x4_E2 +#endif xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r @@ -4501,9 +4857,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs5, vs19 // a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_SUBI1', ` +#else .macro KERNEL2x4_SUBI1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -4535,9 +4899,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs39, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_SUB1', ` +#else .macro KERNEL2x4_SUB1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -4569,9 +4941,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE2x4', ` +#else .macro SAVE2x4 +#endif mr T1, CO @@ -4852,14 +5232,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 32 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=2 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD2x2_1', ` +#else .macro LOAD2x2_1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -4876,9 +5264,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_I1', ` +#else .macro KERNEL2x2_I1 +#endif lxvw4x vs4, o0, AO // load a0, a1 @@ -4904,9 +5300,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs35, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_1', ` +#else .macro KERNEL2x2_1 +#endif lxvw4x vs4, o0, AO // load a0, a1 @@ -4932,9 +5336,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs35, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_2', ` +#else .macro KERNEL2x2_2 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -4960,9 +5372,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs35, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_E2', ` +#else .macro KERNEL2x2_E2 +#endif xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r @@ -4972,9 +5392,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs35, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_SUBI1', ` +#else .macro KERNEL2x2_SUBI1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -5000,9 +5428,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs35, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_SUB1', ` +#else .macro KERNEL2x2_SUB1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -5028,9 +5464,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs35, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE2x2', ` +#else .macro SAVE2x2 +#endif mr T1, CO @@ -5179,14 +5623,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 16 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=1 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD2x1_1', ` +#else .macro LOAD2x1_1 +#endif lxsspx vs0, o0, AO // load a0_r lxsspx vs1, o4, AO // load a0_i @@ -5205,9 +5657,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_I1', ` +#else .macro KERNEL2x1_I1 +#endif lxsspx vs4, o0, AO // load a0_r @@ -5239,9 +5699,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs39, vs1, vs10 // a0_i*b1_r +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_1', ` +#else .macro KERNEL2x1_1 +#endif lxsspx vs4, o0, AO // load a0_r @@ -5273,9 +5741,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs39, vs1, vs10 // a0_i*b1_r +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_2', ` +#else .macro KERNEL2x1_2 +#endif lxsspx vs0, o0, AO // load a0_r @@ -5307,9 +5783,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs39, vs5, vs18 // a4_i*b1_r +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_E2', ` +#else .macro KERNEL2x1_E2 +#endif xsmaddadp vs32, vs4, vs16 // a4_r*b0_r @@ -5323,9 +5807,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs39, vs5, vs18 // a4_i*b1_r +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_SUBI1', ` +#else .macro KERNEL2x1_SUBI1 +#endif lxsspx vs0, o0, AO // load a0_r @@ -5357,9 +5849,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs39, vs1, vs10 // a0_i*b1_r +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_SUB1', ` +#else .macro KERNEL2x1_SUB1 +#endif lxsspx vs0, o0, AO // load a0_r @@ -5391,9 +5891,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs39, vs1, vs10 // a0_i*b1_r +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE2x1', ` +#else .macro SAVE2x1 +#endif mr T1, CO @@ -5482,14 +5990,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 8 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=8 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD1x8_1', ` +#else .macro LOAD1x8_1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -5514,9 +6030,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_I1', ` +#else .macro KERNEL1x8_I1 +#endif lxvw4x vs4, o0, AO // load a0, a1 @@ -5553,9 +6077,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_1', ` +#else .macro KERNEL1x8_1 +#endif lxvw4x vs4, o0, AO // load a0, a1 @@ -5592,9 +6124,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_2', ` +#else .macro KERNEL1x8_2 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -5631,9 +6171,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs7, vs17 // a7_r*b0_i, a7_i*b0_i, a1_r*b0_i, a1_i*b0_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_E2', ` +#else .macro KERNEL1x8_E2 +#endif xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r @@ -5646,9 +6194,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs7, vs17 // a7_r*b0_i, a7_i*b0_i, a1_r*b0_i, a1_i*b0_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_SUBI1', ` +#else .macro KERNEL1x8_SUBI1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -5685,9 +6241,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_SUB1', ` +#else .macro KERNEL1x8_SUB1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -5724,9 +6288,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE1x8', ` +#else .macro SAVE1x8 +#endif mr T1, CO @@ -6001,14 +6573,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 64 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=4 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD1x4_1', ` +#else .macro LOAD1x4_1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -6029,9 +6609,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_I1', ` +#else .macro KERNEL1x4_I1 +#endif lxvw4x vs4, o0, AO // load a0, a1 @@ -6060,9 +6648,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_1', ` +#else .macro KERNEL1x4_1 +#endif lxvw4x vs4, o0, AO // load a0, a1 @@ -6091,9 +6687,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_2', ` +#else .macro KERNEL1x4_2 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -6122,9 +6726,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_E2', ` +#else .macro KERNEL1x4_E2 +#endif xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r @@ -6133,9 +6745,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_SUBI1', ` +#else .macro KERNEL1x4_SUBI1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -6164,9 +6784,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_SUB1', ` +#else .macro KERNEL1x4_SUB1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -6195,9 +6823,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE1x4', ` +#else .macro SAVE1x4 +#endif mr T1, CO @@ -6340,14 +6976,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 32 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=2 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD1x2_1', ` +#else .macro LOAD1x2_1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -6366,9 +7010,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_I1', ` +#else .macro KERNEL1x2_I1 +#endif lxvw4x vs4, o0, AO // load a0, a1 @@ -6393,9 +7045,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_1', ` +#else .macro KERNEL1x2_1 +#endif lxvw4x vs4, o0, AO // load a0, a1 @@ -6420,9 +7080,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_2', ` +#else .macro KERNEL1x2_2 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -6447,18 +7115,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_E2', ` +#else .macro KERNEL1x2_E2 +#endif xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_SUBI1', ` +#else .macro KERNEL1x2_SUBI1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -6483,9 +7167,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_SUB1', ` +#else .macro KERNEL1x2_SUB1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -6510,9 +7202,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE1x2', ` +#else .macro SAVE1x2 +#endif mr T1, CO @@ -6589,14 +7289,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 16 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=1 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD1x1_1', ` +#else .macro LOAD1x1_1 +#endif lxsspx vs0, o0, AO // load a0_r lxsspx vs1, o4, AO // load a0_i @@ -6610,9 +7318,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_I1', ` +#else .macro KERNEL1x1_I1 +#endif lxsspx vs4, o0, AO // load a0_r @@ -6634,9 +7350,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs35, vs1, vs8 // a0_i*b0_r +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_1', ` +#else .macro KERNEL1x1_1 +#endif lxsspx vs4, o0, AO // load a0_r @@ -6658,9 +7382,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs35, vs1, vs8 // a0_i*b0_r +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_2', ` +#else .macro KERNEL1x1_2 +#endif lxsspx vs0, o0, AO // load a0_r @@ -6682,9 +7414,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs35, vs5, vs16 // a4_i*b0_r +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_E2', ` +#else .macro KERNEL1x1_E2 +#endif xsmaddadp vs32, vs4, vs16 // a4_r*b0_r @@ -6693,9 +7433,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs35, vs5, vs16 // a4_i*b0_r +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_SUBI1', ` +#else .macro KERNEL1x1_SUBI1 +#endif lxsspx vs0, o0, AO // load a0_r @@ -6717,9 +7465,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs35, vs1, vs8 // a0_i*b0_r +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_SUB1', ` +#else .macro KERNEL1x1_SUB1 +#endif lxsspx vs0, o0, AO // load a0_r @@ -6741,9 +7497,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs35, vs1, vs8 // a0_i*b0_r +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE1x1', ` +#else .macro SAVE1x1 +#endif mr T1, CO @@ -6790,5 +7554,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 8 +#if defined(_AIX) +') +#else .endm +#endif diff --git a/kernel/power/dasum_microk_power8.c b/kernel/power/dasum_microk_power8.c index 880d7d271..4652fc57c 100644 --- a/kernel/power/dasum_microk_power8.c +++ b/kernel/power/dasum_microk_power8.c @@ -68,10 +68,10 @@ static double dasum_kernel_16 (long n, double *x) "addi %2, %2, 128 \n\t" "addic. %1, %1, -16 \n\t" - "ble 2f \n\t" + "ble two%= \n\t" - ".p2align 5 \n" - "1: \n\t" + ".align 5 \n" + "one%=: \n\t" "xvabsdp 48, 40 \n\t" "xvabsdp 49, 41 \n\t" @@ -108,9 +108,9 @@ static double dasum_kernel_16 (long n, double *x) "xvadddp 38, 38, %x5 \n\t" "xvadddp 39, 39, %x6 \n\t" - "bgt 1b \n" + "bgt one%= \n" - "2: \n\t" + "two%=: \n\t" "xvabsdp 48, 40 \n\t" "xvabsdp 49, 41 \n\t" @@ -140,7 +140,7 @@ static double dasum_kernel_16 (long n, double *x) "xvadddp 32, 32, 36 \n\t" - "xxswapd 33, 32 \n\t" + XXSWAPD_S(33,32) "xsadddp %x0, 32, 33 \n" "#n=%1 x=%3=%2 sum=%0 o16=%8 o32=%9 o48=%10 o64=%11 o80=%12 o96=%13 o112=%14\n" diff --git a/kernel/power/daxpy_microk_power8.c b/kernel/power/daxpy_microk_power8.c index fb714a3f9..a92026e83 100644 --- a/kernel/power/daxpy_microk_power8.c +++ b/kernel/power/daxpy_microk_power8.c @@ -58,7 +58,7 @@ static void daxpy_kernel_8 (long n, double *x, double *y, double alpha) __asm__ ( - "xxspltd %x4, %x22, 0 \n\t" + XXSPLTD_S(%x4,%x22,0) "dcbt 0, %2 \n\t" "dcbt 0, %3 \n\t" @@ -90,10 +90,10 @@ static void daxpy_kernel_8 (long n, double *x, double *y, double alpha) "addi %3, %3, -64 \n\t" "addic. %1, %1, -16 \n\t" - "ble 2f \n\t" + "ble two%= \n\t" ".align 5 \n" - "1: \n\t" + "one%=: \n\t" "xvmaddadp %x13, %x5, %x4 \n\t" "xvmaddadp %x14, %x6, %x4 \n\t" @@ -152,9 +152,9 @@ static void daxpy_kernel_8 (long n, double *x, double *y, double alpha) "addi %3, %3, -64 \n\t" "addic. %1, %1, -16 \n\t" - "bgt 1b \n" + "bgt one%= \n" - "2: \n\t" + "two%=: \n\t" "xvmaddadp %x13, %x5, %x4 \n\t" "xvmaddadp %x14, %x6, %x4 \n\t" diff --git a/kernel/power/dcopy_microk_power8.c b/kernel/power/dcopy_microk_power8.c index 261dc04de..b51a21d08 100644 --- a/kernel/power/dcopy_microk_power8.c +++ b/kernel/power/dcopy_microk_power8.c @@ -62,10 +62,10 @@ static void dcopy_kernel_32 (long n, double *x, double *y) "addi %2, %2, 128 \n\t" "addic. %1, %1, -32 \n\t" - "ble 2f \n\t" + "ble two%= \n\t" - ".p2align 5 \n" - "1: \n\t" + ".align 5 \n" + "one%=: \n\t" "stxvd2x 32, 0, %3 \n\t" "stxvd2x 33, %5, %3 \n\t" @@ -108,9 +108,9 @@ static void dcopy_kernel_32 (long n, double *x, double *y) "addi %2, %2, 128 \n\t" "addic. %1, %1, -32 \n\t" - "bgt 1b \n" + "bgt one%= \n" - "2: \n\t" + "two%=: \n\t" "stxvd2x 32, 0, %3 \n\t" "stxvd2x 33, %5, %3 \n\t" diff --git a/kernel/power/ddot_microk_power8.c b/kernel/power/ddot_microk_power8.c index 4e6bc29c9..d2518ef7e 100644 --- a/kernel/power/ddot_microk_power8.c +++ b/kernel/power/ddot_microk_power8.c @@ -78,10 +78,10 @@ static double ddot_kernel_8 (long n, double *x, double *y) "addi %3, %3, 128 \n\t" "addic. %1, %1, -16 \n\t" - "ble 2f \n\t" + "ble two%= \n\t" - ".p2align 5 \n" - "1: \n\t" + ".align 5 \n" + "one%=: \n\t" "xvmaddadp 32, 40, 48 \n\t" "lxvd2x 40, 0, %2 \n\t" @@ -112,9 +112,9 @@ static double ddot_kernel_8 (long n, double *x, double *y) "addi %3, %3, 128 \n\t" "addic. %1, %1, -16 \n\t" - "bgt 1b \n" + "bgt one%= \n" - "2: \n\t" + "two%=: \n\t" "xvmaddadp 32, 40, 48 \n\t" "xvmaddadp 33, 41, 49 \n\t" @@ -135,7 +135,7 @@ static double ddot_kernel_8 (long n, double *x, double *y) "xvadddp 32, 32, 36 \n\t" - "xxswapd 33, 32 \n\t" + XXSWAPD_S(33,32) "xsadddp %x0, 32, 33 \n" diff --git a/kernel/power/dgemm_macros_16x4_power8.S b/kernel/power/dgemm_macros_16x4_power8.S index 5be517f7c..782425fbd 100644 --- a/kernel/power/dgemm_macros_16x4_power8.S +++ b/kernel/power/dgemm_macros_16x4_power8.S @@ -37,7 +37,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * Macros for N=4, M=16 * *********************************************************************/ +#if defined(_AIX) +define(`LOAD4x16_1', ` +#else .macro LOAD4x16_1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -58,10 +62,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 128 addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x16_I1', ` +#else .macro KERNEL4x16_I1 +#endif xvmuldp vs32, vs0, vs24 xvmuldp vs33, vs1, vs24 @@ -125,11 +137,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 128 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x16_1', ` +#else .macro KERNEL4x16_1 +#endif xvmaddadp vs32, vs0, vs24 xvmaddadp vs33, vs1, vs24 @@ -194,9 +214,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 128 addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x16_2', ` +#else .macro KERNEL4x16_2 +#endif xvmaddadp vs32, vs8, vs28 xvmaddadp vs33, vs9, vs28 @@ -260,9 +288,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 128 addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x16_L1', ` +#else .macro KERNEL4x16_L1 +#endif xvmaddadp vs32, vs0, vs24 xvmaddadp vs33, vs1, vs24 @@ -326,9 +362,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 128 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x16_L2', ` +#else .macro KERNEL4x16_L2 +#endif xvmaddadp vs32, vs8, vs28 xvmaddadp vs33, vs9, vs28 @@ -392,10 +436,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs63, vs15, vs31 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x16_E2', ` +#else .macro KERNEL4x16_E2 +#endif xvmaddadp vs32, vs8, vs28 @@ -434,9 +486,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs62, vs14, vs31 xvmaddadp vs63, vs15, vs31 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x16_SUBI1', ` +#else .macro KERNEL4x16_SUBI1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -495,9 +555,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs62, vs6, vs27 xvmuldp vs63, vs7, vs27 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x16_SUB1', ` +#else .macro KERNEL4x16_SUB1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -555,9 +623,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs62, vs6, vs27 xvmaddadp vs63, vs7, vs27 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE4x16', ` +#else .macro SAVE4x16 +#endif add T2, CO, LDC @@ -680,13 +756,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvd2x vs39, o112, T4 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************* * Macros for N=4, M=8 * *********************************************************************/ +#if defined(_AIX) +define(`LOAD4x8_1', ` +#else .macro LOAD4x8_1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -703,9 +787,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x8_I1', ` +#else .macro KERNEL4x8_I1 +#endif xvmuldp vs32, vs0, vs24 xvmuldp vs33, vs1, vs24 @@ -744,9 +836,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x8_1', ` +#else .macro KERNEL4x8_1 +#endif xvmaddadp vs32, vs0, vs24 xvmaddadp vs33, vs1, vs24 @@ -784,9 +884,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x8_2', ` +#else .macro KERNEL4x8_2 +#endif xvmaddadp vs32, vs8, vs28 xvmaddadp vs33, vs9, vs28 @@ -824,9 +932,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x8_E2', ` +#else .macro KERNEL4x8_E2 +#endif xvmaddadp vs32, vs8, vs28 @@ -849,9 +965,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs58, vs10, vs31 xvmaddadp vs59, vs11, vs31 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x8_SUBI1', ` +#else .macro KERNEL4x8_SUBI1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -887,9 +1011,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs58, vs2, vs27 xvmuldp vs59, vs3, vs27 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x8_SUB1', ` +#else .macro KERNEL4x8_SUB1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -925,9 +1057,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs58, vs2, vs27 xvmaddadp vs59, vs3, vs27 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE4x8', ` +#else .macro SAVE4x8 +#endif mr T1, CO @@ -1035,13 +1175,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 64 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************* * Macros for N=4, M=4 * *********************************************************************/ +#if defined(_AIX) +define(`LOAD4x4_1', ` +#else .macro LOAD4x4_1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -1054,9 +1202,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 32 addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x4_I1', ` +#else .macro KERNEL4x4_I1 +#endif lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO @@ -1082,9 +1238,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs56, vs0, vs27 xvmuldp vs57, vs1, vs27 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x4_1', ` +#else .macro KERNEL4x4_1 +#endif lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO @@ -1110,9 +1274,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs56, vs0, vs27 xvmaddadp vs57, vs1, vs27 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x4_2', ` +#else .macro KERNEL4x4_2 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -1138,9 +1310,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs56, vs8, vs31 xvmaddadp vs57, vs9, vs31 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x4_E2', ` +#else .macro KERNEL4x4_E2 +#endif xvmaddadp vs32, vs8, vs28 @@ -1155,9 +1335,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs56, vs8, vs31 xvmaddadp vs57, vs9, vs31 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x4_SUBI1', ` +#else .macro KERNEL4x4_SUBI1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -1183,9 +1371,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs56, vs0, vs27 xvmuldp vs57, vs1, vs27 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x4_SUB1', ` +#else .macro KERNEL4x4_SUB1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -1211,9 +1407,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs56, vs0, vs27 xvmaddadp vs57, vs1, vs27 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE4x4', ` +#else .macro SAVE4x4 +#endif mr T1, CO @@ -1289,13 +1493,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 32 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************* * Macros for N=4, M=2 * *********************************************************************/ +#if defined(_AIX) +define(`LOAD4x2_1', ` +#else .macro LOAD4x2_1 +#endif lxvd2x vs0, 0, AO @@ -1307,9 +1519,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 16 addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x2_I1', ` +#else .macro KERNEL4x2_I1 +#endif lxvd2x vs8, 0, AO @@ -1330,9 +1550,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs56, vs0, vs27 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x2_1', ` +#else .macro KERNEL4x2_1 +#endif lxvd2x vs8, 0, AO @@ -1353,9 +1581,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs56, vs0, vs27 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x2_2', ` +#else .macro KERNEL4x2_2 +#endif lxvd2x vs0, 0, AO @@ -1376,9 +1612,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs56, vs8, vs31 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x2_E2', ` +#else .macro KERNEL4x2_E2 +#endif xvmaddadp vs32, vs8, vs28 @@ -1389,9 +1633,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs56, vs8, vs31 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x2_SUBI1', ` +#else .macro KERNEL4x2_SUBI1 +#endif lxvd2x vs0, 0, AO @@ -1412,9 +1664,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs56, vs0, vs27 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x2_SUB1', ` +#else .macro KERNEL4x2_SUB1 +#endif lxvd2x vs0, 0, AO @@ -1435,9 +1695,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs56, vs0, vs27 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE4x2', ` +#else .macro SAVE4x2 +#endif mr T1, CO @@ -1497,13 +1765,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 16 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************* * Macros for N=4, M=1 * *********************************************************************/ +#if defined(_AIX) +define(`LOAD4x1_1', ` +#else .macro LOAD4x1_1 +#endif lxsdx vs0, 0, AO @@ -1515,9 +1791,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 8 addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x1_I1', ` +#else .macro KERNEL4x1_I1 +#endif lxsdx vs8, 0, AO @@ -1538,9 +1822,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs56, vs0, vs27 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x1_1', ` +#else .macro KERNEL4x1_1 +#endif lxsdx vs8, 0, AO @@ -1561,9 +1853,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs56, vs0, vs27 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x1_2', ` +#else .macro KERNEL4x1_2 +#endif lxsdx vs0, 0, AO @@ -1584,9 +1884,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs56, vs8, vs31 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x1_E2', ` +#else .macro KERNEL4x1_E2 +#endif xsmaddadp vs32, vs8, vs28 @@ -1597,9 +1905,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs56, vs8, vs31 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x1_SUBI1', ` +#else .macro KERNEL4x1_SUBI1 +#endif lxsdx vs0, 0, AO @@ -1620,9 +1936,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs56, vs0, vs27 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x1_SUB1', ` +#else .macro KERNEL4x1_SUB1 +#endif lxsdx vs0, 0, AO @@ -1643,9 +1967,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs56, vs0, vs27 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE4x1', ` +#else .macro SAVE4x1 +#endif mr T1, CO @@ -1705,13 +2037,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 8 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************* * Macros for N=2, M=16 * *********************************************************************/ +#if defined(_AIX) +define(`LOAD2x16_1', ` +#else .macro LOAD2x16_1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -1731,9 +2071,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x16_I1', ` +#else .macro KERNEL2x16_I1 +#endif lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO @@ -1772,9 +2120,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs46, vs6, vs25 xvmuldp vs47, vs7, vs25 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x16_1', ` +#else .macro KERNEL2x16_1 +#endif lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO @@ -1813,9 +2169,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs46, vs6, vs25 xvmaddadp vs47, vs7, vs25 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x16_2', ` +#else .macro KERNEL2x16_2 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -1854,9 +2218,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs46, vs14, vs29 xvmaddadp vs47, vs15, vs29 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x16_E2', ` +#else .macro KERNEL2x16_E2 +#endif xvmaddadp vs32, vs8, vs28 @@ -1877,9 +2249,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs46, vs14, vs29 xvmaddadp vs47, vs15, vs29 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x16_SUBI1', ` +#else .macro KERNEL2x16_SUBI1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -1918,9 +2298,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs46, vs6, vs25 xvmuldp vs47, vs7, vs25 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x16_SUB1', ` +#else .macro KERNEL2x16_SUB1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -1959,9 +2347,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs46, vs6, vs25 xvmaddadp vs47, vs7, vs25 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE2x16', ` +#else .macro SAVE2x16 +#endif mr T1, CO addi T2, T1, 64 @@ -2055,13 +2451,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 128 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************* * Macros for N=4, M=8 * *********************************************************************/ +#if defined(_AIX) +define(`LOAD2x8_1', ` +#else .macro LOAD2x8_1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -2074,9 +2478,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 addi BO, BO, 16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_I1', ` +#else .macro KERNEL2x8_I1 +#endif lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO @@ -2100,9 +2512,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs42, vs2, vs25 xvmuldp vs43, vs3, vs25 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_1', ` +#else .macro KERNEL2x8_1 +#endif lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO @@ -2126,9 +2546,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs42, vs2, vs25 xvmaddadp vs43, vs3, vs25 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_2', ` +#else .macro KERNEL2x8_2 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -2152,9 +2580,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs42, vs10, vs29 xvmaddadp vs43, vs11, vs29 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_E2', ` +#else .macro KERNEL2x8_E2 +#endif xvmaddadp vs32, vs8, vs28 @@ -2167,9 +2603,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs42, vs10, vs29 xvmaddadp vs43, vs11, vs29 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_SUBI1', ` +#else .macro KERNEL2x8_SUBI1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -2193,9 +2637,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs42, vs2, vs25 xvmuldp vs43, vs3, vs25 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_SUB1', ` +#else .macro KERNEL2x8_SUB1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -2219,9 +2671,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs42, vs2, vs25 xvmaddadp vs43, vs3, vs25 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE2x8', ` +#else .macro SAVE2x8 +#endif mr T1, CO @@ -2277,13 +2737,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 64 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************* * Macros for N=2, M=4 * *********************************************************************/ +#if defined(_AIX) +define(`LOAD2x4_1', ` +#else .macro LOAD2x4_1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -2294,9 +2762,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 32 addi BO, BO, 16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_I1', ` +#else .macro KERNEL2x4_I1 +#endif lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO @@ -2314,9 +2790,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs40, vs0, vs25 xvmuldp vs41, vs1, vs25 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_1', ` +#else .macro KERNEL2x4_1 +#endif lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO @@ -2334,9 +2818,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs40, vs0, vs25 xvmaddadp vs41, vs1, vs25 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_2', ` +#else .macro KERNEL2x4_2 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -2354,9 +2846,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs40, vs8, vs29 xvmaddadp vs41, vs9, vs29 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_E2', ` +#else .macro KERNEL2x4_E2 +#endif xvmaddadp vs32, vs8, vs28 @@ -2365,9 +2865,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs40, vs8, vs29 xvmaddadp vs41, vs9, vs29 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_SUBI1', ` +#else .macro KERNEL2x4_SUBI1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -2385,9 +2893,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs40, vs0, vs25 xvmuldp vs41, vs1, vs25 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_SUB1', ` +#else .macro KERNEL2x4_SUB1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -2405,9 +2921,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs40, vs0, vs25 xvmaddadp vs41, vs1, vs25 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE2x4', ` +#else .macro SAVE2x4 +#endif mr T1, CO @@ -2447,13 +2971,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 32 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************* * Macros for N=2, M=2 * *********************************************************************/ +#if defined(_AIX) +define(`LOAD2x2_1', ` +#else .macro LOAD2x2_1 +#endif lxvd2x vs0, 0, AO @@ -2463,9 +2995,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 16 addi BO, BO, 16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_I1', ` +#else .macro KERNEL2x2_I1 +#endif lxvd2x vs8, 0, AO @@ -2480,9 +3020,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs40, vs0, vs25 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_1', ` +#else .macro KERNEL2x2_1 +#endif lxvd2x vs8, 0, AO @@ -2497,9 +3045,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs40, vs0, vs25 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_2', ` +#else .macro KERNEL2x2_2 +#endif lxvd2x vs0, 0, AO @@ -2514,18 +3070,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs40, vs8, vs29 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_E2', ` +#else .macro KERNEL2x2_E2 +#endif xvmaddadp vs32, vs8, vs28 xvmaddadp vs40, vs8, vs29 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_SUBI1', ` +#else .macro KERNEL2x2_SUBI1 +#endif lxvd2x vs0, 0, AO @@ -2540,9 +3112,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs40, vs0, vs25 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_SUB1', ` +#else .macro KERNEL2x2_SUB1 +#endif lxvd2x vs0, 0, AO @@ -2557,9 +3137,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs40, vs0, vs25 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE2x2', ` +#else .macro SAVE2x2 +#endif mr T1, CO @@ -2591,13 +3179,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 16 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************* * Macros for N=2, M=1 * *********************************************************************/ +#if defined(_AIX) +define(`LOAD2x1_1', ` +#else .macro LOAD2x1_1 +#endif lxsdx vs0, 0, AO @@ -2607,9 +3203,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 8 addi BO, BO, 16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_I1', ` +#else .macro KERNEL2x1_I1 +#endif lxsdx vs8, 0, AO @@ -2624,9 +3228,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs40, vs0, vs25 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_1', ` +#else .macro KERNEL2x1_1 +#endif lxsdx vs8, 0, AO @@ -2641,9 +3253,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs40, vs0, vs25 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_2', ` +#else .macro KERNEL2x1_2 +#endif lxsdx vs0, 0, AO @@ -2658,18 +3278,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs40, vs8, vs29 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_E2', ` +#else .macro KERNEL2x1_E2 +#endif xsmaddadp vs32, vs8, vs28 xsmaddadp vs40, vs8, vs29 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_SUBI1', ` +#else .macro KERNEL2x1_SUBI1 +#endif lxsdx vs0, 0, AO @@ -2684,9 +3320,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs40, vs0, vs25 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_SUB1', ` +#else .macro KERNEL2x1_SUB1 +#endif lxsdx vs0, 0, AO @@ -2701,9 +3345,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs40, vs0, vs25 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE2x1', ` +#else .macro SAVE2x1 +#endif mr T1, CO @@ -2735,13 +3387,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 8 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************* * Macros for N=1, M=16 * *********************************************************************/ +#if defined(_AIX) +define(`LOAD1x16_1', ` +#else .macro LOAD1x16_1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -2760,9 +3420,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x16_I1', ` +#else .macro KERNEL1x16_I1 +#endif lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO @@ -2791,9 +3459,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs38, vs6, vs24 xvmuldp vs39, vs7, vs24 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x16_1', ` +#else .macro KERNEL1x16_1 +#endif lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO @@ -2822,9 +3498,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs38, vs6, vs24 xvmaddadp vs39, vs7, vs24 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x16_2', ` +#else .macro KERNEL1x16_2 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -2853,9 +3537,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs38, vs14, vs28 xvmaddadp vs39, vs15, vs28 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x16_E2', ` +#else .macro KERNEL1x16_E2 +#endif xvmaddadp vs32, vs8, vs28 @@ -2867,9 +3559,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs38, vs14, vs28 xvmaddadp vs39, vs15, vs28 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x16_SUBI1', ` +#else .macro KERNEL1x16_SUBI1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -2898,9 +3598,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs38, vs6, vs24 xvmuldp vs39, vs7, vs24 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x16_SUB1', ` +#else .macro KERNEL1x16_SUB1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -2929,9 +3637,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs38, vs6, vs24 xvmaddadp vs39, vs7, vs24 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE1x16', ` +#else .macro SAVE1x16 +#endif mr T1, CO addi T2, T1, 64 @@ -2980,13 +3696,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 128 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************* * Macros for N=4, M=8 * *********************************************************************/ +#if defined(_AIX) +define(`LOAD1x8_1', ` +#else .macro LOAD1x8_1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -2998,9 +3722,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 addi BO, BO, 8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_I1', ` +#else .macro KERNEL1x8_I1 +#endif lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO @@ -3018,9 +3750,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs34, vs2, vs24 xvmuldp vs35, vs3, vs24 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_1', ` +#else .macro KERNEL1x8_1 +#endif lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO @@ -3038,9 +3778,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs34, vs2, vs24 xvmaddadp vs35, vs3, vs24 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_2', ` +#else .macro KERNEL1x8_2 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -3058,9 +3806,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs34, vs10, vs28 xvmaddadp vs35, vs11, vs28 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_E2', ` +#else .macro KERNEL1x8_E2 +#endif xvmaddadp vs32, vs8, vs28 @@ -3068,9 +3824,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs34, vs10, vs28 xvmaddadp vs35, vs11, vs28 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_SUBI1', ` +#else .macro KERNEL1x8_SUBI1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -3088,9 +3852,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs34, vs2, vs24 xvmuldp vs35, vs3, vs24 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_SUB1', ` +#else .macro KERNEL1x8_SUB1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -3108,9 +3880,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs34, vs2, vs24 xvmaddadp vs35, vs3, vs24 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE1x8', ` +#else .macro SAVE1x8 +#endif mr T1, CO @@ -3140,13 +3920,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 64 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************* * Macros for N=1, M=4 * *********************************************************************/ +#if defined(_AIX) +define(`LOAD1x4_1', ` +#else .macro LOAD1x4_1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -3156,9 +3944,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 32 addi BO, BO, 8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_I1', ` +#else .macro KERNEL1x4_I1 +#endif lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO @@ -3172,9 +3968,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs32, vs0, vs24 xvmuldp vs33, vs1, vs24 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_1', ` +#else .macro KERNEL1x4_1 +#endif lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO @@ -3188,9 +3992,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs32, vs0, vs24 xvmaddadp vs33, vs1, vs24 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_2', ` +#else .macro KERNEL1x4_2 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -3204,17 +4016,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs32, vs8, vs28 xvmaddadp vs33, vs9, vs28 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_E2', ` +#else .macro KERNEL1x4_E2 +#endif xvmaddadp vs32, vs8, vs28 xvmaddadp vs33, vs9, vs28 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_SUBI1', ` +#else .macro KERNEL1x4_SUBI1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -3228,9 +4056,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs32, vs0, vs24 xvmuldp vs33, vs1, vs24 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_SUB1', ` +#else .macro KERNEL1x4_SUB1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -3244,9 +4080,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs32, vs0, vs24 xvmaddadp vs33, vs1, vs24 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE1x4', ` +#else .macro SAVE1x4 +#endif mr T1, CO @@ -3268,13 +4112,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 32 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************* * Macros for N=1, M=2 * *********************************************************************/ +#if defined(_AIX) +define(`LOAD1x2_1', ` +#else .macro LOAD1x2_1 +#endif lxvd2x vs0, 0, AO @@ -3283,9 +4135,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 16 addi BO, BO, 8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_I1', ` +#else .macro KERNEL1x2_I1 +#endif lxvd2x vs8, 0, AO @@ -3297,9 +4157,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs32, vs0, vs24 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_1', ` +#else .macro KERNEL1x2_1 +#endif lxvd2x vs8, 0, AO @@ -3311,9 +4179,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs32, vs0, vs24 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_2', ` +#else .macro KERNEL1x2_2 +#endif lxvd2x vs0, 0, AO @@ -3325,16 +4201,32 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs32, vs8, vs28 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_E2', ` +#else .macro KERNEL1x2_E2 +#endif xvmaddadp vs32, vs8, vs28 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_SUBI1', ` +#else .macro KERNEL1x2_SUBI1 +#endif lxvd2x vs0, 0, AO @@ -3346,9 +4238,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs32, vs0, vs24 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_SUB1', ` +#else .macro KERNEL1x2_SUB1 +#endif lxvd2x vs0, 0, AO @@ -3360,9 +4260,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs32, vs0, vs24 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE1x2', ` +#else .macro SAVE1x2 +#endif mr T1, CO @@ -3380,13 +4288,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 16 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************* * Macros for N=1, M=1 * *********************************************************************/ +#if defined(_AIX) +define(`LOAD1x1_1', ` +#else .macro LOAD1x1_1 +#endif lxsdx vs0, 0, AO @@ -3395,9 +4311,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 8 addi BO, BO, 8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_I1', ` +#else .macro KERNEL1x1_I1 +#endif lxsdx vs8, 0, AO @@ -3409,9 +4333,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs32, vs0, vs24 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_1', ` +#else .macro KERNEL1x1_1 +#endif lxsdx vs8, 0, AO @@ -3423,9 +4355,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs32, vs0, vs24 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_2', ` +#else .macro KERNEL1x1_2 +#endif lxsdx vs0, 0, AO @@ -3437,16 +4377,32 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs32, vs8, vs28 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_E2', ` +#else .macro KERNEL1x1_E2 +#endif xsmaddadp vs32, vs8, vs28 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_SUBI1', ` +#else .macro KERNEL1x1_SUBI1 +#endif lxsdx vs0, 0, AO @@ -3458,9 +4414,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs32, vs0, vs24 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_SUB1', ` +#else .macro KERNEL1x1_SUB1 +#endif lxsdx vs0, 0, AO @@ -3472,9 +4436,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs32, vs0, vs24 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE1x1', ` +#else .macro SAVE1x1 +#endif mr T1, CO @@ -3492,5 +4464,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 8 +#if defined(_AIX) +') +#else .endm +#endif diff --git a/kernel/power/dgemm_ncopy_macros_4_power8.S b/kernel/power/dgemm_ncopy_macros_4_power8.S index 8d6744b91..33d02c77d 100644 --- a/kernel/power/dgemm_ncopy_macros_4_power8.S +++ b/kernel/power/dgemm_ncopy_macros_4_power8.S @@ -38,7 +38,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * Macros for N=4 and M=16 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_4x16', ` +#else .macro COPY_4x16 +#endif lxvd2x vs0, o0, A0 lxvd2x vs1, o0, A1 @@ -180,14 +184,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 128 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=4 and M=8 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_4x8', ` +#else .macro COPY_4x8 +#endif lxvd2x vs0, o0, A0 lxvd2x vs1, o16, A0 @@ -259,14 +271,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 128 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=4 and M=4 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_4x4', ` +#else .macro COPY_4x4 +#endif lxvd2x vs0, o0, A0 lxvd2x vs1, o16, A0 @@ -310,14 +330,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 128 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=4 and M=2 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_4x2', ` +#else .macro COPY_4x2 +#endif lxvd2x vs0, o0, A0 addi A0, A0, 16 @@ -348,14 +376,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 64 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=4 and M=1 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_4x1', ` +#else .macro COPY_4x1 +#endif lxsdx vs0, o0, A0 addi A0, A0, 8 @@ -382,14 +418,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=16 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_2x16', ` +#else .macro COPY_2x16 +#endif lxvd2x vs0, o0, A0 lxvd2x vs1, o16, A0 @@ -459,14 +503,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 128 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=8 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_2x8', ` +#else .macro COPY_2x8 +#endif lxvd2x vs0, o0, A0 lxvd2x vs1, o16, A0 @@ -506,14 +558,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 128 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=4 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_2x4', ` +#else .macro COPY_2x4 +#endif lxvd2x vs0, o0, A0 lxvd2x vs1, o16, A0 @@ -539,14 +599,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 64 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=2 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_2x2', ` +#else .macro COPY_2x2 +#endif lxvd2x vs0, o0, A0 addi A0, A0, 16 @@ -565,14 +633,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=1 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_2x1', ` +#else .macro COPY_2x1 +#endif lxsdx vs0, o0, A0 addi A0, A0, 8 @@ -589,14 +665,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 16 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=16 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_1x16', ` +#else .macro COPY_1x16 +#endif lxvd2x vs0, o0, A0 lxvd2x vs1, o16, A0 @@ -622,14 +706,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 64 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=8 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_1x8', ` +#else .macro COPY_1x8 +#endif lxvd2x vs0, o0, A0 lxvd2x vs1, o16, A0 @@ -645,14 +737,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 64 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=4 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_1x4', ` +#else .macro COPY_1x4 +#endif lxvd2x vs0, o0, A0 lxvd2x vs1, o16, A0 @@ -664,14 +764,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=2 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_1x2', ` +#else .macro COPY_1x2 +#endif lxvd2x vs0, o0, A0 addi A0, A0, 16 @@ -681,14 +789,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 16 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=1 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_1x1', ` +#else .macro COPY_1x1 +#endif lxsdx vs0, o0, A0 addi A0, A0, 8 @@ -698,5 +814,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 8 +#if defined(_AIX) +') +#else .endm +#endif diff --git a/kernel/power/dgemm_tcopy_macros_16_power8.S b/kernel/power/dgemm_tcopy_macros_16_power8.S index 68e53bcf2..6c5b8ed62 100644 --- a/kernel/power/dgemm_tcopy_macros_16_power8.S +++ b/kernel/power/dgemm_tcopy_macros_16_power8.S @@ -38,7 +38,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * Macros for N=4 and M=16 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_4x16', ` +#else .macro COPY_4x16 +#endif lxvd2x vs32, o0, A0 lxvd2x vs33, o16, A0 @@ -140,14 +144,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvd2x vs10, o32, T1 stxvd2x vs11, o48, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=4 and M=8 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_4x8', ` +#else .macro COPY_4x8 +#endif lxvd2x vs32, o0, A0 lxvd2x vs33, o16, A0 @@ -205,14 +217,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvd2x vs46, o32, T1 stxvd2x vs47, o48, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=4 and M=4 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_4x4', ` +#else .macro COPY_4x4 +#endif lxvd2x vs32, o0, A0 lxvd2x vs33, o16, A0 @@ -250,14 +270,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvd2x vs38, o32, T1 stxvd2x vs39, o48, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=4 and M=2 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_4x2', ` +#else .macro COPY_4x2 +#endif lxvd2x vs32, o0, A0 addi A0, A0, 16 @@ -285,14 +313,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvd2x vs35, o48, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=4 and M=1 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_4x1', ` +#else .macro COPY_4x1 +#endif lxsdx vs32, o0, A0 addi A0, A0, 8 @@ -322,14 +358,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxsdx vs35, o8, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=16 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_2x16', ` +#else .macro COPY_2x16 +#endif lxvd2x vs32, o0, A0 lxvd2x vs33, o16, A0 @@ -383,14 +427,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvd2x vs46, o32, T1 stxvd2x vs47, o48, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=8 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_2x8', ` +#else .macro COPY_2x8 +#endif lxvd2x vs32, o0, A0 lxvd2x vs33, o16, A0 @@ -420,14 +472,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvd2x vs38, o32, T1 stxvd2x vs39, o48, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=4 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_2x4', ` +#else .macro COPY_2x4 +#endif lxvd2x vs32, o0, A0 lxvd2x vs33, o16, A0 @@ -447,14 +507,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvd2x vs34, o32, T1 stxvd2x vs35, o48, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=2 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_2x2', ` +#else .macro COPY_2x2 +#endif lxvd2x vs32, o0, A0 addi A0, A0, 16 @@ -470,14 +538,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvd2x vs33, o16, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=1 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_2x1', ` +#else .macro COPY_2x1 +#endif lxsdx vs32, o0, A0 addi A0, A0, 8 @@ -493,14 +569,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxsdx vs33, o8, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=16 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_1x16', ` +#else .macro COPY_1x16 +#endif lxvd2x vs32, o0, A0 lxvd2x vs33, o16, A0 @@ -528,14 +612,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvd2x vs38, o32, T1 stxvd2x vs39, o48, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=8 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_1x8', ` +#else .macro COPY_1x8 +#endif lxvd2x vs32, o0, A0 lxvd2x vs33, o16, A0 @@ -551,14 +643,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvd2x vs34, o32, T1 stxvd2x vs35, o48, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=4 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_1x4', ` +#else .macro COPY_1x4 +#endif lxvd2x vs32, o0, A0 lxvd2x vs33, o16, A0 @@ -570,14 +670,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvd2x vs32, o0, T1 stxvd2x vs33, o16, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=2 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_1x2', ` +#else .macro COPY_1x2 +#endif lxvd2x vs32, o0, A0 addi A0, A0, 16 @@ -587,14 +695,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvd2x vs32, o0, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=1 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_1x1', ` +#else .macro COPY_1x1 +#endif lxsdx vs32, o0, A0 addi A0, A0, 8 @@ -604,5 +720,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxsdx vs32, o0, T1 +#if defined(_AIX) +') +#else .endm +#endif diff --git a/kernel/power/dgemv_n_microk_power8.c b/kernel/power/dgemv_n_microk_power8.c index ae4fe9009..c2eb3968c 100644 --- a/kernel/power/dgemv_n_microk_power8.c +++ b/kernel/power/dgemv_n_microk_power8.c @@ -46,7 +46,7 @@ static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y ( "lxvd2x 34, 0, %10 \n\t" // x0, x1 "lxvd2x 35, %11, %10 \n\t" // x2, x3 - "xxspltd 32, %x9, 0 \n\t" // alpha, alpha + XXSPLTD_S(32,%x9,0) // alpha, alpha "sldi %6, %13, 3 \n\t" // lda * sizeof (double) @@ -56,10 +56,10 @@ static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y "add %4, %3, %6 \n\t" // a0 = ap, a1 = a0 + lda "add %6, %6, %6 \n\t" // 2 * lda - "xxspltd 32, 34, 0 \n\t" // x0 * alpha, x0 * alpha - "xxspltd 33, 34, 1 \n\t" // x1 * alpha, x1 * alpha - "xxspltd 34, 35, 0 \n\t" // x2 * alpha, x2 * alpha - "xxspltd 35, 35, 1 \n\t" // x3 * alpha, x3 * alpha + XXSPLTD_S(32,34,0) // x0 * alpha, x0 * alpha + XXSPLTD_S(33,34,1) // x1 * alpha, x1 * alpha + XXSPLTD_S(34,35,0) // x2 * alpha, x2 * alpha + XXSPLTD_S(35,35,1) // x3 * alpha, x3 * alpha "add %5, %3, %6 \n\t" // a2 = a0 + 2 * lda "add %6, %4, %6 \n\t" // a3 = a1 + 2 * lda @@ -89,10 +89,10 @@ static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y "addi %6, %6, 32 \n\t" "addic. %1, %1, -4 \n\t" - "ble 2f \n\t" + "ble two%= \n\t" - ".p2align 5 \n" - "1: \n\t" + ".align 5 \n" + "one%=: \n\t" "lxvd2x 36, 0, %2 \n\t" // y0, y1 "lxvd2x 37, %11, %2 \n\t" // y2, y3 @@ -131,7 +131,7 @@ static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y "addi %2, %2, 32 \n\t" "addic. %1, %1, -4 \n\t" - "ble 2f \n\t" + "ble two%= \n\t" "lxvd2x 36, 0, %2 \n\t" // y0, y1 @@ -171,7 +171,7 @@ static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y "addi %2, %2, 32 \n\t" "addic. %1, %1, -4 \n\t" - "ble 2f \n\t" + "ble two%= \n\t" "lxvd2x 36, 0, %2 \n\t" // y0, y1 @@ -211,7 +211,7 @@ static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y "addi %2, %2, 32 \n\t" "addic. %1, %1, -4 \n\t" - "ble 2f \n\t" + "ble two%= \n\t" "lxvd2x 36, 0, %2 \n\t" // y0, y1 @@ -251,9 +251,9 @@ static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y "addi %2, %2, 32 \n\t" "addic. %1, %1, -4 \n\t" - "bgt 1b \n" + "bgt one%= \n" - "2: \n\t" + "two%=: \n\t" "lxvd2x 36, 0, %2 \n\t" // y0, y1 "lxvd2x 37, %11, %2 \n\t" // y2, y3 diff --git a/kernel/power/dgemv_t.c b/kernel/power/dgemv_t.c index b8589a131..ffe469d4d 100644 --- a/kernel/power/dgemv_t.c +++ b/kernel/power/dgemv_t.c @@ -93,11 +93,11 @@ static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, do "li %[off],32 \n\t" - "ble- 2f \n\t" + "ble- two%= \n\t" //-------------------------------------------------- - ".p2align 5 \n\t" - "1: \n\t" + ".align 5 \n\t" + "one%=: \n\t" "xvmaddadp 34,36,32 \n\t" "xvmaddadp 35,38,32 \n\t" "addi %[off2], %[off2],32 \n\t" @@ -137,7 +137,7 @@ static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, do "lxvd2x 49, %[a6], %[off2] \n\t" "lxvd2x 51, %[a7], %[off2] \n\t" "lxvd2x 33, %[x], %[off2] \n\t" - "ble- 2f \n\t" + "ble- two%= \n\t" "xvmaddadp 34,36,32 \n\t" "xvmaddadp 35,38,32 \n\t" "addi %[off2], %[off2],32 \n\t" @@ -177,7 +177,7 @@ static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, do "lxvd2x 49, %[a6], %[off2] \n\t" "lxvd2x 51, %[a7], %[off2] \n\t" "lxvd2x 33, %[x], %[off2] \n\t" - "ble- 2f \n\t" + "ble- two%= \n\t" "xvmaddadp 34,36,32 \n\t" "xvmaddadp 35,38,32 \n\t" #if defined(PREFETCH) @@ -229,7 +229,7 @@ static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, do "lxvd2x 33, %[x], %[off2] \n\t" "addic. %[n],%[n],-4 \n\t" - "ble- 2f \n\t" + "ble- two%= \n\t" "addi %[off2], %[off2],32 \n\t" #if defined(PREFETCH) @@ -288,9 +288,9 @@ static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, do #if defined(PREFETCH) "dcbt %[temp],%[x] \n\t" #endif - "bgt+ 1b \n\t" - ".p2align 5 \n\t" - "2: \n\t" + "bgt+ one%= \n\t" + ".align 5 \n\t" + "two%=: \n\t" //-------------------------------------------- "xvmaddadp 34,36,32 \n\t" @@ -301,7 +301,7 @@ static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, do "xvmaddadp 7,46,32 \n\t" "xvmaddadp 8,48,32 \n\t" "xvmaddadp 9,50,32 \n\t" - "xxspltd 36, %x[alpha], 0 \n\t" + XXSPLTD_S(36,%x[alpha],0) "xvmaddadp 34,37,33 \n\t" "xvmaddadp 35,39,33 \n\t" "xvmaddadp 4,41,33 \n\t" @@ -322,21 +322,21 @@ static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, do - "xxmrgld 42,34,35 \n\t" - "xxmrghd 43,34,35 \n\t" + XXMRGLD_S(42,34,35) + XXMRGHD_S(43,34,35) - "xxmrgld 44,4,5 \n\t" - "xxmrghd 45,4,5 \n\t" + XXMRGLD_S(44,4,5) + XXMRGHD_S(45,4,5) "xvadddp 42,42,43 \n\t" - "xxmrgld 46,6,7 \n\t" - "xxmrghd 47,6,7 \n\t" + XXMRGLD_S(46,6,7) + XXMRGHD_S(47,6,7) "xvadddp 44,44,45 \n\t" - "xxmrgld 48,8,9 \n\t" - "xxmrghd 49,8,9 \n\t" + XXMRGLD_S(48,8,9) + XXMRGHD_S(49,8,9) "xvadddp 46,46,47 \n\t" diff --git a/kernel/power/drot_microk_power8.c b/kernel/power/drot_microk_power8.c index 016b7764d..259c08187 100644 --- a/kernel/power/drot_microk_power8.c +++ b/kernel/power/drot_microk_power8.c @@ -51,8 +51,8 @@ static void drot_kernel_16 (long n, double *x, double *y, double c, double s) __asm__ ( - "xxspltd 36, %x13, 0 \n\t" // load c to both dwords - "xxspltd 37, %x14, 0 \n\t" // load s to both dwords + XXSPLTD_S(36,%x13,0) // load c to both dwords + XXSPLTD_S(37,%x14,0) // load s to both dwords "lxvd2x 32, 0, %3 \n\t" // load x "lxvd2x 33, %15, %3 \n\t" @@ -68,10 +68,10 @@ static void drot_kernel_16 (long n, double *x, double *y, double c, double s) "addi %4, %4, 64 \n\t" "addic. %2, %2, -8 \n\t" - "ble 2f \n\t" + "ble two%= \n\t" - ".p2align 5 \n" - "1: \n\t" + ".align 5 \n" + "one%=: \n\t" "xvmuldp 40, 32, 36 \n\t" // c * x "xvmuldp 41, 33, 36 \n\t" @@ -135,9 +135,9 @@ static void drot_kernel_16 (long n, double *x, double *y, double c, double s) "addi %4, %4, 128 \n\t" "addic. %2, %2, -8 \n\t" - "bgt 1b \n" + "bgt one%= \n" - "2: \n\t" + "two%=: \n\t" "xvmuldp 40, 32, 36 \n\t" // c * x "xvmuldp 41, 33, 36 \n\t" diff --git a/kernel/power/dscal_microk_power8.c b/kernel/power/dscal_microk_power8.c index 04898eb3d..e9bacd05a 100644 --- a/kernel/power/dscal_microk_power8.c +++ b/kernel/power/dscal_microk_power8.c @@ -41,7 +41,7 @@ static void dscal_kernel_8 (long n, double *x, double alpha) ( "dcbt 0, %2 \n\t" - "xxspltd %x3, %x3, 0 \n\t" + XXSPLTD_S(%x3,%x3,0) "lxvd2x 32, 0, %2 \n\t" "lxvd2x 33, %4, %2 \n\t" @@ -55,10 +55,10 @@ static void dscal_kernel_8 (long n, double *x, double alpha) "addi %2, %2, 128 \n\t" "addic. %1, %1, -16 \n\t" - "ble 2f \n\t" + "ble two%= \n\t" - ".p2align 5 \n" - "1: \n\t" + ".align 5 \n" + "one%=: \n\t" "xvmuldp 40, 32, %x3 \n\t" "xvmuldp 41, 33, %x3 \n\t" @@ -91,9 +91,9 @@ static void dscal_kernel_8 (long n, double *x, double alpha) "addi %2, %2, 256 \n\t" "addic. %1, %1, -16 \n\t" - "bgt 1b \n" + "bgt one%= \n" - "2: \n\t" + "two%=: \n\t" "xvmuldp 40, 32, %x3 \n\t" "xvmuldp 41, 33, %x3 \n\t" @@ -146,8 +146,8 @@ static void dscal_kernel_8_zero (long n, double *x) ( "xxlxor %x3, %x3, %x3 \n\t" - ".p2align 5 \n" - "1: \n\t" + ".align 5 \n" + "one%=: \n\t" "stxvd2x %x3, 0, %2 \n\t" "stxvd2x %x3, %4, %2 \n\t" @@ -161,7 +161,7 @@ static void dscal_kernel_8_zero (long n, double *x) "addi %2, %2, 128 \n\t" "addic. %1, %1, -16 \n\t" - "bgt 1b \n" + "bgt one%= \n" "#n=%1 x=%0=%2 t0=%x3 o16=%4 o32=%5 o48=%6 o64=%7 o80=%8 o96=%9 o112=%10" : diff --git a/kernel/power/dswap_microk_power8.c b/kernel/power/dswap_microk_power8.c index 31eff3449..ecfd5c9f9 100644 --- a/kernel/power/dswap_microk_power8.c +++ b/kernel/power/dswap_microk_power8.c @@ -39,8 +39,8 @@ static void dswap_kernel_32 (long n, double *x, double *y) { __asm__ ( - ".p2align 5 \n" - "1: \n\t" + ".align 5 \n" + "one%=: \n\t" "lxvd2x 32, 0, %4 \n\t" "lxvd2x 33, %5, %4 \n\t" @@ -131,7 +131,7 @@ static void dswap_kernel_32 (long n, double *x, double *y) "addi %4, %4, 128 \n\t" "addic. %2, %2, -32 \n\t" - "bgt 1b \n" + "bgt one%= \n" "#n=%2 x=%0=%3 y=%1=%4 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11" : diff --git a/kernel/power/dtrmm_macros_16x4_power8.S b/kernel/power/dtrmm_macros_16x4_power8.S index 079144a90..efb034594 100644 --- a/kernel/power/dtrmm_macros_16x4_power8.S +++ b/kernel/power/dtrmm_macros_16x4_power8.S @@ -37,7 +37,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * Macros for N=4, M=16 * *********************************************************************/ +#if defined(_AIX) +define(`LOAD4x16_1', ` +#else .macro LOAD4x16_1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -60,9 +64,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x16_I1', ` +#else .macro KERNEL4x16_I1 +#endif xvmuldp vs32, vs0, vs24 xvmuldp vs33, vs1, vs24 @@ -127,9 +139,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x16_1', ` +#else .macro KERNEL4x16_1 +#endif xvmaddadp vs32, vs0, vs24 xvmaddadp vs33, vs1, vs24 @@ -195,9 +215,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x16_2', ` +#else .macro KERNEL4x16_2 +#endif xvmaddadp vs32, vs8, vs28 xvmaddadp vs33, vs9, vs28 @@ -262,9 +290,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x16_E2', ` +#else .macro KERNEL4x16_E2 +#endif xvmaddadp vs32, vs8, vs28 @@ -303,9 +339,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs62, vs14, vs31 xvmaddadp vs63, vs15, vs31 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x16_SUBI1', ` +#else .macro KERNEL4x16_SUBI1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -364,9 +408,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs62, vs6, vs27 xvmuldp vs63, vs7, vs27 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x16_SUB1', ` +#else .macro KERNEL4x16_SUB1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -425,9 +477,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs62, vs6, vs27 xvmaddadp vs63, vs7, vs27 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE4x16', ` +#else .macro SAVE4x16 +#endif mr T1, CO addi T2, T1, 64 @@ -615,13 +675,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 128 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************* * Macros for N=4, M=8 * *********************************************************************/ +#if defined(_AIX) +define(`LOAD4x8_1', ` +#else .macro LOAD4x8_1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -638,9 +706,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x8_I1', ` +#else .macro KERNEL4x8_I1 +#endif xvmuldp vs32, vs0, vs24 xvmuldp vs33, vs1, vs24 @@ -679,9 +755,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x8_1', ` +#else .macro KERNEL4x8_1 +#endif xvmaddadp vs32, vs0, vs24 xvmaddadp vs33, vs1, vs24 @@ -719,9 +803,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x8_2', ` +#else .macro KERNEL4x8_2 +#endif xvmaddadp vs32, vs8, vs28 xvmaddadp vs33, vs9, vs28 @@ -759,9 +851,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x8_E2', ` +#else .macro KERNEL4x8_E2 +#endif xvmaddadp vs32, vs8, vs28 @@ -784,9 +884,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs58, vs10, vs31 xvmaddadp vs59, vs11, vs31 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x8_SUBI1', ` +#else .macro KERNEL4x8_SUBI1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -822,9 +930,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs58, vs2, vs27 xvmuldp vs59, vs3, vs27 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x8_SUB1', ` +#else .macro KERNEL4x8_SUB1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -860,9 +976,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs58, vs2, vs27 xvmaddadp vs59, vs3, vs27 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE4x8', ` +#else .macro SAVE4x8 +#endif mr T1, CO @@ -970,13 +1094,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 64 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************* * Macros for N=4, M=4 * *********************************************************************/ +#if defined(_AIX) +define(`LOAD4x4_1', ` +#else .macro LOAD4x4_1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -989,9 +1121,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 32 addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x4_I1', ` +#else .macro KERNEL4x4_I1 +#endif lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO @@ -1017,9 +1157,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs56, vs0, vs27 xvmuldp vs57, vs1, vs27 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x4_1', ` +#else .macro KERNEL4x4_1 +#endif lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO @@ -1045,9 +1193,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs56, vs0, vs27 xvmaddadp vs57, vs1, vs27 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x4_2', ` +#else .macro KERNEL4x4_2 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -1073,9 +1229,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs56, vs8, vs31 xvmaddadp vs57, vs9, vs31 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x4_E2', ` +#else .macro KERNEL4x4_E2 +#endif xvmaddadp vs32, vs8, vs28 @@ -1090,9 +1254,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs56, vs8, vs31 xvmaddadp vs57, vs9, vs31 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x4_SUBI1', ` +#else .macro KERNEL4x4_SUBI1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -1118,9 +1290,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs56, vs0, vs27 xvmuldp vs57, vs1, vs27 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x4_SUB1', ` +#else .macro KERNEL4x4_SUB1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -1146,9 +1326,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs56, vs0, vs27 xvmaddadp vs57, vs1, vs27 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE4x4', ` +#else .macro SAVE4x4 +#endif mr T1, CO @@ -1224,13 +1412,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 32 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************* * Macros for N=4, M=2 * *********************************************************************/ +#if defined(_AIX) +define(`LOAD4x2_1', ` +#else .macro LOAD4x2_1 +#endif lxvd2x vs0, 0, AO @@ -1242,9 +1438,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 16 addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x2_I1', ` +#else .macro KERNEL4x2_I1 +#endif lxvd2x vs8, 0, AO @@ -1265,9 +1469,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs56, vs0, vs27 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x2_1', ` +#else .macro KERNEL4x2_1 +#endif lxvd2x vs8, 0, AO @@ -1288,9 +1500,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs56, vs0, vs27 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x2_2', ` +#else .macro KERNEL4x2_2 +#endif lxvd2x vs0, 0, AO @@ -1311,9 +1531,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs56, vs8, vs31 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x2_E2', ` +#else .macro KERNEL4x2_E2 +#endif xvmaddadp vs32, vs8, vs28 @@ -1324,9 +1552,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs56, vs8, vs31 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x2_SUBI1', ` +#else .macro KERNEL4x2_SUBI1 +#endif lxvd2x vs0, 0, AO @@ -1347,9 +1583,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs56, vs0, vs27 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x2_SUB1', ` +#else .macro KERNEL4x2_SUB1 +#endif lxvd2x vs0, 0, AO @@ -1370,9 +1614,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs56, vs0, vs27 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE4x2', ` +#else .macro SAVE4x2 +#endif mr T1, CO @@ -1432,13 +1684,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 16 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************* * Macros for N=4, M=1 * *********************************************************************/ +#if defined(_AIX) +define(`LOAD4x1_1', ` +#else .macro LOAD4x1_1 +#endif lxsdx vs0, 0, AO @@ -1450,9 +1710,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 8 addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x1_I1', ` +#else .macro KERNEL4x1_I1 +#endif lxsdx vs8, 0, AO @@ -1473,9 +1741,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs56, vs0, vs27 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x1_1', ` +#else .macro KERNEL4x1_1 +#endif lxsdx vs8, 0, AO @@ -1496,9 +1772,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs56, vs0, vs27 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x1_2', ` +#else .macro KERNEL4x1_2 +#endif lxsdx vs0, 0, AO @@ -1519,9 +1803,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs56, vs8, vs31 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x1_E2', ` +#else .macro KERNEL4x1_E2 +#endif xsmaddadp vs32, vs8, vs28 @@ -1532,9 +1824,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs56, vs8, vs31 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x1_SUBI1', ` +#else .macro KERNEL4x1_SUBI1 +#endif lxsdx vs0, 0, AO @@ -1555,9 +1855,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs56, vs0, vs27 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x1_SUB1', ` +#else .macro KERNEL4x1_SUB1 +#endif lxsdx vs0, 0, AO @@ -1578,9 +1886,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs56, vs0, vs27 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE4x1', ` +#else .macro SAVE4x1 +#endif mr T1, CO @@ -1640,13 +1956,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 8 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************* * Macros for N=2, M=16 * *********************************************************************/ +#if defined(_AIX) +define(`LOAD2x16_1', ` +#else .macro LOAD2x16_1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -1666,9 +1990,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x16_I1', ` +#else .macro KERNEL2x16_I1 +#endif lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO @@ -1707,9 +2039,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs46, vs6, vs25 xvmuldp vs47, vs7, vs25 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x16_1', ` +#else .macro KERNEL2x16_1 +#endif lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO @@ -1748,9 +2088,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs46, vs6, vs25 xvmaddadp vs47, vs7, vs25 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x16_2', ` +#else .macro KERNEL2x16_2 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -1789,9 +2137,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs46, vs14, vs29 xvmaddadp vs47, vs15, vs29 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x16_E2', ` +#else .macro KERNEL2x16_E2 +#endif xvmaddadp vs32, vs8, vs28 @@ -1812,9 +2168,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs46, vs14, vs29 xvmaddadp vs47, vs15, vs29 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x16_SUBI1', ` +#else .macro KERNEL2x16_SUBI1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -1853,9 +2217,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs46, vs6, vs25 xvmuldp vs47, vs7, vs25 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x16_SUB1', ` +#else .macro KERNEL2x16_SUB1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -1894,9 +2266,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs46, vs6, vs25 xvmaddadp vs47, vs7, vs25 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE2x16', ` +#else .macro SAVE2x16 +#endif mr T1, CO addi T2, T1, 64 @@ -1990,13 +2370,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 128 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************* * Macros for N=4, M=8 * *********************************************************************/ +#if defined(_AIX) +define(`LOAD2x8_1', ` +#else .macro LOAD2x8_1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -2009,9 +2397,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 addi BO, BO, 16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_I1', ` +#else .macro KERNEL2x8_I1 +#endif lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO @@ -2035,9 +2431,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs42, vs2, vs25 xvmuldp vs43, vs3, vs25 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_1', ` +#else .macro KERNEL2x8_1 +#endif lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO @@ -2061,9 +2465,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs42, vs2, vs25 xvmaddadp vs43, vs3, vs25 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_2', ` +#else .macro KERNEL2x8_2 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -2087,9 +2499,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs42, vs10, vs29 xvmaddadp vs43, vs11, vs29 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_E2', ` +#else .macro KERNEL2x8_E2 +#endif xvmaddadp vs32, vs8, vs28 @@ -2102,9 +2522,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs42, vs10, vs29 xvmaddadp vs43, vs11, vs29 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_SUBI1', ` +#else .macro KERNEL2x8_SUBI1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -2128,9 +2556,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs42, vs2, vs25 xvmuldp vs43, vs3, vs25 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_SUB1', ` +#else .macro KERNEL2x8_SUB1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -2154,9 +2590,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs42, vs2, vs25 xvmaddadp vs43, vs3, vs25 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE2x8', ` +#else .macro SAVE2x8 +#endif mr T1, CO @@ -2212,13 +2656,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 64 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************* * Macros for N=2, M=4 * *********************************************************************/ +#if defined(_AIX) +define(`LOAD2x4_1', ` +#else .macro LOAD2x4_1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -2229,9 +2681,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 32 addi BO, BO, 16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_I1', ` +#else .macro KERNEL2x4_I1 +#endif lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO @@ -2249,9 +2709,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs40, vs0, vs25 xvmuldp vs41, vs1, vs25 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_1', ` +#else .macro KERNEL2x4_1 +#endif lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO @@ -2269,9 +2737,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs40, vs0, vs25 xvmaddadp vs41, vs1, vs25 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_2', ` +#else .macro KERNEL2x4_2 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -2289,9 +2765,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs40, vs8, vs29 xvmaddadp vs41, vs9, vs29 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_E2', ` +#else .macro KERNEL2x4_E2 +#endif xvmaddadp vs32, vs8, vs28 @@ -2300,9 +2784,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs40, vs8, vs29 xvmaddadp vs41, vs9, vs29 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_SUBI1', ` +#else .macro KERNEL2x4_SUBI1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -2320,9 +2812,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs40, vs0, vs25 xvmuldp vs41, vs1, vs25 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_SUB1', ` +#else .macro KERNEL2x4_SUB1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -2340,9 +2840,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs40, vs0, vs25 xvmaddadp vs41, vs1, vs25 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE2x4', ` +#else .macro SAVE2x4 +#endif mr T1, CO @@ -2382,13 +2890,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 32 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************* * Macros for N=2, M=2 * *********************************************************************/ +#if defined(_AIX) +define(`LOAD2x2_1', ` +#else .macro LOAD2x2_1 +#endif lxvd2x vs0, 0, AO @@ -2398,9 +2914,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 16 addi BO, BO, 16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_I1', ` +#else .macro KERNEL2x2_I1 +#endif lxvd2x vs8, 0, AO @@ -2415,9 +2939,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs40, vs0, vs25 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_1', ` +#else .macro KERNEL2x2_1 +#endif lxvd2x vs8, 0, AO @@ -2432,9 +2964,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs40, vs0, vs25 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_2', ` +#else .macro KERNEL2x2_2 +#endif lxvd2x vs0, 0, AO @@ -2449,18 +2989,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs40, vs8, vs29 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_E2', ` +#else .macro KERNEL2x2_E2 +#endif xvmaddadp vs32, vs8, vs28 xvmaddadp vs40, vs8, vs29 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_SUBI1', ` +#else .macro KERNEL2x2_SUBI1 +#endif lxvd2x vs0, 0, AO @@ -2475,9 +3031,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs40, vs0, vs25 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_SUB1', ` +#else .macro KERNEL2x2_SUB1 +#endif lxvd2x vs0, 0, AO @@ -2492,9 +3056,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs40, vs0, vs25 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE2x2', ` +#else .macro SAVE2x2 +#endif mr T1, CO @@ -2526,13 +3098,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 16 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************* * Macros for N=2, M=1 * *********************************************************************/ +#if defined(_AIX) +define(`LOAD2x1_1', ` +#else .macro LOAD2x1_1 +#endif lxsdx vs0, 0, AO @@ -2542,9 +3122,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 8 addi BO, BO, 16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_I1', ` +#else .macro KERNEL2x1_I1 +#endif lxsdx vs8, 0, AO @@ -2559,9 +3147,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs40, vs0, vs25 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_1', ` +#else .macro KERNEL2x1_1 +#endif lxsdx vs8, 0, AO @@ -2576,9 +3172,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs40, vs0, vs25 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_2', ` +#else .macro KERNEL2x1_2 +#endif lxsdx vs0, 0, AO @@ -2593,18 +3197,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs40, vs8, vs29 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_E2', ` +#else .macro KERNEL2x1_E2 +#endif xsmaddadp vs32, vs8, vs28 xsmaddadp vs40, vs8, vs29 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_SUBI1', ` +#else .macro KERNEL2x1_SUBI1 +#endif lxsdx vs0, 0, AO @@ -2619,9 +3239,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs40, vs0, vs25 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_SUB1', ` +#else .macro KERNEL2x1_SUB1 +#endif lxsdx vs0, 0, AO @@ -2636,9 +3264,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs40, vs0, vs25 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE2x1', ` +#else .macro SAVE2x1 +#endif mr T1, CO @@ -2670,13 +3306,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 8 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************* * Macros for N=1, M=16 * *********************************************************************/ +#if defined(_AIX) +define(`LOAD1x16_1', ` +#else .macro LOAD1x16_1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -2695,9 +3339,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x16_I1', ` +#else .macro KERNEL1x16_I1 +#endif lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO @@ -2726,9 +3378,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs38, vs6, vs24 xvmuldp vs39, vs7, vs24 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x16_1', ` +#else .macro KERNEL1x16_1 +#endif lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO @@ -2757,9 +3417,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs38, vs6, vs24 xvmaddadp vs39, vs7, vs24 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x16_2', ` +#else .macro KERNEL1x16_2 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -2788,9 +3456,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs38, vs14, vs28 xvmaddadp vs39, vs15, vs28 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x16_E2', ` +#else .macro KERNEL1x16_E2 +#endif xvmaddadp vs32, vs8, vs28 @@ -2802,9 +3478,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs38, vs14, vs28 xvmaddadp vs39, vs15, vs28 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x16_SUBI1', ` +#else .macro KERNEL1x16_SUBI1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -2833,9 +3517,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs38, vs6, vs24 xvmuldp vs39, vs7, vs24 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x16_SUB1', ` +#else .macro KERNEL1x16_SUB1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -2864,9 +3556,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs38, vs6, vs24 xvmaddadp vs39, vs7, vs24 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE1x16', ` +#else .macro SAVE1x16 +#endif mr T1, CO addi T2, T1, 64 @@ -2915,13 +3615,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 128 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************* * Macros for N=4, M=8 * *********************************************************************/ +#if defined(_AIX) +define(`LOAD1x8_1', ` +#else .macro LOAD1x8_1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -2933,9 +3641,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 addi BO, BO, 8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_I1', ` +#else .macro KERNEL1x8_I1 +#endif lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO @@ -2953,9 +3669,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs34, vs2, vs24 xvmuldp vs35, vs3, vs24 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_1', ` +#else .macro KERNEL1x8_1 +#endif lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO @@ -2973,9 +3697,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs34, vs2, vs24 xvmaddadp vs35, vs3, vs24 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_2', ` +#else .macro KERNEL1x8_2 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -2993,9 +3725,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs34, vs10, vs28 xvmaddadp vs35, vs11, vs28 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_E2', ` +#else .macro KERNEL1x8_E2 +#endif xvmaddadp vs32, vs8, vs28 @@ -3003,9 +3743,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs34, vs10, vs28 xvmaddadp vs35, vs11, vs28 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_SUBI1', ` +#else .macro KERNEL1x8_SUBI1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -3023,9 +3771,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs34, vs2, vs24 xvmuldp vs35, vs3, vs24 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_SUB1', ` +#else .macro KERNEL1x8_SUB1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -3043,9 +3799,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs34, vs2, vs24 xvmaddadp vs35, vs3, vs24 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE1x8', ` +#else .macro SAVE1x8 +#endif mr T1, CO @@ -3075,13 +3839,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 64 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************* * Macros for N=1, M=4 * *********************************************************************/ +#if defined(_AIX) +define(`LOAD1x4_1', ` +#else .macro LOAD1x4_1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -3091,9 +3863,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 32 addi BO, BO, 8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_I1', ` +#else .macro KERNEL1x4_I1 +#endif lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO @@ -3107,9 +3887,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs32, vs0, vs24 xvmuldp vs33, vs1, vs24 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_1', ` +#else .macro KERNEL1x4_1 +#endif lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO @@ -3123,9 +3911,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs32, vs0, vs24 xvmaddadp vs33, vs1, vs24 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_2', ` +#else .macro KERNEL1x4_2 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -3139,17 +3935,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs32, vs8, vs28 xvmaddadp vs33, vs9, vs28 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_E2', ` +#else .macro KERNEL1x4_E2 +#endif xvmaddadp vs32, vs8, vs28 xvmaddadp vs33, vs9, vs28 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_SUBI1', ` +#else .macro KERNEL1x4_SUBI1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -3163,9 +3975,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs32, vs0, vs24 xvmuldp vs33, vs1, vs24 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_SUB1', ` +#else .macro KERNEL1x4_SUB1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -3179,9 +3999,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs32, vs0, vs24 xvmaddadp vs33, vs1, vs24 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE1x4', ` +#else .macro SAVE1x4 +#endif mr T1, CO @@ -3203,13 +4031,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 32 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************* * Macros for N=1, M=2 * *********************************************************************/ +#if defined(_AIX) +define(`LOAD1x2_1', ` +#else .macro LOAD1x2_1 +#endif lxvd2x vs0, 0, AO @@ -3218,9 +4054,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 16 addi BO, BO, 8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_I1', ` +#else .macro KERNEL1x2_I1 +#endif lxvd2x vs8, 0, AO @@ -3232,9 +4076,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs32, vs0, vs24 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_1', ` +#else .macro KERNEL1x2_1 +#endif lxvd2x vs8, 0, AO @@ -3246,9 +4098,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs32, vs0, vs24 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_2', ` +#else .macro KERNEL1x2_2 +#endif lxvd2x vs0, 0, AO @@ -3260,16 +4120,32 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs32, vs8, vs28 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_E2', ` +#else .macro KERNEL1x2_E2 +#endif xvmaddadp vs32, vs8, vs28 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_SUBI1', ` +#else .macro KERNEL1x2_SUBI1 +#endif lxvd2x vs0, 0, AO @@ -3281,9 +4157,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs32, vs0, vs24 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_SUB1', ` +#else .macro KERNEL1x2_SUB1 +#endif lxvd2x vs0, 0, AO @@ -3295,9 +4179,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs32, vs0, vs24 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE1x2', ` +#else .macro SAVE1x2 +#endif mr T1, CO @@ -3315,13 +4207,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 16 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************* * Macros for N=1, M=1 * *********************************************************************/ +#if defined(_AIX) +define(`LOAD1x1_1', ` +#else .macro LOAD1x1_1 +#endif lxsdx vs0, 0, AO @@ -3330,9 +4230,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 8 addi BO, BO, 8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_I1', ` +#else .macro KERNEL1x1_I1 +#endif lxsdx vs8, 0, AO @@ -3344,9 +4252,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs32, vs0, vs24 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_1', ` +#else .macro KERNEL1x1_1 +#endif lxsdx vs8, 0, AO @@ -3358,9 +4274,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs32, vs0, vs24 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_2', ` +#else .macro KERNEL1x1_2 +#endif lxsdx vs0, 0, AO @@ -3372,16 +4296,32 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs32, vs8, vs28 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_E2', ` +#else .macro KERNEL1x1_E2 +#endif xsmaddadp vs32, vs8, vs28 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_SUBI1', ` +#else .macro KERNEL1x1_SUBI1 +#endif lxsdx vs0, 0, AO @@ -3393,9 +4333,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs32, vs0, vs24 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_SUB1', ` +#else .macro KERNEL1x1_SUB1 +#endif lxsdx vs0, 0, AO @@ -3407,9 +4355,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs32, vs0, vs24 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE1x1', ` +#else .macro SAVE1x1 +#endif mr T1, CO @@ -3427,5 +4383,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 8 +#if defined(_AIX) +') +#else .endm +#endif diff --git a/kernel/power/dtrsm_macros_LT_16x4_power8.S b/kernel/power/dtrsm_macros_LT_16x4_power8.S index dc47daa3a..5a5c4037c 100644 --- a/kernel/power/dtrsm_macros_LT_16x4_power8.S +++ b/kernel/power/dtrsm_macros_LT_16x4_power8.S @@ -1,46 +1,58 @@ +#if defined(_AIX) +define(`INIT_16x4', ` +#else .macro INIT_16x4 +#endif xxlxor vs0, vs0, vs0 - xvmovdp vs32, vs0 - xvmovdp vs33, vs0 - xvmovdp vs34, vs0 - xvmovdp vs35, vs0 - xvmovdp vs36, vs0 - xvmovdp vs37, vs0 - xvmovdp vs38, vs0 - xvmovdp vs39, vs0 - xvmovdp vs40, vs0 - xvmovdp vs41, vs0 - xvmovdp vs42, vs0 - xvmovdp vs43, vs0 - xvmovdp vs44, vs0 - xvmovdp vs45, vs0 - xvmovdp vs46, vs0 - xvmovdp vs47, vs0 - xvmovdp vs48, vs0 - xvmovdp vs49, vs0 - xvmovdp vs50, vs0 - xvmovdp vs51, vs0 - xvmovdp vs52, vs0 - xvmovdp vs53, vs0 - xvmovdp vs54, vs0 - xvmovdp vs55, vs0 - xvmovdp vs56, vs0 - xvmovdp vs57, vs0 - xvmovdp vs58, vs0 - xvmovdp vs59, vs0 - xvmovdp vs60, vs0 - xvmovdp vs61, vs0 - xvmovdp vs62, vs0 - xvmovdp vs63, vs0 + XVMOVDP(vs32,vs0) + XVMOVDP(vs33,vs0) + XVMOVDP(vs34,vs0) + XVMOVDP(vs35,vs0) + XVMOVDP(vs36,vs0) + XVMOVDP(vs37,vs0) + XVMOVDP(vs38,vs0) + XVMOVDP(vs39,vs0) + XVMOVDP(vs40,vs0) + XVMOVDP(vs41,vs0) + XVMOVDP(vs42,vs0) + XVMOVDP(vs43,vs0) + XVMOVDP(vs44,vs0) + XVMOVDP(vs45,vs0) + XVMOVDP(vs46,vs0) + XVMOVDP(vs47,vs0) + XVMOVDP(vs48,vs0) + XVMOVDP(vs49,vs0) + XVMOVDP(vs50,vs0) + XVMOVDP(vs51,vs0) + XVMOVDP(vs52,vs0) + XVMOVDP(vs53,vs0) + XVMOVDP(vs54,vs0) + XVMOVDP(vs55,vs0) + XVMOVDP(vs56,vs0) + XVMOVDP(vs57,vs0) + XVMOVDP(vs58,vs0) + XVMOVDP(vs59,vs0) + XVMOVDP(vs60,vs0) + XVMOVDP(vs61,vs0) + XVMOVDP(vs62,vs0) + XVMOVDP(vs63,vs0) +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL_16x4', ` +#else .macro KERNEL_16x4 +#endif lxvd2x vs0, o0, AO @@ -98,35 +110,51 @@ xvmaddadp vs63, vs7, vs19 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`INIT_8x4', ` +#else .macro INIT_8x4 +#endif xxlxor vs0, vs0, vs0 - xvmovdp vs32, vs0 - xvmovdp vs33, vs0 - xvmovdp vs34, vs0 - xvmovdp vs35, vs0 - xvmovdp vs36, vs0 - xvmovdp vs37, vs0 - xvmovdp vs38, vs0 - xvmovdp vs39, vs0 - xvmovdp vs40, vs0 - xvmovdp vs41, vs0 - xvmovdp vs42, vs0 - xvmovdp vs43, vs0 - xvmovdp vs44, vs0 - xvmovdp vs45, vs0 - xvmovdp vs46, vs0 - xvmovdp vs47, vs0 + XVMOVDP(vs32,vs0) + XVMOVDP(vs33,vs0) + XVMOVDP(vs34,vs0) + XVMOVDP(vs35,vs0) + XVMOVDP(vs36,vs0) + XVMOVDP(vs37,vs0) + XVMOVDP(vs38,vs0) + XVMOVDP(vs39,vs0) + XVMOVDP(vs40,vs0) + XVMOVDP(vs41,vs0) + XVMOVDP(vs42,vs0) + XVMOVDP(vs43,vs0) + XVMOVDP(vs44,vs0) + XVMOVDP(vs45,vs0) + XVMOVDP(vs46,vs0) + XVMOVDP(vs47,vs0) +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL_8x4', ` +#else .macro KERNEL_8x4 +#endif lxvd2x vs0, o0, AO @@ -161,27 +189,43 @@ xvmaddadp vs47, vs3, vs19 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`INIT_4x4', ` +#else .macro INIT_4x4 +#endif xxlxor vs0, vs0, vs0 - xvmovdp vs32, vs0 - xvmovdp vs33, vs0 - xvmovdp vs34, vs0 - xvmovdp vs35, vs0 - xvmovdp vs36, vs0 - xvmovdp vs37, vs0 - xvmovdp vs38, vs0 - xvmovdp vs39, vs0 + XVMOVDP(vs32,vs0) + XVMOVDP(vs33,vs0) + XVMOVDP(vs34,vs0) + XVMOVDP(vs35,vs0) + XVMOVDP(vs36,vs0) + XVMOVDP(vs37,vs0) + XVMOVDP(vs38,vs0) + XVMOVDP(vs39,vs0) +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL_4x4', ` +#else .macro KERNEL_4x4 +#endif lxvd2x vs0, o0, AO @@ -206,23 +250,39 @@ xvmaddadp vs39, vs1, vs19 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`INIT_2x4', ` +#else .macro INIT_2x4 +#endif xxlxor vs0, vs0, vs0 - xvmovdp vs32, vs0 - xvmovdp vs33, vs0 - xvmovdp vs34, vs0 - xvmovdp vs35, vs0 + XVMOVDP(vs32,vs0) + XVMOVDP(vs33,vs0) + XVMOVDP(vs34,vs0) + XVMOVDP(vs35,vs0) +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL_2x4', ` +#else .macro KERNEL_2x4 +#endif lxvd2x vs0, o0, AO @@ -242,23 +302,39 @@ xvmaddadp vs35, vs0, vs19 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`INIT_1x4', ` +#else .macro INIT_1x4 +#endif xxlxor vs0, vs0, vs0 - xvmovdp vs32, vs0 - xvmovdp vs33, vs0 - xvmovdp vs34, vs0 - xvmovdp vs35, vs0 + XVMOVDP(vs32,vs0) + XVMOVDP(vs33,vs0) + XVMOVDP(vs34,vs0) + XVMOVDP(vs35,vs0) +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL_1x4', ` +#else .macro KERNEL_1x4 +#endif lxvdsx vs0, o0, AO @@ -278,14 +354,22 @@ xvmaddadp vs35, vs0, vs19 +#if defined(_AIX) +') +#else .endm +#endif /*########################################################################################## SOLVE_LT 16x4 ##########################################################################################*/ +#if defined(_AIX) +define(`SOLVE_LT_16x4', ` +#else .macro SOLVE_LT_16x4 +#endif //############### LOAD B ####################### @@ -1149,46 +1233,46 @@ stxsdx vs32, o0, T1 - xxswapd vs32, vs32 + XXSWAPD(vs32,vs32) stxsdx vs34, o8, T1 - xxswapd vs34, vs34 + XXSWAPD(vs34,vs34) stxsdx vs36, o16, T1 - xxswapd vs36, vs36 + XXSWAPD(vs36,vs36) stxsdx vs38, o24, T1 - xxswapd vs38, vs38 + XXSWAPD(vs38,vs38) addi T1, T1, 32 stxsdx vs40, o0, T1 - xxswapd vs40, vs40 + XXSWAPD(vs40,vs40) stxsdx vs42, o8, T1 - xxswapd vs42, vs42 + XXSWAPD(vs42,vs42) stxsdx vs44, o16, T1 - xxswapd vs44, vs44 + XXSWAPD(vs44,vs44) stxsdx vs46, o24, T1 - xxswapd vs46, vs46 + XXSWAPD(vs46,vs46) addi T1, T1, 32 stxsdx vs48, o0, T1 - xxswapd vs48, vs48 + XXSWAPD(vs48,vs48) stxsdx vs50, o8, T1 - xxswapd vs50, vs50 + XXSWAPD(vs50,vs50) stxsdx vs52, o16, T1 - xxswapd vs52, vs52 + XXSWAPD(vs52,vs52) stxsdx vs54, o24, T1 - xxswapd vs54, vs54 + XXSWAPD(vs54,vs54) addi T1, T1, 32 stxsdx vs56, o0, T1 - xxswapd vs56, vs56 + XXSWAPD(vs56,vs56) stxsdx vs58, o8, T1 - xxswapd vs58, vs58 + XXSWAPD(vs58,vs58) stxsdx vs60, o16, T1 - xxswapd vs60, vs60 + XXSWAPD(vs60,vs60) stxsdx vs62, o24, T1 - xxswapd vs62, vs62 + XXSWAPD(vs62,vs62) stxsdx vs32, o0, T2 stxsdx vs34, o8, T2 @@ -1225,46 +1309,46 @@ stxsdx vs33, o0, T1 - xxswapd vs33, vs33 + XXSWAPD(vs33,vs33) stxsdx vs35, o8, T1 - xxswapd vs35, vs35 + XXSWAPD(vs35,vs35) stxsdx vs37, o16, T1 - xxswapd vs37, vs37 + XXSWAPD(vs37,vs37) stxsdx vs39, o24, T1 - xxswapd vs39, vs39 + XXSWAPD(vs39,vs39) addi T1, T1, 32 stxsdx vs41, o0, T1 - xxswapd vs41, vs41 + XXSWAPD(vs41,vs41) stxsdx vs43, o8, T1 - xxswapd vs43, vs43 + XXSWAPD(vs43,vs43) stxsdx vs45, o16, T1 - xxswapd vs45, vs45 + XXSWAPD(vs45,vs45) stxsdx vs47, o24, T1 - xxswapd vs47, vs47 + XXSWAPD(vs47,vs47) addi T1, T1, 32 stxsdx vs49, o0, T1 - xxswapd vs49, vs49 + XXSWAPD(vs49,vs49) stxsdx vs51, o8, T1 - xxswapd vs51, vs51 + XXSWAPD(vs51,vs51) stxsdx vs53, o16, T1 - xxswapd vs53, vs53 + XXSWAPD(vs53,vs53) stxsdx vs55, o24, T1 - xxswapd vs55, vs55 + XXSWAPD(vs55,vs55) addi T1, T1, 32 stxsdx vs57, o0, T1 - xxswapd vs57, vs57 + XXSWAPD(vs57,vs57) stxsdx vs59, o8, T1 - xxswapd vs59, vs59 + XXSWAPD(vs59,vs59) stxsdx vs61, o16, T1 - xxswapd vs61, vs61 + XXSWAPD(vs61,vs61) stxsdx vs63, o24, T1 - xxswapd vs63, vs63 + XXSWAPD(vs63,vs63) stxsdx vs33, o0, T2 stxsdx vs35, o8, T2 @@ -1292,14 +1376,22 @@ stxsdx vs61, o16, T2 stxsdx vs63, o24, T2 +#if defined(_AIX) +') +#else .endm +#endif /*########################################################################################## SOLVE_LT 8x4 ##########################################################################################*/ +#if defined(_AIX) +define(`SOLVE_LT_8x4', ` +#else .macro SOLVE_LT_8x4 +#endif xxpermdi vs0, vs32, vs33, 0 xxpermdi vs1, vs34, vs35, 0 @@ -1603,24 +1695,24 @@ stxsdx vs32, o0, T1 - xxswapd vs32, vs32 + XXSWAPD(vs32,vs32) stxsdx vs34, o8, T1 - xxswapd vs34, vs34 + XXSWAPD(vs34,vs34) stxsdx vs36, o16, T1 - xxswapd vs36, vs36 + XXSWAPD(vs36,vs36) stxsdx vs38, o24, T1 - xxswapd vs38, vs38 + XXSWAPD(vs38,vs38) addi T1, T1, 32 stxsdx vs40, o0, T1 - xxswapd vs40, vs40 + XXSWAPD(vs40,vs40) stxsdx vs42, o8, T1 - xxswapd vs42, vs42 + XXSWAPD(vs42,vs42) stxsdx vs44, o16, T1 - xxswapd vs44, vs44 + XXSWAPD(vs44,vs44) stxsdx vs46, o24, T1 - xxswapd vs46, vs46 + XXSWAPD(vs46,vs46) stxsdx vs32, o0, T2 stxsdx vs34, o8, T2 @@ -1643,24 +1735,24 @@ stxsdx vs33, o0, T1 - xxswapd vs33, vs33 + XXSWAPD(vs33,vs33) stxsdx vs35, o8, T1 - xxswapd vs35, vs35 + XXSWAPD(vs35,vs35) stxsdx vs37, o16, T1 - xxswapd vs37, vs37 + XXSWAPD(vs37,vs37) stxsdx vs39, o24, T1 - xxswapd vs39, vs39 + XXSWAPD(vs39,vs39) addi T1, T1, 32 stxsdx vs41, o0, T1 - xxswapd vs41, vs41 + XXSWAPD(vs41,vs41) stxsdx vs43, o8, T1 - xxswapd vs43, vs43 + XXSWAPD(vs43,vs43) stxsdx vs45, o16, T1 - xxswapd vs45, vs45 + XXSWAPD(vs45,vs45) stxsdx vs47, o24, T1 - xxswapd vs47, vs47 + XXSWAPD(vs47,vs47) stxsdx vs33, o0, T2 stxsdx vs35, o8, T2 @@ -1674,14 +1766,22 @@ stxsdx vs45, o16, T2 stxsdx vs47, o24, T2 +#if defined(_AIX) +') +#else .endm +#endif /*########################################################################################## SOLVE_LT 4x4 ##########################################################################################*/ +#if defined(_AIX) +define(`SOLVE_LT_4x4', ` +#else .macro SOLVE_LT_4x4 +#endif xxpermdi vs0, vs32, vs33, 0 xxpermdi vs1, vs34, vs35, 0 @@ -1813,13 +1913,13 @@ stxsdx vs32, o0, T1 - xxswapd vs32, vs32 + XXSWAPD(vs32,vs32) stxsdx vs34, o8, T1 - xxswapd vs34, vs34 + XXSWAPD(vs34,vs34) stxsdx vs36, o16, T1 - xxswapd vs36, vs36 + XXSWAPD(vs36,vs36) stxsdx vs38, o24, T1 - xxswapd vs38, vs38 + XXSWAPD(vs38,vs38) stxsdx vs32, o0, T2 stxsdx vs34, o8, T2 @@ -1835,27 +1935,35 @@ stxsdx vs33, o0, T1 - xxswapd vs33, vs33 + XXSWAPD(vs33,vs33) stxsdx vs35, o8, T1 - xxswapd vs35, vs35 + XXSWAPD(vs35,vs35) stxsdx vs37, o16, T1 - xxswapd vs37, vs37 + XXSWAPD(vs37,vs37) stxsdx vs39, o24, T1 - xxswapd vs39, vs39 + XXSWAPD(vs39,vs39) stxsdx vs33, o0, T2 stxsdx vs35, o8, T2 stxsdx vs37, o16, T2 stxsdx vs39, o24, T2 +#if defined(_AIX) +') +#else .endm +#endif /*########################################################################################## SOLVE_LT 2x4 ##########################################################################################*/ +#if defined(_AIX) +define(`SOLVE_LT_2x4', ` +#else .macro SOLVE_LT_2x4 +#endif xxpermdi vs0, vs32, vs33, 0 xxpermdi vs1, vs34, vs35, 0 @@ -1925,9 +2033,9 @@ stxsdx vs32, o0, T1 - xxswapd vs32, vs32 + XXSWAPD(vs32,vs32) stxsdx vs34, o8, T1 - xxswapd vs34, vs34 + XXSWAPD(vs34,vs34) stxsdx vs32, o0, T2 stxsdx vs34, o8, T2 @@ -1941,21 +2049,29 @@ stxsdx vs33, o0, T1 - xxswapd vs33, vs33 + XXSWAPD(vs33,vs33) stxsdx vs35, o8, T1 - xxswapd vs35, vs35 + XXSWAPD(vs35,vs35) stxsdx vs33, o0, T2 stxsdx vs35, o8, T2 +#if defined(_AIX) +') +#else .endm +#endif /*########################################################################################## SOLVE_LT 1x4 ##########################################################################################*/ +#if defined(_AIX) +define(`SOLVE_LT_1x4', ` +#else .macro SOLVE_LT_1x4 +#endif xxpermdi vs0, vs32, vs33, 0 xxpermdi vs1, vs34, vs35, 0 @@ -2001,7 +2117,7 @@ stxsdx vs32, o0, T1 - xxswapd vs32, vs32 + XXSWAPD(vs32,vs32) stxsdx vs32, o0, T2 @@ -2014,39 +2130,55 @@ stxsdx vs33, o0, T1 - xxswapd vs33, vs33 + XXSWAPD(vs33,vs33) stxsdx vs33, o0, T2 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`INIT_16x2', ` +#else .macro INIT_16x2 +#endif xxlxor vs0, vs0, vs0 - xvmovdp vs32, vs0 - xvmovdp vs33, vs0 - xvmovdp vs34, vs0 - xvmovdp vs35, vs0 - xvmovdp vs36, vs0 - xvmovdp vs37, vs0 - xvmovdp vs38, vs0 - xvmovdp vs39, vs0 - xvmovdp vs40, vs0 - xvmovdp vs41, vs0 - xvmovdp vs42, vs0 - xvmovdp vs43, vs0 - xvmovdp vs44, vs0 - xvmovdp vs45, vs0 - xvmovdp vs46, vs0 - xvmovdp vs47, vs0 + XVMOVDP(vs32,vs0) + XVMOVDP(vs33,vs0) + XVMOVDP(vs34,vs0) + XVMOVDP(vs35,vs0) + XVMOVDP(vs36,vs0) + XVMOVDP(vs37,vs0) + XVMOVDP(vs38,vs0) + XVMOVDP(vs39,vs0) + XVMOVDP(vs40,vs0) + XVMOVDP(vs41,vs0) + XVMOVDP(vs42,vs0) + XVMOVDP(vs43,vs0) + XVMOVDP(vs44,vs0) + XVMOVDP(vs45,vs0) + XVMOVDP(vs46,vs0) + XVMOVDP(vs47,vs0) +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL_16x2', ` +#else .macro KERNEL_16x2 +#endif lxvd2x vs0, o0, AO @@ -2086,27 +2218,43 @@ xvmaddadp vs47, vs7, vs17 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`INIT_8x2', ` +#else .macro INIT_8x2 +#endif xxlxor vs0, vs0, vs0 - xvmovdp vs32, vs0 - xvmovdp vs33, vs0 - xvmovdp vs34, vs0 - xvmovdp vs35, vs0 - xvmovdp vs36, vs0 - xvmovdp vs37, vs0 - xvmovdp vs38, vs0 - xvmovdp vs39, vs0 + XVMOVDP(vs32,vs0) + XVMOVDP(vs33,vs0) + XVMOVDP(vs34,vs0) + XVMOVDP(vs35,vs0) + XVMOVDP(vs36,vs0) + XVMOVDP(vs37,vs0) + XVMOVDP(vs38,vs0) + XVMOVDP(vs39,vs0) +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL_8x2', ` +#else .macro KERNEL_8x2 +#endif lxvd2x vs0, o0, AO @@ -2131,23 +2279,39 @@ xvmaddadp vs39, vs3, vs17 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`INIT_4x2', ` +#else .macro INIT_4x2 +#endif xxlxor vs0, vs0, vs0 - xvmovdp vs32, vs0 - xvmovdp vs33, vs0 - xvmovdp vs34, vs0 - xvmovdp vs35, vs0 + XVMOVDP(vs32,vs0) + XVMOVDP(vs33,vs0) + XVMOVDP(vs34,vs0) + XVMOVDP(vs35,vs0) +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL_4x2', ` +#else .macro KERNEL_4x2 +#endif lxvd2x vs0, o0, AO @@ -2166,21 +2330,37 @@ xvmaddadp vs35, vs1, vs17 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`INIT_2x2', ` +#else .macro INIT_2x2 +#endif xxlxor vs0, vs0, vs0 - xvmovdp vs32, vs0 - xvmovdp vs33, vs0 + XVMOVDP(vs32,vs0) + XVMOVDP(vs33,vs0) +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL_2x2', ` +#else .macro KERNEL_2x2 +#endif lxvd2x vs0, o0, AO @@ -2196,21 +2376,37 @@ xvmaddadp vs33, vs0, vs17 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`INIT_1x2', ` +#else .macro INIT_1x2 +#endif xxlxor vs0, vs0, vs0 - xvmovdp vs32, vs0 - xvmovdp vs33, vs0 + XVMOVDP(vs32,vs0) + XVMOVDP(vs33,vs0) +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL_1x2', ` +#else .macro KERNEL_1x2 +#endif lxvdsx vs0, o0, AO @@ -2226,14 +2422,22 @@ xvmaddadp vs33, vs0, vs17 +#if defined(_AIX) +') +#else .endm +#endif /*########################################################################################## SOLVE_LT 16x2 ##########################################################################################*/ +#if defined(_AIX) +define(`SOLVE_LT_16x2', ` +#else .macro SOLVE_LT_16x2 +#endif xxpermdi vs0, vs32, vs33, 0 xxpermdi vs1, vs32, vs33, 3 @@ -2821,46 +3025,46 @@ stxsdx vs32, o0, T1 - xxswapd vs32, vs32 + XXSWAPD(vs32,vs32) stxsdx vs33, o8, T1 - xxswapd vs33, vs33 + XXSWAPD(vs33,vs33) stxsdx vs34, o16, T1 - xxswapd vs34, vs34 + XXSWAPD(vs34,vs34) stxsdx vs35, o24, T1 - xxswapd vs35, vs35 + XXSWAPD(vs35,vs35) addi T1, T1, 32 stxsdx vs36, o0, T1 - xxswapd vs36, vs36 + XXSWAPD(vs36,vs36) stxsdx vs37, o8, T1 - xxswapd vs37, vs37 + XXSWAPD(vs37,vs37) stxsdx vs38, o16, T1 - xxswapd vs38, vs38 + XXSWAPD(vs38,vs38) stxsdx vs39, o24, T1 - xxswapd vs39, vs39 + XXSWAPD(vs39,vs39) addi T1, T1, 32 stxsdx vs40, o0, T1 - xxswapd vs40, vs40 + XXSWAPD(vs40,vs40) stxsdx vs41, o8, T1 - xxswapd vs41, vs41 + XXSWAPD(vs41,vs41) stxsdx vs42, o16, T1 - xxswapd vs42, vs42 + XXSWAPD(vs42,vs42) stxsdx vs43, o24, T1 - xxswapd vs43, vs43 + XXSWAPD(vs43,vs43) addi T1, T1, 32 stxsdx vs44, o0, T1 - xxswapd vs44, vs44 + XXSWAPD(vs44,vs44) stxsdx vs45, o8, T1 - xxswapd vs45, vs45 + XXSWAPD(vs45,vs45) stxsdx vs46, o16, T1 - xxswapd vs46, vs46 + XXSWAPD(vs46,vs46) stxsdx vs47, o24, T1 - xxswapd vs47, vs47 + XXSWAPD(vs47,vs47) stxsdx vs32, o0, T2 stxsdx vs33, o8, T2 @@ -2888,14 +3092,22 @@ stxsdx vs46, o16, T2 stxsdx vs47, o24, T2 +#if defined(_AIX) +') +#else .endm +#endif /*########################################################################################## SOLVE_LT 8x2 ##########################################################################################*/ +#if defined(_AIX) +define(`SOLVE_LT_8x2', ` +#else .macro SOLVE_LT_8x2 +#endif xxpermdi vs0, vs32, vs33, 0 xxpermdi vs1, vs32, vs33, 3 @@ -3111,24 +3323,24 @@ stxsdx vs32, o0, T1 - xxswapd vs32, vs32 + XXSWAPD(vs32,vs32) stxsdx vs33, o8, T1 - xxswapd vs33, vs33 + XXSWAPD(vs33,vs33) stxsdx vs34, o16, T1 - xxswapd vs34, vs34 + XXSWAPD(vs34,vs34) stxsdx vs35, o24, T1 - xxswapd vs35, vs35 + XXSWAPD(vs35,vs35) addi T1, T1, 32 stxsdx vs36, o0, T1 - xxswapd vs36, vs36 + XXSWAPD(vs36,vs36) stxsdx vs37, o8, T1 - xxswapd vs37, vs37 + XXSWAPD(vs37,vs37) stxsdx vs38, o16, T1 - xxswapd vs38, vs38 + XXSWAPD(vs38,vs38) stxsdx vs39, o24, T1 - xxswapd vs39, vs39 + XXSWAPD(vs39,vs39) stxsdx vs32, o0, T2 stxsdx vs33, o8, T2 @@ -3142,14 +3354,22 @@ stxsdx vs38, o16, T2 stxsdx vs39, o24, T2 +#if defined(_AIX) +') +#else .endm +#endif /*########################################################################################## SOLVE_LT 4x2 ##########################################################################################*/ +#if defined(_AIX) +define(`SOLVE_LT_4x2', ` +#else .macro SOLVE_LT_4x2 +#endif xxpermdi vs0, vs32, vs33, 0 xxpermdi vs1, vs32, vs33, 3 @@ -3245,27 +3465,35 @@ stxsdx vs32, o0, T1 - xxswapd vs32, vs32 + XXSWAPD(vs32,vs32) stxsdx vs33, o8, T1 - xxswapd vs33, vs33 + XXSWAPD(vs33,vs33) stxsdx vs34, o16, T1 - xxswapd vs34, vs34 + XXSWAPD(vs34,vs34) stxsdx vs35, o24, T1 - xxswapd vs35, vs35 + XXSWAPD(vs35,vs35) stxsdx vs32, o0, T2 stxsdx vs33, o8, T2 stxsdx vs34, o16, T2 stxsdx vs35, o24, T2 +#if defined(_AIX) +') +#else .endm +#endif /*########################################################################################## SOLVE_LT 2x2 ##########################################################################################*/ +#if defined(_AIX) +define(`SOLVE_LT_2x2', ` +#else .macro SOLVE_LT_2x2 +#endif xxpermdi vs0, vs32, vs33, 0 xxpermdi vs1, vs32, vs33, 3 @@ -3322,21 +3550,29 @@ stxsdx vs32, o0, T1 - xxswapd vs32, vs32 + XXSWAPD(vs32,vs32) stxsdx vs33, o8, T1 - xxswapd vs33, vs33 + XXSWAPD(vs33,vs33) stxsdx vs32, o0, T2 stxsdx vs33, o8, T2 +#if defined(_AIX) +') +#else .endm +#endif /*########################################################################################## SOLVE_LT 1x2 ##########################################################################################*/ +#if defined(_AIX) +define(`SOLVE_LT_1x2', ` +#else .macro SOLVE_LT_1x2 +#endif xxpermdi vs0, vs32, vs33, 0 @@ -3376,39 +3612,55 @@ stxsdx vs32, o0, T1 - xxswapd vs32, vs32 + XXSWAPD(vs32,vs32) stxsdx vs32, o0, T2 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`INIT_16x1', ` +#else .macro INIT_16x1 +#endif xxlxor vs0, vs0, vs0 - xvmovdp vs32, vs0 - xvmovdp vs33, vs0 - xvmovdp vs34, vs0 - xvmovdp vs35, vs0 - xvmovdp vs36, vs0 - xvmovdp vs37, vs0 - xvmovdp vs38, vs0 - xvmovdp vs39, vs0 - xvmovdp vs40, vs0 - xvmovdp vs41, vs0 - xvmovdp vs42, vs0 - xvmovdp vs43, vs0 - xvmovdp vs44, vs0 - xvmovdp vs45, vs0 - xvmovdp vs46, vs0 - xvmovdp vs47, vs0 + XVMOVDP(vs32,vs0) + XVMOVDP(vs33,vs0) + XVMOVDP(vs34,vs0) + XVMOVDP(vs35,vs0) + XVMOVDP(vs36,vs0) + XVMOVDP(vs37,vs0) + XVMOVDP(vs38,vs0) + XVMOVDP(vs39,vs0) + XVMOVDP(vs40,vs0) + XVMOVDP(vs41,vs0) + XVMOVDP(vs42,vs0) + XVMOVDP(vs43,vs0) + XVMOVDP(vs44,vs0) + XVMOVDP(vs45,vs0) + XVMOVDP(vs46,vs0) + XVMOVDP(vs47,vs0) +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL_16x1', ` +#else .macro KERNEL_16x1 +#endif lxvdsx vs0, o0, AO @@ -3461,27 +3713,43 @@ xvmaddadp vs47, vs15, vs16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`INIT_8x1', ` +#else .macro INIT_8x1 +#endif xxlxor vs0, vs0, vs0 - xvmovdp vs32, vs0 - xvmovdp vs33, vs0 - xvmovdp vs34, vs0 - xvmovdp vs35, vs0 - xvmovdp vs36, vs0 - xvmovdp vs37, vs0 - xvmovdp vs38, vs0 - xvmovdp vs39, vs0 + XVMOVDP(vs32,vs0) + XVMOVDP(vs33,vs0) + XVMOVDP(vs34,vs0) + XVMOVDP(vs35,vs0) + XVMOVDP(vs36,vs0) + XVMOVDP(vs37,vs0) + XVMOVDP(vs38,vs0) + XVMOVDP(vs39,vs0) +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL_8x1', ` +#else .macro KERNEL_8x1 +#endif lxvdsx vs0, o0, AO @@ -3512,23 +3780,39 @@ xvmaddadp vs39, vs7, vs16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`INIT_4x1', ` +#else .macro INIT_4x1 +#endif xxlxor vs0, vs0, vs0 - xvmovdp vs32, vs0 - xvmovdp vs33, vs0 - xvmovdp vs34, vs0 - xvmovdp vs35, vs0 + XVMOVDP(vs32,vs0) + XVMOVDP(vs33,vs0) + XVMOVDP(vs34,vs0) + XVMOVDP(vs35,vs0) +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL_4x1', ` +#else .macro KERNEL_4x1 +#endif lxvdsx vs0, o0, AO @@ -3548,21 +3832,37 @@ xvmaddadp vs35, vs3, vs16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`INIT_2x1', ` +#else .macro INIT_2x1 +#endif xxlxor vs0, vs0, vs0 - xvmovdp vs32, vs0 - xvmovdp vs33, vs0 + XVMOVDP(vs32,vs0) + XVMOVDP(vs33,vs0) +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL_2x1', ` +#else .macro KERNEL_2x1 +#endif lxvdsx vs0, o0, AO @@ -3578,20 +3878,36 @@ xvmaddadp vs33, vs1, vs16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`INIT_1x1', ` +#else .macro INIT_1x1 +#endif xxlxor vs0, vs0, vs0 - xvmovdp vs32, vs0 + XVMOVDP(vs32,vs0) +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL_1x1', ` +#else .macro KERNEL_1x1 +#endif lxvdsx vs0, o0, AO @@ -3605,31 +3921,39 @@ xvmaddadp vs32, vs0, vs16 +#if defined(_AIX) +') +#else .endm +#endif /*########################################################################################## SOLVE_LT 16x1 ##########################################################################################*/ +#if defined(_AIX) +define(`SOLVE_LT_16x1', ` +#else .macro SOLVE_LT_16x1 +#endif - xxswapd vs0, vs32 - xxswapd vs1, vs33 - xxswapd vs2, vs34 - xxswapd vs3, vs35 - xxswapd vs4, vs36 - xxswapd vs5, vs37 - xxswapd vs6, vs38 - xxswapd vs7, vs39 - xxswapd vs8, vs40 - xxswapd vs9, vs41 - xxswapd vs10, vs42 - xxswapd vs11, vs43 - xxswapd vs12, vs44 - xxswapd vs13, vs45 - xxswapd vs14, vs46 - xxswapd vs15, vs47 + XXSWAPD(vs0,vs32) + XXSWAPD(vs1,vs33) + XXSWAPD(vs2,vs34) + XXSWAPD(vs3,vs35) + XXSWAPD(vs4,vs36) + XXSWAPD(vs5,vs37) + XXSWAPD(vs6,vs38) + XXSWAPD(vs7,vs39) + XXSWAPD(vs8,vs40) + XXSWAPD(vs9,vs41) + XXSWAPD(vs10,vs42) + XXSWAPD(vs11,vs43) + XXSWAPD(vs12,vs44) + XXSWAPD(vs13,vs45) + XXSWAPD(vs14,vs46) + XXSWAPD(vs15,vs47) //############### LOAD B ####################### @@ -4215,23 +4539,31 @@ stxsdx vs46, o16, T1 stxsdx vs47, o24, T1 +#if defined(_AIX) +') +#else .endm +#endif /*########################################################################################## SOLVE_LT 8x1 ##########################################################################################*/ +#if defined(_AIX) +define(`SOLVE_LT_8x1', ` +#else .macro SOLVE_LT_8x1 +#endif - xxswapd vs0, vs32 - xxswapd vs1, vs33 - xxswapd vs2, vs34 - xxswapd vs3, vs35 - xxswapd vs4, vs36 - xxswapd vs5, vs37 - xxswapd vs6, vs38 - xxswapd vs7, vs39 + XXSWAPD(vs0,vs32) + XXSWAPD(vs1,vs33) + XXSWAPD(vs2,vs34) + XXSWAPD(vs3,vs35) + XXSWAPD(vs4,vs36) + XXSWAPD(vs5,vs37) + XXSWAPD(vs6,vs38) + XXSWAPD(vs7,vs39) //############### LOAD B ####################### @@ -4443,19 +4775,27 @@ stxsdx vs38, o16, T1 stxsdx vs39, o24, T1 +#if defined(_AIX) +') +#else .endm +#endif /*########################################################################################## SOLVE_LT 4x1 ##########################################################################################*/ +#if defined(_AIX) +define(`SOLVE_LT_4x1', ` +#else .macro SOLVE_LT_4x1 +#endif - xxswapd vs0, vs32 - xxswapd vs1, vs33 - xxswapd vs2, vs34 - xxswapd vs3, vs35 + XXSWAPD(vs0,vs32) + XXSWAPD(vs1,vs33) + XXSWAPD(vs2,vs34) + XXSWAPD(vs3,vs35) //############### LOAD B ####################### @@ -4546,17 +4886,25 @@ stxsdx vs34, o16, T1 stxsdx vs35, o24, T1 +#if defined(_AIX) +') +#else .endm +#endif /*########################################################################################## SOLVE_LT 2x1 ##########################################################################################*/ +#if defined(_AIX) +define(`SOLVE_LT_2x1', ` +#else .macro SOLVE_LT_2x1 +#endif - xxswapd vs0, vs32 - xxswapd vs1, vs33 + XXSWAPD(vs0,vs32) + XXSWAPD(vs1,vs33) //############### LOAD B ####################### @@ -4609,16 +4957,24 @@ stxsdx vs32, o0, T1 stxsdx vs33, o8, T1 +#if defined(_AIX) +') +#else .endm +#endif /*########################################################################################## SOLVE_LT 1x1 ##########################################################################################*/ +#if defined(_AIX) +define(`SOLVE_LT_1x1', ` +#else .macro SOLVE_LT_1x1 +#endif - xxswapd vs0, vs32 + XXSWAPD(vs0,vs32) //############### LOAD B ####################### @@ -4655,5 +5011,9 @@ stxsdx vs32, o0, T1 +#if defined(_AIX) +') +#else .endm +#endif diff --git a/kernel/power/idamax.c b/kernel/power/idamax.c index 5bdc0a13c..623ac9fb0 100644 --- a/kernel/power/idamax.c +++ b/kernel/power/idamax.c @@ -58,8 +58,8 @@ static BLASLONG diamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) { "lxvd2x 47, %[i48],%[ptr_tmp] \n\t" "lxvd2x 48, %[i64],%[ptr_tmp] \n\t" "lxvd2x 49, %[i80],%[ptr_tmp] \n\t" - "lxvd2x 50, %[i96],%[ptr_tmp] \n\t" - "lxvd2x 51,%[i112],%[ptr_tmp] \n\t" + "lxvd2x 6, %[i96],%[ptr_tmp] \n\t" + "lxvd2x 7,%[i112],%[ptr_tmp] \n\t" "xxlor 40,%x[start],%x[start] \n\t" //{ 1,0} vs40 | v8 "vaddudm 9,8,%[adder] \n\t" //{3,2} vs41 @@ -69,7 +69,7 @@ static BLASLONG diamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) { "vaddudm 11,10,%[adder] \n\t" //{7,6} vs43 "xxlxor 39,39,39 \n\t" // vs39 vec_max_value "vaddudm 4,11, %[adder] \n\t" // {9,8} -{8;8} vs36 | v4 - "xxspltd 36,36,0 \n\t" + XXSPLTD_S(36,36,0) "xvabsdp 44, 44 \n\t" "xvabsdp 45, 45 \n\t" @@ -77,21 +77,21 @@ static BLASLONG diamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) { "xvabsdp 47, 47 \n\t" "xvabsdp 48, 48 \n\t" "xvabsdp 49, 49 \n\t" - "xvabsdp 50, 50 \n\t" - "xvabsdp 51, 51 \n\t" + "xvabsdp 6, 6 \n\t" + "xvabsdp 7, 7 \n\t" //jump first half forward - "b 2f \n\t" + "b two%= \n\t" //=================================================================== - ".p2align 5 \n\t" + ".align 5 \n\t" - "1: \n\t" + "one%=: \n\t" "xvcmpgtdp 2,45,44 \n\t " "xvcmpgtdp 3,47,46 \n\t " "xvcmpgtdp 4,49,48 \n\t " - "xvcmpgtdp 5,51,50 \n\t" + "xvcmpgtdp 5,7,6 \n\t" "xxsel 32,40,41,2 \n\t" "xxsel 0,44,45,2 \n\t" @@ -100,7 +100,7 @@ static BLASLONG diamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) { "xxsel 34,40,41,4 \n\t" "xxsel 45,48,49,4 \n\t" "xxsel 35,42,43,5 \n\t" - "xxsel 47,50,51,5 \n\t" + "xxsel 47,6,7,5 \n\t" "xvcmpgtdp 2, 1,0 \n\t" "xvcmpgtdp 3,47, 45 \n\t" @@ -134,8 +134,8 @@ static BLASLONG diamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) { "vaddudm 1,1,5 \n\t" // get real index for first bigger - "lxvd2x 50, %[i96],%[ptr_tmp] \n\t" - "lxvd2x 51,%[i112],%[ptr_tmp] \n\t" + "lxvd2x 6, %[i96],%[ptr_tmp] \n\t" + "lxvd2x 7,%[i112],%[ptr_tmp] \n\t" //compare with previous to get vec_max_index(v6 | vs38 ) and vec_max_value (vs39) "xvcmpgtdp 2, 3,39 \n\t" @@ -155,16 +155,16 @@ static BLASLONG diamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) { "xvabsdp 48, 48 \n\t" "xvabsdp 49, 49 \n\t" - "xvabsdp 50, 50 \n\t" - "xvabsdp 51, 51 \n\t" + "xvabsdp 6, 6 \n\t" + "xvabsdp 7, 7 \n\t" //<-----------jump here from first load - "2: \n\t" + "two%=: \n\t" "xvcmpgtdp 2,45,44 \n\t " "xvcmpgtdp 3,47,46 \n\t " "xvcmpgtdp 4,49,48 \n\t " - "xvcmpgtdp 5,51,50 \n\t" + "xvcmpgtdp 5,7,6 \n\t" "xxsel 32,40,41,2 \n\t" "xxsel 0,44,45,2 \n\t" @@ -173,7 +173,7 @@ static BLASLONG diamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) { "xxsel 34,40,41,4 \n\t" "xxsel 45,48,49,4 \n\t" "xxsel 35,42,43,5 \n\t" - "xxsel 47,50,51,5 \n\t" + "xxsel 47,6,7,5 \n\t" "xvcmpgtdp 2, 1,0 \n\t" "xvcmpgtdp 3,47, 45 \n\t" @@ -203,8 +203,8 @@ static BLASLONG diamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) { "vaddudm 1,1,5 \n\t" // get real index for first bigger - "lxvd2x 50, %[i96],%[ptr_tmp] \n\t" - "lxvd2x 51,%[i112],%[ptr_tmp] \n\t" + "lxvd2x 6, %[i96],%[ptr_tmp] \n\t" + "lxvd2x 7,%[i112],%[ptr_tmp] \n\t" @@ -226,21 +226,21 @@ static BLASLONG diamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) { "xvabsdp 48, 48 \n\t" "xvabsdp 49, 49 \n\t" - "xvabsdp 50, 50 \n\t" - "xvabsdp 51, 51 \n\t" + "xvabsdp 6, 6 \n\t" + "xvabsdp 7, 7 \n\t" //decrement n "addic. %[n], %[n], -32 \n\t" //Loop back if >0 - "bgt+ 1b \n\t" + "bgt+ one%= \n\t" //============================================================================== "xvcmpgtdp 2,45,44 \n\t " "xvcmpgtdp 3,47,46 \n\t " "xvcmpgtdp 4,49,48 \n\t " - "xvcmpgtdp 5,51,50 \n\t" + "xvcmpgtdp 5,7,6 \n\t" "xxsel 32,40,41,2 \n\t" "xxsel 0,44,45,2 \n\t" @@ -249,7 +249,7 @@ static BLASLONG diamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) { "xxsel 34,40,41,4 \n\t" "xxsel 45,48,49,4 \n\t" "xxsel 35,42,43,5 \n\t" - "xxsel 47,50,51,5 \n\t" + "xxsel 47,6,7,5 \n\t" "xvcmpgtdp 2, 1,0 \n\t" "xvcmpgtdp 3,47, 45 \n\t" @@ -276,28 +276,28 @@ static BLASLONG diamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) { ///////extract max value and max index from vector - "xxspltd 32,38,1 \n\t" - "xxspltd 40,39,1 \n\t" + XXSPLTD_S(32,38,1) + XXSPLTD_S(40,39,1) "xvcmpeqdp. 2, 40,39 \n\t" //cr6 0 bit set if all true, cr6=4*6+bit_ind=24,0011at CR(BI)==1, at=10 hint that it occurs rarely //0b001110=14 - "bc 14,24, 3f \n\t" + "bc 14,24, three%= \n\t" "xvcmpgtdp 4, 40,39 \n\t" "xxsel 0,39,40,4 \n\t" "xxsel 1,38,32,4 \n\t" "stxsdx 0,0,%[ptr_maxf] \n\t" - "b 4f \n\t" + "b four%= \n\t" - "3: \n\t" + "three%=: \n\t" //if elements value are equal then choose minimum index - "xxspltd 0,40,0 \n\t" + XXSPLTD_S(0,40,0) "vminud 0,0,6 \n\t" //vs32 vs38 "xxlor 1,32,32 \n\t" "stxsdx 0,0,%[ptr_maxf] \n\t" - "4: \n\t" + "four%=: \n\t" "mfvsrd %[index],1 \n\t" : [maxf] "=m"(*maxf),[ptr_tmp] "+&b"(x),[index] "=r"(index), [n] "+&r"(n) @@ -306,7 +306,7 @@ static BLASLONG diamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) { [i64] "b"(64), [i80] "b"(80), [i96] "b"(96), [i112] "b"(112), [start] "v"(start), [adder] "v"(temp_add_index) : "cc", "vs0", "vs1","vs2","vs3", "vs4","vs5","vs32", "vs33", "vs34", "vs35", "vs36", - "vs37", "vs38", "vs39", "vs40", "vs41", "vs42", "vs43", "vs44", "vs45", "vs46", "vs47", "vs48", "vs49", "vs50", "vs51" + "vs37", "vs38", "vs39", "vs40", "vs41", "vs42", "vs43", "vs44", "vs45", "vs46", "vs47", "vs48", "vs49", "vs6", "vs7" ); diff --git a/kernel/power/idamin.c b/kernel/power/idamin.c index 7fe0f8a33..b2705f2fa 100644 --- a/kernel/power/idamin.c +++ b/kernel/power/idamin.c @@ -58,8 +58,8 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { "lxvd2x 47, %[i48],%[ptr_tmp] \n\t" "lxvd2x 48, %[i64],%[ptr_tmp] \n\t" "lxvd2x 49, %[i80],%[ptr_tmp] \n\t" - "lxvd2x 50, %[i96],%[ptr_tmp] \n\t" - "lxvd2x 51,%[i112],%[ptr_tmp] \n\t" + "lxvd2x 6, %[i96],%[ptr_tmp] \n\t" + "lxvd2x 7,%[i112],%[ptr_tmp] \n\t" "xxlor 40,%x[start],%x[start] \n\t" //{ 1,0} vs40 | v8 "vaddudm 9,8, %[adder] \n\t" //{3,2} vs41 @@ -69,7 +69,7 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { "vaddudm 11,10,%[adder] \n\t" //{7,6} vs43 "lxvdsx 39,0,%[ptr_minf] \n\t" // vs39 vec_min_value "vaddudm 4,11, %[adder] \n\t" // {9,8} -{8;8} vs36 | v4 - "xxspltd 36,36,0 \n\t" + XXSPLTD_S(36,36,0) "xvabsdp 39, 39 \n\t" "xvabsdp 44, 44 \n\t" @@ -78,21 +78,21 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { "xvabsdp 47, 47 \n\t" "xvabsdp 48, 48 \n\t" "xvabsdp 49, 49 \n\t" - "xvabsdp 50, 50 \n\t" - "xvabsdp 51, 51 \n\t" + "xvabsdp 6, 6 \n\t" + "xvabsdp 7, 7 \n\t" //jump first half forward - "b 2f \n\t" + "b two%= \n\t" //=================================================================== - ".p2align 5 \n\t" + ".align 5 \n\t" - "1: \n\t" + "one%=: \n\t" "xvcmpgtdp 2,44,45 \n\t " "xvcmpgtdp 3,46,47 \n\t " "xvcmpgtdp 4,48,49 \n\t " - "xvcmpgtdp 5,50,51 \n\t" + "xvcmpgtdp 5,6,7 \n\t" "xxsel 32,40,41,2 \n\t" "xxsel 0,44,45,2 \n\t" @@ -101,7 +101,7 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { "xxsel 34,40,41,4 \n\t" "xxsel 45,48,49,4 \n\t" "xxsel 35,42,43,5 \n\t" - "xxsel 47,50,51,5 \n\t" + "xxsel 47,6,7,5 \n\t" "xvcmpgtdp 2,0, 1 \n\t" "xvcmpgtdp 3, 45,47 \n\t" @@ -135,8 +135,8 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { "vaddudm 1,1,5 \n\t" // get real index for first smaller - "lxvd2x 50, %[i96],%[ptr_tmp] \n\t" - "lxvd2x 51,%[i112],%[ptr_tmp] \n\t" + "lxvd2x 6, %[i96],%[ptr_tmp] \n\t" + "lxvd2x 7,%[i112],%[ptr_tmp] \n\t" //compare with previous to get vec_min_index(v6 | vs38 ) and vec_min_value (vs39) "xvcmpgtdp 2,39, 3 \n\t" @@ -156,16 +156,16 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { "xvabsdp 48, 48 \n\t" "xvabsdp 49, 49 \n\t" - "xvabsdp 50, 50 \n\t" - "xvabsdp 51, 51 \n\t" + "xvabsdp 6, 6 \n\t" + "xvabsdp 7, 7 \n\t" //<-----------jump here from first load - "2: \n\t" + "two%=: \n\t" "xvcmpgtdp 2,44,45 \n\t " "xvcmpgtdp 3,46,47 \n\t " "xvcmpgtdp 4,48,49 \n\t " - "xvcmpgtdp 5,50,51 \n\t" + "xvcmpgtdp 5,6,7 \n\t" "xxsel 32,40,41,2 \n\t" "xxsel 0,44,45,2 \n\t" @@ -174,7 +174,7 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { "xxsel 34,40,41,4 \n\t" "xxsel 45,48,49,4 \n\t" "xxsel 35,42,43,5 \n\t" - "xxsel 47,50,51,5 \n\t" + "xxsel 47,6,7,5 \n\t" "xvcmpgtdp 2,0, 1 \n\t" "xvcmpgtdp 3, 45,47 \n\t" @@ -204,8 +204,8 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { "vaddudm 1,1,5 \n\t" // get real index for first smaller - "lxvd2x 50, %[i96],%[ptr_tmp] \n\t" - "lxvd2x 51,%[i112],%[ptr_tmp] \n\t" + "lxvd2x 6, %[i96],%[ptr_tmp] \n\t" + "lxvd2x 7,%[i112],%[ptr_tmp] \n\t" @@ -227,21 +227,21 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { "xvabsdp 48, 48 \n\t" "xvabsdp 49, 49 \n\t" - "xvabsdp 50, 50 \n\t" - "xvabsdp 51, 51 \n\t" + "xvabsdp 6, 6 \n\t" + "xvabsdp 7, 7 \n\t" //decrement n "addic. %[n], %[n], -32 \n\t" //Loop back if >0 - "bgt+ 1b \n\t" + "bgt+ one%= \n\t" //============================================================================== "xvcmpgtdp 2,44,45 \n\t " "xvcmpgtdp 3,46,47 \n\t " "xvcmpgtdp 4,48,49 \n\t " - "xvcmpgtdp 5,50,51 \n\t" + "xvcmpgtdp 5,6,7 \n\t" "xxsel 32,40,41,2 \n\t" "xxsel 0,44,45,2 \n\t" @@ -250,7 +250,7 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { "xxsel 34,40,41,4 \n\t" "xxsel 45,48,49,4 \n\t" "xxsel 35,42,43,5 \n\t" - "xxsel 47,50,51,5 \n\t" + "xxsel 47,6,7,5 \n\t" "xvcmpgtdp 2,0, 1 \n\t" "xvcmpgtdp 3, 45,47 \n\t" @@ -277,28 +277,28 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { ///////extract min value and min index from vector - "xxspltd 32,38,1 \n\t" - "xxspltd 40,39,1 \n\t" + XXSPLTD_S(32,38,1) + XXSPLTD_S(40,39,1) "xvcmpeqdp. 2, 40,39 \n\t" //cr6 0 bit set if all true, cr6=4*6+bit_ind=24,0011at CR(BI)==1, at=10 hint that it occurs rarely //0b001110=14 - "bc 14,24, 3f \n\t" + "bc 14,24, three%= \n\t" "xvcmpgtdp 4,39, 40 \n\t" "xxsel 0,39,40,4 \n\t" "xxsel 1,38,32,4 \n\t" "stxsdx 0,0,%[ptr_minf] \n\t" - "b 4f \n\t" + "b four%= \n\t" - "3: \n\t" + "three%=: \n\t" //if elements value are equal then choose minimum index - "xxspltd 0,40,0 \n\t" + XXSPLTD_S(0,40,0) "vminud 0,0,6 \n\t" //vs32 vs38 "xxlor 1,32,32 \n\t" "stxsdx 0,0,%[ptr_minf] \n\t" - "4: \n\t" + "four%=: \n\t" "mfvsrd %[index],1 \n\t" : [minf] "=m"(*minf),[ptr_tmp] "+&b"(x),[index] "=r"(index), [n] "+&r"(n) @@ -307,7 +307,7 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { [i64] "b"(64), [i80] "b"(80), [i96] "b"(96), [i112] "b"(112), [start] "v"(start), [adder] "v"(temp_add_index) : "cc", "vs0", "vs1","vs2","vs3", "vs4","vs5","vs32", "vs33", "vs34", "vs35", "vs36", - "vs37", "vs38", "vs39", "vs40", "vs41", "vs42", "vs43", "vs44", "vs45", "vs46", "vs47", "vs48", "vs49", "vs50", "vs51" + "vs37", "vs38", "vs39", "vs40", "vs41", "vs42", "vs43", "vs44", "vs45", "vs46", "vs47", "vs48", "vs49", "vs6", "vs7" ); return index; diff --git a/kernel/power/izamax.c b/kernel/power/izamax.c index cfe78c8c0..339c3ccde 100644 --- a/kernel/power/izamax.c +++ b/kernel/power/izamax.c @@ -56,8 +56,8 @@ static BLASLONG ziamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *maxf) { "lxvd2x 47, %[i48],%[ptr_tmp] \n\t" "lxvd2x 48, %[i64],%[ptr_tmp] \n\t" "lxvd2x 49, %[i80],%[ptr_tmp] \n\t" - "lxvd2x 50, %[i96],%[ptr_tmp] \n\t" - "lxvd2x 51,%[i112],%[ptr_tmp] \n\t" + "lxvd2x 6, %[i96],%[ptr_tmp] \n\t" + "lxvd2x 7,%[i112],%[ptr_tmp] \n\t" "xxlor 40,%x[start],%x[start] \n\t" //{ 1,0} vs40 | v8 "vaddudm 9,8,%[adder] \n\t" //{3,2} vs41 @@ -67,7 +67,7 @@ static BLASLONG ziamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *maxf) { "vaddudm 11,10,%[adder] \n\t" //{7,6} vs43 "xxlxor 39,39,39 \n\t" // vs39 vec_max_value is zero "vaddudm 4,11, %[adder] \n\t" // {9,8} -{8;8} vs36 | v4 - "xxspltd 36,36,0 \n\t" + XXSPLTD_S(36,36,0) @@ -77,24 +77,24 @@ static BLASLONG ziamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *maxf) { "xvabsdp 47, 47 \n\t" "xvabsdp 48, 48 \n\t" "xvabsdp 49, 49 \n\t" - "xvabsdp 50, 50 \n\t" - "xvabsdp 51, 51 \n\t" + "xvabsdp 6, 6 \n\t" + "xvabsdp 7, 7 \n\t" //jump first half forward - "b 2f \n\t" + "b two%= \n\t" - ".p2align 5 \n\t" - "1: \n\t" + ".align 5 \n\t" + "one%=: \n\t" - "xxmrghd 0,44,45 \n\t" - "xxmrgld 1,44,45 \n\t" - "xxmrghd 2,46,47 \n\t" - "xxmrgld 3,46,47 \n\t" - "xxmrghd 4,48,49 \n\t" - "xxmrgld 5,48,49 \n\t" - "xxmrghd 44,50,51 \n\t" - "xxmrgld 45,50,51 \n\t" + XXMRGHD_S(0,44,45) + XXMRGLD_S(1,44,45) + XXMRGHD_S(2,46,47) + XXMRGLD_S(3,46,47) + XXMRGHD_S(4,48,49) + XXMRGLD_S(5,48,49) + XXMRGHD_S(44,6,7) + XXMRGLD_S(45,6,7) "xvadddp 46, 0,1 \n\t" "xvadddp 47, 2,3 \n\t" @@ -103,15 +103,15 @@ static BLASLONG ziamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *maxf) { - "xvcmpgtdp 50,47,46 \n\t " - "xvcmpgtdp 51,49,48 \n\t " + "xvcmpgtdp 6,47,46 \n\t " + "xvcmpgtdp 7,49,48 \n\t " "addi %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t" - "xxsel 32,40,41,50 \n\t" - "xxsel 0,46,47,50 \n\t" - "xxsel 33,42,43,51 \n\t" - "xxsel 1,48,49,51 \n\t" + "xxsel 32,40,41,6 \n\t" + "xxsel 0,46,47,6 \n\t" + "xxsel 33,42,43,7 \n\t" + "xxsel 1,48,49,7 \n\t" "lxvd2x 44, 0,%[ptr_tmp] \n\t" "lxvd2x 45, %[i16],%[ptr_tmp] \n\t" @@ -133,8 +133,8 @@ static BLASLONG ziamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *maxf) { "lxvd2x 48, %[i64],%[ptr_tmp] \n\t" "lxvd2x 49, %[i80],%[ptr_tmp] \n\t" - "lxvd2x 50, %[i96],%[ptr_tmp] \n\t" - "lxvd2x 51,%[i112],%[ptr_tmp] \n\t" + "lxvd2x 6, %[i96],%[ptr_tmp] \n\t" + "lxvd2x 7,%[i112],%[ptr_tmp] \n\t" //select with previous "xxsel 38,38,32,4 \n\t" "xxsel 39,39,3,4 \n\t" @@ -148,35 +148,35 @@ static BLASLONG ziamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *maxf) { "xvabsdp 47, 47 \n\t" "xvabsdp 48, 48 \n\t" "xvabsdp 49, 49 \n\t" - "xvabsdp 50, 50 \n\t" - "xvabsdp 51, 51 \n\t" + "xvabsdp 6, 6 \n\t" + "xvabsdp 7, 7 \n\t" //>>/////////////////////////////// half start - "2: \n\t" - "xxmrghd 0,44,45 \n\t" - "xxmrgld 1,44,45 \n\t" - "xxmrghd 2,46,47 \n\t" - "xxmrgld 3,46,47 \n\t" - "xxmrghd 4,48,49 \n\t" - "xxmrgld 5,48,49 \n\t" - "xxmrghd 44,50,51 \n\t" - "xxmrgld 45,50,51 \n\t" + "two%=: \n\t" + XXMRGHD_S(0,44,45) + XXMRGLD_S(1,44,45) + XXMRGHD_S(2,46,47) + XXMRGLD_S(3,46,47) + XXMRGHD_S(4,48,49) + XXMRGLD_S(5,48,49) + XXMRGHD_S(44,6,7) + XXMRGLD_S(45,6,7) "xvadddp 46, 0,1 \n\t" "xvadddp 47, 2,3 \n\t" "xvadddp 48, 4,5 \n\t" "xvadddp 49, 44,45 \n\t" - "xvcmpgtdp 50,47,46 \n\t " - "xvcmpgtdp 51,49,48 \n\t " + "xvcmpgtdp 6,47,46 \n\t " + "xvcmpgtdp 7,49,48 \n\t " "addi %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t" - "xxsel 32,40,41,50 \n\t" - "xxsel 0,46,47,50 \n\t" - "xxsel 33,42,43,51 \n\t" - "xxsel 1,48,49,51 \n\t" + "xxsel 32,40,41,6 \n\t" + "xxsel 0,46,47,6 \n\t" + "xxsel 33,42,43,7 \n\t" + "xxsel 1,48,49,7 \n\t" "lxvd2x 44, 0,%[ptr_tmp] \n\t" "lxvd2x 45, %[i16],%[ptr_tmp] \n\t" @@ -198,8 +198,8 @@ static BLASLONG ziamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *maxf) { "lxvd2x 48, %[i64],%[ptr_tmp] \n\t" "lxvd2x 49, %[i80],%[ptr_tmp] \n\t" - "lxvd2x 50, %[i96],%[ptr_tmp] \n\t" - "lxvd2x 51,%[i112],%[ptr_tmp] \n\t" + "lxvd2x 6, %[i96],%[ptr_tmp] \n\t" + "lxvd2x 7,%[i112],%[ptr_tmp] \n\t" //select with previous "xxsel 38,38,32,4 \n\t" "xxsel 39,39,3,4 \n\t" @@ -211,24 +211,24 @@ static BLASLONG ziamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *maxf) { "xvabsdp 47, 47 \n\t" "xvabsdp 48, 48 \n\t" "xvabsdp 49, 49 \n\t" - "xvabsdp 50, 50 \n\t" - "xvabsdp 51, 51 \n\t" + "xvabsdp 6, 6 \n\t" + "xvabsdp 7, 7 \n\t" //decrement n "addic. %[n], %[n], -16 \n\t" //Loop back if >0 - "bgt+ 1b \n\t" + "bgt+ one%= \n\t" - "xxmrghd 0,44,45 \n\t" - "xxmrgld 1,44,45 \n\t" - "xxmrghd 2,46,47 \n\t" - "xxmrgld 3,46,47 \n\t" - "xxmrghd 4,48,49 \n\t" - "xxmrgld 5,48,49 \n\t" - "xxmrghd 44,50,51 \n\t" - "xxmrgld 45,50,51 \n\t" + XXMRGHD_S(0,44,45) + XXMRGLD_S(1,44,45) + XXMRGHD_S(2,46,47) + XXMRGLD_S(3,46,47) + XXMRGHD_S(4,48,49) + XXMRGLD_S(5,48,49) + XXMRGHD_S(44,6,7) + XXMRGLD_S(45,6,7) "xvadddp 46, 0,1 \n\t" "xvadddp 47, 2,3 \n\t" @@ -237,13 +237,13 @@ static BLASLONG ziamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *maxf) { - "xvcmpgtdp 50,47,46 \n\t " - "xvcmpgtdp 51,49,48 \n\t " + "xvcmpgtdp 6,47,46 \n\t " + "xvcmpgtdp 7,49,48 \n\t " - "xxsel 32,40,41,50 \n\t" - "xxsel 0,46,47,50 \n\t" - "xxsel 33,42,43,51 \n\t" - "xxsel 1,48,49,51 \n\t" + "xxsel 32,40,41,6 \n\t" + "xxsel 0,46,47,6 \n\t" + "xxsel 33,42,43,7 \n\t" + "xxsel 1,48,49,7 \n\t" "xvcmpgtdp 2,1,0 \n\t " "xxsel 32,32,33,2 \n\t" @@ -262,28 +262,28 @@ static BLASLONG ziamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *maxf) { ///////extract max value and max index from vector - "xxspltd 32,38,1 \n\t" - "xxspltd 40,39,1 \n\t" + XXSPLTD_S(32,38,1) + XXSPLTD_S(40,39,1) "xvcmpeqdp. 2, 40,39 \n\t" //cr6 0 bit set if all true, cr6=4*6+bit_ind=24,0011at CR(BI)==1, at=10 hint that it occurs rarely //0b001110=14 - "bc 14,24, 3f \n\t" + "bc 14,24, three%= \n\t" "xvcmpgtdp 4, 40,39 \n\t" "xxsel 0,39,40,4 \n\t" "xxsel 1,38,32,4 \n\t" "stxsdx 0,0,%[ptr_maxf] \n\t" - "b 4f \n\t" + "b four%= \n\t" - "3: \n\t" + "three%=: \n\t" //if elements value are equal then choose minimum index - "xxspltd 0,40,0 \n\t" + XXSPLTD_S(0,40,0) "vminud 0,0,6 \n\t" //vs32 vs38 "xxlor 1,32,32 \n\t" "stxsdx 0,0,%[ptr_maxf] \n\t" - "4: \n\t" + "four%=: \n\t" "mfvsrd %[index],1 \n\t" : [maxf] "=m"(*maxf),[ptr_tmp] "+&b"(x),[index] "=r"(index), [n] "+&r"(n) @@ -292,7 +292,7 @@ static BLASLONG ziamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *maxf) { [i64] "b"(64), [i80] "b"(80), [i96] "b"(96), [i112] "b"(112), [start] "v"(start), [adder] "v"(temp_add_index) : "cc", "vs0", "vs1","vs2","vs3", "vs4","vs5","vs32", "vs33", "vs34", "vs35", "vs36", - "vs37", "vs38", "vs39", "vs40", "vs41", "vs42", "vs43", "vs44", "vs45", "vs46", "vs47", "vs48", "vs49", "vs50", "vs51" + "vs37", "vs38", "vs39", "vs40", "vs41", "vs42", "vs43", "vs44", "vs45", "vs46", "vs47", "vs48", "vs49", "vs6", "vs7" ); return index; diff --git a/kernel/power/izamin.c b/kernel/power/izamin.c index 1ffa3ba8b..6d0d15547 100644 --- a/kernel/power/izamin.c +++ b/kernel/power/izamin.c @@ -54,8 +54,8 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) { "lxvd2x 47, %[i48],%[ptr_tmp] \n\t" "lxvd2x 48, %[i64],%[ptr_tmp] \n\t" "lxvd2x 49, %[i80],%[ptr_tmp] \n\t" - "lxvd2x 50, %[i96],%[ptr_tmp] \n\t" - "lxvd2x 51,%[i112],%[ptr_tmp] \n\t" + "lxvd2x 6, %[i96],%[ptr_tmp] \n\t" + "lxvd2x 7,%[i112],%[ptr_tmp] \n\t" "xxlor 40,%x[start],%x[start] \n\t" //{ 1,0} vs40 | v8 "vaddudm 9,8,%[adder] \n\t" //{3,2} vs41 @@ -65,7 +65,7 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) { "vaddudm 11,10,%[adder] \n\t" //{7,6} vs43 "lxvdsx 39,0,%[ptr_minf] \n\t" // vs39 vec_min_value "vaddudm 4,11, %[adder] \n\t" // {9,8} -{8;8} vs36 | v4 - "xxspltd 36,36,0 \n\t" + XXSPLTD_S(36,36,0) @@ -75,24 +75,24 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) { "xvabsdp 47, 47 \n\t" "xvabsdp 48, 48 \n\t" "xvabsdp 49, 49 \n\t" - "xvabsdp 50, 50 \n\t" - "xvabsdp 51, 51 \n\t" + "xvabsdp 6, 6 \n\t" + "xvabsdp 7, 7 \n\t" //jump first half forward - "b 2f \n\t" + "b two%= \n\t" - ".p2align 5 \n\t" - "1: \n\t" + ".align 5 \n\t" + "one%=: \n\t" - "xxmrghd 0,44,45 \n\t" - "xxmrgld 1,44,45 \n\t" - "xxmrghd 2,46,47 \n\t" - "xxmrgld 3,46,47 \n\t" - "xxmrghd 4,48,49 \n\t" - "xxmrgld 5,48,49 \n\t" - "xxmrghd 44,50,51 \n\t" - "xxmrgld 45,50,51 \n\t" + XXMRGHD_S(0,44,45) + XXMRGLD_S(1,44,45) + XXMRGHD_S(2,46,47) + XXMRGLD_S(3,46,47) + XXMRGHD_S(4,48,49) + XXMRGLD_S(5,48,49) + XXMRGHD_S(44,6,7) + XXMRGLD_S(45,6,7) "xvadddp 46, 0,1 \n\t" "xvadddp 47, 2,3 \n\t" @@ -101,15 +101,15 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) { - "xvcmpgtdp 50,46,47 \n\t " - "xvcmpgtdp 51,48,49 \n\t " + "xvcmpgtdp 6,46,47 \n\t " + "xvcmpgtdp 7,48,49 \n\t " "addi %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t" - "xxsel 32,40,41,50 \n\t" - "xxsel 0,46,47,50 \n\t" - "xxsel 33,42,43,51 \n\t" - "xxsel 1,48,49,51 \n\t" + "xxsel 32,40,41,6 \n\t" + "xxsel 0,46,47,6 \n\t" + "xxsel 33,42,43,7 \n\t" + "xxsel 1,48,49,7 \n\t" "lxvd2x 44, 0,%[ptr_tmp] \n\t" "lxvd2x 45, %[i16],%[ptr_tmp] \n\t" @@ -131,8 +131,8 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) { "lxvd2x 48, %[i64],%[ptr_tmp] \n\t" "lxvd2x 49, %[i80],%[ptr_tmp] \n\t" - "lxvd2x 50, %[i96],%[ptr_tmp] \n\t" - "lxvd2x 51,%[i112],%[ptr_tmp] \n\t" + "lxvd2x 6, %[i96],%[ptr_tmp] \n\t" + "lxvd2x 7,%[i112],%[ptr_tmp] \n\t" //select with previous "xxsel 38,38,32,4 \n\t" "xxsel 39,39,3,4 \n\t" @@ -146,35 +146,35 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) { "xvabsdp 47, 47 \n\t" "xvabsdp 48, 48 \n\t" "xvabsdp 49, 49 \n\t" - "xvabsdp 50, 50 \n\t" - "xvabsdp 51, 51 \n\t" + "xvabsdp 6, 6 \n\t" + "xvabsdp 7, 7 \n\t" //>>/////////////////////////////// half start - "2: \n\t" - "xxmrghd 0,44,45 \n\t" - "xxmrgld 1,44,45 \n\t" - "xxmrghd 2,46,47 \n\t" - "xxmrgld 3,46,47 \n\t" - "xxmrghd 4,48,49 \n\t" - "xxmrgld 5,48,49 \n\t" - "xxmrghd 44,50,51 \n\t" - "xxmrgld 45,50,51 \n\t" + "two%=: \n\t" + XXMRGHD_S(0,44,45) + XXMRGLD_S(1,44,45) + XXMRGHD_S(2,46,47) + XXMRGLD_S(3,46,47) + XXMRGHD_S(4,48,49) + XXMRGLD_S(5,48,49) + XXMRGHD_S(44,6,7) + XXMRGLD_S(45,6,7) "xvadddp 46, 0,1 \n\t" "xvadddp 47, 2,3 \n\t" "xvadddp 48, 4,5 \n\t" "xvadddp 49, 44,45 \n\t" - "xvcmpgtdp 50,46,47 \n\t " - "xvcmpgtdp 51,48,49 \n\t " + "xvcmpgtdp 6,46,47 \n\t " + "xvcmpgtdp 7,48,49 \n\t " "addi %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t" - "xxsel 32,40,41,50 \n\t" - "xxsel 0,46,47,50 \n\t" - "xxsel 33,42,43,51 \n\t" - "xxsel 1,48,49,51 \n\t" + "xxsel 32,40,41,6 \n\t" + "xxsel 0,46,47,6 \n\t" + "xxsel 33,42,43,7 \n\t" + "xxsel 1,48,49,7 \n\t" "lxvd2x 44, 0,%[ptr_tmp] \n\t" "lxvd2x 45, %[i16],%[ptr_tmp] \n\t" @@ -196,8 +196,8 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) { "lxvd2x 48, %[i64],%[ptr_tmp] \n\t" "lxvd2x 49, %[i80],%[ptr_tmp] \n\t" - "lxvd2x 50, %[i96],%[ptr_tmp] \n\t" - "lxvd2x 51,%[i112],%[ptr_tmp] \n\t" + "lxvd2x 6, %[i96],%[ptr_tmp] \n\t" + "lxvd2x 7,%[i112],%[ptr_tmp] \n\t" //select with previous "xxsel 38,38,32,4 \n\t" "xxsel 39,39,3,4 \n\t" @@ -209,24 +209,24 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) { "xvabsdp 47, 47 \n\t" "xvabsdp 48, 48 \n\t" "xvabsdp 49, 49 \n\t" - "xvabsdp 50, 50 \n\t" - "xvabsdp 51, 51 \n\t" + "xvabsdp 6, 6 \n\t" + "xvabsdp 7, 7 \n\t" //decrement n "addic. %[n], %[n], -16 \n\t" //Loop back if >0 - "bgt+ 1b \n\t" + "bgt+ one%= \n\t" - "xxmrghd 0,44,45 \n\t" - "xxmrgld 1,44,45 \n\t" - "xxmrghd 2,46,47 \n\t" - "xxmrgld 3,46,47 \n\t" - "xxmrghd 4,48,49 \n\t" - "xxmrgld 5,48,49 \n\t" - "xxmrghd 44,50,51 \n\t" - "xxmrgld 45,50,51 \n\t" + XXMRGHD_S(0,44,45) + XXMRGLD_S(1,44,45) + XXMRGHD_S(2,46,47) + XXMRGLD_S(3,46,47) + XXMRGHD_S(4,48,49) + XXMRGLD_S(5,48,49) + XXMRGHD_S(44,6,7) + XXMRGLD_S(45,6,7) "xvadddp 46, 0,1 \n\t" "xvadddp 47, 2,3 \n\t" @@ -235,13 +235,13 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) { - "xvcmpgtdp 50,46,47 \n\t " - "xvcmpgtdp 51,48,49 \n\t " + "xvcmpgtdp 6,46,47 \n\t " + "xvcmpgtdp 7,48,49 \n\t " - "xxsel 32,40,41,50 \n\t" - "xxsel 0,46,47,50 \n\t" - "xxsel 33,42,43,51 \n\t" - "xxsel 1,48,49,51 \n\t" + "xxsel 32,40,41,6 \n\t" + "xxsel 0,46,47,6 \n\t" + "xxsel 33,42,43,7 \n\t" + "xxsel 1,48,49,7 \n\t" "xvcmpgtdp 2,0,1 \n\t " "xxsel 32,32,33,2 \n\t" @@ -260,28 +260,28 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) { ///////extract min value and min index from vector - "xxspltd 32,38,1 \n\t" - "xxspltd 40,39,1 \n\t" + XXSPLTD_S(32,38,1) + XXSPLTD_S(40,39,1) "xvcmpeqdp. 2, 40,39 \n\t" //cr6 0 bit set if all true, cr6=4*6+bit_ind=24,0011at CR(BI)==1, at=10 hint that it occurs rarely //0b001110=14 - "bc 14,24, 3f \n\t" + "bc 14,24, three%= \n\t" "xvcmpgtdp 4,39, 40 \n\t" "xxsel 0,39,40,4 \n\t" "xxsel 1,38,32,4 \n\t" "stxsdx 0,0,%[ptr_minf] \n\t" - "b 4f \n\t" + "b four%= \n\t" - "3: \n\t" + "three%=: \n\t" //if elements value are equal then choose minimum index - "xxspltd 0,40,0 \n\t" + XXSPLTD_S(0,40,0) "vminud 0,0,6 \n\t" //vs32 vs38 "xxlor 1,32,32 \n\t" "stxsdx 0,0,%[ptr_minf] \n\t" - "4: \n\t" + "four%=: \n\t" "mfvsrd %[index],1 \n\t" : [minf] "=m"(*minf),[ptr_tmp] "+&b"(x),[index] "=r"(index), [n] "+&r"(n) @@ -290,7 +290,7 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) { [i64] "b"(64), [i80] "b"(80), [i96] "b"(96), [i112] "b"(112), [start] "v"(start), [adder] "v"(temp_add_index) : "cc", "vs0", "vs1","vs2","vs3", "vs4","vs5","vs32", "vs33", "vs34", "vs35", "vs36", - "vs37", "vs38", "vs39", "vs40", "vs41", "vs42", "vs43", "vs44", "vs45", "vs46", "vs47", "vs48", "vs49", "vs50", "vs51" + "vs37", "vs38", "vs39", "vs40", "vs41", "vs42", "vs43", "vs44", "vs45", "vs46", "vs47", "vs48", "vs49", "vs6", "vs7" ); return index; diff --git a/kernel/power/lock.c b/kernel/power/lock.c index 51348d63c..1c1b006b0 100644 --- a/kernel/power/lock.c +++ b/kernel/power/lock.c @@ -46,10 +46,10 @@ static void __inline blas_lock(volatile BLASULONG *address){ " .machine \"any\" ;" "0: lwarx %0,0, %1 ;" " cmpwi 0,%0,0;" - " bne 1f;" + " bne one%=;" " stwcx. %2,0, %1 ;" " bne- 0b;" - "1: " + "one%=: " : "=&r"(ret) : "r"(address), "r" (val) : "cr0", "memory"); diff --git a/kernel/power/sasum_microk_power8.c b/kernel/power/sasum_microk_power8.c index 4bb515de8..aa465c38e 100644 --- a/kernel/power/sasum_microk_power8.c +++ b/kernel/power/sasum_microk_power8.c @@ -68,10 +68,10 @@ static float sasum_kernel_32 (long n, float *x) "addi %2, %2, 128 \n\t" "addic. %1, %1, -32 \n\t" - "ble 2f \n\t" + "ble two%= \n\t" - ".p2align 5 \n" - "1: \n\t" + ".align 5 \n" + "one%=: \n\t" "xvabssp 48, 40 \n\t" "xvabssp 49, 41 \n\t" @@ -108,9 +108,9 @@ static float sasum_kernel_32 (long n, float *x) "xvaddsp 38, 38, %x5 \n\t" "xvaddsp 39, 39, %x6 \n\t" - "bgt 1b \n" + "bgt one%= \n" - "2: \n\t" + "two%=: \n\t" "xvabssp 48, 40 \n\t" "xvabssp 49, 41 \n\t" diff --git a/kernel/power/scopy_microk_power8.c b/kernel/power/scopy_microk_power8.c index 7a54d5e1e..da39789b1 100644 --- a/kernel/power/scopy_microk_power8.c +++ b/kernel/power/scopy_microk_power8.c @@ -51,10 +51,10 @@ static void scopy_kernel_32 (long n, float *x, float *y) "addi %2, %2, 128 \n\t" "addic. %1, %1, -32 \n\t" - "ble 2f \n\t" + "ble two%= \n\t" - ".p2align 5 \n" - "1: \n\t" + ".align 5 \n" + "one%=: \n\t" "stxvd2x 40, 0, %3 \n\t" "stxvd2x 41, %5, %3 \n\t" @@ -77,9 +77,9 @@ static void scopy_kernel_32 (long n, float *x, float *y) "addi %2, %2, 128 \n\t" "addic. %1, %1, -32 \n\t" - "bgt 1b \n" + "bgt one%= \n" - "2: \n\t" + "two%=: \n\t" "stxvd2x 40, 0, %3 \n\t" "stxvd2x 41, %5, %3 \n\t" diff --git a/kernel/power/sdot_microk_power8.c b/kernel/power/sdot_microk_power8.c index bfe100c8b..a8db6a8d6 100644 --- a/kernel/power/sdot_microk_power8.c +++ b/kernel/power/sdot_microk_power8.c @@ -78,10 +78,10 @@ static float sdot_kernel_16 (long n, float *x, float *y) "addi %3, %3, 128 \n\t" "addic. %1, %1, -32 \n\t" - "ble 2f \n\t" + "ble two%= \n\t" - ".p2align 5 \n" - "1: \n\t" + ".align 5 \n" + "one%=: \n\t" "xvmaddasp 32, 40, 48 \n\t" "lxvd2x 40, 0, %2 \n\t" @@ -112,9 +112,9 @@ static float sdot_kernel_16 (long n, float *x, float *y) "addi %3, %3, 128 \n\t" "addic. %1, %1, -32 \n\t" - "bgt 1b \n" + "bgt one%= \n" - "2: \n\t" + "two%=: \n\t" "xvmaddasp 32, 40, 48 \n\t" "xvmaddasp 33, 41, 49 \n\t" diff --git a/kernel/power/sgemm_macros_16x8_power8.S b/kernel/power/sgemm_macros_16x8_power8.S index 98414857f..9bcfca827 100644 --- a/kernel/power/sgemm_macros_16x8_power8.S +++ b/kernel/power/sgemm_macros_16x8_power8.S @@ -38,7 +38,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * Macros for N=8 and M=16 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD8x16_1', ` +#else .macro LOAD8x16_1 +#endif lxvw4x vs0, o0, AO lxvw4x vs1, o16, AO @@ -63,9 +67,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 128 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x16_I1', ` +#else .macro KERNEL8x16_I1 +#endif lxvw4x vs4, o0, AO @@ -133,9 +145,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs63, vs3, vs15 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x16_1', ` +#else .macro KERNEL8x16_1 +#endif lxvw4x vs4, o0, AO @@ -203,9 +223,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs63, vs3, vs15 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x16_2', ` +#else .macro KERNEL8x16_2 +#endif lxvw4x vs0, o0, AO @@ -273,9 +301,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs63, vs7, vs23 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x16_E2', ` +#else .macro KERNEL8x16_E2 +#endif xvmaddasp vs32, vs4, vs16 @@ -319,9 +355,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs63, vs7, vs23 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x16_SUBI1', ` +#else .macro KERNEL8x16_SUBI1 +#endif lxvw4x vs0, o0, AO @@ -389,9 +433,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs63, vs3, vs15 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x16_SUB1', ` +#else .macro KERNEL8x16_SUB1 +#endif lxvw4x vs0, o0, AO @@ -459,9 +511,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs63, vs3, vs15 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE8x16', ` +#else .macro SAVE8x16 +#endif mr T1, CO @@ -698,14 +758,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 64 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=8 and M=8 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD8x8_1', ` +#else .macro LOAD8x8_1 +#endif lxvw4x vs0, o0, AO lxvw4x vs1, o16, AO @@ -728,9 +796,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 128 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x8_I1', ` +#else .macro KERNEL8x8_I1 +#endif lxvw4x vs4, o0, AO @@ -780,9 +856,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs47, vs1, vs15 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x8_1', ` +#else .macro KERNEL8x8_1 +#endif lxvw4x vs4, o0, AO @@ -832,9 +916,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs47, vs1, vs15 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x8_2', ` +#else .macro KERNEL8x8_2 +#endif lxvw4x vs0, o0, AO @@ -884,9 +976,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs47, vs5, vs23 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x8_E2', ` +#else .macro KERNEL8x8_E2 +#endif xvmaddasp vs32, vs4, vs16 @@ -914,9 +1014,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs47, vs5, vs23 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x8_SUBI1', ` +#else .macro KERNEL8x8_SUBI1 +#endif lxvw4x vs0, o0, AO @@ -966,9 +1074,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs47, vs1, vs15 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x8_SUB1', ` +#else .macro KERNEL8x8_SUB1 +#endif lxvw4x vs0, o0, AO @@ -1018,9 +1134,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs47, vs1, vs15 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE8x8', ` +#else .macro SAVE8x8 +#endif mr T1, CO @@ -1193,14 +1317,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 32 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=8 and M=4 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD8x4_1', ` +#else .macro LOAD8x4_1 +#endif lxvw4x vs0, o0, AO @@ -1222,9 +1354,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 128 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x4_I1', ` +#else .macro KERNEL8x4_I1 +#endif lxvw4x vs4, o0, AO @@ -1265,9 +1405,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs39, vs0, vs15 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x4_1', ` +#else .macro KERNEL8x4_1 +#endif lxvw4x vs4, o0, AO @@ -1308,9 +1456,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs0, vs15 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x4_2', ` +#else .macro KERNEL8x4_2 +#endif lxvw4x vs0, o0, AO @@ -1351,9 +1507,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs4, vs23 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x4_E2', ` +#else .macro KERNEL8x4_E2 +#endif xvmaddasp vs32, vs4, vs16 @@ -1373,9 +1537,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs4, vs23 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x4_SUBI1', ` +#else .macro KERNEL8x4_SUBI1 +#endif lxvw4x vs0, o0, AO @@ -1416,9 +1588,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs39, vs0, vs15 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x4_SUB1', ` +#else .macro KERNEL8x4_SUB1 +#endif lxvw4x vs0, o0, AO @@ -1459,9 +1639,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs0, vs15 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE8x4', ` +#else .macro SAVE8x4 +#endif mr T1, CO @@ -1602,14 +1790,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 16 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=8 and M=2 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD8x2_1', ` +#else .macro LOAD8x2_1 +#endif lxsspx vs0, o0, AO lxsspx vs1, o4, AO @@ -1633,9 +1829,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 128 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x2_I1', ` +#else .macro KERNEL8x2_I1 +#endif lxsspx vs4, o0, AO @@ -1686,9 +1890,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs47, vs1, vs15 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x2_1', ` +#else .macro KERNEL8x2_1 +#endif lxsspx vs4, o0, AO @@ -1739,9 +1951,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs47, vs1, vs15 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x2_2', ` +#else .macro KERNEL8x2_2 +#endif lxsspx vs0, o0, AO @@ -1792,9 +2012,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs47, vs5, vs23 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x2_E2', ` +#else .macro KERNEL8x2_E2 +#endif xsmaddadp vs32, vs4, vs16 @@ -1822,9 +2050,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs47, vs5, vs23 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x2_SUBI1', ` +#else .macro KERNEL8x2_SUBI1 +#endif lxsspx vs0, o0, AO @@ -1875,9 +2111,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs47, vs1, vs15 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x2_SUB1', ` +#else .macro KERNEL8x2_SUB1 +#endif lxsspx vs0, o0, AO @@ -1928,9 +2172,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs47, vs1, vs15 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE8x2', ` +#else .macro SAVE8x2 +#endif mr T1, CO @@ -2103,14 +2355,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 8 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=8 and M=1 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD8x1_1', ` +#else .macro LOAD8x1_1 +#endif lxsspx vs0, o0, AO @@ -2133,9 +2393,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 128 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x1_I1', ` +#else .macro KERNEL8x1_I1 +#endif lxsspx vs4, o0, AO @@ -2177,9 +2445,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs39, vs0, vs15 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x1_1', ` +#else .macro KERNEL8x1_1 +#endif lxsspx vs4, o0, AO @@ -2221,9 +2497,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs39, vs0, vs15 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x1_2', ` +#else .macro KERNEL8x1_2 +#endif lxsspx vs0, o0, AO @@ -2265,9 +2549,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs39, vs4, vs23 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x1_E2', ` +#else .macro KERNEL8x1_E2 +#endif xsmaddadp vs32, vs4, vs16 @@ -2287,9 +2579,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs39, vs4, vs23 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x1_SUBI1', ` +#else .macro KERNEL8x1_SUBI1 +#endif lxsspx vs0, o0, AO @@ -2331,9 +2631,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs39, vs0, vs15 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x1_SUB1', ` +#else .macro KERNEL8x1_SUB1 +#endif lxsspx vs0, o0, AO @@ -2375,9 +2683,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs39, vs0, vs15 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE8x1', ` +#else .macro SAVE8x1 +#endif mr T1, CO @@ -2518,14 +2834,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 4 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=4 and M=16 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD4x16_1', ` +#else .macro LOAD4x16_1 +#endif lxvw4x vs0, o0, AO lxvw4x vs1, o16, AO @@ -2543,9 +2867,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 64 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x16_I1', ` +#else .macro KERNEL4x16_I1 +#endif lxvw4x vs4, o0, AO @@ -2586,9 +2918,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs47, vs3, vs11 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x16_1', ` +#else .macro KERNEL4x16_1 +#endif lxvw4x vs4, o0, AO @@ -2629,9 +2969,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs47, vs3, vs11 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x16_2', ` +#else .macro KERNEL4x16_2 +#endif lxvw4x vs0, o0, AO @@ -2672,9 +3020,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs47, vs7, vs19 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x16_E2', ` +#else .macro KERNEL4x16_E2 +#endif xvmaddasp vs32, vs4, vs16 @@ -2698,9 +3054,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs47, vs7, vs19 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x16_SUBI1', ` +#else .macro KERNEL4x16_SUBI1 +#endif lxvw4x vs0, o0, AO @@ -2741,9 +3105,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs47, vs3, vs11 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x16_SUB1', ` +#else .macro KERNEL4x16_SUB1 +#endif lxvw4x vs0, o0, AO @@ -2784,9 +3156,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs47, vs3, vs11 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE4x16', ` +#else .macro SAVE4x16 +#endif mr T1, CO @@ -2907,14 +3287,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 64 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=4 and M=8 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD4x8_1', ` +#else .macro LOAD4x8_1 +#endif lxvw4x vs0, o0, AO lxvw4x vs1, o16, AO @@ -2930,9 +3318,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 64 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x8_I1', ` +#else .macro KERNEL4x8_I1 +#endif lxvw4x vs4, o0, AO @@ -2963,9 +3359,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs39, vs1, vs11 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x8_1', ` +#else .macro KERNEL4x8_1 +#endif lxvw4x vs4, o0, AO @@ -2996,9 +3400,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs1, vs11 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x8_2', ` +#else .macro KERNEL4x8_2 +#endif lxvw4x vs0, o0, AO @@ -3029,9 +3441,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs5, vs19 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x8_E2', ` +#else .macro KERNEL4x8_E2 +#endif xvmaddasp vs32, vs4, vs16 @@ -3047,9 +3467,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs5, vs19 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x8_SUBI1', ` +#else .macro KERNEL4x8_SUBI1 +#endif lxvw4x vs0, o0, AO @@ -3080,9 +3508,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs39, vs1, vs11 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x8_SUB1', ` +#else .macro KERNEL4x8_SUB1 +#endif lxvw4x vs0, o0, AO @@ -3113,9 +3549,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs1, vs11 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE4x8', ` +#else .macro SAVE4x8 +#endif mr T1, CO @@ -3204,14 +3648,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 32 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=4 and M=4 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD4x4_1', ` +#else .macro LOAD4x4_1 +#endif lxvw4x vs0, o0, AO @@ -3226,9 +3678,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 64 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x4_I1', ` +#else .macro KERNEL4x4_I1 +#endif lxvw4x vs4, o0, AO @@ -3254,9 +3714,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs35, vs0, vs11 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x4_1', ` +#else .macro KERNEL4x4_1 +#endif lxvw4x vs4, o0, AO @@ -3282,9 +3750,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs35, vs0, vs11 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x4_2', ` +#else .macro KERNEL4x4_2 +#endif lxvw4x vs0, o0, AO @@ -3310,9 +3786,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs35, vs4, vs19 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x4_E2', ` +#else .macro KERNEL4x4_E2 +#endif xvmaddasp vs32, vs4, vs16 @@ -3324,9 +3808,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs35, vs4, vs19 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x4_SUBI1', ` +#else .macro KERNEL4x4_SUBI1 +#endif lxvw4x vs0, o0, AO @@ -3352,9 +3844,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs35, vs0, vs11 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x4_SUB1', ` +#else .macro KERNEL4x4_SUB1 +#endif lxvw4x vs0, o0, AO @@ -3380,9 +3880,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs35, vs0, vs11 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE4x4', ` +#else .macro SAVE4x4 +#endif mr T1, CO @@ -3455,14 +3963,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 16 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=4 and M=2 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD4x2_1', ` +#else .macro LOAD4x2_1 +#endif lxsspx vs0, o0, AO lxsspx vs1, o4, AO @@ -3479,9 +3995,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 64 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x2_I1', ` +#else .macro KERNEL4x2_I1 +#endif lxsspx vs4, o0, AO @@ -3513,9 +4037,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs39, vs1, vs11 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x2_1', ` +#else .macro KERNEL4x2_1 +#endif lxsspx vs4, o0, AO @@ -3547,9 +4079,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs39, vs1, vs11 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x2_2', ` +#else .macro KERNEL4x2_2 +#endif lxsspx vs0, o0, AO @@ -3581,9 +4121,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs39, vs5, vs19 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x2_E2', ` +#else .macro KERNEL4x2_E2 +#endif xsmaddadp vs32, vs4, vs16 @@ -3599,9 +4147,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs39, vs5, vs19 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x2_SUBI1', ` +#else .macro KERNEL4x2_SUBI1 +#endif lxsspx vs0, o0, AO @@ -3633,9 +4189,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs39, vs1, vs11 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x2_SUB1', ` +#else .macro KERNEL4x2_SUB1 +#endif lxsspx vs0, o0, AO @@ -3667,9 +4231,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs39, vs1, vs11 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE4x2', ` +#else .macro SAVE4x2 +#endif mr T1, CO @@ -3758,14 +4330,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 8 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=4 and M=1 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD4x1_1', ` +#else .macro LOAD4x1_1 +#endif lxsspx vs0, o0, AO @@ -3781,9 +4361,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 64 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x1_I1', ` +#else .macro KERNEL4x1_I1 +#endif lxsspx vs4, o0, AO @@ -3810,9 +4398,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs35, vs0, vs11 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x1_1', ` +#else .macro KERNEL4x1_1 +#endif lxsspx vs4, o0, AO @@ -3839,9 +4435,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs35, vs0, vs11 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x1_2', ` +#else .macro KERNEL4x1_2 +#endif lxsspx vs0, o0, AO @@ -3868,9 +4472,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs35, vs4, vs19 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x1_E2', ` +#else .macro KERNEL4x1_E2 +#endif xsmaddadp vs32, vs4, vs16 @@ -3882,9 +4494,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs35, vs4, vs19 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x1_SUBI1', ` +#else .macro KERNEL4x1_SUBI1 +#endif lxsspx vs0, o0, AO @@ -3911,9 +4531,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs35, vs0, vs11 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x1_SUB1', ` +#else .macro KERNEL4x1_SUB1 +#endif lxsspx vs0, o0, AO @@ -3940,9 +4568,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs35, vs0, vs11 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE4x1', ` +#else .macro SAVE4x1 +#endif mr T1, CO @@ -4015,14 +4651,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 4 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=16 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD2x16_1', ` +#else .macro LOAD2x16_1 +#endif lxvw4x vs0, o0, AO lxvw4x vs1, o16, AO @@ -4038,9 +4682,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x16_I1', ` +#else .macro KERNEL2x16_I1 +#endif lxvw4x vs4, o0, AO @@ -4069,9 +4721,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs39, vs3, vs9 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x16_1', ` +#else .macro KERNEL2x16_1 +#endif lxvw4x vs4, o0, AO @@ -4100,9 +4760,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs3, vs9 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x16_2', ` +#else .macro KERNEL2x16_2 +#endif lxvw4x vs0, o0, AO @@ -4131,9 +4799,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs7, vs17 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x16_E2', ` +#else .macro KERNEL2x16_E2 +#endif xvmaddasp vs32, vs4, vs16 @@ -4147,9 +4823,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs7, vs17 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x16_SUBI1', ` +#else .macro KERNEL2x16_SUBI1 +#endif lxvw4x vs0, o0, AO @@ -4178,9 +4862,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs39, vs3, vs9 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x16_SUB1', ` +#else .macro KERNEL2x16_SUB1 +#endif lxvw4x vs0, o0, AO @@ -4209,9 +4901,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs3, vs9 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE2x16', ` +#else .macro SAVE2x16 +#endif mr T1, CO @@ -4274,14 +4974,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 64 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=8 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD2x8_1', ` +#else .macro LOAD2x8_1 +#endif lxvw4x vs0, o0, AO lxvw4x vs1, o16, AO @@ -4295,9 +5003,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_I1', ` +#else .macro KERNEL2x8_I1 +#endif lxvw4x vs4, o0, AO @@ -4320,9 +5036,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs35, vs1, vs9 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_1', ` +#else .macro KERNEL2x8_1 +#endif lxvw4x vs4, o0, AO @@ -4345,9 +5069,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs35, vs1, vs9 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_2', ` +#else .macro KERNEL2x8_2 +#endif lxvw4x vs0, o0, AO @@ -4370,9 +5102,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs35, vs5, vs17 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_E2', ` +#else .macro KERNEL2x8_E2 +#endif xvmaddasp vs32, vs4, vs16 @@ -4382,9 +5122,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs35, vs5, vs17 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_SUBI1', ` +#else .macro KERNEL2x8_SUBI1 +#endif lxvw4x vs0, o0, AO @@ -4407,9 +5155,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs35, vs1, vs9 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_SUB1', ` +#else .macro KERNEL2x8_SUB1 +#endif lxvw4x vs0, o0, AO @@ -4432,9 +5188,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs35, vs1, vs9 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE2x8', ` +#else .macro SAVE2x8 +#endif mr T1, CO @@ -4481,14 +5245,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 32 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=4 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD2x4_1', ` +#else .macro LOAD2x4_1 +#endif lxvw4x vs0, o0, AO @@ -4501,9 +5273,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_I1', ` +#else .macro KERNEL2x4_I1 +#endif lxvw4x vs4, o0, AO @@ -4523,9 +5303,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs33, vs0, vs9 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_1', ` +#else .macro KERNEL2x4_1 +#endif lxvw4x vs4, o0, AO @@ -4545,9 +5333,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs33, vs0, vs9 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_2', ` +#else .macro KERNEL2x4_2 +#endif lxvw4x vs0, o0, AO @@ -4567,9 +5363,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs33, vs4, vs17 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_E2', ` +#else .macro KERNEL2x4_E2 +#endif xvmaddasp vs32, vs4, vs16 @@ -4577,9 +5381,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs33, vs4, vs17 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_SUBI1', ` +#else .macro KERNEL2x4_SUBI1 +#endif lxvw4x vs0, o0, AO @@ -4599,9 +5411,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs33, vs0, vs9 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_SUB1', ` +#else .macro KERNEL2x4_SUB1 +#endif lxvw4x vs0, o0, AO @@ -4621,9 +5441,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs33, vs0, vs9 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE2x4', ` +#else .macro SAVE2x4 +#endif mr T1, CO @@ -4662,14 +5490,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 16 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=2 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD2x2_1', ` +#else .macro LOAD2x2_1 +#endif lxsspx vs0, o0, AO lxsspx vs1, o4, AO @@ -4684,9 +5520,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_I1', ` +#else .macro KERNEL2x2_I1 +#endif lxsspx vs4, o0, AO @@ -4710,9 +5554,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs35, vs1, vs9 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_1', ` +#else .macro KERNEL2x2_1 +#endif lxsspx vs4, o0, AO @@ -4736,9 +5588,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs35, vs1, vs9 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_2', ` +#else .macro KERNEL2x2_2 +#endif lxsspx vs0, o0, AO @@ -4762,9 +5622,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs35, vs5, vs17 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_E2', ` +#else .macro KERNEL2x2_E2 +#endif xsmaddadp vs32, vs4, vs16 @@ -4774,9 +5642,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs35, vs5, vs17 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_SUBI1', ` +#else .macro KERNEL2x2_SUBI1 +#endif lxsspx vs0, o0, AO @@ -4800,9 +5676,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs35, vs1, vs9 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_SUB1', ` +#else .macro KERNEL2x2_SUB1 +#endif lxsspx vs0, o0, AO @@ -4826,9 +5710,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs35, vs1, vs9 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE2x2', ` +#else .macro SAVE2x2 +#endif mr T1, CO @@ -4875,14 +5767,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 8 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=1 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD2x1_1', ` +#else .macro LOAD2x1_1 +#endif lxsspx vs0, o0, AO @@ -4896,9 +5796,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_I1', ` +#else .macro KERNEL2x1_I1 +#endif lxsspx vs4, o0, AO @@ -4919,9 +5827,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs33, vs0, vs9 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_1', ` +#else .macro KERNEL2x1_1 +#endif lxsspx vs4, o0, AO @@ -4942,9 +5858,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs33, vs0, vs9 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_2', ` +#else .macro KERNEL2x1_2 +#endif lxsspx vs0, o0, AO @@ -4965,9 +5889,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs33, vs4, vs17 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_E2', ` +#else .macro KERNEL2x1_E2 +#endif xsmaddadp vs32, vs4, vs16 @@ -4975,9 +5907,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs33, vs4, vs17 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_SUBI1', ` +#else .macro KERNEL2x1_SUBI1 +#endif lxsspx vs0, o0, AO @@ -4998,9 +5938,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs33, vs0, vs9 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_SUB1', ` +#else .macro KERNEL2x1_SUB1 +#endif lxsspx vs0, o0, AO @@ -5021,9 +5969,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs33, vs0, vs9 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE2x1', ` +#else .macro SAVE2x1 +#endif mr T1, CO @@ -5062,14 +6018,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 4 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=16 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD1x16_1', ` +#else .macro LOAD1x16_1 +#endif lxvw4x vs0, o0, AO lxvw4x vs1, o16, AO @@ -5084,9 +6048,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x16_I1', ` +#else .macro KERNEL1x16_I1 +#endif lxvw4x vs4, o0, AO @@ -5109,9 +6081,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs35, vs3, vs8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x16_1', ` +#else .macro KERNEL1x16_1 +#endif lxvw4x vs4, o0, AO @@ -5134,9 +6114,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs35, vs3, vs8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x16_2', ` +#else .macro KERNEL1x16_2 +#endif lxvw4x vs0, o0, AO @@ -5159,9 +6147,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs35, vs7, vs16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x16_E2', ` +#else .macro KERNEL1x16_E2 +#endif xvmaddasp vs32, vs4, vs16 @@ -5170,9 +6166,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs35, vs7, vs16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x16_SUBI1', ` +#else .macro KERNEL1x16_SUBI1 +#endif lxvw4x vs0, o0, AO @@ -5195,9 +6199,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs35, vs3, vs8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x16_SUB1', ` +#else .macro KERNEL1x16_SUB1 +#endif lxvw4x vs0, o0, AO @@ -5220,9 +6232,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs35, vs3, vs8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE1x16', ` +#else .macro SAVE1x16 +#endif mr T1, CO @@ -5256,14 +6276,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 64 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=8 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD1x8_1', ` +#else .macro LOAD1x8_1 +#endif lxvw4x vs0, o0, AO lxvw4x vs1, o16, AO @@ -5276,9 +6304,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_I1', ` +#else .macro KERNEL1x8_I1 +#endif lxvw4x vs4, o0, AO @@ -5297,9 +6333,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs33, vs1, vs8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_1', ` +#else .macro KERNEL1x8_1 +#endif lxvw4x vs4, o0, AO @@ -5318,9 +6362,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs33, vs1, vs8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_2', ` +#else .macro KERNEL1x8_2 +#endif lxvw4x vs0, o0, AO @@ -5339,18 +6391,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs33, vs5, vs16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_E2', ` +#else .macro KERNEL1x8_E2 +#endif xvmaddasp vs32, vs4, vs16 xvmaddasp vs33, vs5, vs16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_SUBI1', ` +#else .macro KERNEL1x8_SUBI1 +#endif lxvw4x vs0, o0, AO @@ -5369,9 +6437,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs33, vs1, vs8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_SUB1', ` +#else .macro KERNEL1x8_SUB1 +#endif lxvw4x vs0, o0, AO @@ -5390,9 +6466,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs33, vs1, vs8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE1x8', ` +#else .macro SAVE1x8 +#endif mr T1, CO @@ -5418,14 +6502,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 32 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=4 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD1x4_1', ` +#else .macro LOAD1x4_1 +#endif lxvw4x vs0, o0, AO @@ -5437,9 +6529,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_I1', ` +#else .macro KERNEL1x4_I1 +#endif lxvw4x vs4, o0, AO @@ -5456,9 +6556,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs32, vs0, vs8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_1', ` +#else .macro KERNEL1x4_1 +#endif lxvw4x vs4, o0, AO @@ -5475,9 +6583,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs32, vs0, vs8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_2', ` +#else .macro KERNEL1x4_2 +#endif lxvw4x vs0, o0, AO @@ -5494,17 +6610,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs32, vs4, vs16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_E2', ` +#else .macro KERNEL1x4_E2 +#endif xvmaddasp vs32, vs4, vs16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_SUBI1', ` +#else .macro KERNEL1x4_SUBI1 +#endif lxvw4x vs0, o0, AO @@ -5521,9 +6653,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs32, vs0, vs8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_SUB1', ` +#else .macro KERNEL1x4_SUB1 +#endif lxvw4x vs0, o0, AO @@ -5540,9 +6680,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs32, vs0, vs8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE1x4', ` +#else .macro SAVE1x4 +#endif mr T1, CO @@ -5564,14 +6712,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 16 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=2 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD1x2_1', ` +#else .macro LOAD1x2_1 +#endif lxsspx vs0, o0, AO lxsspx vs1, o4, AO @@ -5585,9 +6741,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_I1', ` +#else .macro KERNEL1x2_I1 +#endif lxsspx vs4, o0, AO @@ -5607,9 +6771,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs33, vs1, vs8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_1', ` +#else .macro KERNEL1x2_1 +#endif lxsspx vs4, o0, AO @@ -5629,9 +6801,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs33, vs1, vs8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_2', ` +#else .macro KERNEL1x2_2 +#endif lxsspx vs0, o0, AO @@ -5651,18 +6831,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs33, vs5, vs16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_E2', ` +#else .macro KERNEL1x2_E2 +#endif xsmaddadp vs32, vs4, vs16 xsmaddadp vs33, vs5, vs16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_SUBI1', ` +#else .macro KERNEL1x2_SUBI1 +#endif lxsspx vs0, o0, AO @@ -5682,9 +6878,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs33, vs1, vs8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_SUB1', ` +#else .macro KERNEL1x2_SUB1 +#endif lxsspx vs0, o0, AO @@ -5704,9 +6908,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs33, vs1, vs8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE1x2', ` +#else .macro SAVE1x2 +#endif mr T1, CO @@ -5732,14 +6944,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 8 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=1 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD1x1_1', ` +#else .macro LOAD1x1_1 +#endif lxsspx vs0, o0, AO @@ -5752,9 +6972,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_I1', ` +#else .macro KERNEL1x1_I1 +#endif lxsspx vs4, o0, AO @@ -5772,9 +7000,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs32, vs0, vs8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_1', ` +#else .macro KERNEL1x1_1 +#endif lxsspx vs4, o0, AO @@ -5792,9 +7028,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs32, vs0, vs8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_2', ` +#else .macro KERNEL1x1_2 +#endif lxsspx vs0, o0, AO @@ -5812,17 +7056,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs32, vs4, vs16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_E2', ` +#else .macro KERNEL1x1_E2 +#endif xsmaddadp vs32, vs4, vs16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_SUBI1', ` +#else .macro KERNEL1x1_SUBI1 +#endif lxsspx vs0, o0, AO @@ -5840,9 +7100,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs32, vs0, vs8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_SUB1', ` +#else .macro KERNEL1x1_SUB1 +#endif lxsspx vs0, o0, AO @@ -5860,9 +7128,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs32, vs0, vs8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE1x1', ` +#else .macro SAVE1x1 +#endif mr T1, CO @@ -5884,13 +7160,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 4 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`COPYB_4x8', ` +#else .macro COPYB_4x8 +#endif lxvw4x vs5, o0, BO @@ -5993,10 +7277,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvw4x vs54, o48, BBO addi BBO, BBO, 64 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`COPYB_1x8', ` +#else .macro COPYB_1x8 +#endif lxvw4x vs5, o0, BO @@ -6026,5 +7318,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvw4x vs14, o48, BBO addi BBO, BBO, 64 +#if defined(_AIX) +') +#else .endm +#endif diff --git a/kernel/power/sgemm_tcopy_macros_16_power8.S b/kernel/power/sgemm_tcopy_macros_16_power8.S index 53f9c8b82..ed592a604 100644 --- a/kernel/power/sgemm_tcopy_macros_16_power8.S +++ b/kernel/power/sgemm_tcopy_macros_16_power8.S @@ -38,7 +38,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * Macros for N=4 and M=16 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_4x16', ` +#else .macro COPY_4x16 +#endif lxvw4x vs32, o0, A0 lxvw4x vs33, o16, A0 @@ -88,13 +92,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvw4x vs46, o32, T1 stxvw4x vs47, o48, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=4 and M=8 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_4x8', ` +#else .macro COPY_4x8 +#endif lxvw4x vs32, o0, A0 lxvw4x vs33, o16, A0 @@ -124,13 +136,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvw4x vs38, o32, T1 stxvw4x vs39, o48, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=4 and M=4 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_4x4', ` +#else .macro COPY_4x4 +#endif lxvw4x vs32, o0, A0 @@ -150,13 +170,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvw4x vs35, o48, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=4 and M=2 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_4x2', ` +#else .macro COPY_4x2 +#endif lxsspx vs32, o0, A0 lxsspx vs33, o4, A0 @@ -190,13 +218,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxsspx vs38, o0, T1 stxsspx vs39, o4, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=4 and M=1 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_4x1', ` +#else .macro COPY_4x1 +#endif lxsspx vs32, o0, A0 @@ -218,13 +254,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxsspx vs35, o4, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=16 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_2x16', ` +#else .macro COPY_2x16 +#endif lxvw4x vs32, o0, A0 lxvw4x vs33, o16, A0 @@ -250,13 +294,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvw4x vs38, o32, T1 stxvw4x vs39, o48, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=8 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_2x8', ` +#else .macro COPY_2x8 +#endif lxvw4x vs32, o0, A0 lxvw4x vs33, o16, A0 @@ -272,13 +324,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvw4x vs34, o32, T1 stxvw4x vs35, o48, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=4 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_2x4', ` +#else .macro COPY_2x4 +#endif lxvw4x vs32, o0, A0 @@ -290,13 +350,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvw4x vs33, o16, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=2 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_2x2', ` +#else .macro COPY_2x2 +#endif lxsspx vs32, o0, A0 lxsspx vs33, o4, A0 @@ -314,13 +382,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxsspx vs34, o0, T1 stxsspx vs35, o4, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=1 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_2x1', ` +#else .macro COPY_2x1 +#endif lxsspx vs32, o0, A0 @@ -332,13 +408,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxsspx vs33, o4, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=16 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_1x16', ` +#else .macro COPY_1x16 +#endif lxvw4x vs32, o0, A0 lxvw4x vs33, o16, A0 @@ -352,13 +436,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvw4x vs34, o32, T1 stxvw4x vs35, o48, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=8 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_1x8', ` +#else .macro COPY_1x8 +#endif lxvw4x vs32, o0, A0 lxvw4x vs33, o16, A0 @@ -368,13 +460,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvw4x vs32, o0, T1 stxvw4x vs33, o16, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=4 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_1x4', ` +#else .macro COPY_1x4 +#endif lxvw4x vs32, o0, A0 @@ -382,13 +482,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvw4x vs32, o0, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=2 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_1x2', ` +#else .macro COPY_1x2 +#endif lxsspx vs32, o0, A0 lxsspx vs33, o4, A0 @@ -398,13 +506,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxsspx vs32, o0, T1 stxsspx vs33, o4, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=1 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_1x1', ` +#else .macro COPY_1x1 +#endif lxsspx vs32, o0, A0 @@ -412,5 +528,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxsspx vs32, o0, T1 +#if defined(_AIX) +') +#else .endm +#endif diff --git a/kernel/power/sgemm_tcopy_macros_8_power8.S b/kernel/power/sgemm_tcopy_macros_8_power8.S index 1b71d5bb3..f80f095dc 100644 --- a/kernel/power/sgemm_tcopy_macros_8_power8.S +++ b/kernel/power/sgemm_tcopy_macros_8_power8.S @@ -38,7 +38,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * Macros for N=4 and M=8 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_4x8', ` +#else .macro COPY_4x8 +#endif lxvw4x vs32, o0, A0 lxvw4x vs33, o16, A0 @@ -68,13 +72,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvw4x vs38, o32, T1 stxvw4x vs39, o48, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=4 and M=4 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_4x4', ` +#else .macro COPY_4x4 +#endif lxvw4x vs32, o0, A0 @@ -94,13 +106,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvw4x vs35, o48, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=4 and M=2 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_4x2', ` +#else .macro COPY_4x2 +#endif lxsspx vs32, o0, A0 lxsspx vs33, o4, A0 @@ -134,13 +154,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxsspx vs38, o0, T1 stxsspx vs39, o4, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=4 and M=1 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_4x1', ` +#else .macro COPY_4x1 +#endif lxsspx vs32, o0, A0 @@ -162,13 +190,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxsspx vs35, o4, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=8 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_2x8', ` +#else .macro COPY_2x8 +#endif lxvw4x vs32, o0, A0 lxvw4x vs33, o16, A0 @@ -184,13 +220,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvw4x vs34, o32, T1 stxvw4x vs35, o48, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=4 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_2x4', ` +#else .macro COPY_2x4 +#endif lxvw4x vs32, o0, A0 @@ -202,13 +246,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvw4x vs33, o16, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=2 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_2x2', ` +#else .macro COPY_2x2 +#endif lxsspx vs32, o0, A0 lxsspx vs33, o4, A0 @@ -226,13 +278,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxsspx vs34, o0, T1 stxsspx vs35, o4, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=1 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_2x1', ` +#else .macro COPY_2x1 +#endif lxsspx vs32, o0, A0 @@ -244,13 +304,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxsspx vs33, o4, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=8 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_1x8', ` +#else .macro COPY_1x8 +#endif lxvw4x vs32, o0, A0 lxvw4x vs33, o16, A0 @@ -260,13 +328,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvw4x vs32, o0, T1 stxvw4x vs33, o16, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=4 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_1x4', ` +#else .macro COPY_1x4 +#endif lxvw4x vs32, o0, A0 @@ -274,13 +350,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvw4x vs32, o0, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=2 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_1x2', ` +#else .macro COPY_1x2 +#endif lxsspx vs32, o0, A0 lxsspx vs33, o4, A0 @@ -290,13 +374,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxsspx vs32, o0, T1 stxsspx vs33, o4, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=1 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_1x1', ` +#else .macro COPY_1x1 +#endif lxsspx vs32, o0, A0 @@ -304,5 +396,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxsspx vs32, o0, T1 +#if defined(_AIX) +') +#else .endm +#endif diff --git a/kernel/power/srot_microk_power8.c b/kernel/power/srot_microk_power8.c index 6eecb60a1..329a8cd06 100644 --- a/kernel/power/srot_microk_power8.c +++ b/kernel/power/srot_microk_power8.c @@ -71,10 +71,10 @@ static void srot_kernel_16 (long n, float *x, float *y, float c, float s) "addi %4, %4, 64 \n\t" "addic. %2, %2, -16 \n\t" - "ble 2f \n\t" + "ble two%= \n\t" - ".p2align 5 \n" - "1: \n\t" + ".align 5 \n" + "one%=: \n\t" "xvmulsp 40, 32, 36 \n\t" // c * x "xvmulsp 41, 33, 36 \n\t" @@ -138,9 +138,9 @@ static void srot_kernel_16 (long n, float *x, float *y, float c, float s) "addi %4, %4, 128 \n\t" "addic. %2, %2, -16 \n\t" - "bgt 1b \n" + "bgt one%= \n" - "2: \n\t" + "two%=: \n\t" "xvmulsp 40, 32, 36 \n\t" // c * x "xvmulsp 41, 33, 36 \n\t" diff --git a/kernel/power/sscal_microk_power8.c b/kernel/power/sscal_microk_power8.c index 058ff3399..88fba3166 100644 --- a/kernel/power/sscal_microk_power8.c +++ b/kernel/power/sscal_microk_power8.c @@ -56,10 +56,10 @@ static void sscal_kernel_16 (long n, float *x, float alpha) "addi %2, %2, 128 \n\t" "addic. %1, %1, -32 \n\t" - "ble 2f \n\t" + "ble two%= \n\t" - ".p2align 5 \n" - "1: \n\t" + ".align 5 \n" + "one%=: \n\t" "xvmulsp 40, 32, %x3 \n\t" "xvmulsp 41, 33, %x3 \n\t" @@ -92,9 +92,9 @@ static void sscal_kernel_16 (long n, float *x, float alpha) "addi %2, %2, 256 \n\t" "addic. %1, %1, -32 \n\t" - "bgt 1b \n" + "bgt one%= \n" - "2: \n\t" + "two%=: \n\t" "xvmulsp 40, 32, %x3 \n\t" "xvmulsp 41, 33, %x3 \n\t" @@ -147,8 +147,8 @@ static void sscal_kernel_16_zero (long n, float *x) ( "xxlxor %x3, %x3, %x3 \n\t" - ".p2align 5 \n" - "1: \n\t" + ".align 5 \n" + "one%=: \n\t" "stxvd2x %x3, 0, %2 \n\t" "stxvd2x %x3, %4, %2 \n\t" @@ -162,7 +162,7 @@ static void sscal_kernel_16_zero (long n, float *x) "addi %2, %2, 128 \n\t" "addic. %1, %1, -32 \n\t" - "bgt 1b \n" + "bgt one%= \n" "#n=%1 x=%0=%2 t0=%x3 o16=%4 o32=%5 o48=%6 o64=%7 o80=%8 o96=%9 o112=%10" : diff --git a/kernel/power/sswap_microk_power8.c b/kernel/power/sswap_microk_power8.c index cfefdd6ef..a407018a8 100644 --- a/kernel/power/sswap_microk_power8.c +++ b/kernel/power/sswap_microk_power8.c @@ -39,8 +39,8 @@ static void sswap_kernel_32 (long n, float *x, float *y) { __asm__ ( - ".p2align 5 \n" - "1: \n\t" + ".align 5 \n" + "one%=: \n\t" "lxvd2x 32, 0, %4 \n\t" "lxvd2x 33, %5, %4 \n\t" @@ -83,7 +83,7 @@ static void sswap_kernel_32 (long n, float *x, float *y) "addi %4, %4, 128 \n\t" "addic. %2, %2, -32 \n\t" - "bgt 1b \n" + "bgt one%= \n" "#n=%2 x=%0=%3 y=%1=%4 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11" : diff --git a/kernel/power/strmm_macros_16x8_power8.S b/kernel/power/strmm_macros_16x8_power8.S index 27bc1e89c..6c016d6fa 100644 --- a/kernel/power/strmm_macros_16x8_power8.S +++ b/kernel/power/strmm_macros_16x8_power8.S @@ -38,7 +38,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * Macros for N=8 and M=16 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD8x16_1', ` +#else .macro LOAD8x16_1 +#endif lxvw4x vs0, o0, AO lxvw4x vs1, o16, AO @@ -63,9 +67,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x16_I1', ` +#else .macro KERNEL8x16_I1 +#endif lxvw4x vs4, o0, AO @@ -133,9 +145,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs63, vs3, vs15 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x16_1', ` +#else .macro KERNEL8x16_1 +#endif lxvw4x vs4, o0, AO @@ -203,9 +223,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs63, vs3, vs15 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x16_2', ` +#else .macro KERNEL8x16_2 +#endif lxvw4x vs0, o0, AO @@ -273,9 +301,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs63, vs7, vs23 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x16_E2', ` +#else .macro KERNEL8x16_E2 +#endif xvmaddasp vs32, vs4, vs16 @@ -319,9 +355,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs63, vs7, vs23 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x16_SUBI1', ` +#else .macro KERNEL8x16_SUBI1 +#endif lxvw4x vs0, o0, AO @@ -389,9 +433,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs63, vs3, vs15 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x16_SUB1', ` +#else .macro KERNEL8x16_SUB1 +#endif lxvw4x vs0, o0, AO @@ -459,9 +511,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs63, vs3, vs15 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE8x16', ` +#else .macro SAVE8x16 +#endif mr T1, CO @@ -698,14 +758,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 64 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=8 and M=8 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD8x8_1', ` +#else .macro LOAD8x8_1 +#endif lxvw4x vs0, o0, AO lxvw4x vs1, o16, AO @@ -728,9 +796,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x8_I1', ` +#else .macro KERNEL8x8_I1 +#endif lxvw4x vs4, o0, AO @@ -780,9 +856,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs47, vs1, vs15 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x8_1', ` +#else .macro KERNEL8x8_1 +#endif lxvw4x vs4, o0, AO @@ -832,9 +916,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs47, vs1, vs15 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x8_2', ` +#else .macro KERNEL8x8_2 +#endif lxvw4x vs0, o0, AO @@ -884,9 +976,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs47, vs5, vs23 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x8_E2', ` +#else .macro KERNEL8x8_E2 +#endif xvmaddasp vs32, vs4, vs16 @@ -914,9 +1014,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs47, vs5, vs23 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x8_SUBI1', ` +#else .macro KERNEL8x8_SUBI1 +#endif lxvw4x vs0, o0, AO @@ -966,9 +1074,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs47, vs1, vs15 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x8_SUB1', ` +#else .macro KERNEL8x8_SUB1 +#endif lxvw4x vs0, o0, AO @@ -1018,9 +1134,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs47, vs1, vs15 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE8x8', ` +#else .macro SAVE8x8 +#endif mr T1, CO @@ -1193,14 +1317,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 32 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=8 and M=4 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD8x4_1', ` +#else .macro LOAD8x4_1 +#endif lxvw4x vs0, o0, AO @@ -1222,9 +1354,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x4_I1', ` +#else .macro KERNEL8x4_I1 +#endif lxvw4x vs4, o0, AO @@ -1265,9 +1405,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs39, vs0, vs15 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x4_1', ` +#else .macro KERNEL8x4_1 +#endif lxvw4x vs4, o0, AO @@ -1308,9 +1456,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs0, vs15 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x4_2', ` +#else .macro KERNEL8x4_2 +#endif lxvw4x vs0, o0, AO @@ -1351,9 +1507,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs4, vs23 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x4_E2', ` +#else .macro KERNEL8x4_E2 +#endif xvmaddasp vs32, vs4, vs16 @@ -1373,9 +1537,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs4, vs23 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x4_SUBI1', ` +#else .macro KERNEL8x4_SUBI1 +#endif lxvw4x vs0, o0, AO @@ -1416,9 +1588,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs39, vs0, vs15 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x4_SUB1', ` +#else .macro KERNEL8x4_SUB1 +#endif lxvw4x vs0, o0, AO @@ -1459,9 +1639,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs0, vs15 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE8x4', ` +#else .macro SAVE8x4 +#endif mr T1, CO @@ -1602,14 +1790,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 16 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=8 and M=2 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD8x2_1', ` +#else .macro LOAD8x2_1 +#endif lxsspx vs0, o0, AO lxsspx vs1, o4, AO @@ -1632,9 +1828,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x2_I1', ` +#else .macro KERNEL8x2_I1 +#endif lxsspx vs4, o0, AO @@ -1684,9 +1888,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs47, vs1, vs15 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x2_1', ` +#else .macro KERNEL8x2_1 +#endif lxsspx vs4, o0, AO @@ -1736,9 +1948,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs47, vs1, vs15 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x2_2', ` +#else .macro KERNEL8x2_2 +#endif lxsspx vs0, o0, AO @@ -1788,9 +2008,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs47, vs5, vs23 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x2_E2', ` +#else .macro KERNEL8x2_E2 +#endif xsmaddadp vs32, vs4, vs16 @@ -1818,9 +2046,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs47, vs5, vs23 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x2_SUBI1', ` +#else .macro KERNEL8x2_SUBI1 +#endif lxsspx vs0, o0, AO @@ -1870,9 +2106,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs47, vs1, vs15 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x2_SUB1', ` +#else .macro KERNEL8x2_SUB1 +#endif lxsspx vs0, o0, AO @@ -1922,9 +2166,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs47, vs1, vs15 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE8x2', ` +#else .macro SAVE8x2 +#endif mr T1, CO @@ -2097,14 +2349,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 8 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=8 and M=1 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD8x1_1', ` +#else .macro LOAD8x1_1 +#endif lxsspx vs0, o0, AO @@ -2126,9 +2386,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x1_I1', ` +#else .macro KERNEL8x1_I1 +#endif lxsspx vs4, o0, AO @@ -2169,9 +2437,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs39, vs0, vs15 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x1_1', ` +#else .macro KERNEL8x1_1 +#endif lxsspx vs4, o0, AO @@ -2212,9 +2488,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs39, vs0, vs15 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x1_2', ` +#else .macro KERNEL8x1_2 +#endif lxsspx vs0, o0, AO @@ -2255,9 +2539,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs39, vs4, vs23 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x1_E2', ` +#else .macro KERNEL8x1_E2 +#endif xsmaddadp vs32, vs4, vs16 @@ -2277,9 +2569,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs39, vs4, vs23 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x1_SUBI1', ` +#else .macro KERNEL8x1_SUBI1 +#endif lxsspx vs0, o0, AO @@ -2320,9 +2620,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs39, vs0, vs15 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x1_SUB1', ` +#else .macro KERNEL8x1_SUB1 +#endif lxsspx vs0, o0, AO @@ -2363,9 +2671,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs39, vs0, vs15 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE8x1', ` +#else .macro SAVE8x1 +#endif mr T1, CO @@ -2506,14 +2822,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 4 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=4 and M=16 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD4x16_1', ` +#else .macro LOAD4x16_1 +#endif lxvw4x vs0, o0, AO lxvw4x vs1, o16, AO @@ -2531,9 +2855,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x16_I1', ` +#else .macro KERNEL4x16_I1 +#endif lxvw4x vs4, o0, AO @@ -2574,9 +2906,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs47, vs3, vs11 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x16_1', ` +#else .macro KERNEL4x16_1 +#endif lxvw4x vs4, o0, AO @@ -2617,9 +2957,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs47, vs3, vs11 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x16_2', ` +#else .macro KERNEL4x16_2 +#endif lxvw4x vs0, o0, AO @@ -2660,9 +3008,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs47, vs7, vs19 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x16_E2', ` +#else .macro KERNEL4x16_E2 +#endif xvmaddasp vs32, vs4, vs16 @@ -2686,9 +3042,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs47, vs7, vs19 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x16_SUBI1', ` +#else .macro KERNEL4x16_SUBI1 +#endif lxvw4x vs0, o0, AO @@ -2729,9 +3093,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs47, vs3, vs11 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x16_SUB1', ` +#else .macro KERNEL4x16_SUB1 +#endif lxvw4x vs0, o0, AO @@ -2772,9 +3144,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs47, vs3, vs11 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE4x16', ` +#else .macro SAVE4x16 +#endif mr T1, CO @@ -2895,14 +3275,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 64 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=4 and M=8 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD4x8_1', ` +#else .macro LOAD4x8_1 +#endif lxvw4x vs0, o0, AO lxvw4x vs1, o16, AO @@ -2918,9 +3306,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x8_I1', ` +#else .macro KERNEL4x8_I1 +#endif lxvw4x vs4, o0, AO @@ -2951,9 +3347,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs39, vs1, vs11 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x8_1', ` +#else .macro KERNEL4x8_1 +#endif lxvw4x vs4, o0, AO @@ -2984,9 +3388,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs1, vs11 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x8_2', ` +#else .macro KERNEL4x8_2 +#endif lxvw4x vs0, o0, AO @@ -3017,9 +3429,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs5, vs19 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x8_E2', ` +#else .macro KERNEL4x8_E2 +#endif xvmaddasp vs32, vs4, vs16 @@ -3035,9 +3455,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs5, vs19 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x8_SUBI1', ` +#else .macro KERNEL4x8_SUBI1 +#endif lxvw4x vs0, o0, AO @@ -3068,9 +3496,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs39, vs1, vs11 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x8_SUB1', ` +#else .macro KERNEL4x8_SUB1 +#endif lxvw4x vs0, o0, AO @@ -3101,9 +3537,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs1, vs11 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE4x8', ` +#else .macro SAVE4x8 +#endif mr T1, CO @@ -3192,14 +3636,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 32 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=4 and M=4 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD4x4_1', ` +#else .macro LOAD4x4_1 +#endif lxvw4x vs0, o0, AO @@ -3214,9 +3666,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x4_I1', ` +#else .macro KERNEL4x4_I1 +#endif lxvw4x vs4, o0, AO @@ -3242,9 +3702,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs35, vs0, vs11 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x4_1', ` +#else .macro KERNEL4x4_1 +#endif lxvw4x vs4, o0, AO @@ -3270,9 +3738,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs35, vs0, vs11 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x4_2', ` +#else .macro KERNEL4x4_2 +#endif lxvw4x vs0, o0, AO @@ -3298,9 +3774,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs35, vs4, vs19 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x4_E2', ` +#else .macro KERNEL4x4_E2 +#endif xvmaddasp vs32, vs4, vs16 @@ -3312,9 +3796,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs35, vs4, vs19 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x4_SUBI1', ` +#else .macro KERNEL4x4_SUBI1 +#endif lxvw4x vs0, o0, AO @@ -3340,9 +3832,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs35, vs0, vs11 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x4_SUB1', ` +#else .macro KERNEL4x4_SUB1 +#endif lxvw4x vs0, o0, AO @@ -3368,9 +3868,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs35, vs0, vs11 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE4x4', ` +#else .macro SAVE4x4 +#endif mr T1, CO @@ -3443,14 +3951,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 16 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=4 and M=2 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD4x2_1', ` +#else .macro LOAD4x2_1 +#endif lxsspx vs0, o0, AO lxsspx vs1, o4, AO @@ -3466,9 +3982,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x2_I1', ` +#else .macro KERNEL4x2_I1 +#endif lxsspx vs4, o0, AO @@ -3499,9 +4023,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs39, vs1, vs11 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x2_1', ` +#else .macro KERNEL4x2_1 +#endif lxsspx vs4, o0, AO @@ -3532,9 +4064,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs39, vs1, vs11 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x2_2', ` +#else .macro KERNEL4x2_2 +#endif lxsspx vs0, o0, AO @@ -3565,9 +4105,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs39, vs5, vs19 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x2_E2', ` +#else .macro KERNEL4x2_E2 +#endif xsmaddadp vs32, vs4, vs16 @@ -3583,9 +4131,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs39, vs5, vs19 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x2_SUBI1', ` +#else .macro KERNEL4x2_SUBI1 +#endif lxsspx vs0, o0, AO @@ -3616,9 +4172,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs39, vs1, vs11 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x2_SUB1', ` +#else .macro KERNEL4x2_SUB1 +#endif lxsspx vs0, o0, AO @@ -3649,9 +4213,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs39, vs1, vs11 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE4x2', ` +#else .macro SAVE4x2 +#endif mr T1, CO @@ -3740,14 +4312,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 8 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=4 and M=1 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD4x1_1', ` +#else .macro LOAD4x1_1 +#endif lxsspx vs0, o0, AO @@ -3762,9 +4342,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x1_I1', ` +#else .macro KERNEL4x1_I1 +#endif lxsspx vs4, o0, AO @@ -3790,9 +4378,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs35, vs0, vs11 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x1_1', ` +#else .macro KERNEL4x1_1 +#endif lxsspx vs4, o0, AO @@ -3818,9 +4414,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs35, vs0, vs11 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x1_2', ` +#else .macro KERNEL4x1_2 +#endif lxsspx vs0, o0, AO @@ -3846,9 +4450,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs35, vs4, vs19 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x1_E2', ` +#else .macro KERNEL4x1_E2 +#endif xsmaddadp vs32, vs4, vs16 @@ -3860,9 +4472,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs35, vs4, vs19 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x1_SUBI1', ` +#else .macro KERNEL4x1_SUBI1 +#endif lxsspx vs0, o0, AO @@ -3888,9 +4508,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs35, vs0, vs11 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x1_SUB1', ` +#else .macro KERNEL4x1_SUB1 +#endif lxsspx vs0, o0, AO @@ -3916,9 +4544,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs35, vs0, vs11 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE4x1', ` +#else .macro SAVE4x1 +#endif mr T1, CO @@ -3991,14 +4627,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 4 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=16 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD2x16_1', ` +#else .macro LOAD2x16_1 +#endif lxvw4x vs0, o0, AO lxvw4x vs1, o16, AO @@ -4014,9 +4658,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x16_I1', ` +#else .macro KERNEL2x16_I1 +#endif lxvw4x vs4, o0, AO @@ -4045,9 +4697,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs39, vs3, vs9 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x16_1', ` +#else .macro KERNEL2x16_1 +#endif lxvw4x vs4, o0, AO @@ -4076,9 +4736,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs3, vs9 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x16_2', ` +#else .macro KERNEL2x16_2 +#endif lxvw4x vs0, o0, AO @@ -4107,9 +4775,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs7, vs17 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x16_E2', ` +#else .macro KERNEL2x16_E2 +#endif xvmaddasp vs32, vs4, vs16 @@ -4123,9 +4799,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs7, vs17 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x16_SUBI1', ` +#else .macro KERNEL2x16_SUBI1 +#endif lxvw4x vs0, o0, AO @@ -4154,9 +4838,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs39, vs3, vs9 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x16_SUB1', ` +#else .macro KERNEL2x16_SUB1 +#endif lxvw4x vs0, o0, AO @@ -4185,9 +4877,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs3, vs9 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE2x16', ` +#else .macro SAVE2x16 +#endif mr T1, CO @@ -4250,14 +4950,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 64 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=8 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD2x8_1', ` +#else .macro LOAD2x8_1 +#endif lxvw4x vs0, o0, AO lxvw4x vs1, o16, AO @@ -4271,9 +4979,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_I1', ` +#else .macro KERNEL2x8_I1 +#endif lxvw4x vs4, o0, AO @@ -4296,9 +5012,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs35, vs1, vs9 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_1', ` +#else .macro KERNEL2x8_1 +#endif lxvw4x vs4, o0, AO @@ -4321,9 +5045,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs35, vs1, vs9 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_2', ` +#else .macro KERNEL2x8_2 +#endif lxvw4x vs0, o0, AO @@ -4346,9 +5078,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs35, vs5, vs17 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_E2', ` +#else .macro KERNEL2x8_E2 +#endif xvmaddasp vs32, vs4, vs16 @@ -4358,9 +5098,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs35, vs5, vs17 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_SUBI1', ` +#else .macro KERNEL2x8_SUBI1 +#endif lxvw4x vs0, o0, AO @@ -4383,9 +5131,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs35, vs1, vs9 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_SUB1', ` +#else .macro KERNEL2x8_SUB1 +#endif lxvw4x vs0, o0, AO @@ -4408,9 +5164,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs35, vs1, vs9 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE2x8', ` +#else .macro SAVE2x8 +#endif mr T1, CO @@ -4457,14 +5221,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 32 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=4 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD2x4_1', ` +#else .macro LOAD2x4_1 +#endif lxvw4x vs0, o0, AO @@ -4477,9 +5249,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_I1', ` +#else .macro KERNEL2x4_I1 +#endif lxvw4x vs4, o0, AO @@ -4499,9 +5279,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs33, vs0, vs9 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_1', ` +#else .macro KERNEL2x4_1 +#endif lxvw4x vs4, o0, AO @@ -4521,9 +5309,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs33, vs0, vs9 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_2', ` +#else .macro KERNEL2x4_2 +#endif lxvw4x vs0, o0, AO @@ -4543,9 +5339,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs33, vs4, vs17 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_E2', ` +#else .macro KERNEL2x4_E2 +#endif xvmaddasp vs32, vs4, vs16 @@ -4553,9 +5357,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs33, vs4, vs17 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_SUBI1', ` +#else .macro KERNEL2x4_SUBI1 +#endif lxvw4x vs0, o0, AO @@ -4575,9 +5387,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs33, vs0, vs9 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_SUB1', ` +#else .macro KERNEL2x4_SUB1 +#endif lxvw4x vs0, o0, AO @@ -4597,9 +5417,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs33, vs0, vs9 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE2x4', ` +#else .macro SAVE2x4 +#endif mr T1, CO @@ -4638,14 +5466,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 16 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=2 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD2x2_1', ` +#else .macro LOAD2x2_1 +#endif lxsspx vs0, o0, AO lxsspx vs1, o4, AO @@ -4659,9 +5495,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_I1', ` +#else .macro KERNEL2x2_I1 +#endif lxsspx vs4, o0, AO @@ -4684,9 +5528,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs35, vs1, vs9 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_1', ` +#else .macro KERNEL2x2_1 +#endif lxsspx vs4, o0, AO @@ -4709,9 +5561,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs35, vs1, vs9 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_2', ` +#else .macro KERNEL2x2_2 +#endif lxsspx vs0, o0, AO @@ -4734,9 +5594,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs35, vs5, vs17 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_E2', ` +#else .macro KERNEL2x2_E2 +#endif xsmaddadp vs32, vs4, vs16 @@ -4746,9 +5614,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs35, vs5, vs17 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_SUBI1', ` +#else .macro KERNEL2x2_SUBI1 +#endif lxsspx vs0, o0, AO @@ -4771,9 +5647,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs35, vs1, vs9 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_SUB1', ` +#else .macro KERNEL2x2_SUB1 +#endif lxsspx vs0, o0, AO @@ -4796,9 +5680,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs35, vs1, vs9 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE2x2', ` +#else .macro SAVE2x2 +#endif mr T1, CO @@ -4845,14 +5737,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 8 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=1 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD2x1_1', ` +#else .macro LOAD2x1_1 +#endif lxsspx vs0, o0, AO @@ -4865,9 +5765,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_I1', ` +#else .macro KERNEL2x1_I1 +#endif lxsspx vs4, o0, AO @@ -4887,9 +5795,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs33, vs0, vs9 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_1', ` +#else .macro KERNEL2x1_1 +#endif lxsspx vs4, o0, AO @@ -4909,9 +5825,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs33, vs0, vs9 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_2', ` +#else .macro KERNEL2x1_2 +#endif lxsspx vs0, o0, AO @@ -4931,9 +5855,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs33, vs4, vs17 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_E2', ` +#else .macro KERNEL2x1_E2 +#endif xsmaddadp vs32, vs4, vs16 @@ -4941,9 +5873,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs33, vs4, vs17 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_SUBI1', ` +#else .macro KERNEL2x1_SUBI1 +#endif lxsspx vs0, o0, AO @@ -4963,9 +5903,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs33, vs0, vs9 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_SUB1', ` +#else .macro KERNEL2x1_SUB1 +#endif lxsspx vs0, o0, AO @@ -4985,9 +5933,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs33, vs0, vs9 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE2x1', ` +#else .macro SAVE2x1 +#endif mr T1, CO @@ -5026,14 +5982,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 4 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=16 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD1x16_1', ` +#else .macro LOAD1x16_1 +#endif lxvw4x vs0, o0, AO lxvw4x vs1, o16, AO @@ -5048,9 +6012,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 4 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x16_I1', ` +#else .macro KERNEL1x16_I1 +#endif lxvw4x vs4, o0, AO @@ -5073,9 +6045,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs35, vs3, vs8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x16_1', ` +#else .macro KERNEL1x16_1 +#endif lxvw4x vs4, o0, AO @@ -5098,9 +6078,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs35, vs3, vs8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x16_2', ` +#else .macro KERNEL1x16_2 +#endif lxvw4x vs0, o0, AO @@ -5123,9 +6111,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs35, vs7, vs16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x16_E2', ` +#else .macro KERNEL1x16_E2 +#endif xvmaddasp vs32, vs4, vs16 @@ -5134,9 +6130,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs35, vs7, vs16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x16_SUBI1', ` +#else .macro KERNEL1x16_SUBI1 +#endif lxvw4x vs0, o0, AO @@ -5159,9 +6163,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs35, vs3, vs8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x16_SUB1', ` +#else .macro KERNEL1x16_SUB1 +#endif lxvw4x vs0, o0, AO @@ -5184,9 +6196,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs35, vs3, vs8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE1x16', ` +#else .macro SAVE1x16 +#endif mr T1, CO @@ -5220,14 +6240,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 64 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=8 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD1x8_1', ` +#else .macro LOAD1x8_1 +#endif lxvw4x vs0, o0, AO lxvw4x vs1, o16, AO @@ -5240,9 +6268,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 4 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_I1', ` +#else .macro KERNEL1x8_I1 +#endif lxvw4x vs4, o0, AO @@ -5261,9 +6297,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs33, vs1, vs8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_1', ` +#else .macro KERNEL1x8_1 +#endif lxvw4x vs4, o0, AO @@ -5282,9 +6326,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs33, vs1, vs8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_2', ` +#else .macro KERNEL1x8_2 +#endif lxvw4x vs0, o0, AO @@ -5303,18 +6355,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs33, vs5, vs16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_E2', ` +#else .macro KERNEL1x8_E2 +#endif xvmaddasp vs32, vs4, vs16 xvmaddasp vs33, vs5, vs16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_SUBI1', ` +#else .macro KERNEL1x8_SUBI1 +#endif lxvw4x vs0, o0, AO @@ -5333,9 +6401,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs33, vs1, vs8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_SUB1', ` +#else .macro KERNEL1x8_SUB1 +#endif lxvw4x vs0, o0, AO @@ -5354,9 +6430,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs33, vs1, vs8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE1x8', ` +#else .macro SAVE1x8 +#endif mr T1, CO @@ -5382,14 +6466,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 32 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=4 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD1x4_1', ` +#else .macro LOAD1x4_1 +#endif lxvw4x vs0, o0, AO @@ -5401,9 +6493,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 4 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_I1', ` +#else .macro KERNEL1x4_I1 +#endif lxvw4x vs4, o0, AO @@ -5420,9 +6520,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs32, vs0, vs8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_1', ` +#else .macro KERNEL1x4_1 +#endif lxvw4x vs4, o0, AO @@ -5439,9 +6547,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs32, vs0, vs8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_2', ` +#else .macro KERNEL1x4_2 +#endif lxvw4x vs0, o0, AO @@ -5458,17 +6574,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs32, vs4, vs16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_E2', ` +#else .macro KERNEL1x4_E2 +#endif xvmaddasp vs32, vs4, vs16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_SUBI1', ` +#else .macro KERNEL1x4_SUBI1 +#endif lxvw4x vs0, o0, AO @@ -5485,9 +6617,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs32, vs0, vs8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_SUB1', ` +#else .macro KERNEL1x4_SUB1 +#endif lxvw4x vs0, o0, AO @@ -5504,9 +6644,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs32, vs0, vs8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE1x4', ` +#else .macro SAVE1x4 +#endif mr T1, CO @@ -5528,14 +6676,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 16 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=2 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD1x2_1', ` +#else .macro LOAD1x2_1 +#endif lxsspx vs0, o0, AO lxsspx vs1, o4, AO @@ -5548,9 +6704,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 4 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_I1', ` +#else .macro KERNEL1x2_I1 +#endif lxsspx vs4, o0, AO @@ -5569,9 +6733,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs33, vs1, vs8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_1', ` +#else .macro KERNEL1x2_1 +#endif lxsspx vs4, o0, AO @@ -5590,9 +6762,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs33, vs1, vs8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_2', ` +#else .macro KERNEL1x2_2 +#endif lxsspx vs0, o0, AO @@ -5611,18 +6791,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs33, vs5, vs16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_E2', ` +#else .macro KERNEL1x2_E2 +#endif xsmaddadp vs32, vs4, vs16 xsmaddadp vs33, vs5, vs16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_SUBI1', ` +#else .macro KERNEL1x2_SUBI1 +#endif lxsspx vs0, o0, AO @@ -5641,9 +6837,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs33, vs1, vs8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_SUB1', ` +#else .macro KERNEL1x2_SUB1 +#endif lxsspx vs0, o0, AO @@ -5662,9 +6866,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs33, vs1, vs8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE1x2', ` +#else .macro SAVE1x2 +#endif mr T1, CO @@ -5690,14 +6902,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 8 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=1 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD1x1_1', ` +#else .macro LOAD1x1_1 +#endif lxsspx vs0, o0, AO @@ -5709,9 +6929,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 4 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_I1', ` +#else .macro KERNEL1x1_I1 +#endif lxsspx vs4, o0, AO @@ -5728,9 +6956,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs32, vs0, vs8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_1', ` +#else .macro KERNEL1x1_1 +#endif lxsspx vs4, o0, AO @@ -5747,9 +6983,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs32, vs0, vs8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_2', ` +#else .macro KERNEL1x1_2 +#endif lxsspx vs0, o0, AO @@ -5766,17 +7010,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs32, vs4, vs16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_E2', ` +#else .macro KERNEL1x1_E2 +#endif xsmaddadp vs32, vs4, vs16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_SUBI1', ` +#else .macro KERNEL1x1_SUBI1 +#endif lxsspx vs0, o0, AO @@ -5793,9 +7053,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs32, vs0, vs8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_SUB1', ` +#else .macro KERNEL1x1_SUB1 +#endif lxsspx vs0, o0, AO @@ -5812,9 +7080,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs32, vs0, vs8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE1x1', ` +#else .macro SAVE1x1 +#endif mr T1, CO @@ -5836,5 +7112,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 4 +#if defined(_AIX) +') +#else .endm +#endif diff --git a/kernel/power/zasum_microk_power8.c b/kernel/power/zasum_microk_power8.c index 82366902d..3f0af4232 100644 --- a/kernel/power/zasum_microk_power8.c +++ b/kernel/power/zasum_microk_power8.c @@ -68,10 +68,10 @@ static double zasum_kernel_8 (long n, double *x) "addi %2, %2, 128 \n\t" "addic. %1, %1, -8 \n\t" - "ble 2f \n\t" + "ble two%= \n\t" - ".p2align 5 \n" - "1: \n\t" + ".align 5 \n" + "one%=: \n\t" "xvabsdp 48, 40 \n\t" "xvabsdp 49, 41 \n\t" @@ -108,9 +108,9 @@ static double zasum_kernel_8 (long n, double *x) "xvadddp 38, 38, %x5 \n\t" "xvadddp 39, 39, %x6 \n\t" - "bgt 1b \n" + "bgt one%= \n" - "2: \n\t" + "two%=: \n\t" "xvabsdp 48, 40 \n\t" "xvabsdp 49, 41 \n\t" @@ -140,7 +140,7 @@ static double zasum_kernel_8 (long n, double *x) "xvadddp 32, 32, 36 \n\t" - "xxswapd 33, 32 \n\t" + XXSWAPD_S(33,32) "xsadddp %x0, 32, 33 \n" "#n=%1 x=%3=%2 sum=%0 o16=%8 o32=%9 o48=%10 o64=%11 o80=%12 o96=%13 o112=%14\n" diff --git a/kernel/power/zaxpy_microk_power8.c b/kernel/power/zaxpy_microk_power8.c index 124614f62..959050e5f 100644 --- a/kernel/power/zaxpy_microk_power8.c +++ b/kernel/power/zaxpy_microk_power8.c @@ -61,8 +61,8 @@ static void zaxpy_kernel_4 (long n, double *x, double *y, __asm__ ( - "xxspltd 32, %x19, 0 \n\t" // alpha_r - "xxspltd 33, %x20, 0 \n\t" // alpha_i + XXSPLTD_S(32,%x19,0) // alpha_r + XXSPLTD_S(33,%x20,0) // alpha_i "lxvd2x 36, 0, %21 \n\t" // mvec @@ -87,10 +87,10 @@ static void zaxpy_kernel_4 (long n, double *x, double *y, "lxvd2x 50, %23, %3 \n\t" // y2 "lxvd2x 51, %24, %3 \n\t" // y3 - "xxswapd %x8, 40 \n\t" // exchange real and imag part - "xxswapd %x9, 41 \n\t" // exchange real and imag part - "xxswapd %x10, 42 \n\t" // exchange real and imag part - "xxswapd %x11, 43 \n\t" // exchange real and imag part + XXSWAPD_S(%x8,40) // exchange real and imag part + XXSWAPD_S(%x9,41) // exchange real and imag part + XXSWAPD_S(%x10,42) // exchange real and imag part + XXSWAPD_S(%x11,43) // exchange real and imag part "addi %2, %2, 64 \n\t" "addi %3, %3, 64 \n\t" @@ -105,19 +105,19 @@ static void zaxpy_kernel_4 (long n, double *x, double *y, "lxvd2x %x6, %23, %3 \n\t" // y6 "lxvd2x %x7, %24, %3 \n\t" // y7 - "xxswapd %x12, 44 \n\t" // exchange real and imag part - "xxswapd %x13, 45 \n\t" // exchange real and imag part - "xxswapd %x14, 46 \n\t" // exchange real and imag part - "xxswapd %x15, 47 \n\t" // exchange real and imag part + XXSWAPD_S(%x12,44) // exchange real and imag part + XXSWAPD_S(%x13,45) // exchange real and imag part + XXSWAPD_S(%x14,46) // exchange real and imag part + XXSWAPD_S(%x15,47) // exchange real and imag part "addi %2, %2, 64 \n\t" "addi %3, %3, 64 \n\t" "addic. %1, %1, -8 \n\t" - "ble 2f \n\t" + "ble two%= \n\t" - ".p2align 5 \n" - "1: \n\t" + ".align 5 \n" + "one%=: \n\t" "xvmaddadp 48, 40, 32 \n\t" // alpha_r * x0_r , alpha_r * x0_i "xvmaddadp 49, 41, 32 \n\t" @@ -163,31 +163,31 @@ static void zaxpy_kernel_4 (long n, double *x, double *y, "addi %16, %16, 64 \n\t" - "xxswapd %x8, 40 \n\t" // exchange real and imag part - "xxswapd %x9, 41 \n\t" // exchange real and imag part + XXSWAPD_S(%x8,40) // exchange real and imag part + XXSWAPD_S(%x9,41) // exchange real and imag part "lxvd2x 48, 0, %3 \n\t" // y0 "lxvd2x 49, %22, %3 \n\t" // y1 - "xxswapd %x10, 42 \n\t" // exchange real and imag part - "xxswapd %x11, 43 \n\t" // exchange real and imag part + XXSWAPD_S(%x10,42) // exchange real and imag part + XXSWAPD_S(%x11,43) // exchange real and imag part "lxvd2x 50, %23, %3 \n\t" // y2 "lxvd2x 51, %24, %3 \n\t" // y3 - "xxswapd %x12, 44 \n\t" // exchange real and imag part + XXSWAPD_S(%x12,44) // exchange real and imag part "addi %3, %3, 64 \n\t" - "xxswapd %x13, 45 \n\t" // exchange real and imag part + XXSWAPD_S(%x13,45) // exchange real and imag part "lxvd2x %x4, 0, %3 \n\t" // y4 "lxvd2x %x5, %22, %3 \n\t" // y5 - "xxswapd %x14, 46 \n\t" // exchange real and imag part - "xxswapd %x15, 47 \n\t" // exchange real and imag part + XXSWAPD_S(%x14,46) // exchange real and imag part + XXSWAPD_S(%x15,47) // exchange real and imag part "lxvd2x %x6, %23, %3 \n\t" // y6 "lxvd2x %x7, %24, %3 \n\t" // y7 "addi %3, %3, 64 \n\t" "addic. %1, %1, -8 \n\t" - "bgt 1b \n" + "bgt one%= \n" - "2: \n\t" + "two%=: \n\t" "xvmaddadp 48, 40, 32 \n\t" // alpha_r * x0_r , alpha_r * x0_i "xvmaddadp 49, 41, 32 \n\t" diff --git a/kernel/power/zcopy_microk_power8.c b/kernel/power/zcopy_microk_power8.c index 5ca34b633..e29547047 100644 --- a/kernel/power/zcopy_microk_power8.c +++ b/kernel/power/zcopy_microk_power8.c @@ -62,10 +62,10 @@ static void zcopy_kernel_16 (long n, FLOAT *x, FLOAT *y) "addi %2, %2, 128 \n\t" "addic. %1, %1, -16 \n\t" - "ble 2f \n\t" + "ble two%= \n\t" - ".p2align 5 \n" - "1: \n\t" + ".align 5 \n" + "one%=: \n\t" "stxvd2x 32, 0, %3 \n\t" "stxvd2x 33, %5, %3 \n\t" @@ -108,9 +108,9 @@ static void zcopy_kernel_16 (long n, FLOAT *x, FLOAT *y) "addi %2, %2, 128 \n\t" "addic. %1, %1, -16 \n\t" - "bgt 1b \n" + "bgt one%= \n" - "2: \n\t" + "two%=: \n\t" "stxvd2x 32, 0, %3 \n\t" "stxvd2x 33, %5, %3 \n\t" diff --git a/kernel/power/zdot_microk_power8.c b/kernel/power/zdot_microk_power8.c index 71078b66c..dcde82433 100644 --- a/kernel/power/zdot_microk_power8.c +++ b/kernel/power/zdot_microk_power8.c @@ -60,10 +60,10 @@ static void zdot_kernel_8 (long n, double *x, double *y, double *dot) "lxvd2x 43, %9, %2 \n\t" // x3_r, x3_i "lxvd2x 51, %9, %3 \n\t" // y3_r, y3_i - "xxswapd 0, 48 \n\t" // y0_i, y0_r - "xxswapd 1, 49 \n\t" // y1_i, y1_r - "xxswapd 2, 50 \n\t" // y2_i, y2_r - "xxswapd 3, 51 \n\t" // y3_i, y3_r + XXSWAPD_S(0,48) // y0_i, y0_r + XXSWAPD_S(1,49) // y1_i, y1_r + XXSWAPD_S(2,50) // y2_i, y2_r + XXSWAPD_S(3,51) // y3_i, y3_r "addi %2, %2, 64 \n\t" "addi %3, %3, 64 \n\t" @@ -77,19 +77,19 @@ static void zdot_kernel_8 (long n, double *x, double *y, double *dot) "lxvd2x 47, %9, %2 \n\t" // x3_r, x3_i "lxvd2x 7, %9, %3 \n\t" // y3_r, y3_i - "xxswapd 8, 4 \n\t" // y0_i, y0_r - "xxswapd 9, 5 \n\t" // y1_i, y1_r - "xxswapd 10, 6 \n\t" // y2_i, y2_r - "xxswapd 11, 7 \n\t" // y3_i, y3_r + XXSWAPD_S(8,4) // y0_i, y0_r + XXSWAPD_S(9,5) // y1_i, y1_r + XXSWAPD_S(10,6) // y2_i, y2_r + XXSWAPD_S(11,7) // y3_i, y3_r "addi %2, %2, 64 \n\t" "addi %3, %3, 64 \n\t" "addic. %1, %1, -8 \n\t" - "ble 2f \n\t" + "ble two%= \n\t" - ".p2align 5 \n" - "1: \n\t" + ".align 5 \n" + "one%=: \n\t" "xvmaddadp 32, 40, 48 \n\t" // x0_r * y0_r , x0_i * y0_i "lxvd2x 48, 0, %3 \n\t" // y0_r, y0_i @@ -111,14 +111,14 @@ static void zdot_kernel_8 (long n, double *x, double *y, double *dot) "xvmaddadp 39, 43, 3 \n\t" // x3_r * y3_i , x3_i * y3_r "lxvd2x 43, %9, %2 \n\t" // x3_r, x3_i - "xxswapd 0,48 \n\t" // y0_i, y0_r - "xxswapd 1,49 \n\t" // y1_i, y1_r + XXSWAPD_S(0,48) // y0_i, y0_r + XXSWAPD_S(1,49) // y1_i, y1_r "addi %2, %2, 64 \n\t" "addi %3, %3, 64 \n\t" - "xxswapd 2,50 \n\t" // y2_i, y2_r - "xxswapd 3,51 \n\t" // y3_i, y3_r + XXSWAPD_S(2,50) // y2_i, y2_r + XXSWAPD_S(3,51) // y3_i, y3_r "xvmaddadp 32, 44, 4 \n\t" // x0_r * y0_r , x0_i * y0_i "lxvd2x 4, 0, %3 \n\t" // y0_r, y0_i @@ -138,19 +138,19 @@ static void zdot_kernel_8 (long n, double *x, double *y, double *dot) "xvmaddadp 39, 47, 11 \n\t" // x3_r * y3_i , x3_i * y3_r "lxvd2x 47, %9, %2 \n\t" // x3_r, x3_i - "xxswapd 8,4 \n\t" // y0_i, y0_r - "xxswapd 9,5 \n\t" // y1_i, y1_r + XXSWAPD_S(8,4) // y0_i, y0_r + XXSWAPD_S(9,5) // y1_i, y1_r "addi %2, %2, 64 \n\t" "addi %3, %3, 64 \n\t" - "xxswapd 10,6 \n\t" // y2_i, y2_r - "xxswapd 11,7 \n\t" // y3_i, y3_r + XXSWAPD_S(10,6) // y2_i, y2_r + XXSWAPD_S(11,7) // y3_i, y3_r "addic. %1, %1, -8 \n\t" - "bgt 1b \n" + "bgt one%= \n" - "2: \n\t" + "two%=: \n\t" "xvmaddadp 32, 40, 48 \n\t" // x0_r * y0_r , x0_i * y0_i "xvmaddadp 34, 41, 49 \n\t" // x1_r * y1_r , x1_i * y1_i diff --git a/kernel/power/zgemm_macros_8x2_power8.S b/kernel/power/zgemm_macros_8x2_power8.S index c43a115b2..24a36470c 100644 --- a/kernel/power/zgemm_macros_8x2_power8.S +++ b/kernel/power/zgemm_macros_8x2_power8.S @@ -67,7 +67,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * Macros for N=2 and M=8 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD2x8_1', ` +#else .macro LOAD2x8_1 +#endif lxvd2x vs16, o0, BO // load real part from B lxvd2x vs17, o16, BO // load imag part from B @@ -91,9 +95,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_I1', ` +#else .macro KERNEL2x8_I1 +#endif lxvd2x vs8, o0, AO // load real,imag from A lxvd2x vs9, o16, AO // load real,imag from A @@ -151,9 +163,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs63, vs7, vs19 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_1', ` +#else .macro KERNEL2x8_1 +#endif lxvd2x vs8, o0, AO // load real,imag from A lxvd2x vs9, o16, AO // load real,imag from A @@ -211,9 +231,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs63, vs7, vs19 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_2', ` +#else .macro KERNEL2x8_2 +#endif lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A @@ -271,9 +299,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs63, vs15, vs23 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_E2', ` +#else .macro KERNEL2x8_E2 +#endif xvmaddadp vs32, vs8, vs20 // real*real, imag*real @@ -311,9 +347,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs63, vs15, vs23 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_SUBI1', ` +#else .macro KERNEL2x8_SUBI1 +#endif lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A @@ -371,9 +415,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs63, vs7, vs19 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_SUB1', ` +#else .macro KERNEL2x8_SUB1 +#endif lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A @@ -431,9 +483,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs63, vs7, vs19 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE2x8', ` +#else .macro SAVE2x8 +#endif mr T1, CO @@ -455,13 +515,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs33,vs33) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs32 // realA*realB XSFADD_R2 vs0, vs0, vs33 // imagA*imagB - xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs32,vs32) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs33,vs33) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs32 // realA*imagB XSFADD_I2 vs1, vs1, vs33 // imagA*realB @@ -479,13 +539,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs35,vs35) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs34 // realA*realB XSFADD_R2 vs0, vs0, vs35 // imagA*imagB - xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs34,vs34) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs35,vs35) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs34 // realA*imagB XSFADD_I2 vs1, vs1, vs35 // imagA*realB @@ -503,13 +563,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs37, vs37 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs37,vs37) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs36 // realA*realB XSFADD_R2 vs0, vs0, vs37 // imagA*imagB - xxswapd vs36, vs36 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs37, vs37 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs36,vs36) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs37,vs37) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs36 // realA*imagB XSFADD_I2 vs1, vs1, vs37 // imagA*realB @@ -527,13 +587,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs39, vs39 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs39,vs39) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs38 // realA*realB XSFADD_R2 vs0, vs0, vs39 // imagA*imagB - xxswapd vs38, vs38 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs39, vs39 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs38,vs38) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs39,vs39) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs38 // realA*imagB XSFADD_I2 vs1, vs1, vs39 // imagA*realB @@ -551,13 +611,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs41, vs41 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs41,vs41) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs40 // realA*realB XSFADD_R2 vs0, vs0, vs41 // imagA*imagB - xxswapd vs40, vs40 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs41, vs41 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs40,vs40) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs41,vs41) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs40 // realA*imagB XSFADD_I2 vs1, vs1, vs41 // imagA*realB @@ -575,13 +635,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs43, vs43 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs43,vs43) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs42 // realA*realB XSFADD_R2 vs0, vs0, vs43 // imagA*imagB - xxswapd vs42, vs42 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs43, vs43 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs42,vs42) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs43,vs43) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs42 // realA*imagB XSFADD_I2 vs1, vs1, vs43 // imagA*realB @@ -599,13 +659,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs45, vs45 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs45,vs45) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs44 // realA*realB XSFADD_R2 vs0, vs0, vs45 // imagA*imagB - xxswapd vs44, vs44 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs45, vs45 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs44,vs44) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs45,vs45) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs44 // realA*imagB XSFADD_I2 vs1, vs1, vs45 // imagA*realB @@ -623,13 +683,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs47, vs47 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs47,vs47) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs46 // realA*realB XSFADD_R2 vs0, vs0, vs47 // imagA*imagB - xxswapd vs46, vs46 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs47, vs47 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs46,vs46) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs47,vs47) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs46 // realA*imagB XSFADD_I2 vs1, vs1, vs47 // imagA*realB @@ -685,13 +745,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs49, vs49 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs49,vs49) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs48 // realA*realB XSFADD_R2 vs0, vs0, vs49 // imagA*imagB - xxswapd vs48, vs48 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs49, vs49 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs48,vs48) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs49,vs49) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs48 // realA*imagB XSFADD_I2 vs1, vs1, vs49 // imagA*realB @@ -709,13 +769,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs51, vs51 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs51,vs51) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs50 // realA*realB XSFADD_R2 vs0, vs0, vs51 // imagA*imagB - xxswapd vs50, vs50 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs51, vs51 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs50,vs50) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs51,vs51) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs50 // realA*imagB XSFADD_I2 vs1, vs1, vs51 // imagA*realB @@ -733,13 +793,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs53, vs53 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs53,vs53) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs52 // realA*realB XSFADD_R2 vs0, vs0, vs53 // imagA*imagB - xxswapd vs52, vs52 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs53, vs53 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs52,vs52) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs53,vs53) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs52 // realA*imagB XSFADD_I2 vs1, vs1, vs53 // imagA*realB @@ -757,13 +817,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs55, vs55 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs55,vs55) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs54 // realA*realB XSFADD_R2 vs0, vs0, vs55 // imagA*imagB - xxswapd vs54, vs54 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs55, vs55 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs54,vs54) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs55,vs55) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs54 // realA*imagB XSFADD_I2 vs1, vs1, vs55 // imagA*realB @@ -781,13 +841,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs57, vs57 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs57,vs57) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs56 // realA*realB XSFADD_R2 vs0, vs0, vs57 // imagA*imagB - xxswapd vs56, vs56 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs57, vs57 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs56,vs56) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs57,vs57) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs56 // realA*imagB XSFADD_I2 vs1, vs1, vs57 // imagA*realB @@ -805,13 +865,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs59, vs59 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs59,vs59) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs58 // realA*realB XSFADD_R2 vs0, vs0, vs59 // imagA*imagB - xxswapd vs58, vs58 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs59, vs59 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs58,vs58) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs59,vs59) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs58 // realA*imagB XSFADD_I2 vs1, vs1, vs59 // imagA*realB @@ -829,13 +889,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs61, vs61 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs61,vs61) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs60 // realA*realB XSFADD_R2 vs0, vs0, vs61 // imagA*imagB - xxswapd vs60, vs60 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs61, vs61 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs60,vs60) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs61,vs61) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs60 // realA*imagB XSFADD_I2 vs1, vs1, vs61 // imagA*realB @@ -853,13 +913,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs63, vs63 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs63,vs63) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs62 // realA*realB XSFADD_R2 vs0, vs0, vs63 // imagA*imagB - xxswapd vs62, vs62 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs63, vs63 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs62,vs62) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs63,vs63) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs62 // realA*imagB XSFADD_I2 vs1, vs1, vs63 // imagA*realB @@ -900,14 +960,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add T2, T2, LDC addi CO, CO, 128 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=4 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD2x4_1', ` +#else .macro LOAD2x4_1 +#endif lxvd2x vs16, o0, BO // load real part from B lxvd2x vs17, o16, BO // load imag part from B @@ -924,9 +992,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_I1', ` +#else .macro KERNEL2x4_I1 +#endif lxvd2x vs8, o0, AO // load real,imag from A lxvd2x vs9, o16, AO // load real,imag from A @@ -961,9 +1037,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs47, vs3, vs19 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_1', ` +#else .macro KERNEL2x4_1 +#endif lxvd2x vs8, o0, AO // load real,imag from A lxvd2x vs9, o16, AO // load real,imag from A @@ -998,9 +1082,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs47, vs3, vs19 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_2', ` +#else .macro KERNEL2x4_2 +#endif lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A @@ -1035,9 +1127,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs47, vs11, vs23 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_E2', ` +#else .macro KERNEL2x4_E2 +#endif xvmaddadp vs32, vs8, vs20 // real*real, imag*real @@ -1059,9 +1159,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs47, vs11, vs23 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_SUBI1', ` +#else .macro KERNEL2x4_SUBI1 +#endif lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A @@ -1096,9 +1204,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs47, vs3, vs19 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_SUB1', ` +#else .macro KERNEL2x4_SUB1 +#endif lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A @@ -1133,9 +1249,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs47, vs3, vs19 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE2x4', ` +#else .macro SAVE2x4 +#endif mr T1, CO @@ -1152,13 +1276,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs33,vs33) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs32 // realA*realB XSFADD_R2 vs0, vs0, vs33 // imagA*imagB - xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs32,vs32) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs33,vs33) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs32 // realA*imagB XSFADD_I2 vs1, vs1, vs33 // imagA*realB @@ -1176,13 +1300,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs35,vs35) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs34 // realA*realB XSFADD_R2 vs0, vs0, vs35 // imagA*imagB - xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs34,vs34) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs35,vs35) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs34 // realA*imagB XSFADD_I2 vs1, vs1, vs35 // imagA*realB @@ -1200,13 +1324,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs37, vs37 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs37,vs37) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs36 // realA*realB XSFADD_R2 vs0, vs0, vs37 // imagA*imagB - xxswapd vs36, vs36 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs37, vs37 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs36,vs36) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs37,vs37) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs36 // realA*imagB XSFADD_I2 vs1, vs1, vs37 // imagA*realB @@ -1224,13 +1348,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs39, vs39 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs39,vs39) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs38 // realA*realB XSFADD_R2 vs0, vs0, vs39 // imagA*imagB - xxswapd vs38, vs38 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs39, vs39 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs38,vs38) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs39,vs39) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs38 // realA*imagB XSFADD_I2 vs1, vs1, vs39 // imagA*realB @@ -1273,13 +1397,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs41, vs41 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs41,vs41) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs40 // realA*realB XSFADD_R2 vs0, vs0, vs41 // imagA*imagB - xxswapd vs40, vs40 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs41, vs41 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs40,vs40) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs41,vs41) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs40 // realA*imagB XSFADD_I2 vs1, vs1, vs41 // imagA*realB @@ -1297,13 +1421,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs43, vs43 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs43,vs43) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs42 // realA*realB XSFADD_R2 vs0, vs0, vs43 // imagA*imagB - xxswapd vs42, vs42 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs43, vs43 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs42,vs42) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs43,vs43) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs42 // realA*imagB XSFADD_I2 vs1, vs1, vs43 // imagA*realB @@ -1321,13 +1445,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs45, vs45 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs45,vs45) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs44 // realA*realB XSFADD_R2 vs0, vs0, vs45 // imagA*imagB - xxswapd vs44, vs44 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs45, vs45 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs44,vs44) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs45,vs45) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs44 // realA*imagB XSFADD_I2 vs1, vs1, vs45 // imagA*realB @@ -1345,13 +1469,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs47, vs47 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs47,vs47) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs46 // realA*realB XSFADD_R2 vs0, vs0, vs47 // imagA*imagB - xxswapd vs46, vs46 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs47, vs47 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs46,vs46) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs47,vs47) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs46 // realA*imagB XSFADD_I2 vs1, vs1, vs47 // imagA*realB @@ -1383,14 +1507,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add T1, T1, LDC addi CO, CO, 64 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=2 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD2x2_1', ` +#else .macro LOAD2x2_1 +#endif lxvd2x vs16, o0, BO // load real part from B lxvd2x vs17, o16, BO // load imag part from B @@ -1405,9 +1537,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_I1', ` +#else .macro KERNEL2x2_I1 +#endif lxvd2x vs8, o0, AO // load real,imag from A lxvd2x vs9, o16, AO // load real,imag from A @@ -1432,9 +1572,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs39, vs1, vs19 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_1', ` +#else .macro KERNEL2x2_1 +#endif lxvd2x vs8, o0, AO // load real,imag from A lxvd2x vs9, o16, AO // load real,imag from A @@ -1459,9 +1607,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs39, vs1, vs19 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_2', ` +#else .macro KERNEL2x2_2 +#endif lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A @@ -1486,9 +1642,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs39, vs9, vs23 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_E2', ` +#else .macro KERNEL2x2_E2 +#endif xvmaddadp vs32, vs8, vs20 // real*real, imag*real @@ -1502,9 +1666,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs39, vs9, vs23 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_SUBI1', ` +#else .macro KERNEL2x2_SUBI1 +#endif lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A @@ -1529,9 +1701,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs39, vs1, vs19 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_SUB1', ` +#else .macro KERNEL2x2_SUB1 +#endif lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A @@ -1556,9 +1736,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs39, vs1, vs19 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE2x2', ` +#else .macro SAVE2x2 +#endif mr T1, CO @@ -1573,13 +1761,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs33,vs33) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs32 // realA*realB XSFADD_R2 vs0, vs0, vs33 // imagA*imagB - xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs32,vs32) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs33,vs33) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs32 // realA*imagB XSFADD_I2 vs1, vs1, vs33 // imagA*realB @@ -1597,13 +1785,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs35,vs35) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs34 // realA*realB XSFADD_R2 vs0, vs0, vs35 // imagA*imagB - xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs34,vs34) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs35,vs35) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs34 // realA*imagB XSFADD_I2 vs1, vs1, vs35 // imagA*realB @@ -1640,13 +1828,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs37, vs37 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs37,vs37) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs36 // realA*realB XSFADD_R2 vs0, vs0, vs37 // imagA*imagB - xxswapd vs36, vs36 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs37, vs37 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs36,vs36) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs37,vs37) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs36 // realA*imagB XSFADD_I2 vs1, vs1, vs37 // imagA*realB @@ -1664,13 +1852,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs39, vs39 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs39,vs39) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs38 // realA*realB XSFADD_R2 vs0, vs0, vs39 // imagA*imagB - xxswapd vs38, vs38 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs39, vs39 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs38,vs38) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs39,vs39) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs38 // realA*imagB XSFADD_I2 vs1, vs1, vs39 // imagA*realB @@ -1698,14 +1886,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add T1, T1, LDC addi CO, CO, 32 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=1 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD2x1_1', ` +#else .macro LOAD2x1_1 +#endif lxvd2x vs16, o0, BO // load real part from B lxvd2x vs17, o16, BO // load imag part from B @@ -1719,9 +1915,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_I1', ` +#else .macro KERNEL2x1_I1 +#endif lxvd2x vs8, o0, AO // load real,imag from A @@ -1741,9 +1945,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs35, vs0, vs19 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_1', ` +#else .macro KERNEL2x1_1 +#endif lxvd2x vs8, o0, AO // load real,imag from A @@ -1763,9 +1975,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs35, vs0, vs19 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_2', ` +#else .macro KERNEL2x1_2 +#endif lxvd2x vs0, o0, AO // load real,imag from A @@ -1785,9 +2005,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs35, vs8, vs23 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_E2', ` +#else .macro KERNEL2x1_E2 +#endif xvmaddadp vs32, vs8, vs20 // real*real, imag*real @@ -1797,9 +2025,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs35, vs8, vs23 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_SUBI1', ` +#else .macro KERNEL2x1_SUBI1 +#endif lxvd2x vs0, o0, AO // load real,imag from A @@ -1819,9 +2055,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs35, vs0, vs19 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_SUB1', ` +#else .macro KERNEL2x1_SUB1 +#endif lxvd2x vs0, o0, AO // load real,imag from A @@ -1841,9 +2085,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs35, vs0, vs19 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE2x1', ` +#else .macro SAVE2x1 +#endif mr T1, CO @@ -1857,13 +2109,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs33,vs33) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs32 // realA*realB XSFADD_R2 vs0, vs0, vs33 // imagA*imagB - xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs32,vs32) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs33,vs33) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs32 // realA*imagB XSFADD_I2 vs1, vs1, vs33 // imagA*realB @@ -1897,13 +2149,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs35,vs35) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs34 // realA*realB XSFADD_R2 vs0, vs0, vs35 // imagA*imagB - xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs34,vs34) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs35,vs35) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs34 // realA*imagB XSFADD_I2 vs1, vs1, vs35 // imagA*realB @@ -1929,14 +2181,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add T1, T1, LDC addi CO, CO, 16 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=8 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD1x8_1', ` +#else .macro LOAD1x8_1 +#endif lxvd2x vs16, o0, BO // load real part from B lxvd2x vs17, o16, BO // load imag part from B @@ -1958,9 +2218,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_I1', ` +#else .macro KERNEL1x8_I1 +#endif lxvd2x vs8, o0, AO // load real,imag from A lxvd2x vs9, o16, AO // load real,imag from A @@ -1999,9 +2267,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs47, vs7, vs17 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_1', ` +#else .macro KERNEL1x8_1 +#endif lxvd2x vs8, o0, AO // load real,imag from A lxvd2x vs9, o16, AO // load real,imag from A @@ -2040,9 +2316,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs47, vs7, vs17 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_2', ` +#else .macro KERNEL1x8_2 +#endif lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A @@ -2081,9 +2365,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs47, vs15, vs21 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_E2', ` +#else .macro KERNEL1x8_E2 +#endif xvmaddadp vs32, vs8, vs20 // real*real, imag*real @@ -2104,9 +2396,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs47, vs15, vs21 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_SUBI1', ` +#else .macro KERNEL1x8_SUBI1 +#endif lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A @@ -2145,9 +2445,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs47, vs7, vs17 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_SUB1', ` +#else .macro KERNEL1x8_SUB1 +#endif lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A @@ -2186,9 +2494,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs47, vs7, vs17 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE1x8', ` +#else .macro SAVE1x8 +#endif mr T1, CO @@ -2210,13 +2526,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs33,vs33) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs32 // realA*realB XSFADD_R2 vs0, vs0, vs33 // imagA*imagB - xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs32,vs32) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs33,vs33) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs32 // realA*imagB XSFADD_I2 vs1, vs1, vs33 // imagA*realB @@ -2234,13 +2550,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs35,vs35) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs34 // realA*realB XSFADD_R2 vs0, vs0, vs35 // imagA*imagB - xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs34,vs34) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs35,vs35) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs34 // realA*imagB XSFADD_I2 vs1, vs1, vs35 // imagA*realB @@ -2258,13 +2574,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs37, vs37 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs37,vs37) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs36 // realA*realB XSFADD_R2 vs0, vs0, vs37 // imagA*imagB - xxswapd vs36, vs36 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs37, vs37 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs36,vs36) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs37,vs37) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs36 // realA*imagB XSFADD_I2 vs1, vs1, vs37 // imagA*realB @@ -2282,13 +2598,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs39, vs39 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs39,vs39) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs38 // realA*realB XSFADD_R2 vs0, vs0, vs39 // imagA*imagB - xxswapd vs38, vs38 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs39, vs39 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs38,vs38) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs39,vs39) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs38 // realA*imagB XSFADD_I2 vs1, vs1, vs39 // imagA*realB @@ -2306,13 +2622,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs41, vs41 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs41,vs41) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs40 // realA*realB XSFADD_R2 vs0, vs0, vs41 // imagA*imagB - xxswapd vs40, vs40 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs41, vs41 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs40,vs40) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs41,vs41) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs40 // realA*imagB XSFADD_I2 vs1, vs1, vs41 // imagA*realB @@ -2330,13 +2646,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs43, vs43 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs43,vs43) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs42 // realA*realB XSFADD_R2 vs0, vs0, vs43 // imagA*imagB - xxswapd vs42, vs42 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs43, vs43 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs42,vs42) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs43,vs43) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs42 // realA*imagB XSFADD_I2 vs1, vs1, vs43 // imagA*realB @@ -2354,13 +2670,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs45, vs45 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs45,vs45) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs44 // realA*realB XSFADD_R2 vs0, vs0, vs45 // imagA*imagB - xxswapd vs44, vs44 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs45, vs45 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs44,vs44) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs45,vs45) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs44 // realA*imagB XSFADD_I2 vs1, vs1, vs45 // imagA*realB @@ -2378,13 +2694,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs47, vs47 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs47,vs47) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs46 // realA*realB XSFADD_R2 vs0, vs0, vs47 // imagA*imagB - xxswapd vs46, vs46 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs47, vs47 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs46,vs46) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs47,vs47) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs46 // realA*imagB XSFADD_I2 vs1, vs1, vs47 // imagA*realB @@ -2425,14 +2741,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add T2, T2, LDC addi CO, CO, 128 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=4 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD1x4_1', ` +#else .macro LOAD1x4_1 +#endif lxvd2x vs16, o0, BO // load real part from B lxvd2x vs17, o16, BO // load imag part from B @@ -2447,9 +2771,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_I1', ` +#else .macro KERNEL1x4_I1 +#endif lxvd2x vs8, o0, AO // load real,imag from A lxvd2x vs9, o16, AO // load real,imag from A @@ -2473,9 +2805,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs39, vs3, vs17 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_1', ` +#else .macro KERNEL1x4_1 +#endif lxvd2x vs8, o0, AO // load real,imag from A lxvd2x vs9, o16, AO // load real,imag from A @@ -2499,9 +2839,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_2', ` +#else .macro KERNEL1x4_2 +#endif lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A @@ -2525,9 +2873,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_E2', ` +#else .macro KERNEL1x4_E2 +#endif xvmaddadp vs32, vs8, vs20 // real*real, imag*real @@ -2540,9 +2896,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_SUBI1', ` +#else .macro KERNEL1x4_SUBI1 +#endif lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A @@ -2566,9 +2930,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs39, vs3, vs17 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_SUB1', ` +#else .macro KERNEL1x4_SUB1 +#endif lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A @@ -2592,9 +2964,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE1x4', ` +#else .macro SAVE1x4 +#endif mr T1, CO @@ -2611,13 +2991,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs33,vs33) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs32 // realA*realB XSFADD_R2 vs0, vs0, vs33 // imagA*imagB - xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs32,vs32) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs33,vs33) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs32 // realA*imagB XSFADD_I2 vs1, vs1, vs33 // imagA*realB @@ -2635,13 +3015,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs35,vs35) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs34 // realA*realB XSFADD_R2 vs0, vs0, vs35 // imagA*imagB - xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs34,vs34) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs35,vs35) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs34 // realA*imagB XSFADD_I2 vs1, vs1, vs35 // imagA*realB @@ -2659,13 +3039,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs37, vs37 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs37,vs37) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs36 // realA*realB XSFADD_R2 vs0, vs0, vs37 // imagA*imagB - xxswapd vs36, vs36 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs37, vs37 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs36,vs36) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs37,vs37) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs36 // realA*imagB XSFADD_I2 vs1, vs1, vs37 // imagA*realB @@ -2683,13 +3063,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs39, vs39 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs39,vs39) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs38 // realA*realB XSFADD_R2 vs0, vs0, vs39 // imagA*imagB - xxswapd vs38, vs38 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs39, vs39 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs38,vs38) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs39,vs39) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs38 // realA*imagB XSFADD_I2 vs1, vs1, vs39 // imagA*realB @@ -2721,14 +3101,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add T1, T1, LDC addi CO, CO, 64 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=2 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD1x2_1', ` +#else .macro LOAD1x2_1 +#endif lxvd2x vs16, o0, BO // load real part from B lxvd2x vs17, o16, BO // load imag part from B @@ -2741,9 +3129,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_I1', ` +#else .macro KERNEL1x2_I1 +#endif lxvd2x vs8, o0, AO // load real,imag from A lxvd2x vs9, o16, AO // load real,imag from A @@ -2761,9 +3157,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs35, vs1, vs17 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_1', ` +#else .macro KERNEL1x2_1 +#endif lxvd2x vs8, o0, AO // load real,imag from A lxvd2x vs9, o16, AO // load real,imag from A @@ -2781,9 +3185,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_2', ` +#else .macro KERNEL1x2_2 +#endif lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A @@ -2801,9 +3213,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_E2', ` +#else .macro KERNEL1x2_E2 +#endif xvmaddadp vs32, vs8, vs20 // real*real, imag*real @@ -2812,9 +3232,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_SUBI1', ` +#else .macro KERNEL1x2_SUBI1 +#endif lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A @@ -2832,9 +3260,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs35, vs1, vs17 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_SUB1', ` +#else .macro KERNEL1x2_SUB1 +#endif lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A @@ -2852,9 +3288,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE1x2', ` +#else .macro SAVE1x2 +#endif mr T1, CO @@ -2869,13 +3313,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs33,vs33) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs32 // realA*realB XSFADD_R2 vs0, vs0, vs33 // imagA*imagB - xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs32,vs32) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs33,vs33) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs32 // realA*imagB XSFADD_I2 vs1, vs1, vs33 // imagA*realB @@ -2893,13 +3337,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs35,vs35) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs34 // realA*realB XSFADD_R2 vs0, vs0, vs35 // imagA*imagB - xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs34,vs34) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs35,vs35) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs34 // realA*imagB XSFADD_I2 vs1, vs1, vs35 // imagA*realB @@ -2927,14 +3371,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add T1, T1, LDC addi CO, CO, 32 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=1 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD1x1_1', ` +#else .macro LOAD1x1_1 +#endif lxvd2x vs16, o0, BO // load real part from B lxvd2x vs17, o16, BO // load imag part from B @@ -2946,9 +3398,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_I1', ` +#else .macro KERNEL1x1_I1 +#endif lxvd2x vs8, o0, AO // load real,imag from A @@ -2963,9 +3423,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs33, vs0, vs17 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_1', ` +#else .macro KERNEL1x1_1 +#endif lxvd2x vs8, o0, AO // load real,imag from A @@ -2980,9 +3448,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_2', ` +#else .macro KERNEL1x1_2 +#endif lxvd2x vs0, o0, AO // load real,imag from A @@ -2997,18 +3473,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_E2', ` +#else .macro KERNEL1x1_E2 +#endif xvmaddadp vs32, vs8, vs20 // real*real, imag*real xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_SUBI1', ` +#else .macro KERNEL1x1_SUBI1 +#endif lxvd2x vs0, o0, AO // load real,imag from A @@ -3023,9 +3515,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs33, vs0, vs17 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_SUB1', ` +#else .macro KERNEL1x1_SUB1 +#endif lxvd2x vs0, o0, AO // load real,imag from A @@ -3040,9 +3540,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE1x1', ` +#else .macro SAVE1x1 +#endif mr T1, CO @@ -3056,13 +3564,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs33,vs33) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs32 // realA*realB XSFADD_R2 vs0, vs0, vs33 // imagA*imagB - xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs32,vs32) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs33,vs33) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs32 // realA*imagB XSFADD_I2 vs1, vs1, vs33 // imagA*realB @@ -3088,11 +3596,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add T1, T1, LDC addi CO, CO, 16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`ZCOPYB_1x1', ` +#else .macro ZCOPYB_1x1 +#endif lxvdsx vs4, o0, BO // b0_r lxvdsx vs5, o8, BO // b0_i @@ -3101,10 +3617,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvd2x vs5, o16, BBO addi BBO, BBO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`ZCOPYB_8x1', ` +#else .macro ZCOPYB_8x1 +#endif lxvd2x vs32, o0, BO lxvd2x vs33, o16, BO @@ -3118,23 +3642,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvd2x vs39, o48, BO addi BO, BO, 64 - xxspltd vs40, vs32, 0 - xxspltd vs41, vs32, 1 - xxspltd vs42, vs33, 0 - xxspltd vs43, vs33, 1 - xxspltd vs44, vs34, 0 - xxspltd vs45, vs34, 1 - xxspltd vs46, vs35, 0 - xxspltd vs47, vs35, 1 + XXSPLTD(vs40,vs32,0) + XXSPLTD(vs41,vs32,1) + XXSPLTD(vs42,vs33,0) + XXSPLTD(vs43,vs33,1) + XXSPLTD(vs44,vs34,0) + XXSPLTD(vs45,vs34,1) + XXSPLTD(vs46,vs35,0) + XXSPLTD(vs47,vs35,1) - xxspltd vs48, vs36, 0 - xxspltd vs49, vs36, 1 - xxspltd vs50, vs37, 0 - xxspltd vs51, vs37, 1 - xxspltd vs52, vs38, 0 - xxspltd vs53, vs38, 1 - xxspltd vs54, vs39, 0 - xxspltd vs55, vs39, 1 + XXSPLTD(vs48,vs36,0) + XXSPLTD(vs49,vs36,1) + XXSPLTD(vs50,vs37,0) + XXSPLTD(vs51,vs37,1) + XXSPLTD(vs52,vs38,0) + XXSPLTD(vs53,vs38,1) + XXSPLTD(vs54,vs39,0) + XXSPLTD(vs55,vs39,1) stxvd2x vs40, o0, BBO stxvd2x vs41, o16, BBO @@ -3160,6 +3684,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvd2x vs55, o48, BBO addi BBO, BBO, 64 +#if defined(_AIX) +') +#else .endm +#endif diff --git a/kernel/power/zgemm_tcopy_macros_8_power8.S b/kernel/power/zgemm_tcopy_macros_8_power8.S index 3f5a5ed03..654332375 100644 --- a/kernel/power/zgemm_tcopy_macros_8_power8.S +++ b/kernel/power/zgemm_tcopy_macros_8_power8.S @@ -38,7 +38,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * Macros for N=4 and M=8 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_4x8', ` +#else .macro COPY_4x8 +#endif lxvd2x vs32, o0, A0 lxvd2x vs33, o16, A0 @@ -144,14 +148,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvd2x vs12, o32, T1 stxvd2x vs13, o48, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=4 and M=4 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_4x4', ` +#else .macro COPY_4x4 +#endif lxvd2x vs32, o0, A0 lxvd2x vs33, o16, A0 @@ -209,14 +221,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvd2x vs46, o32, T1 stxvd2x vs47, o48, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=4 and M=2 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_4x2', ` +#else .macro COPY_4x2 +#endif lxvd2x vs32, o0, A0 lxvd2x vs33, o16, A0 @@ -254,14 +274,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvd2x vs38, o32, T1 stxvd2x vs39, o48, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=4 and M=1 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_4x1', ` +#else .macro COPY_4x1 +#endif lxvd2x vs32, o0, A0 addi A0, A0, 16 @@ -289,14 +317,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvd2x vs35, o48, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=8 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_2x8', ` +#else .macro COPY_2x8 +#endif lxvd2x vs32, o0, A0 lxvd2x vs33, o16, A0 @@ -350,14 +386,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvd2x vs46, o32, T1 stxvd2x vs47, o48, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=4 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_2x4', ` +#else .macro COPY_2x4 +#endif lxvd2x vs32, o0, A0 lxvd2x vs33, o16, A0 @@ -387,14 +431,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvd2x vs38, o32, T1 stxvd2x vs39, o48, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=2 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_2x2', ` +#else .macro COPY_2x2 +#endif lxvd2x vs32, o0, A0 lxvd2x vs33, o16, A0 @@ -414,14 +466,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvd2x vs34, o32, T1 stxvd2x vs35, o48, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=1 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_2x1', ` +#else .macro COPY_2x1 +#endif lxvd2x vs32, o0, A0 addi A0, A0, 16 @@ -437,14 +497,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvd2x vs33, o16, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=8 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_1x8', ` +#else .macro COPY_1x8 +#endif lxvd2x vs32, o0, A0 lxvd2x vs33, o16, A0 @@ -472,14 +540,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvd2x vs38, o32, T1 stxvd2x vs39, o48, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=4 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_1x4', ` +#else .macro COPY_1x4 +#endif lxvd2x vs32, o0, A0 lxvd2x vs33, o16, A0 @@ -495,14 +571,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvd2x vs34, o32, T1 stxvd2x vs35, o48, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=2 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_1x2', ` +#else .macro COPY_1x2 +#endif lxvd2x vs32, o0, A0 lxvd2x vs33, o16, A0 @@ -514,14 +598,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvd2x vs32, o0, T1 stxvd2x vs33, o16, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=1 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_1x1', ` +#else .macro COPY_1x1 +#endif lxvd2x vs32, o0, A0 addi A0, A0, 16 @@ -531,5 +623,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvd2x vs32, o0, T1 +#if defined(_AIX) +') +#else .endm +#endif diff --git a/kernel/power/zrot.c b/kernel/power/zrot.c index d45468fd5..c6d666178 100644 --- a/kernel/power/zrot.c +++ b/kernel/power/zrot.c @@ -40,8 +40,8 @@ static void zrot_kernel_4(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT cosA, FLOAT si __asm__ ( - "xxspltd 36, %x[cos], 0 \n\t" // load c to both dwords - "xxspltd 37, %x[sin], 0 \n\t" // load s to both dwords + XXSPLTD_S(36,%x[cos],0) // load c to both dwords + XXSPLTD_S(37,%x[sin],0) // load s to both dwords "lxvd2x 32, 0, %[x_ptr] \n\t" // load x "lxvd2x 33, %[i16], %[x_ptr] \n\t" @@ -57,10 +57,10 @@ static void zrot_kernel_4(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT cosA, FLOAT si "addi %[y_ptr], %[y_ptr], 64 \n\t" "addic. %[temp_n], %[temp_n], -4 \n\t" - "ble 2f \n\t" + "ble two%= \n\t" - ".p2align 5 \n" - "1: \n\t" + ".align 5 \n" + "one%=: \n\t" "xvmuldp 40, 32, 36 \n\t" // c * x "xvmuldp 41, 33, 36 \n\t" @@ -124,9 +124,9 @@ static void zrot_kernel_4(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT cosA, FLOAT si "addi %[y_ptr], %[y_ptr], 128 \n\t" "addic. %[temp_n], %[temp_n], -4 \n\t" - "bgt+ 1b \n" + "bgt+ one%= \n" - "2: \n\t" + "two%=: \n\t" "xvmuldp 40, 32, 36 \n\t" // c * x "xvmuldp 41, 33, 36 \n\t" diff --git a/kernel/power/zscal_microk_power8.c b/kernel/power/zscal_microk_power8.c index aba9029a0..567331775 100644 --- a/kernel/power/zscal_microk_power8.c +++ b/kernel/power/zscal_microk_power8.c @@ -58,8 +58,8 @@ static void zscal_kernel_8 (long n, double *x, double alpha_r, double alpha_i) "dcbt 0, %2 \n\t" "xsnegdp 33, %x16 \n\t" // -alpha_i - "xxspltd 32, %x15, 0 \n\t" // alpha_r , alpha_r - "xxmrghd 33, 33, %x16 \n\t" // -alpha_i , alpha_i + XXSPLTD_S(32,%x15,0) // alpha_r , alpha_r + XXMRGHD_S(33,33,%x16) // -alpha_i , alpha_i "lxvd2x 40, 0, %2 \n\t" // x0_r, x0_i "lxvd2x 41, %17, %2 \n\t" @@ -73,10 +73,10 @@ static void zscal_kernel_8 (long n, double *x, double alpha_r, double alpha_i) "addi %2, %2, 128 \n\t" "addic. %1, %1, -8 \n\t" - "ble 2f \n\t" + "ble two%= \n\t" - ".p2align 5 \n" - "1: \n\t" + ".align 5 \n" + "one%=: \n\t" "xvmuldp 48, 40, 32 \n\t" // x0_r * alpha_r, x0_i * alpha_r "xvmuldp 49, 41, 32 \n\t" @@ -87,14 +87,14 @@ static void zscal_kernel_8 (long n, double *x, double alpha_r, double alpha_i) "xvmuldp %x5, 46, 32 \n\t" "xvmuldp %x6, 47, 32 \n\t" - "xxswapd %x7, 40 \n\t" - "xxswapd %x8, 41 \n\t" - "xxswapd %x9, 42 \n\t" - "xxswapd %x10, 43 \n\t" - "xxswapd %x11, 44 \n\t" - "xxswapd %x12, 45 \n\t" - "xxswapd %x13, 46 \n\t" - "xxswapd %x14, 47 \n\t" + XXSWAPD_S(%x7,40) + XXSWAPD_S(%x8,41) + XXSWAPD_S(%x9,42) + XXSWAPD_S(%x10,43) + XXSWAPD_S(%x11,44) + XXSWAPD_S(%x12,45) + XXSWAPD_S(%x13,46) + XXSWAPD_S(%x14,47) "xvmuldp %x7, %x7, 33 \n\t" // x0_i * -alpha_i, x0_r * alpha_i "xvmuldp %x8, %x8, 33 \n\t" @@ -147,9 +147,9 @@ static void zscal_kernel_8 (long n, double *x, double alpha_r, double alpha_i) "addi %2, %2, 256 \n\t" "addic. %1, %1, -8 \n\t" - "bgt 1b \n" + "bgt one%= \n" - "2: \n\t" + "two%=: \n\t" "xvmuldp 48, 40, 32 \n\t" // x0_r * alpha_r, x0_i * alpha_r "xvmuldp 49, 41, 32 \n\t" @@ -160,14 +160,14 @@ static void zscal_kernel_8 (long n, double *x, double alpha_r, double alpha_i) "xvmuldp %x5, 46, 32 \n\t" "xvmuldp %x6, 47, 32 \n\t" - "xxswapd %x7, 40 \n\t" - "xxswapd %x8, 41 \n\t" - "xxswapd %x9, 42 \n\t" - "xxswapd %x10, 43 \n\t" - "xxswapd %x11, 44 \n\t" - "xxswapd %x12, 45 \n\t" - "xxswapd %x13, 46 \n\t" - "xxswapd %x14, 47 \n\t" + XXSWAPD_S(%x7,40) + XXSWAPD_S(%x8,41) + XXSWAPD_S(%x9,42) + XXSWAPD_S(%x10,43) + XXSWAPD_S(%x11,44) + XXSWAPD_S(%x12,45) + XXSWAPD_S(%x13,46) + XXSWAPD_S(%x14,47) "addi %2, %2, -128 \n\t" diff --git a/kernel/power/zswap_microk_power8.c b/kernel/power/zswap_microk_power8.c index 54391ba5d..1e9fbe2cf 100644 --- a/kernel/power/zswap_microk_power8.c +++ b/kernel/power/zswap_microk_power8.c @@ -40,8 +40,8 @@ zswap_kernel_16 (long n, double *x, double *y) { __asm__ ( - ".p2align 5 \n" - "1: \n\t" + ".align 5 \n" + "one%=: \n\t" "lxvd2x 32, 0, %4 \n\t" "lxvd2x 33, %5, %4 \n\t" "lxvd2x 34, %6, %4 \n\t" @@ -130,7 +130,7 @@ zswap_kernel_16 (long n, double *x, double *y) "addi %4, %4, 128 \n\t" "addic. %2, %2, -16 \n\t" - "bgt 1b \n" + "bgt one%= \n" "#n=%2 x=%0=%3 y=%1=%4 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11" : diff --git a/kernel/power/ztrmm_macros_8x2_power8.S b/kernel/power/ztrmm_macros_8x2_power8.S index 701ec65c8..b3fbcd220 100644 --- a/kernel/power/ztrmm_macros_8x2_power8.S +++ b/kernel/power/ztrmm_macros_8x2_power8.S @@ -68,7 +68,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * Macros for N=2 and M=8 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD2x8_1', ` +#else .macro LOAD2x8_1 +#endif lxvdsx vs16, o0, BO // load real part from B lxvdsx vs17, o8, BO // load imag part from B @@ -92,9 +96,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_I1', ` +#else .macro KERNEL2x8_I1 +#endif lxvd2x vs8, o0, AO // load real,imag from A lxvd2x vs9, o16, AO // load real,imag from A @@ -152,9 +164,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs63, vs7, vs19 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_1', ` +#else .macro KERNEL2x8_1 +#endif xvmaddadp vs32, vs0, vs16 // real*real, imag*real @@ -221,9 +241,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_2', ` +#else .macro KERNEL2x8_2 +#endif xvmaddadp vs32, vs8, vs20 // real*real, imag*real @@ -289,9 +317,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_E2', ` +#else .macro KERNEL2x8_E2 +#endif xvmaddadp vs32, vs8, vs20 // real*real, imag*real @@ -329,9 +365,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs63, vs15, vs23 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_SUBI1', ` +#else .macro KERNEL2x8_SUBI1 +#endif lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A @@ -389,9 +433,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs63, vs7, vs19 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_SUB1', ` +#else .macro KERNEL2x8_SUB1 +#endif lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A @@ -449,9 +501,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs63, vs7, vs19 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE2x8', ` +#else .macro SAVE2x8 +#endif mr T1, CO @@ -473,13 +533,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs33,vs33) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs32 // realA*realB XSFADD_R2 vs0, vs0, vs33 // imagA*imagB - xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs32,vs32) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs33,vs33) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs32 // realA*imagB XSFADD_I2 vs1, vs1, vs33 // imagA*realB @@ -497,13 +557,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs35,vs35) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs34 // realA*realB XSFADD_R2 vs0, vs0, vs35 // imagA*imagB - xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs34,vs34) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs35,vs35) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs34 // realA*imagB XSFADD_I2 vs1, vs1, vs35 // imagA*realB @@ -521,13 +581,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs37, vs37 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs37,vs37) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs36 // realA*realB XSFADD_R2 vs0, vs0, vs37 // imagA*imagB - xxswapd vs36, vs36 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs37, vs37 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs36,vs36) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs37,vs37) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs36 // realA*imagB XSFADD_I2 vs1, vs1, vs37 // imagA*realB @@ -545,13 +605,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs39, vs39 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs39,vs39) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs38 // realA*realB XSFADD_R2 vs0, vs0, vs39 // imagA*imagB - xxswapd vs38, vs38 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs39, vs39 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs38,vs38) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs39,vs39) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs38 // realA*imagB XSFADD_I2 vs1, vs1, vs39 // imagA*realB @@ -569,13 +629,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs41, vs41 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs41,vs41) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs40 // realA*realB XSFADD_R2 vs0, vs0, vs41 // imagA*imagB - xxswapd vs40, vs40 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs41, vs41 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs40,vs40) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs41,vs41) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs40 // realA*imagB XSFADD_I2 vs1, vs1, vs41 // imagA*realB @@ -593,13 +653,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs43, vs43 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs43,vs43) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs42 // realA*realB XSFADD_R2 vs0, vs0, vs43 // imagA*imagB - xxswapd vs42, vs42 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs43, vs43 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs42,vs42) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs43,vs43) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs42 // realA*imagB XSFADD_I2 vs1, vs1, vs43 // imagA*realB @@ -617,13 +677,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs45, vs45 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs45,vs45) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs44 // realA*realB XSFADD_R2 vs0, vs0, vs45 // imagA*imagB - xxswapd vs44, vs44 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs45, vs45 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs44,vs44) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs45,vs45) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs44 // realA*imagB XSFADD_I2 vs1, vs1, vs45 // imagA*realB @@ -641,13 +701,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs47, vs47 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs47,vs47) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs46 // realA*realB XSFADD_R2 vs0, vs0, vs47 // imagA*imagB - xxswapd vs46, vs46 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs47, vs47 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs46,vs46) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs47,vs47) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs46 // realA*imagB XSFADD_I2 vs1, vs1, vs47 // imagA*realB @@ -703,13 +763,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs49, vs49 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs49,vs49) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs48 // realA*realB XSFADD_R2 vs0, vs0, vs49 // imagA*imagB - xxswapd vs48, vs48 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs49, vs49 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs48,vs48) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs49,vs49) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs48 // realA*imagB XSFADD_I2 vs1, vs1, vs49 // imagA*realB @@ -727,13 +787,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs51, vs51 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs51,vs51) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs50 // realA*realB XSFADD_R2 vs0, vs0, vs51 // imagA*imagB - xxswapd vs50, vs50 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs51, vs51 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs50,vs50) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs51,vs51) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs50 // realA*imagB XSFADD_I2 vs1, vs1, vs51 // imagA*realB @@ -751,13 +811,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs53, vs53 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs53,vs53) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs52 // realA*realB XSFADD_R2 vs0, vs0, vs53 // imagA*imagB - xxswapd vs52, vs52 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs53, vs53 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs52,vs52) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs53,vs53) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs52 // realA*imagB XSFADD_I2 vs1, vs1, vs53 // imagA*realB @@ -775,13 +835,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs55, vs55 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs55,vs55) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs54 // realA*realB XSFADD_R2 vs0, vs0, vs55 // imagA*imagB - xxswapd vs54, vs54 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs55, vs55 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs54,vs54) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs55,vs55) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs54 // realA*imagB XSFADD_I2 vs1, vs1, vs55 // imagA*realB @@ -799,13 +859,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs57, vs57 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs57,vs57) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs56 // realA*realB XSFADD_R2 vs0, vs0, vs57 // imagA*imagB - xxswapd vs56, vs56 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs57, vs57 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs56,vs56) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs57,vs57) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs56 // realA*imagB XSFADD_I2 vs1, vs1, vs57 // imagA*realB @@ -823,13 +883,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs59, vs59 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs59,vs59) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs58 // realA*realB XSFADD_R2 vs0, vs0, vs59 // imagA*imagB - xxswapd vs58, vs58 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs59, vs59 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs58,vs58) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs59,vs59) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs58 // realA*imagB XSFADD_I2 vs1, vs1, vs59 // imagA*realB @@ -847,13 +907,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs61, vs61 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs61,vs61) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs60 // realA*realB XSFADD_R2 vs0, vs0, vs61 // imagA*imagB - xxswapd vs60, vs60 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs61, vs61 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs60,vs60) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs61,vs61) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs60 // realA*imagB XSFADD_I2 vs1, vs1, vs61 // imagA*realB @@ -871,13 +931,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs63, vs63 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs63,vs63) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs62 // realA*realB XSFADD_R2 vs0, vs0, vs63 // imagA*imagB - xxswapd vs62, vs62 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs63, vs63 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs62,vs62) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs63,vs63) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs62 // realA*imagB XSFADD_I2 vs1, vs1, vs63 // imagA*realB @@ -918,14 +978,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add T2, T2, LDC addi CO, CO, 128 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=4 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD2x4_1', ` +#else .macro LOAD2x4_1 +#endif lxvdsx vs16, o0, BO // load real part from B lxvdsx vs17, o8, BO // load imag part from B @@ -942,9 +1010,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_I1', ` +#else .macro KERNEL2x4_I1 +#endif lxvd2x vs8, o0, AO // load real,imag from A lxvd2x vs9, o16, AO // load real,imag from A @@ -979,9 +1055,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs47, vs3, vs19 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_1', ` +#else .macro KERNEL2x4_1 +#endif lxvd2x vs8, o0, AO // load real,imag from A lxvd2x vs9, o16, AO // load real,imag from A @@ -1016,9 +1100,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs47, vs3, vs19 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_2', ` +#else .macro KERNEL2x4_2 +#endif lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A @@ -1053,9 +1145,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs47, vs11, vs23 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_E2', ` +#else .macro KERNEL2x4_E2 +#endif xvmaddadp vs32, vs8, vs20 // real*real, imag*real @@ -1077,9 +1177,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs47, vs11, vs23 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_SUBI1', ` +#else .macro KERNEL2x4_SUBI1 +#endif lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A @@ -1114,9 +1222,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs47, vs3, vs19 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_SUB1', ` +#else .macro KERNEL2x4_SUB1 +#endif lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A @@ -1151,9 +1267,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs47, vs3, vs19 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE2x4', ` +#else .macro SAVE2x4 +#endif mr T1, CO @@ -1170,13 +1294,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs33,vs33) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs32 // realA*realB XSFADD_R2 vs0, vs0, vs33 // imagA*imagB - xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs32,vs32) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs33,vs33) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs32 // realA*imagB XSFADD_I2 vs1, vs1, vs33 // imagA*realB @@ -1194,13 +1318,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs35,vs35) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs34 // realA*realB XSFADD_R2 vs0, vs0, vs35 // imagA*imagB - xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs34,vs34) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs35,vs35) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs34 // realA*imagB XSFADD_I2 vs1, vs1, vs35 // imagA*realB @@ -1218,13 +1342,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs37, vs37 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs37,vs37) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs36 // realA*realB XSFADD_R2 vs0, vs0, vs37 // imagA*imagB - xxswapd vs36, vs36 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs37, vs37 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs36,vs36) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs37,vs37) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs36 // realA*imagB XSFADD_I2 vs1, vs1, vs37 // imagA*realB @@ -1242,13 +1366,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs39, vs39 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs39,vs39) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs38 // realA*realB XSFADD_R2 vs0, vs0, vs39 // imagA*imagB - xxswapd vs38, vs38 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs39, vs39 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs38,vs38) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs39,vs39) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs38 // realA*imagB XSFADD_I2 vs1, vs1, vs39 // imagA*realB @@ -1291,13 +1415,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs41, vs41 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs41,vs41) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs40 // realA*realB XSFADD_R2 vs0, vs0, vs41 // imagA*imagB - xxswapd vs40, vs40 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs41, vs41 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs40,vs40) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs41,vs41) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs40 // realA*imagB XSFADD_I2 vs1, vs1, vs41 // imagA*realB @@ -1315,13 +1439,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs43, vs43 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs43,vs43) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs42 // realA*realB XSFADD_R2 vs0, vs0, vs43 // imagA*imagB - xxswapd vs42, vs42 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs43, vs43 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs42,vs42) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs43,vs43) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs42 // realA*imagB XSFADD_I2 vs1, vs1, vs43 // imagA*realB @@ -1339,13 +1463,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs45, vs45 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs45,vs45) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs44 // realA*realB XSFADD_R2 vs0, vs0, vs45 // imagA*imagB - xxswapd vs44, vs44 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs45, vs45 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs44,vs44) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs45,vs45) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs44 // realA*imagB XSFADD_I2 vs1, vs1, vs45 // imagA*realB @@ -1363,13 +1487,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs47, vs47 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs47,vs47) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs46 // realA*realB XSFADD_R2 vs0, vs0, vs47 // imagA*imagB - xxswapd vs46, vs46 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs47, vs47 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs46,vs46) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs47,vs47) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs46 // realA*imagB XSFADD_I2 vs1, vs1, vs47 // imagA*realB @@ -1401,14 +1525,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add T1, T1, LDC addi CO, CO, 64 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=2 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD2x2_1', ` +#else .macro LOAD2x2_1 +#endif lxvdsx vs16, o0, BO // load real part from B lxvdsx vs17, o8, BO // load imag part from B @@ -1423,9 +1555,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_I1', ` +#else .macro KERNEL2x2_I1 +#endif lxvd2x vs8, o0, AO // load real,imag from A lxvd2x vs9, o16, AO // load real,imag from A @@ -1450,9 +1590,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs39, vs1, vs19 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_1', ` +#else .macro KERNEL2x2_1 +#endif lxvd2x vs8, o0, AO // load real,imag from A lxvd2x vs9, o16, AO // load real,imag from A @@ -1477,9 +1625,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs39, vs1, vs19 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_2', ` +#else .macro KERNEL2x2_2 +#endif lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A @@ -1504,9 +1660,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs39, vs9, vs23 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_E2', ` +#else .macro KERNEL2x2_E2 +#endif xvmaddadp vs32, vs8, vs20 // real*real, imag*real @@ -1520,9 +1684,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs39, vs9, vs23 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_SUBI1', ` +#else .macro KERNEL2x2_SUBI1 +#endif lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A @@ -1547,9 +1719,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs39, vs1, vs19 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_SUB1', ` +#else .macro KERNEL2x2_SUB1 +#endif lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A @@ -1574,9 +1754,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs39, vs1, vs19 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE2x2', ` +#else .macro SAVE2x2 +#endif mr T1, CO @@ -1591,13 +1779,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs33,vs33) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs32 // realA*realB XSFADD_R2 vs0, vs0, vs33 // imagA*imagB - xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs32,vs32) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs33,vs33) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs32 // realA*imagB XSFADD_I2 vs1, vs1, vs33 // imagA*realB @@ -1615,13 +1803,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs35,vs35) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs34 // realA*realB XSFADD_R2 vs0, vs0, vs35 // imagA*imagB - xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs34,vs34) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs35,vs35) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs34 // realA*imagB XSFADD_I2 vs1, vs1, vs35 // imagA*realB @@ -1658,13 +1846,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs37, vs37 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs37,vs37) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs36 // realA*realB XSFADD_R2 vs0, vs0, vs37 // imagA*imagB - xxswapd vs36, vs36 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs37, vs37 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs36,vs36) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs37,vs37) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs36 // realA*imagB XSFADD_I2 vs1, vs1, vs37 // imagA*realB @@ -1682,13 +1870,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs39, vs39 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs39,vs39) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs38 // realA*realB XSFADD_R2 vs0, vs0, vs39 // imagA*imagB - xxswapd vs38, vs38 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs39, vs39 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs38,vs38) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs39,vs39) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs38 // realA*imagB XSFADD_I2 vs1, vs1, vs39 // imagA*realB @@ -1716,14 +1904,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add T1, T1, LDC addi CO, CO, 32 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=1 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD2x1_1', ` +#else .macro LOAD2x1_1 +#endif lxvdsx vs16, o0, BO // load real part from B lxvdsx vs17, o8, BO // load imag part from B @@ -1737,9 +1933,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_I1', ` +#else .macro KERNEL2x1_I1 +#endif lxvd2x vs8, o0, AO // load real,imag from A @@ -1759,9 +1963,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs35, vs0, vs19 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_1', ` +#else .macro KERNEL2x1_1 +#endif lxvd2x vs8, o0, AO // load real,imag from A @@ -1781,9 +1993,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs35, vs0, vs19 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_2', ` +#else .macro KERNEL2x1_2 +#endif lxvd2x vs0, o0, AO // load real,imag from A @@ -1803,9 +2023,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs35, vs8, vs23 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_E2', ` +#else .macro KERNEL2x1_E2 +#endif xvmaddadp vs32, vs8, vs20 // real*real, imag*real @@ -1815,9 +2043,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs35, vs8, vs23 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_SUBI1', ` +#else .macro KERNEL2x1_SUBI1 +#endif lxvd2x vs0, o0, AO // load real,imag from A @@ -1837,9 +2073,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs35, vs0, vs19 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_SUB1', ` +#else .macro KERNEL2x1_SUB1 +#endif lxvd2x vs0, o0, AO // load real,imag from A @@ -1859,9 +2103,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs35, vs0, vs19 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE2x1', ` +#else .macro SAVE2x1 +#endif mr T1, CO @@ -1875,13 +2127,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs33,vs33) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs32 // realA*realB XSFADD_R2 vs0, vs0, vs33 // imagA*imagB - xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs32,vs32) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs33,vs33) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs32 // realA*imagB XSFADD_I2 vs1, vs1, vs33 // imagA*realB @@ -1915,13 +2167,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs35,vs35) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs34 // realA*realB XSFADD_R2 vs0, vs0, vs35 // imagA*imagB - xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs34,vs34) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs35,vs35) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs34 // realA*imagB XSFADD_I2 vs1, vs1, vs35 // imagA*realB @@ -1947,14 +2199,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add T1, T1, LDC addi CO, CO, 16 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=8 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD1x8_1', ` +#else .macro LOAD1x8_1 +#endif lxvdsx vs16, o0, BO // load real part from B lxvdsx vs17, o8, BO // load imag part from B @@ -1976,9 +2236,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_I1', ` +#else .macro KERNEL1x8_I1 +#endif lxvd2x vs8, o0, AO // load real,imag from A lxvd2x vs9, o16, AO // load real,imag from A @@ -2017,9 +2285,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs47, vs7, vs17 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_1', ` +#else .macro KERNEL1x8_1 +#endif lxvd2x vs8, o0, AO // load real,imag from A lxvd2x vs9, o16, AO // load real,imag from A @@ -2058,9 +2334,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs47, vs7, vs17 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_2', ` +#else .macro KERNEL1x8_2 +#endif lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A @@ -2099,9 +2383,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs47, vs15, vs21 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_E2', ` +#else .macro KERNEL1x8_E2 +#endif xvmaddadp vs32, vs8, vs20 // real*real, imag*real @@ -2122,9 +2414,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs47, vs15, vs21 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_SUBI1', ` +#else .macro KERNEL1x8_SUBI1 +#endif lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A @@ -2163,9 +2463,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs47, vs7, vs17 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_SUB1', ` +#else .macro KERNEL1x8_SUB1 +#endif lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A @@ -2204,9 +2512,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs47, vs7, vs17 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE1x8', ` +#else .macro SAVE1x8 +#endif mr T1, CO @@ -2228,13 +2544,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs33,vs33) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs32 // realA*realB XSFADD_R2 vs0, vs0, vs33 // imagA*imagB - xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs32,vs32) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs33,vs33) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs32 // realA*imagB XSFADD_I2 vs1, vs1, vs33 // imagA*realB @@ -2252,13 +2568,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs35,vs35) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs34 // realA*realB XSFADD_R2 vs0, vs0, vs35 // imagA*imagB - xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs34,vs34) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs35,vs35) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs34 // realA*imagB XSFADD_I2 vs1, vs1, vs35 // imagA*realB @@ -2276,13 +2592,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs37, vs37 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs37,vs37) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs36 // realA*realB XSFADD_R2 vs0, vs0, vs37 // imagA*imagB - xxswapd vs36, vs36 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs37, vs37 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs36,vs36) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs37,vs37) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs36 // realA*imagB XSFADD_I2 vs1, vs1, vs37 // imagA*realB @@ -2300,13 +2616,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs39, vs39 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs39,vs39) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs38 // realA*realB XSFADD_R2 vs0, vs0, vs39 // imagA*imagB - xxswapd vs38, vs38 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs39, vs39 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs38,vs38) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs39,vs39) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs38 // realA*imagB XSFADD_I2 vs1, vs1, vs39 // imagA*realB @@ -2324,13 +2640,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs41, vs41 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs41,vs41) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs40 // realA*realB XSFADD_R2 vs0, vs0, vs41 // imagA*imagB - xxswapd vs40, vs40 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs41, vs41 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs40,vs40) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs41,vs41) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs40 // realA*imagB XSFADD_I2 vs1, vs1, vs41 // imagA*realB @@ -2348,13 +2664,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs43, vs43 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs43,vs43) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs42 // realA*realB XSFADD_R2 vs0, vs0, vs43 // imagA*imagB - xxswapd vs42, vs42 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs43, vs43 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs42,vs42) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs43,vs43) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs42 // realA*imagB XSFADD_I2 vs1, vs1, vs43 // imagA*realB @@ -2372,13 +2688,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs45, vs45 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs45,vs45) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs44 // realA*realB XSFADD_R2 vs0, vs0, vs45 // imagA*imagB - xxswapd vs44, vs44 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs45, vs45 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs44,vs44) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs45,vs45) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs44 // realA*imagB XSFADD_I2 vs1, vs1, vs45 // imagA*realB @@ -2396,13 +2712,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs47, vs47 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs47,vs47) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs46 // realA*realB XSFADD_R2 vs0, vs0, vs47 // imagA*imagB - xxswapd vs46, vs46 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs47, vs47 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs46,vs46) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs47,vs47) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs46 // realA*imagB XSFADD_I2 vs1, vs1, vs47 // imagA*realB @@ -2443,14 +2759,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add T2, T2, LDC addi CO, CO, 128 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=4 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD1x4_1', ` +#else .macro LOAD1x4_1 +#endif lxvdsx vs16, o0, BO // load real part from B lxvdsx vs17, o8, BO // load imag part from B @@ -2465,9 +2789,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_I1', ` +#else .macro KERNEL1x4_I1 +#endif lxvd2x vs8, o0, AO // load real,imag from A lxvd2x vs9, o16, AO // load real,imag from A @@ -2491,9 +2823,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs39, vs3, vs17 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_1', ` +#else .macro KERNEL1x4_1 +#endif lxvd2x vs8, o0, AO // load real,imag from A lxvd2x vs9, o16, AO // load real,imag from A @@ -2517,9 +2857,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_2', ` +#else .macro KERNEL1x4_2 +#endif lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A @@ -2543,9 +2891,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_E2', ` +#else .macro KERNEL1x4_E2 +#endif xvmaddadp vs32, vs8, vs20 // real*real, imag*real @@ -2558,9 +2914,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_SUBI1', ` +#else .macro KERNEL1x4_SUBI1 +#endif lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A @@ -2584,9 +2948,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs39, vs3, vs17 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_SUB1', ` +#else .macro KERNEL1x4_SUB1 +#endif lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A @@ -2610,9 +2982,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE1x4', ` +#else .macro SAVE1x4 +#endif mr T1, CO @@ -2629,13 +3009,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs33,vs33) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs32 // realA*realB XSFADD_R2 vs0, vs0, vs33 // imagA*imagB - xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs32,vs32) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs33,vs33) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs32 // realA*imagB XSFADD_I2 vs1, vs1, vs33 // imagA*realB @@ -2653,13 +3033,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs35,vs35) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs34 // realA*realB XSFADD_R2 vs0, vs0, vs35 // imagA*imagB - xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs34,vs34) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs35,vs35) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs34 // realA*imagB XSFADD_I2 vs1, vs1, vs35 // imagA*realB @@ -2677,13 +3057,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs37, vs37 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs37,vs37) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs36 // realA*realB XSFADD_R2 vs0, vs0, vs37 // imagA*imagB - xxswapd vs36, vs36 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs37, vs37 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs36,vs36) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs37,vs37) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs36 // realA*imagB XSFADD_I2 vs1, vs1, vs37 // imagA*realB @@ -2701,13 +3081,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs39, vs39 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs39,vs39) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs38 // realA*realB XSFADD_R2 vs0, vs0, vs39 // imagA*imagB - xxswapd vs38, vs38 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs39, vs39 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs38,vs38) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs39,vs39) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs38 // realA*imagB XSFADD_I2 vs1, vs1, vs39 // imagA*realB @@ -2739,14 +3119,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add T1, T1, LDC addi CO, CO, 64 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=2 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD1x2_1', ` +#else .macro LOAD1x2_1 +#endif lxvdsx vs16, o0, BO // load real part from B lxvdsx vs17, o8, BO // load imag part from B @@ -2759,9 +3147,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_I1', ` +#else .macro KERNEL1x2_I1 +#endif lxvd2x vs8, o0, AO // load real,imag from A lxvd2x vs9, o16, AO // load real,imag from A @@ -2779,9 +3175,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs35, vs1, vs17 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_1', ` +#else .macro KERNEL1x2_1 +#endif lxvd2x vs8, o0, AO // load real,imag from A lxvd2x vs9, o16, AO // load real,imag from A @@ -2799,9 +3203,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_2', ` +#else .macro KERNEL1x2_2 +#endif lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A @@ -2819,9 +3231,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_E2', ` +#else .macro KERNEL1x2_E2 +#endif xvmaddadp vs32, vs8, vs20 // real*real, imag*real @@ -2830,9 +3250,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_SUBI1', ` +#else .macro KERNEL1x2_SUBI1 +#endif lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A @@ -2850,9 +3278,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs35, vs1, vs17 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_SUB1', ` +#else .macro KERNEL1x2_SUB1 +#endif lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A @@ -2870,9 +3306,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE1x2', ` +#else .macro SAVE1x2 +#endif mr T1, CO @@ -2887,13 +3331,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs33,vs33) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs32 // realA*realB XSFADD_R2 vs0, vs0, vs33 // imagA*imagB - xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs32,vs32) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs33,vs33) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs32 // realA*imagB XSFADD_I2 vs1, vs1, vs33 // imagA*realB @@ -2911,13 +3355,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs35,vs35) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs34 // realA*realB XSFADD_R2 vs0, vs0, vs35 // imagA*imagB - xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs34,vs34) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs35,vs35) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs34 // realA*imagB XSFADD_I2 vs1, vs1, vs35 // imagA*realB @@ -2945,14 +3389,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add T1, T1, LDC addi CO, CO, 32 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=1 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD1x1_1', ` +#else .macro LOAD1x1_1 +#endif lxvdsx vs16, o0, BO // load real part from B lxvdsx vs17, o8, BO // load imag part from B @@ -2964,9 +3416,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_I1', ` +#else .macro KERNEL1x1_I1 +#endif lxvd2x vs8, o0, AO // load real,imag from A @@ -2981,9 +3441,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs33, vs0, vs17 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_1', ` +#else .macro KERNEL1x1_1 +#endif lxvd2x vs8, o0, AO // load real,imag from A @@ -2998,9 +3466,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_2', ` +#else .macro KERNEL1x1_2 +#endif lxvd2x vs0, o0, AO // load real,imag from A @@ -3015,18 +3491,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_E2', ` +#else .macro KERNEL1x1_E2 +#endif xvmaddadp vs32, vs8, vs20 // real*real, imag*real xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_SUBI1', ` +#else .macro KERNEL1x1_SUBI1 +#endif lxvd2x vs0, o0, AO // load real,imag from A @@ -3041,9 +3533,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs33, vs0, vs17 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_SUB1', ` +#else .macro KERNEL1x1_SUB1 +#endif lxvd2x vs0, o0, AO // load real,imag from A @@ -3058,9 +3558,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE1x1', ` +#else .macro SAVE1x1 +#endif mr T1, CO @@ -3074,13 +3582,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs33,vs33) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs32 // realA*realB XSFADD_R2 vs0, vs0, vs33 // imagA*imagB - xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs32,vs32) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs33,vs33) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs32 // realA*imagB XSFADD_I2 vs1, vs1, vs33 // imagA*realB @@ -3106,5 +3614,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add T1, T1, LDC addi CO, CO, 16 +#if defined(_AIX) +') +#else .endm +#endif From 715f4650d9874badfede90e4bd09451ac8ea1886 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 3 Dec 2019 08:24:10 +0100 Subject: [PATCH 02/27] Delete stray copy of dynamic.c from PR 2228 --- dynamic.c | 897 ------------------------------------------------------ 1 file changed, 897 deletions(-) delete mode 100644 dynamic.c diff --git a/dynamic.c b/dynamic.c deleted file mode 100644 index aa2b87621..000000000 --- a/dynamic.c +++ /dev/null @@ -1,897 +0,0 @@ -/*********************************************************************/ -/* Copyright 2009, 2010 The University of Texas at Austin. */ -/* All rights reserved. */ -/* */ -/* Redistribution and use in source and binary forms, with or */ -/* without modification, are permitted provided that the following */ -/* conditions are met: */ -/* */ -/* 1. Redistributions of source code must retain the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer. */ -/* */ -/* 2. Redistributions in binary form must reproduce the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer in the documentation and/or other materials */ -/* provided with the distribution. */ -/* */ -/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -/* POSSIBILITY OF SUCH DAMAGE. */ -/* */ -/* The views and conclusions contained in the software and */ -/* documentation are those of the authors and should not be */ -/* interpreted as representing official policies, either expressed */ -/* or implied, of The University of Texas at Austin. */ -/*********************************************************************/ - -#include "common.h" - -#ifdef _MSC_VER -#define strncasecmp _strnicmp -#define strcasecmp _stricmp -#endif - -#ifdef ARCH_X86 -#define EXTERN extern -#else -#define EXTERN -#endif - -#ifdef DYNAMIC_LIST -extern gotoblas_t gotoblas_PRESCOTT; - -#ifdef DYN_ATHLON -extern gotoblas_t gotoblas_ATHLON; -#else -#define gotoblas_ATHLON gotoblas_PRESCOTT -#endif -#ifdef DYN_KATMAI -extern gotoblas_t gotoblas_KATMAI; -#else -#define gotoblas_KATMAI gotoblas_PRESCOTT -#endif -#ifdef DYN_BANIAS -extern gotoblas_t gotoblas_BANIAS; -#else -#define gotoblas_BANIAS gotoblas_PRESCOTT -#endif -#ifdef DYN_COPPERMINE -extern gotoblas_t gotoblas_COPPERMINE; -#else -#define gotoblas_COPPERMINE gotoblas_PRESCOTT -#endif -#ifdef DYN_NORTHWOOD -extern gotoblas_t gotoblas_NORTHWOOD; -#else -#define gotoblas_NORTHWOOD gotoblas_PRESCOTT -#endif -#ifdef DYN_CORE2 -extern gotoblas_t gotoblas_CORE2; -#else -#define gotoblas_CORE2 gotoblas_PRESCOTT -#endif -#ifdef DYN_NEHALEM -extern gotoblas_t gotoblas_NEHALEM; -#else -#define gotoblas_NEHALEM gotoblas_PRESCOTT -#endif -#ifdef DYN_BARCELONA -extern gotoblas_t gotoblas_BARCELONA; -#elif defined(DYN_NEHALEM) -#define gotoblas_BARCELONA gotoblas_NEHALEM -#else -#define gotoblas_BARCELONA gotoblas_PRESCOTT -#endif -#ifdef DYN_ATOM -extern gotoblas_t gotoblas_ATOM; -elif defined(DYN_NEHALEM) -#define gotoblas_ATOM gotoblas_NEHALEM -#else -#define gotoblas_ATOM gotoblas_PRESCOTT -#endif -#ifdef DYN_NANO -extern gotoblas_t gotoblas_NANO; -#else -#define gotoblas_NANO gotoblas_PRESCOTT -#endif -#ifdef DYN_PENRYN -extern gotoblas_t gotoblas_PENRYN; -#else -#define gotoblas_PENRYN gotoblas_PRESCOTT -#endif -#ifdef DYN_DUNNINGTON -extern gotoblas_t gotoblas_DUNNINGTON; -#else -#define gotoblas_DUNNINGTON gotoblas_PRESCOTT -#endif -#ifdef DYN_OPTERON -extern gotoblas_t gotoblas_OPTERON; -#else -#define gotoblas_OPTERON gotoblas_PRESCOTT -#endif -#ifdef DYN_OPTERON_SSE3 -extern gotoblas_t gotoblas_OPTERON_SSE3; -#else -#define gotoblas_OPTERON_SSE3 gotoblas_PRESCOTT -#endif -#ifdef DYN_BOBCAT -extern gotoblas_t gotoblas_BOBCAT; -#elif defined(DYN_NEHALEM) -#define gotoblas_BOBCAT gotoblas_NEHALEM -#else -#define gotoblas_BOBCAT gotoblas_PRESCOTT -#endif -#ifdef DYN_SANDYBRIDGE -extern gotoblas_t gotoblas_SANDYBRIDGE; -#elif defined(DYN_NEHALEM) -#define gotoblas_SANDYBRIDGE gotoblas_NEHALEM -#else -#define gotoblas_SANDYBRIDGE gotoblas_PRESCOTT -#endif -#ifdef DYN_BULLDOZER -extern gotoblas_t gotoblas_BULLDOZER; -#elif defined(DYN_SANDYBRIDGE) -#define gotoblas_BULLDOZER gotoblas_SANDYBRIDGE -#elif defined(DYN_NEHALEM) -#define gotoblas_BULLDOZER gotoblas_NEHALEM -#else -#define gotoblas_BULLDOZER gotoblas_PRESCOTT -#endif -#ifdef DYN_PILEDRIVER -extern gotoblas_t gotoblas_PILEDRIVER; -#elif defined(DYN_SANDYBRIDGE) -#define gotoblas_PILEDRIVER gotoblas_SANDYBRIDGE -#elif defined(DYN_NEHALEM) -#define gotoblas_PILEDRIVER gotoblas_NEHALEM -#else -#define gotoblas_PILEDRIVER gotoblas_PRESCOTT -#endif -#ifdef DYN_STEAMROLLER -extern gotoblas_t gotoblas_STEAMROLLER; -#elif defined(DYN_SANDYBRIDGE) -#define gotoblas_STEAMROLLER gotoblas_SANDYBRIDGE -#elif defined(DYN_NEHALEM) -#define gotoblas_STEAMROLLER gotoblas_NEHALEM -#else -#define gotoblas_STEAMROLLER gotoblas_PRESCOTT -#endif -#ifdef DYN_EXCAVATOR -extern gotoblas_t gotoblas_EXCAVATOR; -#elif defined(DYN_SANDYBRIDGE) -#define gotoblas_EXCAVATOR gotoblas_SANDYBRIDGE -#elif defined(DYN_NEHALEM) -#define gotoblas_EXCAVATOR gotoblas_NEHALEM -#else -#define gotoblas_EXCAVATOR gotoblas_PRESCOTT -#endif -#ifdef DYN_HASWELL -extern gotoblas_t gotoblas_HASWELL; -#elif defined(DYN_SANDYBRIDGE) -#define gotoblas_HASWELL gotoblas_SANDYBRIDGE -#elif defined(DYN_NEHALEM) -#define gotoblas_HASWELL gotoblas_NEHALEM -#else -#define gotoblas_HASWELL gotoblas_PRESCOTT -#endif -#ifdef DYN_ZEN -extern gotoblas_t gotoblas_ZEN; -#elif defined(DYN_HASWELL) -#define gotoblas_ZEN gotoblas_HASWELL -#elif defined(DYN_SANDYBRIDGE) -#define gotoblas_ZEN gotoblas_SANDYBRIDGE -#elif defined(DYN_NEHALEM) -#define gotoblas_ZEN gotoblas_NEHALEM -#else -#define gotoblas_ZEN gotoblas_PRESCOTT -#endif -#ifdef DYN_SKYLAKEX -extern gotoblas_t gotoblas_SKYLAKEX; -#elif defined(DYN_HASWELL) -#define gotoblas_SKYLAKEX gotoblas_HASWELL -#elif defined(DYN_SANDYBRIDGE) -#define gotoblas_SKYLAKEX gotoblas_SANDYBRIDGE -#elif defined(DYN_NEHALEM) -#define gotoblas_SKYLAKEX gotoblas_NEHALEM -#else -#define gotoblas_SKYLAKEX gotoblas_PRESCOTT -#endif - - -#else // not DYNAMIC_LIST -EXTERN gotoblas_t gotoblas_KATMAI; -EXTERN gotoblas_t gotoblas_COPPERMINE; -EXTERN gotoblas_t gotoblas_NORTHWOOD; -EXTERN gotoblas_t gotoblas_BANIAS; -EXTERN gotoblas_t gotoblas_ATHLON; - -extern gotoblas_t gotoblas_PRESCOTT; -extern gotoblas_t gotoblas_CORE2; -extern gotoblas_t gotoblas_NEHALEM; -extern gotoblas_t gotoblas_BARCELONA; -#ifdef DYNAMIC_OLDER -extern gotoblas_t gotoblas_ATOM; -extern gotoblas_t gotoblas_NANO; -extern gotoblas_t gotoblas_PENRYN; -extern gotoblas_t gotoblas_DUNNINGTON; -extern gotoblas_t gotoblas_OPTERON; -extern gotoblas_t gotoblas_OPTERON_SSE3; -extern gotoblas_t gotoblas_BOBCAT; -#else -#define gotoblas_ATOM gotoblas_NEHALEM -#define gotoblas_NANO gotoblas_NEHALEM -#define gotoblas_PENRYN gotoblas_CORE2 -#define gotoblas_DUNNINGTON gotoblas_CORE2 -#define gotoblas_OPTERON gotoblas_CORE2 -#define gotoblas_OPTERON_SSE3 gotoblas_CORE2 -#define gotoblas_BOBCAT gotoblas_CORE2 -#endif - -#ifndef NO_AVX -extern gotoblas_t gotoblas_SANDYBRIDGE; -extern gotoblas_t gotoblas_BULLDOZER; -extern gotoblas_t gotoblas_PILEDRIVER; -extern gotoblas_t gotoblas_STEAMROLLER; -extern gotoblas_t gotoblas_EXCAVATOR; -#ifdef NO_AVX2 -#define gotoblas_HASWELL gotoblas_SANDYBRIDGE -#define gotoblas_SKYLAKEX gotoblas_SANDYBRIDGE -#define gotoblas_ZEN gotoblas_SANDYBRIDGE -#else -extern gotoblas_t gotoblas_HASWELL; -extern gotoblas_t gotoblas_ZEN; -#ifndef NO_AVX512 -extern gotoblas_t gotoblas_SKYLAKEX; -#else -#define gotoblas_SKYLAKEX gotoblas_HASWELL -#endif -#endif -#else -//Use NEHALEM kernels for sandy bridge -#define gotoblas_SANDYBRIDGE gotoblas_NEHALEM -#define gotoblas_HASWELL gotoblas_NEHALEM -#define gotoblas_SKYLAKEX gotoblas_NEHALEM -#define gotoblas_BULLDOZER gotoblas_BARCELONA -#define gotoblas_PILEDRIVER gotoblas_BARCELONA -#define gotoblas_STEAMROLLER gotoblas_BARCELONA -#define gotoblas_EXCAVATOR gotoblas_BARCELONA -#define gotoblas_ZEN gotoblas_BARCELONA -#endif - -#endif // DYNAMIC_LIST - -#define VENDOR_INTEL 1 -#define VENDOR_AMD 2 -#define VENDOR_CENTAUR 3 -#define VENDOR_HYGON 4 -#define VENDOR_UNKNOWN 99 - -#define BITMASK(a, b, c) ((((a) >> (b)) & (c))) - -#ifndef NO_AVX -static inline void xgetbv(int op, int * eax, int * edx){ - //Use binary code for xgetbv - __asm__ __volatile__ - (".byte 0x0f, 0x01, 0xd0": "=a" (*eax), "=d" (*edx) : "c" (op) : "cc"); -} -#endif - -int support_avx(){ -#ifndef NO_AVX - int eax, ebx, ecx, edx; - int ret=0; - - cpuid(1, &eax, &ebx, &ecx, &edx); - if ((ecx & (1 << 28)) != 0 && (ecx & (1 << 27)) != 0 && (ecx & (1 << 26)) != 0){ - xgetbv(0, &eax, &edx); - if((eax & 6) == 6){ - ret=1; //OS support AVX - } - } - return ret; -#else - return 0; -#endif -} - -int support_avx2(){ -#ifndef NO_AVX2 - int eax, ebx, ecx=0, edx; - int ret=0; - - if (!support_avx()) - return 0; - cpuid(7, &eax, &ebx, &ecx, &edx); - if((ebx & (1<<7)) != 0) - ret=1; //OS supports AVX2 - return ret; -#else - return 0; -#endif -} - -int support_avx512(){ -#if !defined(NO_AVX) && !defined(NO_AVX512) - int eax, ebx, ecx, edx; - int ret=0; - - if (!support_avx()) - return 0; - cpuid(7, &eax, &ebx, &ecx, &edx); - if((ebx & (1<<7)) != 1){ - ret=0; //OS does not even support AVX2 - } - if((ebx & (1<<31)) != 0){ - xgetbv(0, &eax, &edx); - if((eax & 0xe0) == 0xe0) - ret=1; //OS supports AVX512VL - } - return ret; -#else - return 0; -#endif -} - -extern void openblas_warning(int verbose, const char * msg); -#define FALLBACK_VERBOSE 1 -#define NEHALEM_FALLBACK "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Nehalem kernels as a fallback, which may give poorer performance.\n" -#define SANDYBRIDGE_FALLBACK "OpenBLAS : Your OS does not support AVX2 instructions. OpenBLAS is using Sandybridge kernels as a fallback, which may give poorer performance.\n" -#define HASWELL_FALLBACK "OpenBLAS : Your OS does not support AVX512VL instructions. OpenBLAS is using Haswell kernels as a fallback, which may give poorer performance.\n" -#define BARCELONA_FALLBACK "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Barcelona kernels as a fallback, which may give poorer performance.\n" - -static int get_vendor(void){ - int eax, ebx, ecx, edx; - - union - { - char vchar[16]; - int vint[4]; - } vendor; - - cpuid(0, &eax, &ebx, &ecx, &edx); - - *(&vendor.vint[0]) = ebx; - *(&vendor.vint[1]) = edx; - *(&vendor.vint[2]) = ecx; - - vendor.vchar[12] = '\0'; - - if (!strcmp(vendor.vchar, "GenuineIntel")) return VENDOR_INTEL; - if (!strcmp(vendor.vchar, "AuthenticAMD")) return VENDOR_AMD; - if (!strcmp(vendor.vchar, "CentaurHauls")) return VENDOR_CENTAUR; - if (!strcmp(vendor.vchar, "HygonGenuine")) return VENDOR_HYGON; - - if ((eax == 0) || ((eax & 0x500) != 0)) return VENDOR_INTEL; - - return VENDOR_UNKNOWN; -} - -static gotoblas_t *get_coretype(void){ - - int eax, ebx, ecx, edx; - int family, exfamily, model, vendor, exmodel; - - cpuid(1, &eax, &ebx, &ecx, &edx); - - family = BITMASK(eax, 8, 0x0f); - exfamily = BITMASK(eax, 20, 0xff); - model = BITMASK(eax, 4, 0x0f); - exmodel = BITMASK(eax, 16, 0x0f); - - vendor = get_vendor(); - - if (vendor == VENDOR_INTEL){ - switch (family) { - case 0x6: - switch (exmodel) { - case 0: - if (model <= 0x7) return &gotoblas_KATMAI; - if ((model == 0x8) || (model == 0xa) || (model == 0xb)) return &gotoblas_COPPERMINE; - if ((model == 0x9) || (model == 0xd)) return &gotoblas_BANIAS; - if (model == 14) return &gotoblas_BANIAS; - if (model == 15) return &gotoblas_CORE2; - return NULL; - - case 1: - if (model == 6) return &gotoblas_CORE2; - if (model == 7) return &gotoblas_PENRYN; - if (model == 13) return &gotoblas_DUNNINGTON; - if ((model == 10) || (model == 11) || (model == 14) || (model == 15)) return &gotoblas_NEHALEM; - if (model == 12) return &gotoblas_ATOM; - return NULL; - - case 2: - //Intel Core (Clarkdale) / Core (Arrandale) - // Pentium (Clarkdale) / Pentium Mobile (Arrandale) - // Xeon (Clarkdale), 32nm - if (model == 5) return &gotoblas_NEHALEM; - - //Intel Xeon Processor 5600 (Westmere-EP) - //Xeon Processor E7 (Westmere-EX) - //Xeon E7540 - if (model == 12 || model == 14 || model == 15) return &gotoblas_NEHALEM; - - //Intel Core i5-2000 /i7-2000 (Sandy Bridge) - //Intel Core i7-3000 / Xeon E5 - if (model == 10 || model == 13) { - if(support_avx()) - return &gotoblas_SANDYBRIDGE; - else{ - openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); - return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. - } - } - return NULL; - case 3: - //Intel Sandy Bridge 22nm (Ivy Bridge?) - if (model == 10 || model == 14) { - if(support_avx()) - return &gotoblas_SANDYBRIDGE; - else{ - openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); - return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. - } - } - //Intel Haswell - if (model == 12 || model == 15) { - if(support_avx2()) - return &gotoblas_HASWELL; - if(support_avx()) { - openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); - return &gotoblas_SANDYBRIDGE; - } else { - openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); - return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. - } - } - //Intel Broadwell - if (model == 13) { - if(support_avx2()) - return &gotoblas_HASWELL; - if(support_avx()) { - openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); - return &gotoblas_SANDYBRIDGE; - } else { - openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); - return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. - } - } - if (model == 7) return &gotoblas_ATOM; //Bay Trail - return NULL; - case 4: - //Intel Haswell - if (model == 5 || model == 6) { - if(support_avx2()) - return &gotoblas_HASWELL; - if(support_avx()) { - openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); - return &gotoblas_SANDYBRIDGE; - } else { - openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); - return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. - } - } - //Intel Broadwell - if (model == 7 || model == 15) { - if(support_avx2()) - return &gotoblas_HASWELL; - if(support_avx()) { - openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); - return &gotoblas_SANDYBRIDGE; - } else { - openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); - return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. - } - } - //Intel Skylake - if (model == 14) { - if(support_avx2()) - return &gotoblas_HASWELL; - if(support_avx()) { - openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); - return &gotoblas_SANDYBRIDGE; - } else { - openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); - return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. - } - } - //Intel Braswell / Avoton - if (model == 12 || model == 13) { - return &gotoblas_NEHALEM; - } - return NULL; - case 5: - //Intel Broadwell - if (model == 6) { - if(support_avx2()) - return &gotoblas_HASWELL; - if(support_avx()) { - openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); - return &gotoblas_SANDYBRIDGE; - } else { - openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); - return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. - } - } - if (model == 5) { - // Intel Skylake X - if (support_avx512()) - return &gotoblas_SKYLAKEX; - if(support_avx2()){ - openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK); - return &gotoblas_HASWELL; - } - if(support_avx()) { - openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); - return &gotoblas_SANDYBRIDGE; - } else { - openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); - return &gotoblas_NEHALEM; - } - } - //Intel Skylake - if (model == 14) { - if(support_avx2()) - return &gotoblas_HASWELL; - if(support_avx()) { - openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); - return &gotoblas_SANDYBRIDGE; - } else { - openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); - return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. - } - } - //Intel Phi Knights Landing - if (model == 7) { - if(support_avx2()){ - openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK); - return &gotoblas_HASWELL; - } - if(support_avx()) { - openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); - return &gotoblas_SANDYBRIDGE; - } else { - openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); - return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. - } - } - //Apollo Lake or Denverton - if (model == 12 || model == 15) { - return &gotoblas_NEHALEM; - } - return NULL; - case 6: - if (model == 6) { - // Cannon Lake - if(support_avx2()) - return &gotoblas_HASWELL; - if(support_avx()) { - openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); - return &gotoblas_SANDYBRIDGE; - } else { - openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); - return &gotoblas_NEHALEM; - } - } - return NULL; - case 7: - if (model == 10) // Goldmont plus - return &gotoblas_NEHALEM; - if (model == 14) { - // Ice Lake - if (support_avx512()) - return &gotoblas_SKYLAKEX; - if(support_avx2()){ - openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK); - return &gotoblas_HASWELL; - } - if(support_avx()) { - openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); - return &gotoblas_SANDYBRIDGE; - } else { - openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); - return &gotoblas_NEHALEM; - } - } - return NULL; - case 9: - case 8: - if (model == 14 ) { // Kaby Lake, Coffee Lake - if(support_avx2()) - return &gotoblas_HASWELL; - if(support_avx()) { - openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); - return &gotoblas_SANDYBRIDGE; - } else { - openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); - return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. - } - } - return NULL; - } - case 0xf: - if (model <= 0x2) return &gotoblas_NORTHWOOD; - return &gotoblas_PRESCOTT; - } - } - - if (vendor == VENDOR_AMD || vendor == VENDOR_HYGON){ - if (family <= 0xe) { - // Verify that CPU has 3dnow and 3dnowext before claiming it is Athlon - cpuid(0x80000000, &eax, &ebx, &ecx, &edx); - if ( (eax & 0xffff) >= 0x01) { - cpuid(0x80000001, &eax, &ebx, &ecx, &edx); - if ((edx & (1 << 30)) == 0 || (edx & (1 << 31)) == 0) - return NULL; - } - else - return NULL; - - return &gotoblas_ATHLON; - } - if (family == 0xf){ - if ((exfamily == 0) || (exfamily == 2)) { - if (ecx & (1 << 0)) return &gotoblas_OPTERON_SSE3; - else return &gotoblas_OPTERON; - } else if (exfamily == 5) { - return &gotoblas_BOBCAT; - } else if (exfamily == 6) { - if(model == 1){ - //AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series - if(support_avx()) - return &gotoblas_BULLDOZER; - else{ - openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK); - return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. - } - }else if(model == 2 || model == 3){ - //AMD Bulldozer Opteron 6300 / Opteron 4300 / Opteron 3300 - if(support_avx()) - return &gotoblas_PILEDRIVER; - else{ - openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK); - return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. - } - }else if(model == 5){ - if(support_avx()) - return &gotoblas_EXCAVATOR; - else{ - openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK); - return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. - } - }else if(model == 0 || model == 8){ - if (exmodel == 1) { - //AMD Trinity - if(support_avx()) - return &gotoblas_PILEDRIVER; - else{ - openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK); - return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. - } - }else if (exmodel == 3) { - //AMD STEAMROLLER - if(support_avx()) - return &gotoblas_STEAMROLLER; - else{ - openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK); - return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. - } - }else if (exmodel == 6) { - if(support_avx()) - return &gotoblas_EXCAVATOR; - else{ - openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK); - return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. - } - - } - } - } else if (exfamily == 8) { - if (model == 1 || model == 8) { - if(support_avx()) - return &gotoblas_ZEN; - else{ - openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK); - return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. - } - } - } else if (exfamily == 9) { - if(support_avx()) - return &gotoblas_ZEN; - else{ - openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK); - return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. - } - }else { - return &gotoblas_BARCELONA; - } - } - } - - if (vendor == VENDOR_CENTAUR) { - switch (family) { - case 0x6: - return &gotoblas_NANO; - } - } - - return NULL; -} - -static char *corename[] = { - "Unknown", - "Katmai", - "Coppermine", - "Northwood", - "Prescott", - "Banias", - "Atom", - "Core2", - "Penryn", - "Dunnington", - "Nehalem", - "Athlon", - "Opteron", - "Opteron_SSE3", - "Barcelona", - "Nano", - "Sandybridge", - "Bobcat", - "Bulldozer", - "Piledriver", - "Haswell", - "Steamroller", - "Excavator", - "Zen", - "SkylakeX" -}; - -char *gotoblas_corename(void) { - - if (gotoblas == &gotoblas_KATMAI) return corename[ 1]; - if (gotoblas == &gotoblas_COPPERMINE) return corename[ 2]; - if (gotoblas == &gotoblas_NORTHWOOD) return corename[ 3]; - if (gotoblas == &gotoblas_PRESCOTT) return corename[ 4]; - if (gotoblas == &gotoblas_BANIAS) return corename[ 5]; - if (gotoblas == &gotoblas_ATOM) return corename[ 6]; - if (gotoblas == &gotoblas_CORE2) return corename[ 7]; - if (gotoblas == &gotoblas_PENRYN) return corename[ 8]; - if (gotoblas == &gotoblas_DUNNINGTON) return corename[ 9]; - if (gotoblas == &gotoblas_NEHALEM) return corename[10]; - if (gotoblas == &gotoblas_ATHLON) return corename[11]; - if (gotoblas == &gotoblas_OPTERON_SSE3) return corename[12]; - if (gotoblas == &gotoblas_OPTERON) return corename[13]; - if (gotoblas == &gotoblas_BARCELONA) return corename[14]; - if (gotoblas == &gotoblas_NANO) return corename[15]; - if (gotoblas == &gotoblas_SANDYBRIDGE) return corename[16]; - if (gotoblas == &gotoblas_BOBCAT) return corename[17]; - if (gotoblas == &gotoblas_BULLDOZER) return corename[18]; - if (gotoblas == &gotoblas_PILEDRIVER) return corename[19]; - if (gotoblas == &gotoblas_HASWELL) return corename[20]; - if (gotoblas == &gotoblas_STEAMROLLER) return corename[21]; - if (gotoblas == &gotoblas_EXCAVATOR) return corename[22]; - if (gotoblas == &gotoblas_ZEN) return corename[23]; - if (gotoblas == &gotoblas_SKYLAKEX) return corename[24]; - return corename[0]; -} - - -static gotoblas_t *force_coretype(char *coretype){ - - int i ; - int found = -1; - char message[128]; - //char mname[20]; - - for ( i=1 ; i <= 24; i++) - { - if (!strncasecmp(coretype,corename[i],20)) - { - found = i; - break; - } - } - if (found < 0) - { - //strncpy(mname,coretype,20); - snprintf(message, 128, "Core not found: %s\n",coretype); - openblas_warning(1, message); - return(NULL); - } - - switch (found) - { - case 24: return (&gotoblas_SKYLAKEX); - case 23: return (&gotoblas_ZEN); - case 22: return (&gotoblas_EXCAVATOR); - case 21: return (&gotoblas_STEAMROLLER); - case 20: return (&gotoblas_HASWELL); - case 19: return (&gotoblas_PILEDRIVER); - case 18: return (&gotoblas_BULLDOZER); - case 17: return (&gotoblas_BOBCAT); - case 16: return (&gotoblas_SANDYBRIDGE); - case 15: return (&gotoblas_NANO); - case 14: return (&gotoblas_BARCELONA); - case 13: return (&gotoblas_OPTERON); - case 12: return (&gotoblas_OPTERON_SSE3); - case 11: return (&gotoblas_ATHLON); - case 10: return (&gotoblas_NEHALEM); - case 9: return (&gotoblas_DUNNINGTON); - case 8: return (&gotoblas_PENRYN); - case 7: return (&gotoblas_CORE2); - case 6: return (&gotoblas_ATOM); - case 5: return (&gotoblas_BANIAS); - case 4: return (&gotoblas_PRESCOTT); - case 3: return (&gotoblas_NORTHWOOD); - case 2: return (&gotoblas_COPPERMINE); - case 1: return (&gotoblas_KATMAI); - } - return(NULL); - -} - - - - -void gotoblas_dynamic_init(void) { - - char coremsg[128]; - char coren[22]; - char *p; - - - if (gotoblas) return; - - p = getenv("OPENBLAS_CORETYPE"); - if ( p ) - { - gotoblas = force_coretype(p); - } - else - { - gotoblas = get_coretype(); - } - -#ifdef ARCH_X86 - if (gotoblas == NULL) gotoblas = &gotoblas_KATMAI; -#else - if (gotoblas == NULL) gotoblas = &gotoblas_PRESCOTT; - /* sanity check, if 64bit pointer we can't have a 32 bit cpu */ - if (sizeof(void*) == 8) { - if (gotoblas == &gotoblas_KATMAI || - gotoblas == &gotoblas_COPPERMINE || - gotoblas == &gotoblas_NORTHWOOD || - gotoblas == &gotoblas_BANIAS || - gotoblas == &gotoblas_ATHLON) - gotoblas = &gotoblas_PRESCOTT; - } -#endif - - if (gotoblas && gotoblas -> init) { - strncpy(coren,gotoblas_corename(),20); - sprintf(coremsg, "Core: %s\n",coren); - openblas_warning(2, coremsg); - gotoblas -> init(); - } else { - openblas_warning(0, "OpenBLAS : Architecture Initialization failed. No initialization function found.\n"); - exit(1); - } - -} - -void gotoblas_dynamic_quit(void) { - - gotoblas = NULL; - -} From 3518617f5b7118286db9ab86a85cc078c00d6046 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 3 Dec 2019 08:32:29 +0100 Subject: [PATCH 03/27] Add Intel Goldmont+ cpuid was originally in #2228 but that PR had misplaced the file in the toplevel directory --- driver/others/dynamic.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index a4ff0e086..2e87e186a 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -586,6 +586,8 @@ static gotoblas_t *get_coretype(void){ } return NULL; case 7: + if (model == 10) // Goldmont Plus + return &gotoblas_NEHALEM; if (model == 14) { // Ice Lake if (support_avx512()) From 3938e59569cd44634dce06c832f0db12968cd7fc Mon Sep 17 00:00:00 2001 From: Kavana Bhat Date: Wed, 4 Dec 2019 00:23:46 -0600 Subject: [PATCH 04/27] AIX changes for Power8 --- kernel/Makefile.L3 | 193 ++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 180 insertions(+), 13 deletions(-) diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index ed8ae406f..4decfbd20 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -1,4 +1,5 @@ USE_GEMM3M = 0 +OS := $(shell uname) ifeq ($(ARCH), x86) USE_GEMM3M = 1 @@ -434,10 +435,15 @@ $(KDIR)$(SGEMMONCOPYOBJ) : $(KERNELDIR)/$(SGEMMONCOPY) $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ $(KDIR)$(SGEMMOTCOPYOBJ) : $(KERNELDIR)/$(SGEMMOTCOPY) +ifeq ($(OS), AIX) $(CC) $(CFLAGS) -E -UDOUBLE -UCOMPLEX $< -o sgemmotcopy.s m4 sgemmotcopy.s > sgemmotcopy_nomacros.s $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemmotcopy_nomacros.s -o $@ rm sgemmotcopy.s sgemmotcopy_nomacros.s +else + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ +endif + ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) @@ -445,17 +451,26 @@ $(KDIR)$(SGEMMINCOPYOBJ) : $(KERNELDIR)/$(SGEMMINCOPY) $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ $(KDIR)$(SGEMMITCOPYOBJ) : $(KERNELDIR)/$(SGEMMITCOPY) +ifeq ($(OS), AIX) $(CC) $(CFLAGS) -E -UDOUBLE -UCOMPLEX $< -o sgemmitcopy.s m4 sgemmitcopy.s > sgemmitcopy_nomacros.s $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemmitcopy_nomacros.s -o $@ rm sgemmitcopy.s sgemmitcopy_nomacros.s +else + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ +endif + endif $(KDIR)$(DGEMMONCOPYOBJ) : $(KERNELDIR)/$(DGEMMONCOPY) +ifeq ($(OS), AIX) $(CC) $(CFLAGS) -E -DDOUBLE -UCOMPLEX $< -o dgemm_ncopy.s m4 dgemm_ncopy.s > dgemm_ncopy_nomacros.s $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX dgemm_ncopy_nomacros.s -o $@ rm dgemm_ncopy.s dgemm_ncopy_nomacros.s +else + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ +endif $(KDIR)$(DGEMMOTCOPYOBJ) : $(KERNELDIR)/$(DGEMMOTCOPY) $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ @@ -466,10 +481,14 @@ $(KDIR)$(DGEMMINCOPYOBJ) : $(KERNELDIR)/$(DGEMMINCOPY) $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ $(KDIR)$(DGEMMITCOPYOBJ) : $(KERNELDIR)/$(DGEMMITCOPY) +ifeq ($(OS), AIX) $(CC) $(CFLAGS) -E -DDOUBLE -UCOMPLEX $< -o dgemm_itcopy.s m4 dgemm_itcopy.s > dgemm_itcopy_nomacros.s $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX dgemm_itcopy_nomacros.s -o $@ rm dgemm_itcopy.s dgemm_itcopy_nomacros.s +else + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ +endif endif @@ -494,16 +513,10 @@ endif endif $(KDIR)$(CGEMMONCOPYOBJ) : $(KERNELDIR)/$(CGEMMONCOPY) -# $(CC) $(CFLAGS) -E -UDOUBLE -UCOMPLEX $< -o cgemm_oncopy.s -# m4 cgemm_oncopy.s > cgemm_oncopy_nomacros.s $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ -# rm cgemm_oncopy.s cgemm_oncopy_nomacros.s $(KDIR)$(CGEMMOTCOPYOBJ) : $(KERNELDIR)/$(CGEMMOTCOPY) -# $(CC) $(CFLAGS) -E -UDOUBLE -UCOMPLEX $< -o cgemm_otcopy.s -# m4 cgemm_otcopy.s > cgemm_otcopy_nomacros.s $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ -# rm cgemm_otcopy.s cgemm_otcopy_nomacros.s ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) @@ -511,10 +524,14 @@ $(KDIR)$(CGEMMINCOPYOBJ) : $(KERNELDIR)/$(CGEMMINCOPY) $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ $(KDIR)$(CGEMMITCOPYOBJ) : $(KERNELDIR)/$(CGEMMITCOPY) +ifeq ($(OS), AIX) $(CC) $(CFLAGS) -UDOUBLE -UCOMPLEX -E $< -o cgemm_itcopy.s m4 cgemm_itcopy.s > cgemm_itcopy_nomacros.s $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX cgemm_itcopy_nomacros.s -o $@ rm cgemm_itcopy.s cgemm_itcopy_nomacros.s +else + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ +endif endif @@ -530,10 +547,14 @@ $(KDIR)$(ZGEMMINCOPYOBJ) : $(KERNELDIR)/$(ZGEMMINCOPY) $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ $(KDIR)$(ZGEMMITCOPYOBJ) : $(KERNELDIR)/$(ZGEMMITCOPY) +ifeq ($(OS), AIX) $(CC) $(CFLAGS) -E -DDOUBLE -UCOMPLEX $< -o zgemm_itcopy.s m4 zgemm_itcopy.s > zgemm_itcopy_nomacros.s $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX zgemm_itcopy_nomacros.s -o $@ rm zgemm_itcopy.s zgemm_itcopy_nomacros.s +else + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ +endif endif @@ -558,67 +579,107 @@ endif endif $(KDIR)sgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(SGEMMDEPEND) +ifeq ($(OS), AIX) $(CC) $(CFLAGS) -E -UDOUBLE -UCOMPLEX $< -o sgemm_kernel$(TSUFFIX).s m4 sgemm_kernel$(TSUFFIX).s > sgemm_kernel$(TSUFFIX)_nomacros.s $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemm_kernel$(TSUFFIX)_nomacros.s -o $@ rm sgemm_kernel$(TSUFFIX).s sgemm_kernel$(TSUFFIX)_nomacros.s +else + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ +endif $(KDIR)dgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) $(DGEMMDEPEND) +ifeq ($(OS), AIX) $(CC) $(CFLAGS) -E -DDOUBLE -UCOMPLEX $< -o dgemm_kernel$(TSUFFIX).s m4 dgemm_kernel$(TSUFFIX).s > dgemm_kernel$(TSUFFIX)_nomacros.s $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX dgemm_kernel$(TSUFFIX)_nomacros.s -o $@ rm dgemm_kernel$(TSUFFIX).s dgemm_kernel$(TSUFFIX)_nomacros.s +else + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ +endif $(KDIR)qgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) $(QGEMMDEPEND) $(CC) $(CFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ $(KDIR)cgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) +ifeq ($(OS), AIX) $(CC) $(CFLAGS) -E -UDOUBLE -DCOMPLEX -DNN $< -o cgemm_kernel_n.s m4 cgemm_kernel_n.s > cgemm_kernel_n_nomacros.s $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN cgemm_kernel_n_nomacros.s -o $@ rm cgemm_kernel_n.s cgemm_kernel_n_nomacros.s +else + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN $< -o $@ +endif $(KDIR)cgemm_kernel_l$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) +ifeq ($(OS), AIX) $(CC) $(CFLAGS) -E -UDOUBLE -DCOMPLEX -DCN $< -o cgemm_kernel_l.s m4 cgemm_kernel_l.s > cgemm_kernel_l_nomacros.s $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCN cgemm_kernel_l_nomacros.s -o $@ rm cgemm_kernel_l.s cgemm_kernel_l_nomacros.s +else + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $@ +endif $(KDIR)cgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) +ifeq ($(OS), AIX) $(CC) $(CFLAGS) -E -UDOUBLE -DCOMPLEX -DNC $< -o cgemm_kernel_r.s m4 cgemm_kernel_r.s > cgemm_kernel_r_nomacros.s $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNC cgemm_kernel_r_nomacros.s -o $@ rm cgemm_kernel_r.s cgemm_kernel_r_nomacros.s +else + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNC $< -o $@ +endif $(KDIR)cgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) +ifeq ($(OS), AIX) $(CC) $(CFLAGS) -E -UDOUBLE -DCOMPLEX -DCC $< -o cgemm_kernel_b.s m4 cgemm_kernel_b.s > cgemm_kernel_b_nomacros.s $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCC cgemm_kernel_b_nomacros.s -o $@ rm cgemm_kernel_b.s cgemm_kernel_b_nomacros.s +else + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $@ +endif $(KDIR)zgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) +ifeq ($(OS), AIX) $(CC) $(CFLAGS) -E -DDOUBLE -DCOMPLEX -DNN $< -o zgemm_kernel_n.s m4 zgemm_kernel_n.s > zgemm_kernel_n_nomacros.s $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN zgemm_kernel_n_nomacros.s -o $@ rm zgemm_kernel_n.s zgemm_kernel_n_nomacros.s +else + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN $< -o $@ +endif $(KDIR)zgemm_kernel_l$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) +ifeq ($(OS), AIX) $(CC) $(CFLAGS) -E -DDOUBLE -DCOMPLEX -DCN $< -o zgemm_kernel_l.s m4 zgemm_kernel_l.s > zgemm_kernel_l_nomacros.s $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN zgemm_kernel_l_nomacros.s -o $@ rm zgemm_kernel_l.s zgemm_kernel_l_nomacros.s +else + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN $< -o $@ +endif $(KDIR)zgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) +ifeq ($(OS), AIX) $(CC) $(CFLAGS) -E -DDOUBLE -DCOMPLEX -DNC $< -o zgemm_kernel_r.s m4 zgemm_kernel_r.s > zgemm_kernel_r_nomacros.s $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNC zgemm_kernel_r_nomacros.s -o $@ rm zgemm_kernel_r.s zgemm_kernel_r_nomacros.s +else + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNC $< -o $@ +endif $(KDIR)zgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) +ifeq ($(OS), AIX) $(CC) $(CFLAGS) -E -DDOUBLE -DCOMPLEX -DCC $< -o zgemm_kernel_b.s m4 zgemm_kernel_b.s > zgemm_kernel_b_nomacros.s $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC zgemm_kernel_b_nomacros.s -o $@ rm zgemm_kernel_b.s zgemm_kernel_b_nomacros.s +else + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $@ +endif $(KDIR)xgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMDEPEND) $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DNN $< -o $@ @@ -635,56 +696,84 @@ $(KDIR)xgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMD ifdef USE_TRMM $(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) +ifeq ($(OS), AIX) $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o strmmkernel_ln.s m4 strmmkernel_ln.s > strmmkernel_ln_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA strmmkernel_ln_nomacros.s -o $@ rm strmmkernel_ln.s strmmkernel_ln_nomacros.s +else + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ +endif $(KDIR)strmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) +ifeq ($(OS), AIX) $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o strmmkernel_lt.s m4 strmmkernel_lt.s > strmmkernel_lt_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA strmmkernel_lt_nomacros.s -o $@ rm strmmkernel_lt.s strmmkernel_lt_nomacros.s +else + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ +endif $(KDIR)strmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) +ifeq ($(OS), AIX) $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o strmmkernel_rn.s m4 strmmkernel_rn.s > strmmkernel_rn_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA strmmkernel_rn_nomacros.s -o $@ rm strmmkernel_rn.s strmmkernel_rn_nomacros.s +else + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ +endif $(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) +ifeq ($(OS), AIX) $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o strmm_kernel_rt.s m4 strmm_kernel_rt.s > strmm_kernel_rt_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA strmm_kernel_rt_nomacros.s -o $@ rm strmm_kernel_rt.s strmm_kernel_rt_nomacros.s +else + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ +endif $(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) +ifeq ($(OS), AIX) $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o dtrmm_kernel_ln.s -# $(CC) $(CFLAGS) -E $< -o dtrmm_kernel_ln.s m4 dtrmm_kernel_ln.s > dtrmm_kernel_ln_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA dtrmm_kernel_ln_nomacros.s -o $@ rm dtrmm_kernel_ln.s dtrmm_kernel_ln_nomacros.s +else + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ +endif $(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) +ifeq ($(OS), AIX) $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o dtrmm_kernel_lt.s -# $(CC) $(CFLAGS) -E $< -o dtrmm_kernel_lt.s m4 dtrmm_kernel_lt.s > dtrmm_kernel_lt_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA dtrmm_kernel_lt_nomacros.s -o $@ rm dtrmm_kernel_lt.s dtrmm_kernel_lt_nomacros.s +else + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ +endif $(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) +ifeq ($(OS), AIX) $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o dtrmm_kernel_rn.s -# $(CC) $(CFLAGS) -E $< -o dtrmm_kernel_rn.s m4 dtrmm_kernel_rn.s > dtrmm_kernel_rn_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA dtrmm_kernel_rn_nomacros.s -o $@ rm dtrmm_kernel_rn.s dtrmm_kernel_rn_nomacros.s +else + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ +endif $(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) +ifeq ($(OS), AIX) $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o dtrmm_kernel_rt.s -# $(CC) $(CFLAGS) -E $< -o dtrmm_kernel_rt.s m4 dtrmm_kernel_rt.s > dtrmm_kernel_rt_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA dtrmm_kernel_rt_nomacros.s -o $@ rm dtrmm_kernel_rt.s dtrmm_kernel_rt_nomacros.s +else + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ +endif $(KDIR)qtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ @@ -699,100 +788,165 @@ $(KDIR)qtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ $(KDIR)ctrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) +ifeq ($(OS), AIX) $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o ctrmm_kernel_ln.s m4 ctrmm_kernel_ln.s > ctrmm_kernel_ln_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN ctrmm_kernel_ln_nomacros.s -o $@ rm ctrmm_kernel_ln.s ctrmm_kernel_ln_nomacros.s +else + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ +endif $(KDIR)ctrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) +ifeq ($(OS), AIX) $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o ctrmm_kernel_lt.s m4 ctrmm_kernel_lt.s > ctrmm_kernel_lt_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN ctrmm_kernel_lt_nomacros.s -o $@ rm ctrmm_kernel_lt.s ctrmm_kernel_lt_nomacros.s +else + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ +endif $(KDIR)ctrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) +ifeq ($(OS), AIX) $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o ctrmm_kernel_lr.s m4 ctrmm_kernel_lr.s > ctrmm_kernel_lr_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN ctrmm_kernel_lr_nomacros.s -o $@ rm ctrmm_kernel_lr.s ctrmm_kernel_lr_nomacros.s +else + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ +endif $(KDIR)ctrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) +ifeq ($(OS), AIX) $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o ctrmm_kernel_lc.s m4 ctrmm_kernel_lc.s > ctrmm_kernel_lc_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN ctrmm_kernel_lc_nomacros.s -o $@ rm ctrmm_kernel_lc_nomacros.s ctrmm_kernel_lc.s +else + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ +endif $(KDIR)ctrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) +ifeq ($(OS), AIX) $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o ctrmm_kernel_rn.s m4 ctrmm_kernel_rn.s > ctrmm_kernel_rn_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN ctrmm_kernel_rn_nomacros.s -o $@ rm ctrmm_kernel_rn.s ctrmm_kernel_rn_nomacros.s +else + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ +endif $(KDIR)ctrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) +ifeq ($(OS), AIX) $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o ctrmm_kernel_rt.s m4 ctrmm_kernel_rt.s > ctrmm_kernel_rt_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN ctrmm_kernel_rt_nomacros.s -o $@ rm ctrmm_kernel_rt.s ctrmm_kernel_rt_nomacros.s +else + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ +endif $(KDIR)ctrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) +ifeq ($(OS), AIX) $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o ctrmm_kernel_rr.s m4 ctrmm_kernel_rr.s > ctrmm_kernel_rr_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC ctrmm_kernel_rr_nomacros.s -o $@ rm ctrmm_kernel_rr.s ctrmm_kernel_rr_nomacros.s +else + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ +endif $(KDIR)ctrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) +ifeq ($(OS), AIX) $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o ctrmm_kernel_RC.s m4 ctrmm_kernel_RC.s > ctrmm_kernel_RC_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC ctrmm_kernel_RC_nomacros.s -o $@ rm ctrmm_kernel_RC.s ctrmm_kernel_RC_nomacros.s +else + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ +endif $(KDIR)ztrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) +ifeq ($(OS), AIX) $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o ztrmm_kernel_ln.s m4 ztrmm_kernel_ln.s > ztrmm_kernel_ln_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN ztrmm_kernel_ln_nomacros.s -o $@ rm ztrmm_kernel_ln.s ztrmm_kernel_ln_nomacros.s +else + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ +endif $(KDIR)ztrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) +ifeq ($(OS), AIX) $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o ztrmm_kernel_lt.s m4 ztrmm_kernel_lt.s > ztrmm_kernel_lt_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN ztrmm_kernel_lt_nomacros.s -o $@ rm ztrmm_kernel_lt.s ztrmm_kernel_lt_nomacros.s +else + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ +endif $(KDIR)ztrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) +ifeq ($(OS), AIX) $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o ztrmm_kernel_lr.s m4 ztrmm_kernel_lr.s > ztrmm_kernel_lr_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN ztrmm_kernel_lr_nomacros.s -o $@ rm ztrmm_kernel_lr.s ztrmm_kernel_lr_nomacros.s +else + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ +endif $(KDIR)ztrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) +ifeq ($(OS), AIX) $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o ztrmm_kernel_lc.s m4 ztrmm_kernel_lc.s >ztrmm_kernel_lc_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN ztrmm_kernel_lc_nomacros.s -o $@ rm ztrmm_kernel_lc.s ztrmm_kernel_lc_nomacros.s +else + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ +endif $(KDIR)ztrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) +ifeq ($(OS), AIX) $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o ztrmm_kernel_rn.s m4 ztrmm_kernel_rn.s > ztrmm_kernel_rn_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN ztrmm_kernel_rn_nomacros.s -o $@ rm ztrmm_kernel_rn.s ztrmm_kernel_rn_nomacros.s +else + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ +endif $(KDIR)ztrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) +ifeq ($(OS), AIX) $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o ztrmm_kernel_rt.s m4 ztrmm_kernel_rt.s > ztrmm_kernel_rt_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN ztrmm_kernel_rt_nomacros.s -o $@ rm ztrmm_kernel_rt.s ztrmm_kernel_rt_nomacros.s +else + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ +endif $(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) +ifeq ($(OS), AIX) $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o ztrmm_kernel_rr.s m4 ztrmm_kernel_rr.s > ztrmm_kernel_rr_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC ztrmm_kernel_rr_nomacros.s -o $@ rm ztrmm_kernel_rr.s ztrmm_kernel_rr_nomacros.s +else + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ +endif $(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) +ifeq ($(OS), AIX) $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o ztrmm_kernel_rc.s m4 ztrmm_kernel_rc.s > ztrmm_kernel_rc_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC ztrmm_kernel_rc_nomacros.s -o $@ rm ztrmm_kernel_rc.s ztrmm_kernel_rc_nomacros.s +else + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ +endif + else $(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ @@ -804,10 +958,14 @@ $(KDIR)strmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ $(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) +ifeq ($(OS), AIX) $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o strmm_kernel_rt.s m4 strmm_kernel_rt.s > strmm_kernel_rt_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA strmm_kernel_rt_nomacros.s -o $@ rm strmm_kernel_rt.s strmm_kernel_rt_nomacros.s +else + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ +endif $(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ @@ -931,16 +1089,17 @@ $(KDIR)strsm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRSMKERNEL_RT) $(ST $(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -UDOUBLE -UUPPER -DRT -UCONJ $< -o $@ $(KDIR)dtrsm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_LN) $(DTRSMDEPEND) -# $(CC) $(CFLAGS) -E $< -o dtrsm_kernel_ln.s -# m4 dtrsm_kernel_ln.s > dtrsm_kernel_ln_nomacros.s $(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -DUPPER -DLN -UCONJ $< -o $@ -# rm dtrsm_kernel_ln.s dtrsm_kernel_ln_nomacros.s $(KDIR)dtrsm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_LT) $(DTRSMDEPEND) +ifeq ($(OS), AIX) $(CC) $(CFLAGS) -E -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ $< -o dtrsm_kernel_lt.s m4 dtrsm_kernel_lt.s > dtrsm_kernel_lt_nomacros.s $(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ dtrsm_kernel_lt_nomacros.s -o $@ rm dtrsm_kernel_lt.s dtrsm_kernel_lt_nomacros.s +else + $(CC) $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ $< -o $@ +endif $(KDIR)dtrsm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_RN) $(DTRSMDEPEND) $(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -DUPPER -DRN -UCONJ $< -o $@ @@ -2180,10 +2339,14 @@ $(KDIR)cgemm_kernel_l$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMM $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $@ $(KDIR)cgemm_kernel_r$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) +ifeq ($(OS), AIX) $(CC) $(PFLAGS) -E -UDOUBLE -DCOMPLEX -DNC $< -o cgemm_kernel_r.s m4 cgemm_kernel_r.s > cgemm_kernel_r_nomacros.s $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DNC cgemm_kernel_r_nomacros.s -o $@ rm cgemm_kernel_r.s cgemm_kernel_r_nomacros.s +else + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNC $< -o $@ +endif $(KDIR)cgemm_kernel_b$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $@ @@ -2222,10 +2385,14 @@ $(KDIR)strmm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ $(KDIR)strmm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) +ifeq ($(OS), AIX) $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o strmm_kernel_rt.s m4 strmmkernel_rn.s > strmm_kernel_rt_nomacros.s $(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA strmm_kernel_rt_nomacros.s -o $@ rm strmm_kernel_rt.s strmm_kernel_rt_nomacros.s +else + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ +endif $(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) $(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ From a4896b5538e5a3299acd6857b055e58fc3cce398 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 4 Dec 2019 11:06:03 +0100 Subject: [PATCH 05/27] Update DYNAMIC_ARCH support for ARM64 and PPC (#2332) * Update DYNAMIC_ARCH list of ARM64 targets for gmake * Update arm64 cpu list for runtime detection * Update DYNAMIC_ARCH list of ARM64 targets for cmake and add POWERPC targets --- Makefile.arm64 | 3 ++ Makefile.system | 6 +++ cmake/arch.cmake | 6 ++- cmake/prebuild.cmake | 77 +++++++++++++++++++++++++++++++++++ driver/others/dynamic_arm64.c | 56 +++++++++++++++++++++---- 5 files changed, 138 insertions(+), 10 deletions(-) diff --git a/Makefile.arm64 b/Makefile.arm64 index 4d10ff684..c17ea7938 100644 --- a/Makefile.arm64 +++ b/Makefile.arm64 @@ -39,7 +39,10 @@ CCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99 FCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99 endif +ifeq ($(GCCVERSIONGTEQ9), 1) ifeq ($(CORE), TSV110) CCOMMON_OPT += -march=armv8.2-a -mtune=tsv110 FCOMMON_OPT += -march=armv8.2-a -mtune=tsv110 endif +endif + diff --git a/Makefile.system b/Makefile.system index 4cb4dc954..ab2ffca52 100644 --- a/Makefile.system +++ b/Makefile.system @@ -326,6 +326,7 @@ ifeq ($(C_COMPILER), GCC) GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4) GCCVERSIONGT4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 4) GCCVERSIONGT5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 5) +GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9) GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 7) ifeq ($(GCCVERSIONGT4), 1) # GCC Major version > 4 @@ -547,9 +548,14 @@ endif ifeq ($(ARCH), arm64) DYNAMIC_CORE = ARMV8 +DYNAMIC_CORE += CORTEXA53 DYNAMIC_CORE += CORTEXA57 +DYNAMIC_CORE += CORTEXA72 +DYNAMIC_CORE += CORTEXA73 +DYNAMIC_CORE += FALKOR DYNAMIC_CORE += THUNDERX DYNAMIC_CORE += THUNDERX2T99 +DYNAMIC_CORE += TSV110 endif ifeq ($(ARCH), power) diff --git a/cmake/arch.cmake b/cmake/arch.cmake index f3ae84fe0..8280d6274 100644 --- a/cmake/arch.cmake +++ b/cmake/arch.cmake @@ -45,7 +45,11 @@ endif () if (DYNAMIC_ARCH) if (ARM64) - set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99) + set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110) + endif () + + if (POWER) + set(DYNAMIC_CORE POWER6 POWER8 POWER9) endif () if (X86) diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake index 086df1943..c6d109356 100644 --- a/cmake/prebuild.cmake +++ b/cmake/prebuild.cmake @@ -309,6 +309,83 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS set(ZGEMM_UNROLL_M 4) set(ZGEMM_UNROLL_N 4) set(SYMV_P 16) + elseif ("${TCORE}" STREQUAL "TSV110") + file(APPEND ${TARGET_CONF_TEMP} + "#define ARMV8\n" + "#define L1_CODE_SIZE\t65536\n" + "#define L1_CODE_LINESIZE\t64\n" + "#define L1_CODE_ASSOCIATIVE\t4\n" + "#define L1_DATA_SIZE\t65536\n" + "#define L1_DATA_LINESIZE\t64\n" + "#define L1_DATA_ASSOCIATIVE\t4\n" + "#define L2_SIZE\t524288\n" + "#define L2_LINESIZE\t64\n" + "#define L2_ASSOCIATIVE\t8\n" + "#define DTB_DEFAULT_ENTRIES\t64\n" + "#define DTB_SIZE\t4096\n") + set(SGEMM_UNROLL_M 16) + set(SGEMM_UNROLL_N 4) + set(DGEMM_UNROLL_M 8) + set(DGEMM_UNROLL_N 4) + set(CGEMM_UNROLL_M 8) + set(CGEMM_UNROLL_N 4) + set(ZGEMM_UNROLL_M 4) + set(ZGEMM_UNROLL_N 4) + set(SYMV_P 16) + elseif ("${TCORE}" STREQUAL "POWER6") + file(APPEND ${TARGET_CONF_TEMP} + "#define L1_DATA_SIZE 32768\n" + "#define L1_DATA_LINESIZE 128\n" + "#define L2_SIZE 524288\n" + "#define L2_LINESIZE 128 \n" + "#define DTB_DEFAULT_ENTRIES 128\n" + "#define DTB_SIZE 4096\n" + "#define L2_ASSOCIATIVE 8\n") + set(SGEMM_UNROLL_M 4) + set(SGEMM_UNROLL_N 4) + set(DGEMM_UNROLL_M 4) + set(DGEMM_UNROLL_N 4) + set(CGEMM_UNROLL_M 2) + set(CGEMM_UNROLL_N 4) + set(ZGEMM_UNROLL_M 2) + set(ZGEMM_UNROLL_N 4) + set(SYMV_P 8) + elseif ("${TCORE}" STREQUAL "POWER8") + file(APPEND ${TARGET_CONF_TEMP} + "#define L1_DATA_SIZE 32768\n" + "#define L1_DATA_LINESIZE 128\n" + "#define L2_SIZE 524288\n" + "#define L2_LINESIZE 128 \n" + "#define DTB_DEFAULT_ENTRIES 128\n" + "#define DTB_SIZE 4096\n" + "#define L2_ASSOCIATIVE 8\n") + set(SGEMM_UNROLL_M 16) + set(SGEMM_UNROLL_N 8) + set(DGEMM_UNROLL_M 16) + set(DGEMM_UNROLL_N 4) + set(CGEMM_UNROLL_M 8) + set(CGEMM_UNROLL_N 4) + set(ZGEMM_UNROLL_M 8) + set(ZGEMM_UNROLL_N 2) + set(SYMV_P 8) + elseif ("${TCORE}" STREQUAL "POWER9") + file(APPEND ${TARGET_CONF_TEMP} + "#define L1_DATA_SIZE 32768\n" + "#define L1_DATA_LINESIZE 128\n" + "#define L2_SIZE 524288\n" + "#define L2_LINESIZE 128 \n" + "#define DTB_DEFAULT_ENTRIES 128\n" + "#define DTB_SIZE 4096\n" + "#define L2_ASSOCIATIVE 8\n") + set(SGEMM_UNROLL_M 16) + set(SGEMM_UNROLL_N 8) + set(DGEMM_UNROLL_M 16) + set(DGEMM_UNROLL_N 4) + set(CGEMM_UNROLL_M 8) + set(CGEMM_UNROLL_N 4) + set(ZGEMM_UNROLL_M 8) + set(ZGEMM_UNROLL_N 2) + set(SYMV_P 8) endif() # Or should this actually be NUM_CORES? diff --git a/driver/others/dynamic_arm64.c b/driver/others/dynamic_arm64.c index 9db9ba17d..72f5fcca2 100644 --- a/driver/others/dynamic_arm64.c +++ b/driver/others/dynamic_arm64.c @@ -43,13 +43,18 @@ #endif extern gotoblas_t gotoblas_ARMV8; +extern gotoblas_t gotoblas_CORTEXA53; extern gotoblas_t gotoblas_CORTEXA57; +extern gotoblas_t gotoblas_CORTEXA72; +extern gotoblas_t gotoblas_CORTEXA73; +extern gotoblas_t gotoblas_FALKOR; extern gotoblas_t gotoblas_THUNDERX; extern gotoblas_t gotoblas_THUNDERX2T99; +extern gotoblas_t gotoblas_TSV110; extern void openblas_warning(int verbose, const char * msg); -#define NUM_CORETYPES 4 +#define NUM_CORETYPES 9 /* * In case asm/hwcap.h is outdated on the build system, make sure @@ -65,17 +70,27 @@ extern void openblas_warning(int verbose, const char * msg); static char *corename[] = { "armv8", + "cortexa53", "cortexa57", + "cortexa72", + "cortexa73", + "falkor", "thunderx", "thunderx2t99", + "tsv110", "unknown" }; char *gotoblas_corename(void) { if (gotoblas == &gotoblas_ARMV8) return corename[ 0]; - if (gotoblas == &gotoblas_CORTEXA57) return corename[ 1]; - if (gotoblas == &gotoblas_THUNDERX) return corename[ 2]; - if (gotoblas == &gotoblas_THUNDERX2T99) return corename[ 3]; + if (gotoblas == &gotoblas_CORTEXA53) return corename[ 1]; + if (gotoblas == &gotoblas_CORTEXA57) return corename[ 2]; + if (gotoblas == &gotoblas_CORTEXA72) return corename[ 3]; + if (gotoblas == &gotoblas_CORTEXA73) return corename[ 4]; + if (gotoblas == &gotoblas_FALKOR) return corename[ 5]; + if (gotoblas == &gotoblas_THUNDERX) return corename[ 6]; + if (gotoblas == &gotoblas_THUNDERX2T99) return corename[ 7]; + if (gotoblas == &gotoblas_TSV110) return corename[ 8]; return corename[NUM_CORETYPES]; } @@ -96,9 +111,14 @@ static gotoblas_t *force_coretype(char *coretype) { switch (found) { case 0: return (&gotoblas_ARMV8); - case 1: return (&gotoblas_CORTEXA57); - case 2: return (&gotoblas_THUNDERX); - case 3: return (&gotoblas_THUNDERX2T99); + case 1: return (&gotoblas_CORTEXA53); + case 2: return (&gotoblas_CORTEXA57); + case 3: return (&gotoblas_CORTEXA72); + case 4: return (&gotoblas_CORTEXA73); + case 5: return (&gotoblas_FALKOR); + case 6: return (&gotoblas_THUNDERX); + case 7: return (&gotoblas_THUNDERX2T99); + case 8: return (&gotoblas_TSV110); } snprintf(message, 128, "Core not found: %s\n", coretype); openblas_warning(1, message); @@ -136,10 +156,14 @@ static gotoblas_t *get_coretype(void) { case 0x41: // ARM switch (part) { - case 0xd07: // Cortex A57 - case 0xd08: // Cortex A72 case 0xd03: // Cortex A53 + return &gotoblas_CORTEXA53; + case 0xd07: // Cortex A57 return &gotoblas_CORTEXA57; + case 0xd08: // Cortex A72 + return &gotoblas_CORTEXA72; + case 0xd09: // Cortex A73 + return &gotoblas_CORTEXA73; } break; case 0x42: // Broadcom @@ -158,6 +182,20 @@ static gotoblas_t *get_coretype(void) { return &gotoblas_THUNDERX2T99; } break; + case 0x48: // HiSilicon + switch (part) + { + case 0xd01: // tsv110 + return &gotoblas_TSV110; + } + break; + case 0x51: // Qualcomm + switch (part) + { + case 0xc00: // Falkor + return &gotoblas_FALKOR; + } + break; } return NULL; } From 6baa9b07d7e88f93ef42db4e96fa3d2be035c3d4 Mon Sep 17 00:00:00 2001 From: Kavana Bhat Date: Fri, 6 Dec 2019 04:33:32 -0600 Subject: [PATCH 06/27] AIX changes for Power8 --- common_power.h | 8 ++++---- kernel/Makefile.L3 | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/common_power.h b/common_power.h index 76b9f0f32..9df398266 100644 --- a/common_power.h +++ b/common_power.h @@ -60,10 +60,10 @@ #define XXSWAPD(T,A) xxswapd T, A #define XVMOVDP(T,A) xvmovdp T, A -#define XXSPLTD_S(T,A,z) "xxspltd T, A, z \n\t" -#define XXMRGHD_S(T,A,B) "xxmrghd T, A, B \n\t" -#define XXMRGLD_S(T,A,B) "xxmrgld T, A, B \n\t" -#define XXSWAPD_S(T,A) "xxswapd T, A" +#define XXSPLTD_S(T,A,z) "xxspltd " str(T) ", " str(A) ", " str(z)" \n\t" +#define XXMRGHD_S(T,A,B) "xxmrghd " str(T) ", " str(A) ", " str(B)" \n\t" +#define XXMRGLD_S(T,A,B) "xxmrgld " str(T) ", " str(A) ", " str(B)" \n\t" +#define XXSWAPD_S(T,A) "xxswapd " str(T) ", " str(A) " \n\t" #endif diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index 4decfbd20..c36a44f20 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -1098,7 +1098,7 @@ ifeq ($(OS), AIX) $(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ dtrsm_kernel_lt_nomacros.s -o $@ rm dtrsm_kernel_lt.s dtrsm_kernel_lt_nomacros.s else - $(CC) $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ $< -o $@ + $(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ $< -o $@ endif $(KDIR)dtrsm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_RN) $(DTRSMDEPEND) From b28db31429d9b3b6a57a182d79e63aafdd2843f1 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 6 Dec 2019 21:23:56 +0100 Subject: [PATCH 07/27] Support two-digit version numbers in gcc version check fixes #2336 (non-recognition of gcc 10) with patch provided by JeffreyALaw. --- f_check | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/f_check b/f_check index 993ad9a35..79b24e2dc 100644 --- a/f_check +++ b/f_check @@ -71,7 +71,7 @@ if ($compiler eq "") { if ($data =~ /GNU/) { - $data =~ /(\d)\.(\d).(\d)/; + $data =~ /(\d+)\.(\d+).(\d+)/; $major = $1; $minor = $2; From 13226e310195c7dd5e051c791c4f0839f2f606c4 Mon Sep 17 00:00:00 2001 From: Jehan Date: Wed, 11 Dec 2019 17:51:42 +0100 Subject: [PATCH 08/27] driver: more reasonable thread wait timeout on Windows. It used to be 5ms, which might not be long enough in some cases for the thread to exit well, but then when set to 5000 (5s), it would slow down any program depending on OpenBlas. Let's just set it to 50ms, which is at least 10 times longer than originally, but still reasonable in case of failed thread termination. --- driver/others/blas_server_win32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/others/blas_server_win32.c b/driver/others/blas_server_win32.c index e27725baf..5ecc4428b 100644 --- a/driver/others/blas_server_win32.c +++ b/driver/others/blas_server_win32.c @@ -462,7 +462,7 @@ int BLASFUNC(blas_thread_shutdown)(void){ for(i = 0; i < blas_num_threads - 1; i++){ // Could also just use WaitForMultipleObjects - DWORD wait_thread_value = WaitForSingleObject(blas_threads[i], 5000); + DWORD wait_thread_value = WaitForSingleObject(blas_threads[i], 50); #ifndef OS_WINDOWSSTORE // TerminateThread is only available with WINAPI_DESKTOP and WINAPI_SYSTEM not WINAPI_APP in UWP From aeef942c4f2a17099d82307d482abdec53bd3fbd Mon Sep 17 00:00:00 2001 From: w00421467 Date: Tue, 17 Dec 2019 10:00:13 +0800 Subject: [PATCH 09/27] use arm neon instructions to optimize gemm beta operation --- kernel/arm64/KERNEL | 2 +- kernel/arm64/dgemm_beta.S | 178 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 179 insertions(+), 1 deletion(-) create mode 100644 kernel/arm64/dgemm_beta.S diff --git a/kernel/arm64/KERNEL b/kernel/arm64/KERNEL index f936cdf47..440257196 100644 --- a/kernel/arm64/KERNEL +++ b/kernel/arm64/KERNEL @@ -34,7 +34,7 @@ ifndef SGEMM_BETA SGEMM_BETA = ../generic/gemm_beta.c endif ifndef DGEMM_BETA -DGEMM_BETA = ../generic/gemm_beta.c +DGEMM_BETA = ../arm64/dgemm_beta.S endif ifndef CGEMM_BETA CGEMM_BETA = ../generic/zgemm_beta.c diff --git a/kernel/arm64/dgemm_beta.S b/kernel/arm64/dgemm_beta.S new file mode 100644 index 000000000..636954695 --- /dev/null +++ b/kernel/arm64/dgemm_beta.S @@ -0,0 +1,178 @@ +/*************************************************************************** +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M x0 +#define N x1 +#define BETA d0 +#define LDC x6 +#define C00 x7 + +#define A01 x8 +#define A02 x9 +#define A03 x10 +#define A04 x11 + +#define beta0 d11 +#define betaV0 v11.d[0] +#define I x16 + +#define size 128 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro SAVE_REGS + add sp, sp, #-(11 * 16) + stp d8, d9, [sp, #(0 * 16)] + stp d10, d11, [sp, #(1 * 16)] + stp d12, d13, [sp, #(2 * 16)] + stp d14, d15, [sp, #(3 * 16)] + stp d16, d17, [sp, #(4 * 16)] + stp x18, x19, [sp, #(5 * 16)] + stp x20, x21, [sp, #(6 * 16)] + stp x22, x23, [sp, #(7 * 16)] + stp x24, x25, [sp, #(8 * 16)] + stp x26, x27, [sp, #(9 * 16)] + str x28, [sp, #(10 * 16)] +.endm + +.macro RESTORE_REGS + ldp d8, d9, [sp, #(0 * 16)] + ldp d10, d11, [sp, #(1 * 16)] + ldp d12, d13, [sp, #(2 * 16)] + ldp d14, d15, [sp, #(3 * 16)] + ldp d16, d17, [sp, #(4 * 16)] + ldp x18, x19, [sp, #(5 * 16)] + ldp x20, x21, [sp, #(6 * 16)] + ldp x22, x23, [sp, #(7 * 16)] + ldp x24, x25, [sp, #(8 * 16)] + ldp x26, x27, [sp, #(9 * 16)] + ldr x28, [sp, #(10 * 16)] + add sp, sp, #(11*16) +.endm + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + ldr LDC, [sp] + SAVE_REGS + +.Lgemm_beta_BEGIN: + + fmov beta0, BETA + cmp N, #0 + ble .Lgemm_beta_L999 + +.Lgemm_beta_01: + + lsl LDC, LDC, #3 + + .align 5 +.Lgemm_beta_02: + + mov A01, C00 + add C00, C00, LDC + asr I, M, #4 + cmp I, #0 + ble .Lgemm_beta_04 + add A02, A01, #32 + add A03, A02, #32 + add A04, A03, #32 + + .align 5 +.Lgemm_beta_03: + + ldp q0, q1, [A01] + ldp q2, q3, [A02] + ldp q4, q5, [A03] + ldp q6, q7, [A04] + + fmul v0.2d, v0.2d, betaV0 + fmul v1.2d, v1.2d, betaV0 + + fmul v2.2d, v2.2d, betaV0 + fmul v3.2d, v3.2d, betaV0 + + fmul v4.2d, v4.2d, betaV0 + fmul v5.2d, v5.2d, betaV0 + + fmul v6.2d, v6.2d, betaV0 + fmul v7.2d, v7.2d, betaV0 + + st1 {v0.2d, v1.2d}, [A01] + add A01, A01, size + st1 {v2.2d, v3.2d}, [A02] + add A02, A02, size + st1 {v4.2d, v5.2d}, [A03] + add A03, A03, size + st1 {v6.2d, v7.2d}, [A04] + add A04, A04, size + + subs I , I , #1 + bne .Lgemm_beta_03 + + .align 5 +.Lgemm_beta_04: + + and I, M , #15 // M%16 + cmp I, #0 + ble .Lgemm_beta_06 + + .align 5 +.Lgemm_beta_05: + + ldr d12, [A01] + fmul d12, d12, beta0 + str d12, [A01] + add A01, A01, #8 + + subs I , I , #1 + bne .Lgemm_beta_05 + + .align 5 +.Lgemm_beta_06: + + subs N , N, #1 // N-- + bne .Lgemm_beta_02 + + .align 5 +.Lgemm_beta_L999: + + mov x0, #0 + RESTORE_REGS + ret + + EPILOGUE From b7cc69ee622fed9039ab755b87eee9279d27d541 Mon Sep 17 00:00:00 2001 From: w00421467 Date: Fri, 20 Dec 2019 10:11:50 +0800 Subject: [PATCH 10/27] declare DGEMM_BETA in KERNEL.ARMV8 rather than the generic KERNEL --- kernel/arm64/KERNEL | 2 +- kernel/arm64/KERNEL.ARMV8 | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/arm64/KERNEL b/kernel/arm64/KERNEL index 440257196..f936cdf47 100644 --- a/kernel/arm64/KERNEL +++ b/kernel/arm64/KERNEL @@ -34,7 +34,7 @@ ifndef SGEMM_BETA SGEMM_BETA = ../generic/gemm_beta.c endif ifndef DGEMM_BETA -DGEMM_BETA = ../arm64/dgemm_beta.S +DGEMM_BETA = ../generic/gemm_beta.c endif ifndef CGEMM_BETA CGEMM_BETA = ../generic/zgemm_beta.c diff --git a/kernel/arm64/KERNEL.ARMV8 b/kernel/arm64/KERNEL.ARMV8 index efc1ec8bc..b90dd228b 100644 --- a/kernel/arm64/KERNEL.ARMV8 +++ b/kernel/arm64/KERNEL.ARMV8 @@ -102,6 +102,8 @@ CDOTKERNEL = zdot.S ZDOTKERNEL = zdot.S DSDOTKERNEL = dot.S +DGEMM_BETA = dgemm_beta.S + SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) From d573d24de7cda411edbf0675c7c2e2dd8cdb896f Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Sat, 21 Dec 2019 14:35:15 +0800 Subject: [PATCH 11/27] Fast Haswell ZGEMM kernel --- kernel/x86_64/zgemm_kernel_4x2_haswell.c | 240 +++++++++++++++++++++++ 1 file changed, 240 insertions(+) create mode 100644 kernel/x86_64/zgemm_kernel_4x2_haswell.c diff --git a/kernel/x86_64/zgemm_kernel_4x2_haswell.c b/kernel/x86_64/zgemm_kernel_4x2_haswell.c new file mode 100644 index 000000000..3279b8b8c --- /dev/null +++ b/kernel/x86_64/zgemm_kernel_4x2_haswell.c @@ -0,0 +1,240 @@ +#include "common.h" +#include + +/* recommended settings: GEMM_P = 192, GEMM_Q = 192 */ + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + #define A_CONJ 0 + #define B_CONJ 0 +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + #define A_CONJ 1 + #define B_CONJ 0 +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + #define A_CONJ 0 + #define B_CONJ 1 +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + #define A_CONJ 1 + #define B_CONJ 1 +#endif + +/* %0 = a_ptr, %1 = b_ptr, %2 = c_ptr, %3 = c_tmp, %4 = ldc(bytes), %5 = k_counter, %6 = &alpha, %7 = m_counter, %8 = b_pref */ +/* r11 = m, r12 = k << 5, r13 = k, r14 = b_head, r15 = temp */ + +/* m=4, ymm 0-3 temp, ymm 4-15 acc */ +#if A_CONJ == B_CONJ + #define acc_m2n1_exp(ar,ai,b2,cl,cr) "vfmadd231pd %%ymm"#ar",%%ymm"#b2",%%ymm"#cl"; vfmadd231pd %%ymm"#ai",%%ymm"#b2",%%ymm"#cr";" + #define acc_m4n1_con(ua,la,b1,uc,lc) "vfmaddsub231pd %%ymm"#ua",%%ymm"#b1",%%ymm"#uc"; vfmaddsub231pd %%ymm"#la",%%ymm"#b1",%%ymm"#lc";" +#else + #define acc_m2n1_exp(ar,ai,b2,cl,cr) "vfmadd231pd %%ymm"#ar",%%ymm"#b2",%%ymm"#cl"; vfnmadd231pd %%ymm"#ai",%%ymm"#b2",%%ymm"#cr";" + #define acc_m4n1_con(ua,la,b1,uc,lc) "vfmsubadd231pd %%ymm"#ua",%%ymm"#b1",%%ymm"#uc"; vfmsubadd231pd %%ymm"#la",%%ymm"#b1",%%ymm"#lc";" +#endif +/* expanded accumulators for m4n1 and m4n2 */ +#define KERNEL_k1m4n1 \ + "vbroadcastf128 (%1),%%ymm0; addq $16,%1;"\ + "vmovddup (%0),%%ymm1; vmovddup 8(%0),%%ymm2;" acc_m2n1_exp(1,2,0,4,5)\ + "vmovddup 32(%0),%%ymm1; vmovddup 40(%0),%%ymm2;" acc_m2n1_exp(1,2,0,6,7)\ + "addq $64,%0;" +#define KERNEL_k1m4n2 \ + "vbroadcastf128 (%1),%%ymm0; vbroadcastf128 16(%1),%%ymm1; addq $32,%1;"\ + "vmovddup (%0),%%ymm2; vmovddup 8(%0),%%ymm3;" acc_m2n1_exp(2,3,0,4,5) acc_m2n1_exp(2,3,1,8,9)\ + "vmovddup 32(%0),%%ymm2; vmovddup 40(%0),%%ymm3;" acc_m2n1_exp(2,3,0,6,7) acc_m2n1_exp(2,3,1,10,11)\ + "addq $64,%0;" +/* contracted accumulators for m4n4 and m4n6 */ +#define acc_m4n2_con(ua,la,luc,llc,ruc,rlc,lboff,rboff,...) \ + "vbroadcastsd "#lboff"("#__VA_ARGS__"),%%ymm2;" acc_m4n1_con(ua,la,2,luc,llc)\ + "vbroadcastsd "#rboff"("#__VA_ARGS__"),%%ymm3;" acc_m4n1_con(ua,la,3,ruc,rlc) +#define KERNEL_1_k1m4n4 \ + "vmovupd (%0),%%ymm0; vmovupd 32(%0),%%ymm1; prefetcht0 512(%0); addq $64,%0;"\ + acc_m4n2_con(0,1,4,5,6,7,0,16,%1) acc_m4n2_con(0,1,8,9,10,11,0,16,%1,%%r12,1) +#define KERNEL_2_k1m4n4 \ + "vpermilpd $5,%%ymm0,%%ymm0; vpermilpd $5,%%ymm1,%%ymm1;"\ + acc_m4n2_con(0,1,4,5,6,7,8,24,%1) acc_m4n2_con(0,1,8,9,10,11,8,24,%1,%%r12,1) +#define KERNEL_1_k1m4n6 KERNEL_1_k1m4n4 acc_m4n2_con(0,1,12,13,14,15,0,16,%1,%%r12,2) +#define KERNEL_2_k1m4n6 KERNEL_2_k1m4n4 acc_m4n2_con(0,1,12,13,14,15,8,24,%1,%%r12,2) +#define KERNEL_k1m4n4 KERNEL_1_k1m4n4 KERNEL_2_k1m4n4 "addq $32,%1;" +#define KERNEL_k1m4n6 KERNEL_1_k1m4n6 KERNEL_2_k1m4n6 "addq $32,%1;" +#define zero_4ymm(no1,no2,no3,no4) \ + "vpxor %%ymm"#no1",%%ymm"#no1",%%ymm"#no1"; vpxor %%ymm"#no2",%%ymm"#no2",%%ymm"#no2";"\ + "vpxor %%ymm"#no3",%%ymm"#no3",%%ymm"#no3"; vpxor %%ymm"#no4",%%ymm"#no4",%%ymm"#no4";" +/* initialization and storage macros */ +#define INIT_m4n1 zero_4ymm(4,5,6,7) +#define INIT_m4n2 zero_4ymm(4,5,6,7) zero_4ymm(8,9,10,11) +#define INIT_m4n4 zero_4ymm(4,5,6,7) zero_4ymm(8,9,10,11) +#define INIT_m4n6 INIT_m4n4 zero_4ymm(12,13,14,15) +#if A_CONJ == B_CONJ + #define cont_expacc(cl,cr,dst) "vpermilpd $5,%%ymm"#cr",%%ymm"#cr"; vaddsubpd %%ymm"#cl",%%ymm"#cr",%%ymm"#dst";" +#else + #define cont_expacc(cl,cr,dst) "vpermilpd $5,%%ymm"#cr",%%ymm"#cr"; vaddsubpd %%ymm"#cr",%%ymm"#cl",%%ymm"#dst";" +#endif +#if A_CONJ == 0 + #define save_1ymm(c,tmp,off,alpr,alpi,...) \ + "vpermilpd $5,%%ymm"#c",%%ymm"#tmp"; vfmsubadd213pd "#off"("#__VA_ARGS__"),%%ymm"#alpr",%%ymm"#c";"\ + "vfmsubadd231pd %%ymm"#tmp",%%ymm"#alpi",%%ymm"#c"; vmovupd %%ymm"#c","#off"("#__VA_ARGS__");" +#else + #define save_1ymm(c,tmp,off,alpr,alpi,...) \ + "vpermilpd $5,%%ymm"#c",%%ymm"#tmp"; vfmaddsub213pd "#off"("#__VA_ARGS__"),%%ymm"#alpi",%%ymm"#tmp";"\ + "vfmaddsub231pd %%ymm"#c",%%ymm"#alpr",%%ymm"#tmp"; vmovupd %%ymm"#tmp","#off"("#__VA_ARGS__");" +#endif +#define save_init_m4 "movq %2,%3; addq $64,%2; vbroadcastsd (%6),%%ymm0; vbroadcastsd 8(%6),%%ymm1;" +#define SAVE_m4n1 save_init_m4 cont_expacc(4,5,4) cont_expacc(6,7,6) save_1ymm(4,2,0,0,1,%3) save_1ymm(6,3,32,0,1,%3) +#define SAVE_m4n2 SAVE_m4n1\ + cont_expacc(8,9,8) cont_expacc(10,11,10) save_1ymm(8,2,0,0,1,%3,%4,1) save_1ymm(10,3,32,0,1,%3,%4,1) +#define SAVE_m4n4 save_init_m4\ + save_1ymm(4,2,0,0,1,%3) save_1ymm(5,3,32,0,1,%3) save_1ymm(6,2,0,0,1,%3,%4,1) save_1ymm(7,3,32,0,1,%3,%4,1) "leaq (%3,%4,2),%3;"\ + save_1ymm(8,2,0,0,1,%3) save_1ymm(9,3,32,0,1,%3) save_1ymm(10,2,0,0,1,%3,%4,1) save_1ymm(11,3,32,0,1,%3,%4,1) +#define SAVE_m4n6 SAVE_m4n4 "leaq (%3,%4,2),%3;"\ + save_1ymm(12,2,0,0,1,%3) save_1ymm(13,3,32,0,1,%3) save_1ymm(14,2,0,0,1,%3,%4,1) save_1ymm(15,3,32,0,1,%3,%4,1) +#define COMPUTE_m4(ndim) \ + "movq %%r14,%1;" INIT_m4n##ndim "movq %2,%3; movq %%r13,%5;"\ + "testq %5,%5; jz "#ndim"4443f; cmpq $10,%5; jb "#ndim"4442f;"\ + "movq $10,%5; movq $84,%%r15;"\ + #ndim"4441:\n\t"\ + "prefetcht1 (%3); subq $63,%3; addq %%r15,%3;"\ + "prefetcht0 96(%1); prefetcht0 96(%1,%%r12,1); prefetcht0 96(%1,%%r12,2);" KERNEL_k1m4n##ndim KERNEL_k1m4n##ndim\ + "testq $12,%5; movq $84,%%r15; cmovz %4,%%r15; prefetcht1 (%8); addq $16,%8;"\ + "prefetcht0 96(%1); prefetcht0 96(%1,%%r12,1); prefetcht0 96(%1,%%r12,2);" KERNEL_k1m4n##ndim KERNEL_k1m4n##ndim\ + "addq $4,%5; cmpq %5,%%r13; jnb "#ndim"4441b;"\ + "movq %2,%3; negq %5; leaq 10(%%r13,%5,1),%5; prefetcht0 (%6); prefetcht0 15(%6);"\ + #ndim"4442:\n\t"\ + "prefetcht0 (%3); prefetcht0 63(%3); addq %4,%3;"\ + KERNEL_k1m4n##ndim "decq %5; jnz "#ndim"4442b;"\ + #ndim"4443:\n\t"\ + "prefetcht0 (%%r14); prefetcht0 64(%%r14);" SAVE_m4n##ndim + +/* m=2, ymm 0-3 temp, ymm 4-15 acc, expanded accumulators */ +#define KERNEL_k1m2n1 \ + "vmovddup (%0),%%ymm1; vmovddup 8(%0),%%ymm2; addq $32,%0;"\ + "vbroadcastf128 (%1),%%ymm0;" acc_m2n1_exp(1,2,0,4,5) "addq $16,%1;" +#define acc_m2n2_exp(c1l,c1r,c2l,c2r,...) \ + "vbroadcastf128 ("#__VA_ARGS__"),%%ymm2;" acc_m2n1_exp(0,1,2,c1l,c1r)\ + "vbroadcastf128 16("#__VA_ARGS__"),%%ymm3;" acc_m2n1_exp(0,1,3,c2l,c2r) +#define KERNEL_h_k1m2n2 \ + "vmovddup (%0),%%ymm0; vmovddup 8(%0),%%ymm1; addq $32,%0;" acc_m2n2_exp(4,5,6,7,%1) +#define KERNEL_h_k1m2n4 KERNEL_h_k1m2n2 acc_m2n2_exp(8,9,10,11,%1,%%r12,1) +#define KERNEL_h_k1m2n6 KERNEL_h_k1m2n4 acc_m2n2_exp(12,13,14,15,%1,%%r12,2) +#define KERNEL_k1m2n2 KERNEL_h_k1m2n2 "addq $32,%1;" +#define KERNEL_k1m2n4 KERNEL_h_k1m2n4 "addq $32,%1;" +#define KERNEL_k1m2n6 KERNEL_h_k1m2n6 "addq $32,%1;" +#define INIT_m2n1 "vpxor %%ymm4,%%ymm4,%%ymm4; vpxor %%ymm5,%%ymm5,%%ymm5;" +#define INIT_m2n2 zero_4ymm(4,5,6,7) +#define INIT_m2n4 INIT_m2n2 zero_4ymm(8,9,10,11) +#define INIT_m2n6 INIT_m2n4 zero_4ymm(12,13,14,15) +#define save_init_m2 "movq %2,%3; addq $32,%2; vbroadcastsd (%6),%%ymm0; vbroadcastsd 8(%6),%%ymm1;" +#define SAVE_m2n1 save_init_m2 cont_expacc(4,5,4) save_1ymm(4,2,0,0,1,%3) +#define SAVE_m2n2 SAVE_m2n1 cont_expacc(6,7,6) save_1ymm(6,3,0,0,1,%3,%4,1) +#define SAVE_m2n4 SAVE_m2n2 "leaq (%3,%4,2),%3;"\ + cont_expacc(8,9,8) cont_expacc(10,11,10) save_1ymm(8,2,0,0,1,%3) save_1ymm(10,3,0,0,1,%3,%4,1) +#define SAVE_m2n6 SAVE_m2n4 "leaq (%3,%4,2),%3;"\ + cont_expacc(12,13,12) cont_expacc(14,15,14) save_1ymm(12,2,0,0,1,%3) save_1ymm(14,3,0,0,1,%3,%4,1) +#define COMPUTE_m2(ndim) \ + "movq %%r14,%1;" INIT_m2n##ndim "movq %%r13,%5;"\ + "testq %5,%5; jz "#ndim"2222f;"\ + #ndim"2221:\n\t"\ + KERNEL_k1m2n##ndim\ + "decq %5; jnz "#ndim"2221b;"\ + #ndim"2222:\n\t"\ + SAVE_m2n##ndim + +/* m=1, vmm 0-3 temp, vmm 4-15 acc, expanded accumulators */ +#if A_CONJ == B_CONJ + #define acc_m1n1_exp(ar,ai,b2,cl,cr) "vfmadd231pd %%xmm"#ar",%%xmm"#b2",%%xmm"#cl"; vfmadd231pd %%xmm"#ai",%%xmm"#b2",%%xmm"#cr";" + #define acc_m1n2_exp(arb,aib,b4,cl,cr) "vfmadd231pd %%ymm"#arb",%%ymm"#b4",%%ymm"#cl"; vfmadd231pd %%ymm"#aib",%%ymm"#b4",%%ymm"#cr";" +#else + #define acc_m1n1_exp(ar,ai,b2,cl,cr) "vfmadd231pd %%xmm"#ar",%%xmm"#b2",%%xmm"#cl"; vfnmadd231pd %%xmm"#ai",%%xmm"#b2",%%xmm"#cr";" + #define acc_m1n2_exp(arb,aib,b4,cl,cr) "vfmadd231pd %%ymm"#arb",%%ymm"#b4",%%ymm"#cl"; vfnmadd231pd %%ymm"#aib",%%ymm"#b4",%%ymm"#cr";" +#endif +#define KERNEL_k1m1n1 \ + "vmovddup (%0),%%xmm0; vmovddup 8(%0),%%xmm1; addq $16,%0;"\ + "vmovupd (%1),%%xmm2; addq $16,%1;" acc_m1n1_exp(0,1,2,4,5) +#define KERNEL_h_k1m1n2 \ + "vbroadcastsd (%0),%%ymm0; vbroadcastsd 8(%0),%%ymm1; addq $16,%0;"\ + "vmovupd (%1),%%ymm2;" acc_m1n2_exp(0,1,2,4,5) +#define KERNEL_h_k1m1n4 KERNEL_h_k1m1n2 "vmovupd (%1,%%r12,1),%%ymm2;" acc_m1n2_exp(0,1,2,6,7) +#define KERNEL_h_k1m1n6 KERNEL_h_k1m1n4 "vmovupd (%1,%%r12,2),%%ymm2;" acc_m1n2_exp(0,1,2,8,9) +#define KERNEL_k1m1n2 KERNEL_h_k1m1n2 "addq $32,%1;" +#define KERNEL_k1m1n4 KERNEL_h_k1m1n4 "addq $32,%1;" +#define KERNEL_k1m1n6 KERNEL_h_k1m1n6 "addq $32,%1;" +#define INIT_m1n1 "vpxor %%xmm4,%%xmm4,%%xmm4; vpxor %%xmm5,%%xmm5,%%xmm5;" +#define INIT_m1n2 "vpxor %%ymm4,%%ymm4,%%ymm4; vpxor %%ymm5,%%ymm5,%%ymm5;" +#define INIT_m1n4 INIT_m1n2 "vpxor %%ymm6,%%ymm6,%%ymm6; vpxor %%ymm7,%%ymm7,%%ymm7;" +#define INIT_m1n6 INIT_m1n4 "vpxor %%ymm8,%%ymm8,%%ymm8; vpxor %%ymm9,%%ymm9,%%ymm9;" +#if A_CONJ == B_CONJ + #define cont_expxmmacc(cl,cr,dst) "vpermilpd $5,%%xmm"#cr",%%xmm"#cr"; vaddsubpd %%xmm"#cl",%%xmm"#cr",%%xmm"#dst";" +#else + #define cont_expxmmacc(cl,cr,dst) "vpermilpd $5,%%xmm"#cr",%%xmm"#cr"; vaddsubpd %%xmm"#cr",%%xmm"#cl",%%xmm"#dst";" +#endif +#if A_CONJ == 0 + #define save_m1n1(c,tmp,alpr,alpi) \ + "vpermilpd $5,%%xmm"#c",%%xmm"#tmp"; vfmsubadd213pd (%3),%%xmm"#alpr",%%xmm"#c";"\ + "vfmsubadd231pd %%xmm"#tmp",%%xmm"#alpi",%%xmm"#c"; vmovupd %%xmm"#c",(%3);" + #define save_m1n2(c,tmp1,tmp2,alpr,alpi) \ + "vpermilpd $5,%%ymm"#c",%%ymm"#tmp1"; vmovupd (%3),%%xmm"#tmp2"; vinsertf128 $1,(%3,%4,1),%%ymm"#tmp2",%%ymm"#tmp2";"\ + "vfmsubadd213pd %%ymm"#tmp2",%%ymm"#alpr",%%ymm"#c"; vfmsubadd231pd %%ymm"#tmp1",%%ymm"#alpi",%%ymm"#c";"\ + "vmovupd %%xmm"#c",(%3); vextractf128 $1,%%ymm"#c",(%3,%4,1); leaq (%3,%4,2),%3;" +#else + #define save_m1n1(c,tmp,alpr,alpi) \ + "vpermilpd $5,%%xmm"#c",%%xmm"#tmp"; vfmaddsub213pd (%3),%%xmm"#alpi",%%xmm"#tmp";"\ + "vfmaddsub231pd %%xmm"#c",%%xmm"#alpr",%%xmm"#tmp"; vmovupd %%xmm"#tmp",(%3);" + #define save_m1n2(c,tmp1,tmp2,alpr,alpi) \ + "vpermilpd $5,%%ymm"#c",%%ymm"#tmp1"; vmovupd (%3),%%xmm"#tmp2"; vinsertf128 $1,(%3,%4,1),%%ymm"#tmp2",%%ymm"#tmp2";"\ + "vfmaddsub213pd %%ymm"#tmp2",%%ymm"#alpi",%%ymm"#tmp1"; vfmaddsub231pd %%ymm"#c",%%ymm"#alpr",%%ymm"#tmp1";"\ + "vmovupd %%xmm"#tmp1",(%3); vextractf128 $1,%%ymm"#tmp1",(%3,%4,1); leaq (%3,%4,2),%3;" +#endif +#define save_init_m1 "movq %2,%3; addq $16,%2; vbroadcastsd (%6),%%ymm0; vbroadcastsd 8(%6),%%ymm1;" +#define SAVE_m1n1 save_init_m1 cont_expxmmacc(4,5,4) save_m1n1(4,2,0,1) +#define SAVE_m1n2 save_init_m1 cont_expacc(4,5,4) save_m1n2(4,2,3,0,1) +#define SAVE_m1n4 SAVE_m1n2 cont_expacc(6,7,6) save_m1n2(6,2,3,0,1) +#define SAVE_m1n6 SAVE_m1n4 cont_expacc(8,9,8) save_m1n2(8,2,3,0,1) +#define COMPUTE_m1(ndim) \ + "movq %%r14,%1;" INIT_m1n##ndim "movq %%r13,%5;"\ + "testq %5,%5; jz "#ndim"1112f;"\ + #ndim"1111:\n\t"\ + KERNEL_k1m1n##ndim\ + "decq %5; jnz "#ndim"1111b;"\ + #ndim"1112:\n\t"\ + SAVE_m1n##ndim + +#define COMPUTE(ndim) {\ + b_pref = b_ptr + ndim * K *2;\ + __asm__ __volatile__ (\ + "movq %1,%%r14; movq %5,%%r13; movq %5,%%r12; salq $5,%%r12; movq %7,%%r11;"\ + "cmpq $4,%7; jb "#ndim"9992f;"\ + #ndim"9991:\n\t"\ + COMPUTE_m4(ndim)\ + "subq $4,%7; cmpq $4,%7; jnb "#ndim"9991b;"\ + #ndim"9992:\n\t"\ + "cmpq $2,%7; jb "#ndim"9993f;"\ + COMPUTE_m2(ndim) "subq $2,%7;"\ + #ndim"9993:\n\t"\ + "testq %7,%7; jz "#ndim"9994f;"\ + COMPUTE_m1(ndim)\ + #ndim"9994:\n\t"\ + "movq %%r14,%1; movq %%r13,%5; movq %%r11,%7; vzeroupper;"\ + :"+r"(a_ptr),"+r"(b_ptr),"+r"(c_ptr),"+r"(c_tmp),"+r"(ldc_in_bytes),"+r"(K),"+r"(alp),"+r"(M),"+r"(b_pref)\ + ::"cc","memory","r11","r12","r13","r14","r15","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5",\ + "xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15");\ + a_ptr -= M * K *2; b_ptr += ndim * K *2; c_ptr += (ndim * LDC - M) * 2;\ +} + +int __attribute__ ((noinline)) +CNAME(BLASLONG m, BLASLONG n, BLASLONG k, double alphar, double alphai, double * __restrict__ A, double * __restrict__ B, double * __restrict__ C, BLASLONG LDC) +{ + if(m==0||n==0||k==0||(alphar==0.0 && alphai==0.0)) return 0; + int64_t ldc_in_bytes = (int64_t)LDC * sizeof(double) * 2; +#if A_CONJ == B_CONJ + double const_val[2] = {-alphar, -alphai}; +#else + double const_val[2] = {alphar, alphai}; +#endif + int64_t M = (int64_t)m, K = (int64_t)k; + BLASLONG n_count = n; + double *a_ptr = A,*b_ptr = B,*c_ptr = C,*c_tmp = C,*alp = const_val,*b_pref = B; + for(;n_count>5;n_count-=6) COMPUTE(6) + for(;n_count>3;n_count-=4) COMPUTE(4) + for(;n_count>1;n_count-=2) COMPUTE(2) + if(n_count>0) COMPUTE(1) + return 0; +} From f41d52665d589440dd5227b52025ea492bea4c6e Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Sat, 21 Dec 2019 14:37:06 +0800 Subject: [PATCH 12/27] Fast Haswell ZGEMM kernel --- kernel/x86_64/KERNEL.HASWELL | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86_64/KERNEL.HASWELL b/kernel/x86_64/KERNEL.HASWELL index f98728a41..5c11ced1d 100644 --- a/kernel/x86_64/KERNEL.HASWELL +++ b/kernel/x86_64/KERNEL.HASWELL @@ -67,7 +67,7 @@ CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) ZTRMMKERNEL = zgemm_kernel_4x2_haswell.S -ZGEMMKERNEL = zgemm_kernel_4x2_haswell.S +ZGEMMKERNEL = zgemm_kernel_4x2_haswell.c ZGEMMINCOPY = ../generic/zgemm_ncopy_4.c ZGEMMITCOPY = ../generic/zgemm_tcopy_4.c ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c From 105e26e12ac2283ec2bee50d03d02d77a2c92780 Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Sat, 21 Dec 2019 14:38:51 +0800 Subject: [PATCH 13/27] Adjust Haswell ZGEMM blocking parameters --- param.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/param.h b/param.h index d39fc4a1d..5fb0868b2 100644 --- a/param.h +++ b/param.h @@ -1572,7 +1572,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SGEMM_DEFAULT_P 768 #define DGEMM_DEFAULT_P 512 #define CGEMM_DEFAULT_P 384 -#define ZGEMM_DEFAULT_P 256 +#define ZGEMM_DEFAULT_P 192 #ifdef WINDOWS_ABI #define SGEMM_DEFAULT_Q 320 @@ -1582,7 +1582,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define DGEMM_DEFAULT_Q 256 #endif #define CGEMM_DEFAULT_Q 192 -#define ZGEMM_DEFAULT_Q 128 +#define ZGEMM_DEFAULT_Q 192 #define SGEMM_DEFAULT_R sgemm_r #define DGEMM_DEFAULT_R 13824 From 025741f16aeaafe0080b9065dbf2315762b286e4 Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Mon, 23 Dec 2019 23:40:03 +0800 Subject: [PATCH 14/27] Fast Haswell CGEMM kernel --- kernel/x86_64/cgemm_kernel_8x2_haswell.c | 287 +++++++++++++++++++++++ 1 file changed, 287 insertions(+) create mode 100644 kernel/x86_64/cgemm_kernel_8x2_haswell.c diff --git a/kernel/x86_64/cgemm_kernel_8x2_haswell.c b/kernel/x86_64/cgemm_kernel_8x2_haswell.c new file mode 100644 index 000000000..49fef90db --- /dev/null +++ b/kernel/x86_64/cgemm_kernel_8x2_haswell.c @@ -0,0 +1,287 @@ +#include "common.h" +#include + +/* recommended settings: GEMM_P = 256, GEMM_Q = 256 */ + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + #define A_CONJ 0 + #define B_CONJ 0 +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + #define A_CONJ 1 + #define B_CONJ 0 +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + #define A_CONJ 0 + #define B_CONJ 1 +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + #define A_CONJ 1 + #define B_CONJ 1 +#endif + +/* %0 = a_ptr, %1 = b_ptr, %2 = c_ptr, %3 = c_tmp, %4 = ldc(bytes), %5 = k_counter, %6 = &alpha, %7 = m_counter, %8 = b_pref */ +/* r11 = m, r12 = k << 4, r13 = k, r14 = b_head, r15 = temp */ + +/* m=8, ymm 0-3 temp, ymm 4-15 acc */ +#if A_CONJ == B_CONJ + #define acc_m4n1_exp(ar,ai,b2,cl,cr) "vfmadd231ps %%ymm"#ar",%%ymm"#b2",%%ymm"#cl"; vfmadd231ps %%ymm"#ai",%%ymm"#b2",%%ymm"#cr";" + #define acc_m8n1_con(ua,la,b1,uc,lc) "vfmaddsub231ps %%ymm"#ua",%%ymm"#b1",%%ymm"#uc"; vfmaddsub231ps %%ymm"#la",%%ymm"#b1",%%ymm"#lc";" +#else + #define acc_m4n1_exp(ar,ai,b2,cl,cr) "vfmadd231ps %%ymm"#ar",%%ymm"#b2",%%ymm"#cl"; vfnmadd231ps %%ymm"#ai",%%ymm"#b2",%%ymm"#cr";" + #define acc_m8n1_con(ua,la,b1,uc,lc) "vfmsubadd231ps %%ymm"#ua",%%ymm"#b1",%%ymm"#uc"; vfmsubadd231ps %%ymm"#la",%%ymm"#b1",%%ymm"#lc";" +#endif +/* expanded accumulators for m8n1 and m8n2 */ +#define KERNEL_k1m8n1 \ + "vbroadcastsd (%1),%%ymm0; addq $8,%1;"\ + "vmovsldup (%0),%%ymm1; vmovshdup (%0),%%ymm2;" acc_m4n1_exp(1,2,0,4,5)\ + "vmovsldup 32(%0),%%ymm1; vmovshdup 32(%0),%%ymm2;" acc_m4n1_exp(1,2,0,6,7)\ + "addq $64,%0;" +#define KERNEL_k1m8n2 \ + "vbroadcastsd (%1),%%ymm0; vbroadcastsd 8(%1),%%ymm1; addq $16,%1;"\ + "vmovsldup (%0),%%ymm2; vmovshdup (%0),%%ymm3;" acc_m4n1_exp(2,3,0,4,5) acc_m4n1_exp(2,3,1,8,9)\ + "vmovsldup 32(%0),%%ymm2; vmovshdup 32(%0),%%ymm3;" acc_m4n1_exp(2,3,0,6,7) acc_m4n1_exp(2,3,1,10,11)\ + "addq $64,%0;" +/* contracted accumulators for m8n4 and m8n6 */ +#define acc_m8n2_con(ua,la,luc,llc,ruc,rlc,lboff,rboff,...) \ + "vbroadcastss "#lboff"("#__VA_ARGS__"),%%ymm2;" acc_m8n1_con(ua,la,2,luc,llc)\ + "vbroadcastss "#rboff"("#__VA_ARGS__"),%%ymm3;" acc_m8n1_con(ua,la,3,ruc,rlc) +#define KERNEL_1_k1m8n4 \ + "vmovups (%0),%%ymm0; vmovups 32(%0),%%ymm1; prefetcht0 512(%0); addq $64,%0;"\ + acc_m8n2_con(0,1,4,5,6,7,0,8,%1) acc_m8n2_con(0,1,8,9,10,11,0,8,%1,%%r12,1) +#define KERNEL_2_k1m8n4 \ + "vpermilps $177,%%ymm0,%%ymm0; vpermilps $177,%%ymm1,%%ymm1;"\ + acc_m8n2_con(0,1,4,5,6,7,4,12,%1) acc_m8n2_con(0,1,8,9,10,11,4,12,%1,%%r12,1) +#define KERNEL_1_k1m8n6 KERNEL_1_k1m8n4 acc_m8n2_con(0,1,12,13,14,15,0,8,%1,%%r12,2) +#define KERNEL_2_k1m8n6 KERNEL_2_k1m8n4 acc_m8n2_con(0,1,12,13,14,15,4,12,%1,%%r12,2) +#define KERNEL_k1m8n4 KERNEL_1_k1m8n4 KERNEL_2_k1m8n4 "addq $16,%1;" +#define KERNEL_k1m8n6 KERNEL_1_k1m8n6 KERNEL_2_k1m8n6 "addq $16,%1;" +#define zero_4ymm(no1,no2,no3,no4) \ + "vpxor %%ymm"#no1",%%ymm"#no1",%%ymm"#no1"; vpxor %%ymm"#no2",%%ymm"#no2",%%ymm"#no2";"\ + "vpxor %%ymm"#no3",%%ymm"#no3",%%ymm"#no3"; vpxor %%ymm"#no4",%%ymm"#no4",%%ymm"#no4";" +/* initialization and storage macros */ +#define INIT_m8n1 zero_4ymm(4,5,6,7) +#define INIT_m8n2 zero_4ymm(4,5,6,7) zero_4ymm(8,9,10,11) +#define INIT_m8n4 zero_4ymm(4,5,6,7) zero_4ymm(8,9,10,11) +#define INIT_m8n6 INIT_m8n4 zero_4ymm(12,13,14,15) +#if A_CONJ == B_CONJ + #define cont_expacc(cl,cr,dst) "vpermilps $177,%%ymm"#cr",%%ymm"#cr"; vaddsubps %%ymm"#cl",%%ymm"#cr",%%ymm"#dst";" +#else + #define cont_expacc(cl,cr,dst) "vpermilps $177,%%ymm"#cr",%%ymm"#cr"; vaddsubps %%ymm"#cr",%%ymm"#cl",%%ymm"#dst";" +#endif +#if A_CONJ == 0 + #define save_1ymm(c,tmp,off,alpr,alpi,...) \ + "vpermilps $177,%%ymm"#c",%%ymm"#tmp"; vfmsubadd213ps "#off"("#__VA_ARGS__"),%%ymm"#alpr",%%ymm"#c";"\ + "vfmsubadd231ps %%ymm"#tmp",%%ymm"#alpi",%%ymm"#c"; vmovups %%ymm"#c","#off"("#__VA_ARGS__");" +#else + #define save_1ymm(c,tmp,off,alpr,alpi,...) \ + "vpermilps $177,%%ymm"#c",%%ymm"#tmp"; vfmaddsub213ps "#off"("#__VA_ARGS__"),%%ymm"#alpi",%%ymm"#tmp";"\ + "vfmaddsub231ps %%ymm"#c",%%ymm"#alpr",%%ymm"#tmp"; vmovups %%ymm"#tmp","#off"("#__VA_ARGS__");" +#endif +#define save_init_m8 "movq %2,%3; addq $64,%2; vbroadcastss (%6),%%ymm0; vbroadcastss 4(%6),%%ymm1;" +#define SAVE_m8n1 save_init_m8 cont_expacc(4,5,4) cont_expacc(6,7,6) save_1ymm(4,2,0,0,1,%3) save_1ymm(6,3,32,0,1,%3) +#define SAVE_m8n2 SAVE_m8n1\ + cont_expacc(8,9,8) cont_expacc(10,11,10) save_1ymm(8,2,0,0,1,%3,%4,1) save_1ymm(10,3,32,0,1,%3,%4,1) +#define SAVE_m8n4 save_init_m8\ + save_1ymm(4,2,0,0,1,%3) save_1ymm(5,3,32,0,1,%3) save_1ymm(6,2,0,0,1,%3,%4,1) save_1ymm(7,3,32,0,1,%3,%4,1) "leaq (%3,%4,2),%3;"\ + save_1ymm(8,2,0,0,1,%3) save_1ymm(9,3,32,0,1,%3) save_1ymm(10,2,0,0,1,%3,%4,1) save_1ymm(11,3,32,0,1,%3,%4,1) +#define SAVE_m8n6 SAVE_m8n4 "leaq (%3,%4,2),%3;"\ + save_1ymm(12,2,0,0,1,%3) save_1ymm(13,3,32,0,1,%3) save_1ymm(14,2,0,0,1,%3,%4,1) save_1ymm(15,3,32,0,1,%3,%4,1) +#define COMPUTE_m8(ndim) \ + "movq %%r14,%1;" INIT_m8n##ndim "movq %2,%3; movq %%r13,%5;"\ + "testq %5,%5; jz "#ndim"8883f; cmpq $10,%5; jb "#ndim"8882f;"\ + "movq $10,%5; movq $84,%%r15;"\ + #ndim"8881:\n\t"\ + "prefetcht1 (%3); subq $63,%3; addq %%r15,%3;"\ + "prefetcht0 64(%1); prefetcht0 64(%1,%%r12,1); prefetcht0 64(%1,%%r12,2);"\ + KERNEL_k1m8n##ndim KERNEL_k1m8n##ndim\ + "testq $12,%5; movq $84,%%r15; cmovz %4,%%r15; prefetcht1 (%8); addq $16,%8;"\ + KERNEL_k1m8n##ndim KERNEL_k1m8n##ndim\ + "addq $4,%5; cmpq %5,%%r13; jnb "#ndim"8881b;"\ + "movq %2,%3; negq %5; leaq 10(%%r13,%5,1),%5; prefetcht0 (%6); prefetcht0 7(%6);"\ + #ndim"8882:\n\t"\ + "prefetcht0 (%3); prefetcht0 63(%3); addq %4,%3;"\ + KERNEL_k1m8n##ndim "decq %5; jnz "#ndim"8882b;"\ + #ndim"8883:\n\t"\ + "prefetcht0 (%%r14); prefetcht0 64(%%r14);" SAVE_m8n##ndim +/* m=4, ymm 0-3 temp, ymm 4-15 acc, expanded accumulators */ +#define KERNEL_k1m4n1 \ + "vmovsldup (%0),%%ymm1; vmovshdup (%0),%%ymm2; addq $32,%0;"\ + "vbroadcastsd (%1),%%ymm0;" acc_m4n1_exp(1,2,0,4,5) "addq $8,%1;" +#define acc_m4n2_exp(c1l,c1r,c2l,c2r,...) \ + "vbroadcastsd ("#__VA_ARGS__"),%%ymm2;" acc_m4n1_exp(0,1,2,c1l,c1r)\ + "vbroadcastsd 8("#__VA_ARGS__"),%%ymm3;" acc_m4n1_exp(0,1,3,c2l,c2r) +#define KERNEL_h_k1m4n2 \ + "vmovsldup (%0),%%ymm0; vmovshdup (%0),%%ymm1; addq $32,%0;" acc_m4n2_exp(4,5,6,7,%1) +#define KERNEL_h_k1m4n4 KERNEL_h_k1m4n2 acc_m4n2_exp(8,9,10,11,%1,%%r12,1) +#define KERNEL_h_k1m4n6 KERNEL_h_k1m4n4 acc_m4n2_exp(12,13,14,15,%1,%%r12,2) +#define KERNEL_k1m4n2 KERNEL_h_k1m4n2 "addq $16,%1;" +#define KERNEL_k1m4n4 KERNEL_h_k1m4n4 "addq $16,%1;" +#define KERNEL_k1m4n6 KERNEL_h_k1m4n6 "addq $16,%1;" +#define INIT_m4n1 "vpxor %%ymm4,%%ymm4,%%ymm4; vpxor %%ymm5,%%ymm5,%%ymm5;" +#define INIT_m4n2 zero_4ymm(4,5,6,7) +#define INIT_m4n4 INIT_m4n2 zero_4ymm(8,9,10,11) +#define INIT_m4n6 INIT_m4n4 zero_4ymm(12,13,14,15) +#define save_init_m4 "movq %2,%3; addq $32,%2; vbroadcastss (%6),%%ymm0; vbroadcastss 4(%6),%%ymm1;" +#define SAVE_m4n1 save_init_m4 cont_expacc(4,5,4) save_1ymm(4,2,0,0,1,%3) +#define SAVE_m4n2 SAVE_m4n1 cont_expacc(6,7,6) save_1ymm(6,3,0,0,1,%3,%4,1) +#define SAVE_m4n4 SAVE_m4n2 "leaq (%3,%4,2),%3;"\ + cont_expacc(8,9,8) cont_expacc(10,11,10) save_1ymm(8,2,0,0,1,%3) save_1ymm(10,3,0,0,1,%3,%4,1) +#define SAVE_m4n6 SAVE_m4n4 "leaq (%3,%4,2),%3;"\ + cont_expacc(12,13,12) cont_expacc(14,15,14) save_1ymm(12,2,0,0,1,%3) save_1ymm(14,3,0,0,1,%3,%4,1) +#define COMPUTE_m4(ndim) \ + "movq %%r14,%1;" INIT_m4n##ndim "movq %%r13,%5;"\ + "testq %5,%5; jz "#ndim"4442f;"\ + #ndim"4441:\n\t"\ + KERNEL_k1m4n##ndim\ + "decq %5; jnz "#ndim"4441b;"\ + #ndim"4442:\n\t"\ + SAVE_m4n##ndim +/* m=2, xmm 0-3 temp, xmm 4-15 acc, expanded accumulators */ +#if A_CONJ == B_CONJ + #define acc_m2n1_exp(ar,ai,b2,cl,cr) "vfmadd231ps %%xmm"#ar",%%xmm"#b2",%%xmm"#cl"; vfmadd231ps %%xmm"#ai",%%xmm"#b2",%%xmm"#cr";" +#else + #define acc_m2n1_exp(ar,ai,b2,cl,cr) "vfmadd231ps %%xmm"#ar",%%xmm"#b2",%%xmm"#cl"; vfnmadd231ps %%xmm"#ai",%%xmm"#b2",%%xmm"#cr";" +#endif +#define KERNEL_h_k1m2n1 \ + "vmovsldup (%0),%%xmm0; vmovshdup (%0),%%xmm1; addq $16,%0;"\ + "vmovddup (%1),%%xmm2;" acc_m2n1_exp(0,1,2,4,5) +#define KERNEL_h_k1m2n2 KERNEL_h_k1m2n1\ + "vmovddup 8(%1),%%xmm3;" acc_m2n1_exp(0,1,3,6,7) +#define acc_m2n2_exp(c1,c2,c3,c4,...)\ + "vmovddup ("#__VA_ARGS__"),%%xmm2;" acc_m2n1_exp(0,1,2,c1,c2)\ + "vmovddup 8("#__VA_ARGS__"),%%xmm3;" acc_m2n1_exp(0,1,3,c3,c4) +#define KERNEL_h_k1m2n4 KERNEL_h_k1m2n2 acc_m2n2_exp(8,9,10,11,%1,%%r12,1) +#define KERNEL_h_k1m2n6 KERNEL_h_k1m2n4 acc_m2n2_exp(12,13,14,15,%1,%%r12,2) +#define KERNEL_k1m2n1 KERNEL_h_k1m2n1 "addq $8,%1;" +#define KERNEL_k1m2n2 KERNEL_h_k1m2n2 "addq $16,%1;" +#define KERNEL_k1m2n4 KERNEL_h_k1m2n4 "addq $16,%1;" +#define KERNEL_k1m2n6 KERNEL_h_k1m2n6 "addq $16,%1;" +#define zero_2xmm(no1,no2) "vpxor %%xmm"#no1",%%xmm"#no1",%%xmm"#no1"; vpxor %%xmm"#no2",%%xmm"#no2",%%xmm"#no2";" +#define INIT_m2n1 zero_2xmm(4,5) +#define INIT_m2n2 INIT_m2n1 zero_2xmm(6,7) +#define INIT_m2n4 INIT_m2n2 zero_2xmm(8,9) zero_2xmm(10,11) +#define INIT_m2n6 INIT_m2n4 zero_2xmm(12,13) zero_2xmm(14,15) +#if A_CONJ == B_CONJ + #define cont_expxmmacc(cl,cr,dst) "vpermilps $177,%%xmm"#cr",%%xmm"#cr"; vaddsubps %%xmm"#cl",%%xmm"#cr",%%xmm"#dst";" +#else + #define cont_expxmmacc(cl,cr,dst) "vpermilps $177,%%xmm"#cr",%%xmm"#cr"; vaddsubps %%xmm"#cr",%%xmm"#cl",%%xmm"#dst";" +#endif +#if A_CONJ == 0 + #define save_1xmm(c,tmp,alpr,alpi) \ + "vpermilps $177,%%xmm"#c",%%xmm"#tmp"; vfmsubadd213ps (%3),%%xmm"#alpr",%%xmm"#c";"\ + "vfmsubadd231ps %%xmm"#tmp",%%xmm"#alpi",%%xmm"#c"; vmovups %%xmm"#c",(%3); addq %4,%3;" +#else + #define save_1xmm(c,tmp,alpr,alpi) \ + "vpermilps $177,%%xmm"#c",%%xmm"#tmp"; vfmaddsub213ps (%3),%%xmm"#alpi",%%xmm"#tmp";"\ + "vfmaddsub231ps %%xmm"#c",%%xmm"#alpr",%%xmm"#tmp"; vmovups %%xmm"#tmp",(%3); addq %4,%3;" +#endif +#define save_init_m2 "movq %2,%3; addq $16,%2; vbroadcastss (%6),%%xmm0; vbroadcastss 4(%6),%%xmm1;" +#define SAVE_m2n1 save_init_m2 cont_expxmmacc(4,5,4) save_1xmm(4,2,0,1) +#define SAVE_m2n2 SAVE_m2n1 cont_expacc(6,7,6) save_1xmm(6,3,0,1) +#define SAVE_m2n4 SAVE_m2n2 cont_expacc(8,9,8) save_1xmm(8,2,0,1) cont_expacc(10,11,10) save_1xmm(10,3,0,1) +#define SAVE_m2n6 SAVE_m2n4 cont_expacc(12,13,12) save_1xmm(12,2,0,1) cont_expacc(14,15,14) save_1xmm(14,3,0,1) +#define COMPUTE_m2(ndim) \ + "movq %%r14,%1;" INIT_m2n##ndim "movq %%r13,%5;"\ + "testq %5,%5; jz "#ndim"2222f;"\ + #ndim"2221:\n\t"\ + KERNEL_k1m2n##ndim\ + "decq %5; jnz "#ndim"2221b;"\ + #ndim"2222:\n\t"\ + SAVE_m2n##ndim +/* m=1, xmm 0-3 temp, xmm 4-9 acc, expanded accumulators */ +#if A_CONJ == B_CONJ + #define acc_m1n1_exp(ar,ai,b2,cl,cr) "vfmadd231ps %%xmm"#ar",%%xmm"#b2",%%xmm"#cl"; vfmadd231ps %%xmm"#ai",%%xmm"#b2",%%xmm"#cr";" + #define acc_m1n2_exp(arb,aib,b4,cl,cr) "vfmadd231ps %%xmm"#arb",%%xmm"#b4",%%xmm"#cl"; vfmadd231ps %%xmm"#aib",%%xmm"#b4",%%xmm"#cr";" +#else + #define acc_m1n1_exp(ar,ai,b2,cl,cr) "vfmadd231ps %%xmm"#ar",%%xmm"#b2",%%xmm"#cl"; vfnmadd231ps %%xmm"#ai",%%xmm"#b2",%%xmm"#cr";" + #define acc_m1n2_exp(arb,aib,b4,cl,cr) "vfmadd231ps %%xmm"#arb",%%xmm"#b4",%%xmm"#cl"; vfnmadd231ps %%xmm"#aib",%%xmm"#b4",%%xmm"#cr";" +#endif +#define KERNEL_k1m1n1 \ + "vbroadcastss (%0),%%xmm0; vbroadcastss 4(%0),%%xmm1; addq $8,%0;"\ + "vmovsd (%1),%%xmm2; addq $8,%1;" acc_m1n1_exp(0,1,2,4,5) +#define KERNEL_h_k1m1n2 \ + "vbroadcastss (%0),%%xmm0; vbroadcastss 4(%0),%%xmm1; addq $8,%0;"\ + "vmovups (%1),%%xmm2;" acc_m1n2_exp(0,1,2,4,5) +#define KERNEL_h_k1m1n4 KERNEL_h_k1m1n2 "vmovups (%1,%%r12,1),%%xmm2;" acc_m1n2_exp(0,1,2,6,7) +#define KERNEL_h_k1m1n6 KERNEL_h_k1m1n4 "vmovups (%1,%%r12,2),%%xmm2;" acc_m1n2_exp(0,1,2,8,9) +#define KERNEL_k1m1n2 KERNEL_h_k1m1n2 "addq $16,%1;" +#define KERNEL_k1m1n4 KERNEL_h_k1m1n4 "addq $16,%1;" +#define KERNEL_k1m1n6 KERNEL_h_k1m1n6 "addq $16,%1;" +#define INIT_m1n1 zero_2xmm(4,5) +#define INIT_m1n2 zero_2xmm(4,5) +#define INIT_m1n4 INIT_m1n2 zero_2xmm(6,7) +#define INIT_m1n6 INIT_m1n4 zero_2xmm(8,9) +#if A_CONJ == 0 + #define save_m1n1(c,tmp1,tmp2,alpr,alpi) \ + "vpermilps $177,%%xmm"#c",%%xmm"#tmp1"; vmovsd (%3),%%xmm"#tmp2"; vfmsubadd213ps %%xmm"#tmp2",%%xmm"#alpr",%%xmm"#c";"\ + "vfmsubadd231ps %%xmm"#tmp1",%%xmm"#alpi",%%xmm"#c"; vmovsd %%xmm"#c",(%3);" + #define save_m1n2(c,tmp1,tmp2,alpr,alpi) \ + "vpermilps $177,%%xmm"#c",%%xmm"#tmp1"; vmovsd (%3),%%xmm"#tmp2"; vmovhpd (%3,%4,1),%%xmm"#tmp2",%%xmm"#tmp2";"\ + "vfmsubadd213ps %%xmm"#tmp2",%%xmm"#alpr",%%xmm"#c"; vfmsubadd231ps %%xmm"#tmp1",%%xmm"#alpi",%%xmm"#c";"\ + "vmovsd %%xmm"#c",(%3); vmovhpd %%xmm"#c",(%3,%4,1); leaq (%3,%4,2),%3;" +#else + #define save_m1n1(c,tmp1,tmp2,alpr,alpi) \ + "vpermilps $177,%%xmm"#c",%%xmm"#tmp1"; vmovsd (%3),%%xmm"#tmp2"; vfmaddsub213ps %%xmm"#tmp2",%%xmm"#alpi",%%xmm"#tmp1";"\ + "vfmaddsub231ps %%xmm"#c",%%xmm"#alpr",%%xmm"#tmp1"; vmovsd %%xmm"#tmp1",(%3);" + #define save_m1n2(c,tmp1,tmp2,alpr,alpi) \ + "vpermilps $177,%%xmm"#c",%%xmm"#tmp1"; vmovsd (%3),%%xmm"#tmp2"; vmovhpd (%3,%4,1),%%xmm"#tmp2",%%xmm"#tmp2";"\ + "vfmaddsub213ps %%xmm"#tmp2",%%xmm"#alpi",%%xmm"#tmp1"; vfmaddsub231ps %%xmm"#c",%%xmm"#alpr",%%xmm"#tmp1";"\ + "vmovsd %%xmm"#tmp1",(%3); vmovhpd %%xmm"#tmp1",(%3,%4,1); leaq (%3,%4,2),%3;" +#endif +#define save_init_m1 "movq %2,%3; addq $8,%2; vbroadcastss (%6),%%xmm0; vbroadcastss 4(%6),%%xmm1;" +#define SAVE_m1n1 save_init_m1 cont_expxmmacc(4,5,4) save_m1n1(4,2,3,0,1) +#define SAVE_m1n2 save_init_m1 cont_expxmmacc(4,5,4) save_m1n2(4,2,3,0,1) +#define SAVE_m1n4 SAVE_m1n2 cont_expxmmacc(6,7,6) save_m1n2(6,2,3,0,1) +#define SAVE_m1n6 SAVE_m1n4 cont_expxmmacc(8,9,8) save_m1n2(8,2,3,0,1) +#define COMPUTE_m1(ndim) \ + "movq %%r14,%1;" INIT_m1n##ndim "movq %%r13,%5;"\ + "testq %5,%5; jz "#ndim"1112f;"\ + #ndim"1111:\n\t"\ + KERNEL_k1m1n##ndim\ + "decq %5; jnz "#ndim"1111b;"\ + #ndim"1112:\n\t"\ + SAVE_m1n##ndim +#define COMPUTE(ndim) {\ + b_pref = b_ptr + ndim * K *2;\ + __asm__ __volatile__ (\ + "movq %1,%%r14; movq %5,%%r13; movq %5,%%r12; salq $4,%%r12; movq %7,%%r11;"\ + "cmpq $8,%7; jb "#ndim"9992f;"\ + #ndim"9991:\n\t"\ + COMPUTE_m8(ndim)\ + "subq $8,%7; cmpq $8,%7; jnb "#ndim"9991b;"\ + #ndim"9992:\n\t"\ + "cmpq $4,%7; jb "#ndim"9993f;"\ + COMPUTE_m4(ndim) "subq $4,%7;"\ + #ndim"9993:\n\t"\ + "cmpq $2,%7; jb "#ndim"9994f;"\ + COMPUTE_m2(ndim) "subq $2,%7;"\ + #ndim"9994:\n\t"\ + "testq %7,%7; jz "#ndim"9995f;"\ + COMPUTE_m1(ndim)\ + #ndim"9995:\n\t"\ + "movq %%r14,%1; movq %%r13,%5; movq %%r11,%7; vzeroupper;"\ + :"+r"(a_ptr),"+r"(b_ptr),"+r"(c_ptr),"+r"(c_tmp),"+r"(ldc_in_bytes),"+r"(K),"+r"(alp),"+r"(M),"+r"(b_pref)\ + ::"cc","memory","r11","r12","r13","r14","r15","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5",\ + "xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15");\ + a_ptr -= M * K *2; b_ptr += ndim * K *2; c_ptr += (ndim * LDC - M) * 2;\ +} +int __attribute__ ((noinline)) +CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alphar, float alphai, float * __restrict__ A, float * __restrict__ B, float * __restrict__ C, BLASLONG LDC) +{ + if(m==0||n==0||k==0||(alphar==0.0 && alphai==0.0)) return 0; + int64_t ldc_in_bytes = (int64_t)LDC * sizeof(float) * 2; +#if A_CONJ == B_CONJ + float const_val[2] = {-alphar, -alphai}; +#else + float const_val[2] = {alphar, alphai}; +#endif + int64_t M = (int64_t)m, K = (int64_t)k; + BLASLONG n_count = n; + float *a_ptr = A,*b_ptr = B,*c_ptr = C,*c_tmp = C,*alp = const_val,*b_pref = B; + for(;n_count>5;n_count-=6) COMPUTE(6) + for(;n_count>3;n_count-=4) COMPUTE(4) + for(;n_count>1;n_count-=2) COMPUTE(2) + if(n_count>0) COMPUTE(1) + return 0; +} From c418c81224b56e2a99b5f3e7a159b30bfd8f8d8b Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Mon, 23 Dec 2019 23:41:44 +0800 Subject: [PATCH 15/27] Update KERNEL.HASWELL --- kernel/x86_64/KERNEL.HASWELL | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86_64/KERNEL.HASWELL b/kernel/x86_64/KERNEL.HASWELL index 5c11ced1d..9bd34f1e3 100644 --- a/kernel/x86_64/KERNEL.HASWELL +++ b/kernel/x86_64/KERNEL.HASWELL @@ -56,7 +56,7 @@ DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) CTRMMKERNEL = cgemm_kernel_8x2_haswell.S -CGEMMKERNEL = cgemm_kernel_8x2_haswell.S +CGEMMKERNEL = cgemm_kernel_8x2_haswell.c CGEMMINCOPY = ../generic/zgemm_ncopy_8.c CGEMMITCOPY = ../generic/zgemm_tcopy_8.c CGEMMONCOPY = ../generic/zgemm_ncopy_2.c From 2cd9306bb5138f8ec796964fa578b2ea1b73e921 Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Mon, 23 Dec 2019 23:42:30 +0800 Subject: [PATCH 16/27] Update KERNEL.ZEN --- kernel/x86_64/KERNEL.ZEN | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/x86_64/KERNEL.ZEN b/kernel/x86_64/KERNEL.ZEN index be4503d47..aa4ba4834 100644 --- a/kernel/x86_64/KERNEL.ZEN +++ b/kernel/x86_64/KERNEL.ZEN @@ -53,7 +53,7 @@ DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) CTRMMKERNEL = cgemm_kernel_8x2_haswell.S -CGEMMKERNEL = cgemm_kernel_8x2_haswell.S +CGEMMKERNEL = cgemm_kernel_8x2_haswell.c CGEMMINCOPY = ../generic/zgemm_ncopy_8.c CGEMMITCOPY = ../generic/zgemm_tcopy_8.c CGEMMONCOPY = ../generic/zgemm_ncopy_2.c @@ -64,7 +64,7 @@ CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) ZTRMMKERNEL = zgemm_kernel_4x2_haswell.S -ZGEMMKERNEL = zgemm_kernel_4x2_haswell.S +ZGEMMKERNEL = zgemm_kernel_4x2_haswell.c ZGEMMINCOPY = ../generic/zgemm_ncopy_4.c ZGEMMITCOPY = ../generic/zgemm_tcopy_4.c ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c From 611445c7f8d136ce66bb8a825b3383fc8eb028bd Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Mon, 23 Dec 2019 23:44:55 +0800 Subject: [PATCH 17/27] Update param.h --- param.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/param.h b/param.h index 5fb0868b2..d80bbf4f2 100644 --- a/param.h +++ b/param.h @@ -668,8 +668,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SGEMM_DEFAULT_P 768 #define DGEMM_DEFAULT_P 512 -#define CGEMM_DEFAULT_P 384 -#define ZGEMM_DEFAULT_P 256 +#define CGEMM_DEFAULT_P 256 +#define ZGEMM_DEFAULT_P 192 #ifdef WINDOWS_ABI #define SGEMM_DEFAULT_Q 320 @@ -678,8 +678,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SGEMM_DEFAULT_Q 384 #define DGEMM_DEFAULT_Q 256 #endif -#define CGEMM_DEFAULT_Q 192 -#define ZGEMM_DEFAULT_Q 128 +#define CGEMM_DEFAULT_Q 256 +#define ZGEMM_DEFAULT_Q 192 #define SGEMM_DEFAULT_R sgemm_r #define DGEMM_DEFAULT_R 13824 @@ -1571,7 +1571,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SGEMM_DEFAULT_P 768 #define DGEMM_DEFAULT_P 512 -#define CGEMM_DEFAULT_P 384 +#define CGEMM_DEFAULT_P 256 #define ZGEMM_DEFAULT_P 192 #ifdef WINDOWS_ABI @@ -1581,7 +1581,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SGEMM_DEFAULT_Q 384 #define DGEMM_DEFAULT_Q 256 #endif -#define CGEMM_DEFAULT_Q 192 +#define CGEMM_DEFAULT_Q 256 #define ZGEMM_DEFAULT_Q 192 #define SGEMM_DEFAULT_R sgemm_r From 6fbe51072bed086b71d18ed77ee7b8cc79e63dd6 Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Tue, 24 Dec 2019 00:24:40 +0800 Subject: [PATCH 18/27] Update CONTRIBUTORS.md --- CONTRIBUTORS.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 3859a9c19..99f82df9d 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -171,3 +171,9 @@ In chronological order: * [2019-02-01] added missing Blas Level-1,2 (single precision) simd codes * [2019-03-14] power9 dgemm/dtrmm kernel * [2019-04-29] power9 sgemm/strmm kernel + +* Jiachen Wang + * [2018.07] optimize AVX2 DGEMM + * [2018-11] optimize AVX512 SGEMM and DGEMM + * [2018-11] AVX512 CGEMM & ZGEMM kernels + * [2018-12] optimize AVX2 CGEMM and ZGEMM From 3ce6bcdb5f61fad716703b9facf26087aade7ae2 Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Tue, 24 Dec 2019 00:30:16 +0800 Subject: [PATCH 19/27] Update CONTRIBUTORS.md --- CONTRIBUTORS.md | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 99f82df9d..6d30ee942 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -173,7 +173,8 @@ In chronological order: * [2019-04-29] power9 sgemm/strmm kernel * Jiachen Wang - * [2018.07] optimize AVX2 DGEMM - * [2018-11] optimize AVX512 SGEMM and DGEMM - * [2018-11] AVX512 CGEMM & ZGEMM kernels - * [2018-12] optimize AVX2 CGEMM and ZGEMM + * [2019-07-29] optimize AVX2 DGEMM + * [2019-10-20] AVX512 DGEMM kernel (4x8) + * [2019-11-06] optimize AVX512 SGEMM + * [2019-11-12] AVX512 CGEMM & ZGEMM kernels + * [2019-12-23] optimize AVX2 CGEMM and ZGEMM From eeecd623d85e90c75172b610e4ecb11f4c04650e Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Tue, 24 Dec 2019 00:40:16 +0800 Subject: [PATCH 20/27] Update cgemm_kernel_8x2_haswell.c --- kernel/x86_64/cgemm_kernel_8x2_haswell.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kernel/x86_64/cgemm_kernel_8x2_haswell.c b/kernel/x86_64/cgemm_kernel_8x2_haswell.c index 49fef90db..eab8c9ea5 100644 --- a/kernel/x86_64/cgemm_kernel_8x2_haswell.c +++ b/kernel/x86_64/cgemm_kernel_8x2_haswell.c @@ -104,6 +104,7 @@ KERNEL_k1m8n##ndim "decq %5; jnz "#ndim"8882b;"\ #ndim"8883:\n\t"\ "prefetcht0 (%%r14); prefetcht0 64(%%r14);" SAVE_m8n##ndim + /* m=4, ymm 0-3 temp, ymm 4-15 acc, expanded accumulators */ #define KERNEL_k1m4n1 \ "vmovsldup (%0),%%ymm1; vmovshdup (%0),%%ymm2; addq $32,%0;"\ @@ -137,6 +138,7 @@ "decq %5; jnz "#ndim"4441b;"\ #ndim"4442:\n\t"\ SAVE_m4n##ndim + /* m=2, xmm 0-3 temp, xmm 4-15 acc, expanded accumulators */ #if A_CONJ == B_CONJ #define acc_m2n1_exp(ar,ai,b2,cl,cr) "vfmadd231ps %%xmm"#ar",%%xmm"#b2",%%xmm"#cl"; vfmadd231ps %%xmm"#ai",%%xmm"#b2",%%xmm"#cr";" @@ -189,6 +191,7 @@ "decq %5; jnz "#ndim"2221b;"\ #ndim"2222:\n\t"\ SAVE_m2n##ndim + /* m=1, xmm 0-3 temp, xmm 4-9 acc, expanded accumulators */ #if A_CONJ == B_CONJ #define acc_m1n1_exp(ar,ai,b2,cl,cr) "vfmadd231ps %%xmm"#ar",%%xmm"#b2",%%xmm"#cl"; vfmadd231ps %%xmm"#ai",%%xmm"#b2",%%xmm"#cr";" @@ -242,6 +245,7 @@ "decq %5; jnz "#ndim"1111b;"\ #ndim"1112:\n\t"\ SAVE_m1n##ndim + #define COMPUTE(ndim) {\ b_pref = b_ptr + ndim * K *2;\ __asm__ __volatile__ (\ @@ -266,6 +270,7 @@ "xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15");\ a_ptr -= M * K *2; b_ptr += ndim * K *2; c_ptr += (ndim * LDC - M) * 2;\ } + int __attribute__ ((noinline)) CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alphar, float alphai, float * __restrict__ A, float * __restrict__ B, float * __restrict__ C, BLASLONG LDC) { From 5fd1edead95b86df0e92fd2be1e0435d746af56d Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Fri, 27 Dec 2019 18:00:55 +0800 Subject: [PATCH 21/27] Create cgemm3m_kernel_8x4_haswell.c --- kernel/x86_64/cgemm3m_kernel_8x4_haswell.c | 279 +++++++++++++++++++++ 1 file changed, 279 insertions(+) create mode 100644 kernel/x86_64/cgemm3m_kernel_8x4_haswell.c diff --git a/kernel/x86_64/cgemm3m_kernel_8x4_haswell.c b/kernel/x86_64/cgemm3m_kernel_8x4_haswell.c new file mode 100644 index 000000000..831f25483 --- /dev/null +++ b/kernel/x86_64/cgemm3m_kernel_8x4_haswell.c @@ -0,0 +1,279 @@ +/* %0 = "+r"(a_pointer), %1 = "+r"(b_pointer), %2 = "+r"(c_pointer), %3 = "+r"(ldc_in_bytes), %4 for k_count, %5 for c_store */ +/* r12 = k << 4(const), r13 = k(const), r14 = b_head_pos(const), r15 = tmp */ + +#include "common.h" +#include + +//recommended settings: GEMM_P = 320, GEMM_Q = 320. + +/* m = 8 *//* ymm0 for alpha, ymm1-ymm3 for temporary use, ymm4-ymm15 for accumulators */ +#define KERNEL_k1m8n1 \ + "vmovups (%0),%%ymm1; addq $32,%0;"\ + "vbroadcastss (%1),%%ymm2; vfmadd231ps %%ymm1,%%ymm2,%%ymm4;"\ + "addq $4,%1;" +#define KERNEL_h_k1m8n2 \ + "vmovsldup (%0),%%ymm1; vmovshdup (%0),%%ymm2; addq $32,%0;"\ + "vbroadcastsd (%1),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm4; vfmadd231ps %%ymm2,%%ymm3,%%ymm5;" +#define KERNEL_k1m8n2 KERNEL_h_k1m8n2 "addq $8,%1;" +#define KERNEL_h_k1m8n4 \ + KERNEL_h_k1m8n2 "vbroadcastsd 8(%1),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm6; vfmadd231ps %%ymm2,%%ymm3,%%ymm7;" +#define KERNEL_k1m8n4 KERNEL_h_k1m8n4 "addq $16,%1;" +#define unit_kernel_k1m8n4(c1,c2,c3,c4,...) \ + "vbroadcastsd ("#__VA_ARGS__"),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,"#c1"; vfmadd231ps %%ymm2,%%ymm3,"#c2";"\ + "vbroadcastsd 8("#__VA_ARGS__"),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,"#c3"; vfmadd231ps %%ymm2,%%ymm3,"#c4";" +#define KERNEL_h_k1m8n8 KERNEL_h_k1m8n4 unit_kernel_k1m8n4(%%ymm8,%%ymm9,%%ymm10,%%ymm11,%1,%%r12,1) +#define KERNEL_k1m8n8 KERNEL_h_k1m8n8 "addq $16,%1;" +#define KERNEL_h_k1m8n12 KERNEL_h_k1m8n8 unit_kernel_k1m8n4(%%ymm12,%%ymm13,%%ymm14,%%ymm15,%1,%%r12,2) +#define KERNEL_k1m8n12 KERNEL_h_k1m8n12 "addq $16,%1;" +#define INIT_m8n1 "vpxor %%ymm4,%%ymm4,%%ymm4;" +#define INIT_m8n2 INIT_m8n1 "vpxor %%ymm5,%%ymm5,%%ymm5;" +#define INIT_m8n4 INIT_m8n2 "vpxor %%ymm6,%%ymm6,%%ymm6;vpxor %%ymm7,%%ymm7,%%ymm7;" +#define unit_init_m8n4(c1,c2,c3,c4) \ + "vpxor "#c1","#c1","#c1";vpxor "#c2","#c2","#c2";vpxor "#c3","#c3","#c3";vpxor "#c4","#c4","#c4";" +#define INIT_m8n8 INIT_m8n4 unit_init_m8n4(%%ymm8,%%ymm9,%%ymm10,%%ymm11) +#define INIT_m8n12 INIT_m8n8 unit_init_m8n4(%%ymm12,%%ymm13,%%ymm14,%%ymm15) +#define SAVE_m8n1 \ + "vunpcklps %%ymm4,%%ymm4,%%ymm2; vunpckhps %%ymm4,%%ymm4,%%ymm3;"\ + "vperm2f128 $2,%%ymm2,%%ymm3,%%ymm1; vperm2f128 $19,%%ymm2,%%ymm3,%%ymm2;"\ + "vfmadd213ps (%2),%%ymm0,%%ymm1; vfmadd213ps 32(%2),%%ymm0,%%ymm2; vmovups %%ymm1,(%2); vmovups %%ymm2,32(%2);" +#define unit_save_m8n2(c1,c2) \ + "vunpcklpd "#c2","#c1",%%ymm2; vunpckhpd "#c2","#c1",%%ymm3;"\ + "vperm2f128 $2,%%ymm2,%%ymm3,"#c1"; vperm2f128 $19,%%ymm2,%%ymm3,"#c2";"\ + "vmovsldup "#c1",%%ymm2; vmovsldup "#c2",%%ymm3;"\ + "vfmadd213ps (%5),%%ymm0,%%ymm2; vfmadd213ps 32(%5),%%ymm0,%%ymm3; vmovups %%ymm2,(%5); vmovups %%ymm3,32(%5);"\ + "vmovshdup "#c1",%%ymm2; vmovshdup "#c2",%%ymm3;"\ + "vfmadd213ps (%5,%3,1),%%ymm0,%%ymm2; vfmadd213ps 32(%5,%3,1),%%ymm0,%%ymm3; vmovups %%ymm2,(%5,%3,1); vmovups %%ymm3,32(%5,%3,1);"\ + "leaq (%5,%3,2),%5;" +#define SAVE_m8n2 "movq %2,%5;" unit_save_m8n2(%%ymm4,%%ymm5) +#define SAVE_m8n4 SAVE_m8n2 unit_save_m8n2(%%ymm6,%%ymm7) +#define SAVE_m8n8 SAVE_m8n4 unit_save_m8n2(%%ymm8,%%ymm9) unit_save_m8n2(%%ymm10,%%ymm11) +#define SAVE_m8n12 SAVE_m8n8 unit_save_m8n2(%%ymm12,%%ymm13) unit_save_m8n2(%%ymm14,%%ymm15) +#define COMPUTE_m8(ndim) \ + INIT_m8n##ndim\ + "movq %%r13,%4; movq %%r14,%1; movq %2,%5; xorq %%r15,%%r15;"\ + "cmpq $24,%4; jb "#ndim"882f;"\ + #ndim"881:\n\t"\ + "cmpq $126,%%r15; movq $126,%%r15; cmoveq %3,%%r15;"\ + "prefetcht0 64(%1); prefetcht0 64(%1,%%r12,1); prefetcht0 64(%1,%%r12,2);"\ + "prefetcht0 512(%0);" KERNEL_k1m8n##ndim KERNEL_k1m8n##ndim\ + "prefetcht0 512(%0);" KERNEL_k1m8n##ndim KERNEL_k1m8n##ndim\ + "prefetcht1 (%5); leaq -63(%5,%%r15,1),%5;"\ + "prefetcht0 64(%1); prefetcht0 64(%1,%%r12,1); prefetcht0 64(%1,%%r12,2);"\ + "prefetcht0 512(%0);" KERNEL_k1m8n##ndim KERNEL_k1m8n##ndim\ + "prefetcht0 512(%0);" KERNEL_k1m8n##ndim KERNEL_k1m8n##ndim\ + "prefetcht1 (%8); addq $16,%8;"\ + "subq $8,%4; cmpq $24,%4; jnb "#ndim"881b;"\ + "movq %2,%5;"\ + #ndim"882:\n\t"\ + "testq %4,%4; jz "#ndim"883f;"\ + "prefetcht0 (%5); prefetcht0 63(%5); addq %3,%5;"\ + KERNEL_k1m8n##ndim\ + "decq %4; jmp "#ndim"882b;"\ + #ndim"883:\n\t"\ + "prefetcht0 (%%r14); prefetcht0 64(%%r14);"\ + SAVE_m8n##ndim "addq $64,%2;" + +/* m = 4 *//* xmm0 for alpha, xmm1-xmm3 for temporary use, xmm4-xmm15 for accumulators */ +#define KERNEL_k1m4n1 \ + "vmovups (%0),%%xmm1; addq $16,%0;"\ + "vbroadcastss (%1),%%xmm2; vfmadd231ps %%xmm1,%%xmm2,%%xmm4;"\ + "addq $4,%1;" +#define KERNEL_h_k1m4n2 \ + "vmovsldup (%0),%%xmm1; vmovshdup (%0),%%xmm2; addq $16,%0;"\ + "vmovddup (%1),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,%%xmm4; vfmadd231ps %%xmm2,%%xmm3,%%xmm5;" +#define KERNEL_k1m4n2 KERNEL_h_k1m4n2 "addq $8,%1;" +#define KERNEL_h_k1m4n4 \ + KERNEL_h_k1m4n2 "vmovddup 8(%1),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,%%xmm6; vfmadd231ps %%xmm2,%%xmm3,%%xmm7;" +#define KERNEL_k1m4n4 KERNEL_h_k1m4n4 "addq $16,%1;" +#define unit_kernel_k1m4n4(c1,c2,c3,c4,...) \ + "vmovddup ("#__VA_ARGS__"),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,"#c1"; vfmadd231ps %%xmm2,%%xmm3,"#c2";"\ + "vmovddup 8("#__VA_ARGS__"),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,"#c3"; vfmadd231ps %%xmm2,%%xmm3,"#c4";" +#define KERNEL_h_k1m4n8 KERNEL_h_k1m4n4 unit_kernel_k1m4n4(%%xmm8,%%xmm9,%%xmm10,%%xmm11,%1,%%r12,1) +#define KERNEL_k1m4n8 KERNEL_h_k1m4n8 "addq $16,%1;" +#define KERNEL_h_k1m4n12 KERNEL_h_k1m4n8 unit_kernel_k1m4n4(%%xmm12,%%xmm13,%%xmm14,%%xmm15,%1,%%r12,2) +#define KERNEL_k1m4n12 KERNEL_h_k1m4n12 "addq $16,%1;" +#define INIT_m4n1 "vpxor %%xmm4,%%xmm4,%%xmm4;" +#define INIT_m4n2 INIT_m4n1 "vpxor %%xmm5,%%xmm5,%%xmm5;" +#define INIT_m4n4 INIT_m4n2 "vpxor %%xmm6,%%xmm6,%%xmm6;vpxor %%xmm7,%%xmm7,%%xmm7;" +#define unit_init_m4n4(c1,c2,c3,c4) \ + "vpxor "#c1","#c1","#c1";vpxor "#c2","#c2","#c2";vpxor "#c3","#c3","#c3";vpxor "#c4","#c4","#c4";" +#define INIT_m4n8 INIT_m4n4 unit_init_m4n4(%%xmm8,%%xmm9,%%xmm10,%%xmm11) +#define INIT_m4n12 INIT_m4n8 unit_init_m4n4(%%xmm12,%%xmm13,%%xmm14,%%xmm15) +#define SAVE_m4n1 \ + "vunpcklps %%xmm4,%%xmm4,%%xmm2; vunpckhps %%xmm4,%%xmm4,%%xmm3;"\ + "vfmadd213ps (%2),%%xmm0,%%xmm2; vfmadd213ps 16(%2),%%xmm0,%%xmm3; vmovups %%xmm2,(%2); vmovups %%xmm3,16(%2);" +#define unit_save_m4n2(c1,c2) \ + "vunpcklpd "#c2","#c1",%%xmm2; vunpckhpd "#c2","#c1","#c2"; vmovapd %%xmm2,"#c1";"\ + "vmovsldup "#c1",%%xmm2; vmovsldup "#c2",%%xmm3;"\ + "vfmadd213ps (%5),%%xmm0,%%xmm2; vfmadd213ps 16(%5),%%xmm0,%%xmm3; vmovups %%xmm2,(%5); vmovups %%xmm3,16(%5);"\ + "vmovshdup "#c1",%%xmm2; vmovshdup "#c2",%%xmm3;"\ + "vfmadd213ps (%5,%3,1),%%xmm0,%%xmm2; vfmadd213ps 16(%5,%3,1),%%xmm0,%%xmm3; vmovups %%xmm2,(%5,%3,1); vmovups %%xmm3,16(%5,%3,1);"\ + "leaq (%5,%3,2),%5;" +#define SAVE_m4n2 "movq %2,%5;" unit_save_m4n2(%%xmm4,%%xmm5) +#define SAVE_m4n4 SAVE_m4n2 unit_save_m4n2(%%xmm6,%%xmm7) +#define SAVE_m4n8 SAVE_m4n4 unit_save_m4n2(%%xmm8,%%xmm9) unit_save_m4n2(%%xmm10,%%xmm11) +#define SAVE_m4n12 SAVE_m4n8 unit_save_m4n2(%%xmm12,%%xmm13) unit_save_m4n2(%%xmm14,%%xmm15) +#define COMPUTE_m4(ndim) \ + INIT_m4n##ndim\ + "movq %%r13,%4; movq %%r14,%1;"\ + #ndim"442:\n\t"\ + "testq %4,%4; jz "#ndim"443f;"\ + KERNEL_k1m4n##ndim\ + "decq %4; jmp "#ndim"442b;"\ + #ndim"443:\n\t"\ + SAVE_m4n##ndim "addq $32,%2;" + +/* m = 2 *//* xmm0 for alpha, xmm1-xmm3 and xmm10 for temporary use, xmm4-xmm9 for accumulators */ +#define INIT_m2n1 "vpxor %%xmm4,%%xmm4,%%xmm4;" +#define KERNEL_k1m2n1 \ + "vmovsd (%0),%%xmm1; addq $8,%0;"\ + "vbroadcastss (%1),%%xmm2; vfmadd231ps %%xmm1,%%xmm2,%%xmm4;"\ + "addq $4,%1;" +#define SAVE_m2n1 \ + "vunpcklps %%xmm4,%%xmm4,%%xmm1; vfmadd213ps (%2),%%xmm0,%%xmm1; vmovups %%xmm1,(%2);" +#define INIT_m2n2 INIT_m2n1 "vpxor %%xmm5,%%xmm5,%%xmm5;" +#define KERNEL_k1m2n2 \ + "vmovsd (%0),%%xmm1; addq $8,%0;"\ + "vbroadcastss (%1),%%xmm2; vfmadd231ps %%xmm1,%%xmm2,%%xmm4;"\ + "vbroadcastss 4(%1),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,%%xmm5;"\ + "addq $8,%1;" +#define SAVE_m2n2 SAVE_m2n1 \ + "vunpcklps %%xmm5,%%xmm5,%%xmm1; vfmadd213ps (%2,%3,1),%%xmm0,%%xmm1; vmovups %%xmm1,(%2,%3,1);" +#define INIT_m2n4 INIT_m2n2 +#define INIT_m2n8 INIT_m2n4 "vpxor %%xmm6,%%xmm6,%%xmm6; vpxor %%xmm7,%%xmm7,%%xmm7;" +#define INIT_m2n12 INIT_m2n8 "vpxor %%xmm8,%%xmm8,%%xmm8; vpxor %%xmm9,%%xmm9,%%xmm9;" +#define KERNEL_k1m2n4 \ + "vmovups (%1),%%xmm3; addq $16,%1;"\ + "vbroadcastss (%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm4;"\ + "vbroadcastss 4(%0),%%xmm2; vfmadd231ps %%xmm3,%%xmm2,%%xmm5;"\ + "addq $8,%0;" +#define KERNEL_k1m2n8 \ + "vmovups (%1),%%xmm3; vmovups (%1,%%r12,1),%%xmm2; addq $16,%1;"\ + "vbroadcastss (%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm4; vfmadd231ps %%xmm2,%%xmm1,%%xmm6;"\ + "vbroadcastss 4(%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm5; vfmadd231ps %%xmm2,%%xmm1,%%xmm7;"\ + "addq $8,%0;" +#define KERNEL_k1m2n12 \ + "vmovups (%1),%%xmm3; vmovups (%1,%%r12,1),%%xmm2; vmovups (%1,%%r12,2),%%xmm1; addq $16,%1;"\ + "vbroadcastss (%0),%%xmm10; vfmadd231ps %%xmm3,%%xmm10,%%xmm4; vfmadd231ps %%xmm2,%%xmm10,%%xmm6; vfmadd231ps %%xmm1,%%xmm10,%%xmm8;"\ + "vbroadcastss 4(%0),%%xmm10; vfmadd231ps %%xmm3,%%xmm10,%%xmm5; vfmadd231ps %%xmm2,%%xmm10,%%xmm7; vfmadd231ps %%xmm1,%%xmm10,%%xmm9;"\ + "addq $8,%0;" +#define unit_save_m2n4(c1,c2) \ + "vunpcklpd "#c2","#c1",%%xmm1; vunpckhpd "#c2","#c1",%%xmm2;"\ + "vmovsldup %%xmm1,%%xmm3; vfmadd213ps (%5),%%xmm0,%%xmm3; vmovups %%xmm3,(%5);"\ + "vmovshdup %%xmm1,%%xmm3; vfmadd213ps (%5,%3,1),%%xmm0,%%xmm3; vmovups %%xmm3,(%5,%3,1);"\ + "leaq (%5,%3,2),%5;"\ + "vmovsldup %%xmm2,%%xmm3; vfmadd213ps (%5),%%xmm0,%%xmm3; vmovups %%xmm3,(%5);"\ + "vmovshdup %%xmm2,%%xmm3; vfmadd213ps (%5,%3,1),%%xmm0,%%xmm3; vmovups %%xmm3,(%5,%3,1);"\ + "leaq (%5,%3,2),%5;" +#define SAVE_m2n4 "movq %2,%5;" unit_save_m2n4(%%xmm4,%%xmm5) +#define SAVE_m2n8 SAVE_m2n4 unit_save_m2n4(%%xmm6,%%xmm7) +#define SAVE_m2n12 SAVE_m2n8 unit_save_m2n4(%%xmm8,%%xmm9) +#define COMPUTE_m2(ndim) \ + INIT_m2n##ndim\ + "movq %%r13,%4; movq %%r14,%1;"\ + #ndim"222:\n\t"\ + "testq %4,%4; jz "#ndim"223f;"\ + KERNEL_k1m2n##ndim\ + "decq %4; jmp "#ndim"222b;"\ + #ndim"223:\n\t"\ + SAVE_m2n##ndim "addq $16,%2;" + +/* m = 1 *//* xmm0 for alpha, xmm1-xmm3 and xmm10 for temporary use, xmm4-xmm6 for accumulators */ +#define INIT_m1n1 "vpxor %%xmm4,%%xmm4,%%xmm4;" +#define KERNEL_k1m1n1 \ + "vmovss (%1),%%xmm3; addq $4,%1;"\ + "vmovss (%0),%%xmm1; vfmadd231ss %%xmm3,%%xmm1,%%xmm4;"\ + "addq $4,%0;" +#define SAVE_m1n1 \ + "vunpcklps %%xmm4,%%xmm4,%%xmm4; vmovsd (%2),%%xmm1; vfmadd213ps %%xmm1,%%xmm0,%%xmm4; vmovsd %%xmm4,(%2);" +#define INIT_m1n2 INIT_m1n1 +#define KERNEL_k1m1n2 \ + "vmovsd (%1),%%xmm3; addq $8,%1;"\ + "vbroadcastss (%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm4;"\ + "addq $4,%0;" +#define SAVE_m1n2 \ + "vunpcklps %%xmm4,%%xmm4,%%xmm4; vmovsd (%2),%%xmm3; vmovhpd (%2,%3,1),%%xmm3,%%xmm3; vfmadd213ps %%xmm3,%%xmm0,%%xmm4;"\ + "vmovsd %%xmm4,(%2); vmovhpd %%xmm4,(%2,%3,1);" +#define INIT_m1n4 INIT_m1n2 +#define INIT_m1n8 INIT_m1n4 "vpxor %%xmm5,%%xmm5,%%xmm5;" +#define INIT_m1n12 INIT_m1n8 "vpxor %%xmm6,%%xmm6,%%xmm6;" +#define KERNEL_k1m1n4 \ + "vmovups (%1),%%xmm3; addq $16,%1;"\ + "vbroadcastss (%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm4;"\ + "addq $4,%0;" +#define KERNEL_k1m1n8 \ + "vmovups (%1),%%xmm3; vmovups (%1,%%r12,1),%%xmm2; addq $16,%1;"\ + "vbroadcastss (%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm4; vfmadd231ps %%xmm2,%%xmm1,%%xmm5;"\ + "addq $4,%0;" +#define KERNEL_k1m1n12 \ + "vmovups (%1),%%xmm3; vmovups (%1,%%r12,1),%%xmm2; vmovups (%1,%%r12,2),%%xmm1; addq $16,%1;"\ + "vbroadcastss (%0),%%xmm10; vfmadd231ps %%xmm3,%%xmm10,%%xmm4; vfmadd231ps %%xmm2,%%xmm10,%%xmm5; vfmadd231ps %%xmm1,%%xmm10,%%xmm6;"\ + "addq $4,%0;" +#define unit_save_m1n4(c1) \ + "vunpcklps "#c1","#c1",%%xmm1; vunpckhps "#c1","#c1",%%xmm2;"\ + "vmovsd (%5),%%xmm3; vmovhpd (%5,%3,1),%%xmm3,%%xmm3; vfmadd213ps %%xmm3,%%xmm0,%%xmm1;"\ + "vmovsd %%xmm1,(%5); vmovhpd %%xmm1,(%5,%3,1); leaq (%5,%3,2),%5;"\ + "vmovsd (%5),%%xmm3; vmovhpd (%5,%3,1),%%xmm3,%%xmm3; vfmadd213ps %%xmm3,%%xmm0,%%xmm2;"\ + "vmovsd %%xmm2,(%5); vmovhpd %%xmm2,(%5,%3,1); leaq (%5,%3,2),%5;" +#define SAVE_m1n4 "movq %2,%5;" unit_save_m1n4(%%xmm4) +#define SAVE_m1n8 SAVE_m1n4 unit_save_m1n4(%%xmm5) +#define SAVE_m1n12 SAVE_m1n8 unit_save_m1n4(%%xmm6) +#define COMPUTE_m1(ndim) \ + INIT_m1n##ndim\ + "movq %%r13,%4; movq %%r14,%1;"\ + #ndim"112:\n\t"\ + "testq %4,%4; jz "#ndim"113f;"\ + KERNEL_k1m1n##ndim\ + "decq %4; jmp "#ndim"112b;"\ + #ndim"113:\n\t"\ + SAVE_m1n##ndim "addq $8,%2;" + +/* %0 = "+r"(a_pointer), %1 = "+r"(b_pointer), %2 = "+r"(c_pointer), %3 = "+r"(ldc_in_bytes), %4 = "+r"(K), %5 = "+r"(ctemp) */ +/* %6 = "+r"(&alpha), %7 = "+r"(M), %8 = "+r"(next_b) */ +/* r11 = m(const), r12 = k << 4(const), r13 = k(const), r14 = b_head_pos(const),r15 = tmp */ + +#define COMPUTE(ndim) {\ + next_b = b_pointer + ndim * K;\ + __asm__ __volatile__(\ + "vbroadcastsd (%6),%%ymm0;"\ + "movq %4,%%r13; movq %4,%%r12; salq $4,%%r12; movq %1,%%r14; movq %7,%%r11;"\ + "cmpq $8,%7;jb 33101"#ndim"f;"\ + "33109"#ndim":\n\t"\ + COMPUTE_m8(ndim)\ + "subq $8,%7;cmpq $8,%7;jnb 33109"#ndim"b;"\ + "33101"#ndim":\n\t"\ + "cmpq $4,%7;jb 33103"#ndim"f;"\ + COMPUTE_m4(ndim)\ + "subq $4,%7;"\ + "33103"#ndim":\n\t"\ + "cmpq $2,%7;jb 33104"#ndim"f;"\ + COMPUTE_m2(ndim)\ + "subq $2,%7;"\ + "33104"#ndim":\n\t"\ + "testq %7,%7;jz 33105"#ndim"f;"\ + COMPUTE_m1(ndim)\ + "33105"#ndim":\n\t"\ + "movq %%r13,%4; movq %%r14,%1; movq %%r11,%7;"\ + :"+r"(a_pointer),"+r"(b_pointer),"+r"(c_pointer),"+r"(ldc_in_bytes),"+r"(K),"+r"(ctemp),"+r"(const_val),"+r"(M),"+r"(next_b)\ + ::"r11","r12","r13","r14","r15"\ + "xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15","cc","memory");\ + a_pointer -= M * K; b_pointer += ndim * K; c_pointer += 2*(LDC * ndim - M);\ +} + +int __attribute__ ((noinline)) +CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alphar, float alphai, float * __restrict__ A, float * __restrict__ B, float * __restrict__ C, BLASLONG LDC) +{ + if(m==0||n==0||k==0) return 0; + int64_t ldc_in_bytes = (int64_t)LDC * sizeof(float) * 2; + float constval[2]; constval[0] = alphar; constval[1] = alphai; + float *const_val=constval; + int64_t M = (int64_t)m, K = (int64_t)k; + BLASLONG n_count = n; + float *a_pointer = A,*b_pointer = B,*c_pointer = C,*ctemp = C,*next_b = B; + for(;n_count>11;n_count-=12) COMPUTE(12) + for(;n_count>7;n_count-=8) COMPUTE(8) + for(;n_count>3;n_count-=4) COMPUTE(4) + for(;n_count>1;n_count-=2) COMPUTE(2) + if(n_count>0) COMPUTE(1) + return 0; +} From ed9af2f7dae61a23a18aab11025e1b4e586f5a51 Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Fri, 27 Dec 2019 18:01:38 +0800 Subject: [PATCH 22/27] Update KERNEL.HASWELL --- kernel/x86_64/KERNEL.HASWELL | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86_64/KERNEL.HASWELL b/kernel/x86_64/KERNEL.HASWELL index 9bd34f1e3..bdebd22b9 100644 --- a/kernel/x86_64/KERNEL.HASWELL +++ b/kernel/x86_64/KERNEL.HASWELL @@ -97,6 +97,6 @@ ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c -CGEMM3MKERNEL = zgemm3m_kernel_4x8_nehalem.S +CGEMM3MKERNEL = cgemm3m_kernel_8x4_haswell.c ZGEMM3MKERNEL = zgemm3m_kernel_2x8_nehalem.S From 4c35b8dbaacfe23f76c48517674da8cf01cd2828 Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Fri, 27 Dec 2019 18:03:01 +0800 Subject: [PATCH 23/27] Update gemm3m_level3.c --- driver/level3/gemm3m_level3.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/driver/level3/gemm3m_level3.c b/driver/level3/gemm3m_level3.c index bbde7e5d1..d037e72cd 100644 --- a/driver/level3/gemm3m_level3.c +++ b/driver/level3/gemm3m_level3.c @@ -338,7 +338,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; - if (min_jj > GEMM3M_UNROLL_N) min_jj = GEMM3M_UNROLL_N; + if (min_jj > GEMM3M_UNROLL_N*3) min_jj = GEMM3M_UNROLL_N*3; START_RPCC(); @@ -398,7 +398,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; - if (min_jj > GEMM3M_UNROLL_N) min_jj = GEMM3M_UNROLL_N; + if (min_jj > GEMM3M_UNROLL_N*3) min_jj = GEMM3M_UNROLL_N*3; START_RPCC(); @@ -463,7 +463,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; - if (min_jj > GEMM3M_UNROLL_N) min_jj = GEMM3M_UNROLL_N; + if (min_jj > GEMM3M_UNROLL_N*3) min_jj = GEMM3M_UNROLL_N*3; START_RPCC(); From 3a66c8cac18dbbc172fc703feab22f53755a52c9 Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Fri, 27 Dec 2019 18:04:08 +0800 Subject: [PATCH 24/27] Update KERNEL.ZEN --- kernel/x86_64/KERNEL.ZEN | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86_64/KERNEL.ZEN b/kernel/x86_64/KERNEL.ZEN index aa4ba4834..025db515e 100644 --- a/kernel/x86_64/KERNEL.ZEN +++ b/kernel/x86_64/KERNEL.ZEN @@ -94,6 +94,6 @@ ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c -CGEMM3MKERNEL = zgemm3m_kernel_4x8_nehalem.S +CGEMM3MKERNEL = cgemm3m_kernel_8x4_haswell.c ZGEMM3MKERNEL = zgemm3m_kernel_2x8_nehalem.S From 64639f440f7e9cf630100e4b03999e9321018876 Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Fri, 27 Dec 2019 18:06:42 +0800 Subject: [PATCH 25/27] Update param.h --- param.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/param.h b/param.h index d80bbf4f2..4084c781d 100644 --- a/param.h +++ b/param.h @@ -693,15 +693,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define XGEMM_DEFAULT_R xgemm_r #define XGEMM_DEFAULT_Q 128 -#define CGEMM3M_DEFAULT_UNROLL_N 8 -#define CGEMM3M_DEFAULT_UNROLL_M 4 +#define CGEMM3M_DEFAULT_UNROLL_N 4 +#define CGEMM3M_DEFAULT_UNROLL_M 8 #define ZGEMM3M_DEFAULT_UNROLL_N 8 #define ZGEMM3M_DEFAULT_UNROLL_M 2 -#define CGEMM3M_DEFAULT_P 448 +#define CGEMM3M_DEFAULT_P 320 #define ZGEMM3M_DEFAULT_P 224 #define XGEMM3M_DEFAULT_P 112 -#define CGEMM3M_DEFAULT_Q 224 +#define CGEMM3M_DEFAULT_Q 320 #define ZGEMM3M_DEFAULT_Q 224 #define XGEMM3M_DEFAULT_Q 224 #define CGEMM3M_DEFAULT_R 12288 @@ -1596,15 +1596,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define XGEMM_DEFAULT_R xgemm_r #define XGEMM_DEFAULT_Q 128 -#define CGEMM3M_DEFAULT_UNROLL_N 8 -#define CGEMM3M_DEFAULT_UNROLL_M 4 +#define CGEMM3M_DEFAULT_UNROLL_N 4 +#define CGEMM3M_DEFAULT_UNROLL_M 8 #define ZGEMM3M_DEFAULT_UNROLL_N 8 #define ZGEMM3M_DEFAULT_UNROLL_M 2 -#define CGEMM3M_DEFAULT_P 448 +#define CGEMM3M_DEFAULT_P 320 #define ZGEMM3M_DEFAULT_P 224 #define XGEMM3M_DEFAULT_P 112 -#define CGEMM3M_DEFAULT_Q 224 +#define CGEMM3M_DEFAULT_Q 320 #define ZGEMM3M_DEFAULT_Q 224 #define XGEMM3M_DEFAULT_Q 224 #define CGEMM3M_DEFAULT_R 12288 From cd765f094b52bc010091f4782e232706a854ea90 Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Fri, 27 Dec 2019 18:23:29 +0800 Subject: [PATCH 26/27] Update cgemm3m_kernel_8x4_haswell.c --- kernel/x86_64/cgemm3m_kernel_8x4_haswell.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86_64/cgemm3m_kernel_8x4_haswell.c b/kernel/x86_64/cgemm3m_kernel_8x4_haswell.c index 831f25483..01fbf3064 100644 --- a/kernel/x86_64/cgemm3m_kernel_8x4_haswell.c +++ b/kernel/x86_64/cgemm3m_kernel_8x4_haswell.c @@ -255,7 +255,7 @@ "33105"#ndim":\n\t"\ "movq %%r13,%4; movq %%r14,%1; movq %%r11,%7;"\ :"+r"(a_pointer),"+r"(b_pointer),"+r"(c_pointer),"+r"(ldc_in_bytes),"+r"(K),"+r"(ctemp),"+r"(const_val),"+r"(M),"+r"(next_b)\ - ::"r11","r12","r13","r14","r15"\ + ::"r11","r12","r13","r14","r15",\ "xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15","cc","memory");\ a_pointer -= M * K; b_pointer += ndim * K; c_pointer += 2*(LDC * ndim - M);\ } From 312060d0d6b720eeb9bafbddaeedf3ba968a3732 Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Fri, 27 Dec 2019 23:36:13 +0800 Subject: [PATCH 27/27] Update CONTRIBUTORS.md --- CONTRIBUTORS.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 6d30ee942..fd759913d 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -178,3 +178,4 @@ In chronological order: * [2019-11-06] optimize AVX512 SGEMM * [2019-11-12] AVX512 CGEMM & ZGEMM kernels * [2019-12-23] optimize AVX2 CGEMM and ZGEMM + * [2019-12-27] AVX2 CGEMM3M kernel