Merge branch 'develop' into bulldozer
Conflicts: kernel/x86_64/KERNEL.BULLDOZER
This commit is contained in:
@@ -388,7 +388,7 @@ $(KDIR)xgerv_k$(TSUFFIX).$(SUFFIX) $(KDIR)xgerv_k$(TSUFFIX).$(PSUFFIX) : $(KER
|
||||
$(CC) -c $(CFLAGS) -DXDOUBLE -UCONJ -DXCONJ $< -o $@
|
||||
|
||||
$(KDIR)xgerd_k$(TSUFFIX).$(SUFFIX) $(KDIR)xgerd_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGERCKERNEL) $(XGERPARAM)
|
||||
$(CC) -c $(CFLAGS) -DXDOUBLE -DCONJ-DXCONJ $< -o $@
|
||||
$(CC) -c $(CFLAGS) -DXDOUBLE -DCONJ -DXCONJ $< -o $@
|
||||
|
||||
$(KDIR)chemv_U$(TSUFFIX).$(SUFFIX) $(KDIR)chemv_U$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CHEMV_U_KERNEL) $(CHEMV_U_PARAM)
|
||||
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DHEMV $< -o $@
|
||||
|
||||
@@ -1206,328 +1206,328 @@ $(KDIR)xhemm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_utcopy_$(XGEMM_UNROLL_M
|
||||
$(KDIR)xhemm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_ltcopy_$(XGEMM_UNROLL_M).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER $< -DLOWER -o $@
|
||||
|
||||
$(KDIR)cgemm3m_oncopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(SGEMM_UNROLL_N).c
|
||||
$(KDIR)cgemm3m_oncopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_N).c
|
||||
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
|
||||
|
||||
$(KDIR)cgemm3m_oncopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(SGEMM_UNROLL_N).c
|
||||
$(KDIR)cgemm3m_oncopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_N).c
|
||||
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||
|
||||
$(KDIR)cgemm3m_oncopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(SGEMM_UNROLL_N).c
|
||||
$(KDIR)cgemm3m_oncopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_N).c
|
||||
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||
|
||||
$(KDIR)cgemm3m_otcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(SGEMM_UNROLL_N).c
|
||||
$(KDIR)cgemm3m_otcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_N).c
|
||||
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
|
||||
|
||||
$(KDIR)cgemm3m_otcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(SGEMM_UNROLL_N).c
|
||||
$(KDIR)cgemm3m_otcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_N).c
|
||||
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||
|
||||
$(KDIR)cgemm3m_otcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(SGEMM_UNROLL_N).c
|
||||
$(KDIR)cgemm3m_otcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_N).c
|
||||
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||
|
||||
$(KDIR)cgemm3m_incopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(SGEMM_UNROLL_M).c
|
||||
$(KDIR)cgemm3m_incopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_M).c
|
||||
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@
|
||||
|
||||
$(KDIR)cgemm3m_incopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(SGEMM_UNROLL_M).c
|
||||
$(KDIR)cgemm3m_incopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_M).c
|
||||
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||
|
||||
$(KDIR)cgemm3m_incopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(SGEMM_UNROLL_M).c
|
||||
$(KDIR)cgemm3m_incopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_M).c
|
||||
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||
|
||||
$(KDIR)cgemm3m_itcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(SGEMM_UNROLL_M).c
|
||||
$(KDIR)cgemm3m_itcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_M).c
|
||||
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@
|
||||
|
||||
$(KDIR)cgemm3m_itcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(SGEMM_UNROLL_M).c
|
||||
$(KDIR)cgemm3m_itcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_M).c
|
||||
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||
|
||||
$(KDIR)cgemm3m_itcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(SGEMM_UNROLL_M).c
|
||||
$(KDIR)cgemm3m_itcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_M).c
|
||||
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||
|
||||
$(KDIR)zgemm3m_oncopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(DGEMM_UNROLL_N).c
|
||||
$(KDIR)zgemm3m_oncopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_N).c
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
|
||||
|
||||
$(KDIR)zgemm3m_oncopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(DGEMM_UNROLL_N).c
|
||||
$(KDIR)zgemm3m_oncopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_N).c
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||
|
||||
$(KDIR)zgemm3m_oncopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(DGEMM_UNROLL_N).c
|
||||
$(KDIR)zgemm3m_oncopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_N).c
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||
|
||||
$(KDIR)zgemm3m_otcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(DGEMM_UNROLL_N).c
|
||||
$(KDIR)zgemm3m_otcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_N).c
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
|
||||
|
||||
$(KDIR)zgemm3m_otcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(DGEMM_UNROLL_N).c
|
||||
$(KDIR)zgemm3m_otcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_N).c
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||
|
||||
$(KDIR)zgemm3m_otcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(DGEMM_UNROLL_N).c
|
||||
$(KDIR)zgemm3m_otcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_N).c
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||
|
||||
$(KDIR)zgemm3m_incopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(DGEMM_UNROLL_M).c
|
||||
$(KDIR)zgemm3m_incopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_M).c
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@
|
||||
|
||||
$(KDIR)zgemm3m_incopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(DGEMM_UNROLL_M).c
|
||||
$(KDIR)zgemm3m_incopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_M).c
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||
|
||||
$(KDIR)zgemm3m_incopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(DGEMM_UNROLL_M).c
|
||||
$(KDIR)zgemm3m_incopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_M).c
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||
|
||||
$(KDIR)zgemm3m_itcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(DGEMM_UNROLL_M).c
|
||||
$(KDIR)zgemm3m_itcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_M).c
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@
|
||||
|
||||
$(KDIR)zgemm3m_itcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(DGEMM_UNROLL_M).c
|
||||
$(KDIR)zgemm3m_itcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_M).c
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||
|
||||
$(KDIR)zgemm3m_itcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(DGEMM_UNROLL_M).c
|
||||
$(KDIR)zgemm3m_itcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_M).c
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||
|
||||
$(KDIR)xgemm3m_oncopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(QGEMM_UNROLL_N).c
|
||||
$(KDIR)xgemm3m_oncopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_N).c
|
||||
$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
|
||||
|
||||
$(KDIR)xgemm3m_oncopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(QGEMM_UNROLL_N).c
|
||||
$(KDIR)xgemm3m_oncopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_N).c
|
||||
$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||
|
||||
$(KDIR)xgemm3m_oncopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(QGEMM_UNROLL_N).c
|
||||
$(KDIR)xgemm3m_oncopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_N).c
|
||||
$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||
|
||||
$(KDIR)xgemm3m_otcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(QGEMM_UNROLL_N).c
|
||||
$(KDIR)xgemm3m_otcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_N).c
|
||||
$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
|
||||
|
||||
$(KDIR)xgemm3m_otcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(QGEMM_UNROLL_N).c
|
||||
$(KDIR)xgemm3m_otcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_N).c
|
||||
$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||
|
||||
$(KDIR)xgemm3m_otcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(QGEMM_UNROLL_N).c
|
||||
$(KDIR)xgemm3m_otcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_N).c
|
||||
$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||
|
||||
$(KDIR)xgemm3m_incopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(QGEMM_UNROLL_M).c
|
||||
$(KDIR)xgemm3m_incopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_M).c
|
||||
$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@
|
||||
|
||||
$(KDIR)xgemm3m_incopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(QGEMM_UNROLL_M).c
|
||||
$(KDIR)xgemm3m_incopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_M).c
|
||||
$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||
|
||||
$(KDIR)xgemm3m_incopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(QGEMM_UNROLL_M).c
|
||||
$(KDIR)xgemm3m_incopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_M).c
|
||||
$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||
|
||||
$(KDIR)xgemm3m_itcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(QGEMM_UNROLL_M).c
|
||||
$(KDIR)xgemm3m_itcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_M).c
|
||||
$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@
|
||||
|
||||
$(KDIR)xgemm3m_itcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(QGEMM_UNROLL_M).c
|
||||
$(KDIR)xgemm3m_itcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_M).c
|
||||
$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||
|
||||
$(KDIR)xgemm3m_itcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(QGEMM_UNROLL_M).c
|
||||
$(KDIR)xgemm3m_itcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_M).c
|
||||
$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||
|
||||
$(KDIR)csymm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(SGEMM_UNROLL_N).c
|
||||
$(KDIR)csymm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_N).c
|
||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
|
||||
|
||||
$(KDIR)csymm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(SGEMM_UNROLL_N).c
|
||||
$(KDIR)csymm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_N).c
|
||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
|
||||
|
||||
$(KDIR)csymm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(SGEMM_UNROLL_N).c
|
||||
$(KDIR)csymm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_N).c
|
||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||
|
||||
$(KDIR)csymm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(SGEMM_UNROLL_N).c
|
||||
$(KDIR)csymm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_N).c
|
||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||
|
||||
$(KDIR)csymm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(SGEMM_UNROLL_N).c
|
||||
$(KDIR)csymm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_N).c
|
||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||
|
||||
$(KDIR)csymm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(SGEMM_UNROLL_N).c
|
||||
$(KDIR)csymm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_N).c
|
||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||
|
||||
$(KDIR)csymm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(SGEMM_UNROLL_M).c
|
||||
$(KDIR)csymm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_M).c
|
||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
|
||||
|
||||
$(KDIR)csymm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(SGEMM_UNROLL_M).c
|
||||
$(KDIR)csymm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_M).c
|
||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
|
||||
|
||||
$(KDIR)csymm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(SGEMM_UNROLL_M).c
|
||||
$(KDIR)csymm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_M).c
|
||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||
|
||||
$(KDIR)csymm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(SGEMM_UNROLL_M).c
|
||||
$(KDIR)csymm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_M).c
|
||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||
|
||||
$(KDIR)csymm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(SGEMM_UNROLL_M).c
|
||||
$(KDIR)csymm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_M).c
|
||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||
|
||||
$(KDIR)csymm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(SGEMM_UNROLL_M).c
|
||||
$(KDIR)csymm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_M).c
|
||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||
|
||||
$(KDIR)zsymm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(DGEMM_UNROLL_N).c
|
||||
$(KDIR)zsymm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_N).c
|
||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
|
||||
|
||||
$(KDIR)zsymm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(DGEMM_UNROLL_N).c
|
||||
$(KDIR)zsymm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_N).c
|
||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
|
||||
|
||||
$(KDIR)zsymm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(DGEMM_UNROLL_N).c
|
||||
$(KDIR)zsymm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_N).c
|
||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||
|
||||
$(KDIR)zsymm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(DGEMM_UNROLL_N).c
|
||||
$(KDIR)zsymm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_N).c
|
||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||
|
||||
$(KDIR)zsymm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(DGEMM_UNROLL_N).c
|
||||
$(KDIR)zsymm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_N).c
|
||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||
|
||||
$(KDIR)zsymm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(DGEMM_UNROLL_N).c
|
||||
$(KDIR)zsymm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_N).c
|
||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||
|
||||
$(KDIR)zsymm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(DGEMM_UNROLL_M).c
|
||||
$(KDIR)zsymm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_M).c
|
||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
|
||||
|
||||
$(KDIR)zsymm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(DGEMM_UNROLL_M).c
|
||||
$(KDIR)zsymm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_M).c
|
||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
|
||||
|
||||
$(KDIR)zsymm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(DGEMM_UNROLL_M).c
|
||||
$(KDIR)zsymm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_M).c
|
||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||
|
||||
$(KDIR)zsymm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(DGEMM_UNROLL_M).c
|
||||
$(KDIR)zsymm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_M).c
|
||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||
|
||||
$(KDIR)zsymm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(DGEMM_UNROLL_M).c
|
||||
$(KDIR)zsymm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_M).c
|
||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||
|
||||
$(KDIR)zsymm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(DGEMM_UNROLL_M).c
|
||||
$(KDIR)zsymm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_M).c
|
||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||
|
||||
$(KDIR)xsymm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(QGEMM_UNROLL_N).c
|
||||
$(KDIR)xsymm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_N).c
|
||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
|
||||
|
||||
$(KDIR)xsymm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(QGEMM_UNROLL_N).c
|
||||
$(KDIR)xsymm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_N).c
|
||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
|
||||
|
||||
$(KDIR)xsymm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(QGEMM_UNROLL_N).c
|
||||
$(KDIR)xsymm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_N).c
|
||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||
|
||||
$(KDIR)xsymm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(QGEMM_UNROLL_N).c
|
||||
$(KDIR)xsymm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_N).c
|
||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||
|
||||
$(KDIR)xsymm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(QGEMM_UNROLL_N).c
|
||||
$(KDIR)xsymm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_N).c
|
||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||
|
||||
$(KDIR)xsymm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(QGEMM_UNROLL_N).c
|
||||
$(KDIR)xsymm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_N).c
|
||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||
|
||||
$(KDIR)xsymm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(QGEMM_UNROLL_M).c
|
||||
$(KDIR)xsymm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_M).c
|
||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
|
||||
|
||||
$(KDIR)xsymm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(QGEMM_UNROLL_M).c
|
||||
$(KDIR)xsymm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_M).c
|
||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
|
||||
|
||||
$(KDIR)xsymm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(QGEMM_UNROLL_M).c
|
||||
$(KDIR)xsymm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_M).c
|
||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||
|
||||
$(KDIR)xsymm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(QGEMM_UNROLL_M).c
|
||||
$(KDIR)xsymm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_M).c
|
||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||
|
||||
$(KDIR)xsymm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(QGEMM_UNROLL_M).c
|
||||
$(KDIR)xsymm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_M).c
|
||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||
|
||||
$(KDIR)xsymm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(QGEMM_UNROLL_M).c
|
||||
$(KDIR)xsymm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_M).c
|
||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||
|
||||
$(KDIR)chemm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(SGEMM_UNROLL_N).c
|
||||
$(KDIR)chemm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_N).c
|
||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
|
||||
|
||||
$(KDIR)chemm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(SGEMM_UNROLL_N).c
|
||||
$(KDIR)chemm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_N).c
|
||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
|
||||
|
||||
$(KDIR)chemm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(SGEMM_UNROLL_N).c
|
||||
$(KDIR)chemm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_N).c
|
||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||
|
||||
$(KDIR)chemm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(SGEMM_UNROLL_N).c
|
||||
$(KDIR)chemm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_N).c
|
||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||
|
||||
$(KDIR)chemm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(SGEMM_UNROLL_N).c
|
||||
$(KDIR)chemm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_N).c
|
||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||
|
||||
$(KDIR)chemm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(SGEMM_UNROLL_N).c
|
||||
$(KDIR)chemm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_N).c
|
||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||
|
||||
$(KDIR)chemm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(SGEMM_UNROLL_M).c
|
||||
$(KDIR)chemm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_M).c
|
||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
|
||||
|
||||
$(KDIR)chemm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(SGEMM_UNROLL_M).c
|
||||
$(KDIR)chemm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_M).c
|
||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
|
||||
|
||||
$(KDIR)chemm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(SGEMM_UNROLL_M).c
|
||||
$(KDIR)chemm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_M).c
|
||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||
|
||||
$(KDIR)chemm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(SGEMM_UNROLL_M).c
|
||||
$(KDIR)chemm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_M).c
|
||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||
|
||||
$(KDIR)chemm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(SGEMM_UNROLL_M).c
|
||||
$(KDIR)chemm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_M).c
|
||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||
|
||||
$(KDIR)chemm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(SGEMM_UNROLL_M).c
|
||||
$(KDIR)chemm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_M).c
|
||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||
|
||||
$(KDIR)zhemm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(DGEMM_UNROLL_N).c
|
||||
$(KDIR)zhemm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_N).c
|
||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
|
||||
|
||||
$(KDIR)zhemm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(DGEMM_UNROLL_N).c
|
||||
$(KDIR)zhemm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_N).c
|
||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
|
||||
|
||||
$(KDIR)zhemm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(DGEMM_UNROLL_N).c
|
||||
$(KDIR)zhemm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_N).c
|
||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||
|
||||
$(KDIR)zhemm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(DGEMM_UNROLL_N).c
|
||||
$(KDIR)zhemm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_N).c
|
||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||
|
||||
$(KDIR)zhemm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(DGEMM_UNROLL_N).c
|
||||
$(KDIR)zhemm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_N).c
|
||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||
|
||||
$(KDIR)zhemm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(DGEMM_UNROLL_N).c
|
||||
$(KDIR)zhemm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_N).c
|
||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||
|
||||
$(KDIR)zhemm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(DGEMM_UNROLL_M).c
|
||||
$(KDIR)zhemm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_M).c
|
||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
|
||||
|
||||
$(KDIR)zhemm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(DGEMM_UNROLL_M).c
|
||||
$(KDIR)zhemm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_M).c
|
||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
|
||||
|
||||
$(KDIR)zhemm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(DGEMM_UNROLL_M).c
|
||||
$(KDIR)zhemm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_M).c
|
||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||
|
||||
$(KDIR)zhemm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(DGEMM_UNROLL_M).c
|
||||
$(KDIR)zhemm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_M).c
|
||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||
|
||||
$(KDIR)zhemm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(DGEMM_UNROLL_M).c
|
||||
$(KDIR)zhemm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_M).c
|
||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||
|
||||
$(KDIR)zhemm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(DGEMM_UNROLL_M).c
|
||||
$(KDIR)zhemm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_M).c
|
||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||
|
||||
$(KDIR)xhemm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(QGEMM_UNROLL_N).c
|
||||
$(KDIR)xhemm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_N).c
|
||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
|
||||
|
||||
$(KDIR)xhemm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(QGEMM_UNROLL_N).c
|
||||
$(KDIR)xhemm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_N).c
|
||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
|
||||
|
||||
$(KDIR)xhemm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(QGEMM_UNROLL_N).c
|
||||
$(KDIR)xhemm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_N).c
|
||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||
|
||||
$(KDIR)xhemm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(QGEMM_UNROLL_N).c
|
||||
$(KDIR)xhemm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_N).c
|
||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||
|
||||
$(KDIR)xhemm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(QGEMM_UNROLL_N).c
|
||||
$(KDIR)xhemm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_N).c
|
||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||
|
||||
$(KDIR)xhemm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(QGEMM_UNROLL_N).c
|
||||
$(KDIR)xhemm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_N).c
|
||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||
|
||||
$(KDIR)xhemm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(QGEMM_UNROLL_M).c
|
||||
$(KDIR)xhemm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_M).c
|
||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
|
||||
|
||||
$(KDIR)xhemm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(QGEMM_UNROLL_M).c
|
||||
$(KDIR)xhemm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_M).c
|
||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
|
||||
|
||||
$(KDIR)xhemm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(QGEMM_UNROLL_M).c
|
||||
$(KDIR)xhemm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_M).c
|
||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||
|
||||
$(KDIR)xhemm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(QGEMM_UNROLL_M).c
|
||||
$(KDIR)xhemm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_M).c
|
||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||
|
||||
$(KDIR)xhemm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(QGEMM_UNROLL_M).c
|
||||
$(KDIR)xhemm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_M).c
|
||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||
|
||||
$(KDIR)xhemm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(QGEMM_UNROLL_M).c
|
||||
$(KDIR)xhemm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_M).c
|
||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||
|
||||
$(KDIR)strsm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(SGEMM_UNROLL_M).c
|
||||
@@ -2608,328 +2608,328 @@ $(KDIR)xhemm_iutcopy$(TSUFFIX).$(PSUFFIX) : generic/zhemm_utcopy_$(XGEMM_UNROLL_
|
||||
$(KDIR)xhemm_iltcopy$(TSUFFIX).$(PSUFFIX) : generic/zhemm_ltcopy_$(XGEMM_UNROLL_M).c
|
||||
$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER $< -DLOWER -o $@
|
||||
|
||||
$(KDIR)cgemm3m_oncopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(SGEMM_UNROLL_N).c
|
||||
$(KDIR)cgemm3m_oncopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_N).c
|
||||
$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
|
||||
|
||||
$(KDIR)cgemm3m_oncopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(SGEMM_UNROLL_N).c
|
||||
$(KDIR)cgemm3m_oncopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_N).c
|
||||
$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||
|
||||
$(KDIR)cgemm3m_oncopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(SGEMM_UNROLL_N).c
|
||||
$(KDIR)cgemm3m_oncopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_N).c
|
||||
$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||
|
||||
$(KDIR)cgemm3m_otcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(SGEMM_UNROLL_N).c
|
||||
$(KDIR)cgemm3m_otcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_N).c
|
||||
$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
|
||||
|
||||
$(KDIR)cgemm3m_otcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(SGEMM_UNROLL_N).c
|
||||
$(KDIR)cgemm3m_otcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_N).c
|
||||
$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||
|
||||
$(KDIR)cgemm3m_otcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(SGEMM_UNROLL_N).c
|
||||
$(KDIR)cgemm3m_otcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_N).c
|
||||
$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||
|
||||
$(KDIR)cgemm3m_incopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(SGEMM_UNROLL_M).c
|
||||
$(KDIR)cgemm3m_incopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_M).c
|
||||
$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@
|
||||
|
||||
$(KDIR)cgemm3m_incopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(SGEMM_UNROLL_M).c
|
||||
$(KDIR)cgemm3m_incopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_M).c
|
||||
$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||
|
||||
$(KDIR)cgemm3m_incopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(SGEMM_UNROLL_M).c
|
||||
$(KDIR)cgemm3m_incopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_M).c
|
||||
$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||
|
||||
$(KDIR)cgemm3m_itcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(SGEMM_UNROLL_M).c
|
||||
$(KDIR)cgemm3m_itcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_M).c
|
||||
$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@
|
||||
|
||||
$(KDIR)cgemm3m_itcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(SGEMM_UNROLL_M).c
|
||||
$(KDIR)cgemm3m_itcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_M).c
|
||||
$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||
|
||||
$(KDIR)cgemm3m_itcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(SGEMM_UNROLL_M).c
|
||||
$(KDIR)cgemm3m_itcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_M).c
|
||||
$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||
|
||||
$(KDIR)zgemm3m_oncopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(DGEMM_UNROLL_N).c
|
||||
$(KDIR)zgemm3m_oncopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_N).c
|
||||
$(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
|
||||
|
||||
$(KDIR)zgemm3m_oncopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(DGEMM_UNROLL_N).c
|
||||
$(KDIR)zgemm3m_oncopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_N).c
|
||||
$(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||
|
||||
$(KDIR)zgemm3m_oncopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(DGEMM_UNROLL_N).c
|
||||
$(KDIR)zgemm3m_oncopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_N).c
|
||||
$(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||
|
||||
$(KDIR)zgemm3m_otcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(DGEMM_UNROLL_N).c
|
||||
$(KDIR)zgemm3m_otcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_N).c
|
||||
$(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
|
||||
|
||||
$(KDIR)zgemm3m_otcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(DGEMM_UNROLL_N).c
|
||||
$(KDIR)zgemm3m_otcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_N).c
|
||||
$(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||
|
||||
$(KDIR)zgemm3m_otcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(DGEMM_UNROLL_N).c
|
||||
$(KDIR)zgemm3m_otcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_N).c
|
||||
$(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||
|
||||
$(KDIR)zgemm3m_incopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(DGEMM_UNROLL_M).c
|
||||
$(KDIR)zgemm3m_incopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_M).c
|
||||
$(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@
|
||||
|
||||
$(KDIR)zgemm3m_incopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(DGEMM_UNROLL_M).c
|
||||
$(KDIR)zgemm3m_incopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_M).c
|
||||
$(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||
|
||||
$(KDIR)zgemm3m_incopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(DGEMM_UNROLL_M).c
|
||||
$(KDIR)zgemm3m_incopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_M).c
|
||||
$(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||
|
||||
$(KDIR)zgemm3m_itcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(DGEMM_UNROLL_M).c
|
||||
$(KDIR)zgemm3m_itcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_M).c
|
||||
$(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@
|
||||
|
||||
$(KDIR)zgemm3m_itcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(DGEMM_UNROLL_M).c
|
||||
$(KDIR)zgemm3m_itcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_M).c
|
||||
$(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||
|
||||
$(KDIR)zgemm3m_itcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(DGEMM_UNROLL_M).c
|
||||
$(KDIR)zgemm3m_itcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_M).c
|
||||
$(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||
|
||||
$(KDIR)xgemm3m_oncopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(QGEMM_UNROLL_N).c
|
||||
$(KDIR)xgemm3m_oncopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_N).c
|
||||
$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
|
||||
|
||||
$(KDIR)xgemm3m_oncopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(QGEMM_UNROLL_N).c
|
||||
$(KDIR)xgemm3m_oncopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_N).c
|
||||
$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||
|
||||
$(KDIR)xgemm3m_oncopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(QGEMM_UNROLL_N).c
|
||||
$(KDIR)xgemm3m_oncopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_N).c
|
||||
$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||
|
||||
$(KDIR)xgemm3m_otcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(QGEMM_UNROLL_N).c
|
||||
$(KDIR)xgemm3m_otcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_N).c
|
||||
$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
|
||||
|
||||
$(KDIR)xgemm3m_otcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(QGEMM_UNROLL_N).c
|
||||
$(KDIR)xgemm3m_otcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_N).c
|
||||
$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||
|
||||
$(KDIR)xgemm3m_otcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(QGEMM_UNROLL_N).c
|
||||
$(KDIR)xgemm3m_otcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_N).c
|
||||
$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||
|
||||
$(KDIR)xgemm3m_incopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(QGEMM_UNROLL_M).c
|
||||
$(KDIR)xgemm3m_incopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_M).c
|
||||
$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@
|
||||
|
||||
$(KDIR)xgemm3m_incopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(QGEMM_UNROLL_M).c
|
||||
$(KDIR)xgemm3m_incopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_M).c
|
||||
$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||
|
||||
$(KDIR)xgemm3m_incopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(QGEMM_UNROLL_M).c
|
||||
$(KDIR)xgemm3m_incopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_M).c
|
||||
$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||
|
||||
$(KDIR)xgemm3m_itcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(QGEMM_UNROLL_M).c
|
||||
$(KDIR)xgemm3m_itcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_M).c
|
||||
$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@
|
||||
|
||||
$(KDIR)xgemm3m_itcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(QGEMM_UNROLL_M).c
|
||||
$(KDIR)xgemm3m_itcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_M).c
|
||||
$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||
|
||||
$(KDIR)xgemm3m_itcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(QGEMM_UNROLL_M).c
|
||||
$(KDIR)xgemm3m_itcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_M).c
|
||||
$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||
|
||||
$(KDIR)csymm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(SGEMM_UNROLL_N).c
|
||||
$(KDIR)csymm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_N).c
|
||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
|
||||
|
||||
$(KDIR)csymm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(SGEMM_UNROLL_N).c
|
||||
$(KDIR)csymm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_N).c
|
||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
|
||||
|
||||
$(KDIR)csymm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(SGEMM_UNROLL_N).c
|
||||
$(KDIR)csymm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_N).c
|
||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||
|
||||
$(KDIR)csymm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(SGEMM_UNROLL_N).c
|
||||
$(KDIR)csymm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_N).c
|
||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||
|
||||
$(KDIR)csymm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(SGEMM_UNROLL_N).c
|
||||
$(KDIR)csymm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_N).c
|
||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||
|
||||
$(KDIR)csymm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(SGEMM_UNROLL_N).c
|
||||
$(KDIR)csymm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_N).c
|
||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||
|
||||
$(KDIR)csymm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(SGEMM_UNROLL_M).c
|
||||
$(KDIR)csymm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_M).c
|
||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
|
||||
|
||||
$(KDIR)csymm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(SGEMM_UNROLL_M).c
|
||||
$(KDIR)csymm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_M).c
|
||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
|
||||
|
||||
$(KDIR)csymm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(SGEMM_UNROLL_M).c
|
||||
$(KDIR)csymm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_M).c
|
||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||
|
||||
$(KDIR)csymm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(SGEMM_UNROLL_M).c
|
||||
$(KDIR)csymm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_M).c
|
||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||
|
||||
$(KDIR)csymm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(SGEMM_UNROLL_M).c
|
||||
$(KDIR)csymm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_M).c
|
||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||
|
||||
$(KDIR)csymm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(SGEMM_UNROLL_M).c
|
||||
$(KDIR)csymm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_M).c
|
||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||
|
||||
$(KDIR)zsymm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(DGEMM_UNROLL_N).c
|
||||
$(KDIR)zsymm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_N).c
|
||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
|
||||
|
||||
$(KDIR)zsymm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(DGEMM_UNROLL_N).c
|
||||
$(KDIR)zsymm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_N).c
|
||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
|
||||
|
||||
$(KDIR)zsymm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(DGEMM_UNROLL_N).c
|
||||
$(KDIR)zsymm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_N).c
|
||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||
|
||||
$(KDIR)zsymm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(DGEMM_UNROLL_N).c
|
||||
$(KDIR)zsymm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_N).c
|
||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||
|
||||
$(KDIR)zsymm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(DGEMM_UNROLL_N).c
|
||||
$(KDIR)zsymm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_N).c
|
||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||
|
||||
$(KDIR)zsymm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(DGEMM_UNROLL_N).c
|
||||
$(KDIR)zsymm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_N).c
|
||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||
|
||||
$(KDIR)zsymm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(DGEMM_UNROLL_M).c
|
||||
$(KDIR)zsymm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_M).c
|
||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
|
||||
|
||||
$(KDIR)zsymm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(DGEMM_UNROLL_M).c
|
||||
$(KDIR)zsymm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_M).c
|
||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
|
||||
|
||||
$(KDIR)zsymm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(DGEMM_UNROLL_M).c
|
||||
$(KDIR)zsymm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_M).c
|
||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||
|
||||
$(KDIR)zsymm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(DGEMM_UNROLL_M).c
|
||||
$(KDIR)zsymm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_M).c
|
||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||
|
||||
$(KDIR)zsymm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(DGEMM_UNROLL_M).c
|
||||
$(KDIR)zsymm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_M).c
|
||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||
|
||||
$(KDIR)zsymm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(DGEMM_UNROLL_M).c
|
||||
$(KDIR)zsymm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_M).c
|
||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||
|
||||
$(KDIR)xsymm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(QGEMM_UNROLL_N).c
|
||||
$(KDIR)xsymm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_N).c
|
||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
|
||||
|
||||
$(KDIR)xsymm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(QGEMM_UNROLL_N).c
|
||||
$(KDIR)xsymm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_N).c
|
||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
|
||||
|
||||
$(KDIR)xsymm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(QGEMM_UNROLL_N).c
|
||||
$(KDIR)xsymm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_N).c
|
||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||
|
||||
$(KDIR)xsymm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(QGEMM_UNROLL_N).c
|
||||
$(KDIR)xsymm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_N).c
|
||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||
|
||||
$(KDIR)xsymm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(QGEMM_UNROLL_N).c
|
||||
$(KDIR)xsymm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_N).c
|
||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||
|
||||
$(KDIR)xsymm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(QGEMM_UNROLL_N).c
|
||||
$(KDIR)xsymm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_N).c
|
||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||
|
||||
$(KDIR)xsymm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(QGEMM_UNROLL_M).c
|
||||
$(KDIR)xsymm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_M).c
|
||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
|
||||
|
||||
$(KDIR)xsymm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(QGEMM_UNROLL_M).c
|
||||
$(KDIR)xsymm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_M).c
|
||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
|
||||
|
||||
$(KDIR)xsymm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(QGEMM_UNROLL_M).c
|
||||
$(KDIR)xsymm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_M).c
|
||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||
|
||||
$(KDIR)xsymm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(QGEMM_UNROLL_M).c
|
||||
$(KDIR)xsymm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_M).c
|
||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||
|
||||
$(KDIR)xsymm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(QGEMM_UNROLL_M).c
|
||||
$(KDIR)xsymm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_M).c
|
||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||
|
||||
$(KDIR)xsymm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(QGEMM_UNROLL_M).c
|
||||
$(KDIR)xsymm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_M).c
|
||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||
|
||||
$(KDIR)chemm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(SGEMM_UNROLL_N).c
|
||||
$(KDIR)chemm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_N).c
|
||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
|
||||
|
||||
$(KDIR)chemm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(SGEMM_UNROLL_N).c
|
||||
$(KDIR)chemm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_N).c
|
||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
|
||||
|
||||
$(KDIR)chemm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(SGEMM_UNROLL_N).c
|
||||
$(KDIR)chemm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_N).c
|
||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||
|
||||
$(KDIR)chemm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(SGEMM_UNROLL_N).c
|
||||
$(KDIR)chemm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_N).c
|
||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||
|
||||
$(KDIR)chemm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(SGEMM_UNROLL_N).c
|
||||
$(KDIR)chemm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_N).c
|
||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||
|
||||
$(KDIR)chemm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(SGEMM_UNROLL_N).c
|
||||
$(KDIR)chemm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_N).c
|
||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||
|
||||
$(KDIR)chemm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(SGEMM_UNROLL_M).c
|
||||
$(KDIR)chemm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_M).c
|
||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
|
||||
|
||||
$(KDIR)chemm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(SGEMM_UNROLL_M).c
|
||||
$(KDIR)chemm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_M).c
|
||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
|
||||
|
||||
$(KDIR)chemm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(SGEMM_UNROLL_M).c
|
||||
$(KDIR)chemm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_M).c
|
||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||
|
||||
$(KDIR)chemm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(SGEMM_UNROLL_M).c
|
||||
$(KDIR)chemm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_M).c
|
||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||
|
||||
$(KDIR)chemm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(SGEMM_UNROLL_M).c
|
||||
$(KDIR)chemm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_M).c
|
||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||
|
||||
$(KDIR)chemm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(SGEMM_UNROLL_M).c
|
||||
$(KDIR)chemm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_M).c
|
||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||
|
||||
$(KDIR)zhemm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(DGEMM_UNROLL_N).c
|
||||
$(KDIR)zhemm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_N).c
|
||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
|
||||
|
||||
$(KDIR)zhemm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(DGEMM_UNROLL_N).c
|
||||
$(KDIR)zhemm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_N).c
|
||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
|
||||
|
||||
$(KDIR)zhemm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(DGEMM_UNROLL_N).c
|
||||
$(KDIR)zhemm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_N).c
|
||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||
|
||||
$(KDIR)zhemm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(DGEMM_UNROLL_N).c
|
||||
$(KDIR)zhemm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_N).c
|
||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||
|
||||
$(KDIR)zhemm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(DGEMM_UNROLL_N).c
|
||||
$(KDIR)zhemm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_N).c
|
||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||
|
||||
$(KDIR)zhemm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(DGEMM_UNROLL_N).c
|
||||
$(KDIR)zhemm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_N).c
|
||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||
|
||||
$(KDIR)zhemm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(DGEMM_UNROLL_M).c
|
||||
$(KDIR)zhemm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_M).c
|
||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
|
||||
|
||||
$(KDIR)zhemm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(DGEMM_UNROLL_M).c
|
||||
$(KDIR)zhemm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_M).c
|
||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
|
||||
|
||||
$(KDIR)zhemm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(DGEMM_UNROLL_M).c
|
||||
$(KDIR)zhemm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_M).c
|
||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||
|
||||
$(KDIR)zhemm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(DGEMM_UNROLL_M).c
|
||||
$(KDIR)zhemm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_M).c
|
||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||
|
||||
$(KDIR)zhemm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(DGEMM_UNROLL_M).c
|
||||
$(KDIR)zhemm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_M).c
|
||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||
|
||||
$(KDIR)zhemm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(DGEMM_UNROLL_M).c
|
||||
$(KDIR)zhemm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_M).c
|
||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||
|
||||
$(KDIR)xhemm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(QGEMM_UNROLL_N).c
|
||||
$(KDIR)xhemm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_N).c
|
||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
|
||||
|
||||
$(KDIR)xhemm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(QGEMM_UNROLL_N).c
|
||||
$(KDIR)xhemm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_N).c
|
||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
|
||||
|
||||
$(KDIR)xhemm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(QGEMM_UNROLL_N).c
|
||||
$(KDIR)xhemm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_N).c
|
||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||
|
||||
$(KDIR)xhemm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(QGEMM_UNROLL_N).c
|
||||
$(KDIR)xhemm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_N).c
|
||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||
|
||||
$(KDIR)xhemm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(QGEMM_UNROLL_N).c
|
||||
$(KDIR)xhemm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_N).c
|
||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||
|
||||
$(KDIR)xhemm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(QGEMM_UNROLL_N).c
|
||||
$(KDIR)xhemm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_N).c
|
||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||
|
||||
$(KDIR)xhemm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(QGEMM_UNROLL_M).c
|
||||
$(KDIR)xhemm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_M).c
|
||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
|
||||
|
||||
$(KDIR)xhemm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(QGEMM_UNROLL_M).c
|
||||
$(KDIR)xhemm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_M).c
|
||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
|
||||
|
||||
$(KDIR)xhemm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(QGEMM_UNROLL_M).c
|
||||
$(KDIR)xhemm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_M).c
|
||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||
|
||||
$(KDIR)xhemm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(QGEMM_UNROLL_M).c
|
||||
$(KDIR)xhemm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_M).c
|
||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||
|
||||
$(KDIR)xhemm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(QGEMM_UNROLL_M).c
|
||||
$(KDIR)xhemm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_M).c
|
||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||
|
||||
$(KDIR)xhemm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(QGEMM_UNROLL_M).c
|
||||
$(KDIR)xhemm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_M).c
|
||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||
|
||||
$(KDIR)strsm_iunucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_uncopy_$(SGEMM_UNROLL_M).c
|
||||
|
||||
@@ -826,6 +826,22 @@ static void init_parameter(void) {
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef PILEDRIVER
|
||||
|
||||
#ifdef DEBUG
|
||||
fprintf(stderr, "Piledriver\n");
|
||||
#endif
|
||||
|
||||
TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P;
|
||||
TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P;
|
||||
TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P;
|
||||
TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P;
|
||||
#ifdef EXPRECISION
|
||||
TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P;
|
||||
TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P;
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef NANO
|
||||
|
||||
#ifdef DEBUG
|
||||
|
||||
59
kernel/x86/KERNEL.PILEDRIVER
Normal file
59
kernel/x86/KERNEL.PILEDRIVER
Normal file
@@ -0,0 +1,59 @@
|
||||
SGEMMKERNEL = gemm_kernel_4x4_barcelona.S
|
||||
SGEMMINCOPY =
|
||||
SGEMMITCOPY =
|
||||
SGEMMONCOPY = ../generic/gemm_ncopy_4.c
|
||||
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
|
||||
SGEMMINCOPYOBJ =
|
||||
SGEMMITCOPYOBJ =
|
||||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMKERNEL = gemm_kernel_2x4_barcelona.S
|
||||
DGEMMINCOPY = ../generic/gemm_ncopy_2.c
|
||||
DGEMMITCOPY = ../generic/gemm_tcopy_2.c
|
||||
DGEMMONCOPY = ../generic/gemm_ncopy_4.c
|
||||
DGEMMOTCOPY = ../generic/gemm_tcopy_4.c
|
||||
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMKERNEL = zgemm_kernel_2x2_barcelona.S
|
||||
CGEMMINCOPY =
|
||||
CGEMMITCOPY =
|
||||
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
||||
CGEMMINCOPYOBJ =
|
||||
CGEMMITCOPYOBJ =
|
||||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMKERNEL = zgemm_kernel_1x2_barcelona.S
|
||||
ZGEMMINCOPY = ../generic/zgemm_ncopy_1.c
|
||||
ZGEMMITCOPY = ../generic/zgemm_tcopy_1.c
|
||||
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
||||
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
STRSMKERNEL_LN = trsm_kernel_LN_4x4_sse.S
|
||||
STRSMKERNEL_LT = trsm_kernel_LT_4x4_sse.S
|
||||
STRSMKERNEL_RN = trsm_kernel_LT_4x4_sse.S
|
||||
STRSMKERNEL_RT = trsm_kernel_RT_4x4_sse.S
|
||||
|
||||
DTRSMKERNEL_LN = trsm_kernel_LN_2x4_sse2.S
|
||||
DTRSMKERNEL_LT = trsm_kernel_LT_2x4_sse2.S
|
||||
DTRSMKERNEL_RN = trsm_kernel_LT_2x4_sse2.S
|
||||
DTRSMKERNEL_RT = trsm_kernel_RT_2x4_sse2.S
|
||||
|
||||
CTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_sse.S
|
||||
CTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_sse.S
|
||||
CTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_sse.S
|
||||
CTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_sse.S
|
||||
|
||||
ZTRSMKERNEL_LN = ztrsm_kernel_LT_1x2_sse2.S
|
||||
ZTRSMKERNEL_LT = ztrsm_kernel_LT_1x2_sse2.S
|
||||
ZTRSMKERNEL_RN = ztrsm_kernel_LT_1x2_sse2.S
|
||||
ZTRSMKERNEL_RT = ztrsm_kernel_RT_1x2_sse2.S
|
||||
|
||||
CGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S
|
||||
ZGEMM3MKERNEL = zgemm3m_kernel_2x4_barcelona.S
|
||||
@@ -101,10 +101,10 @@
|
||||
#define Y 36 + STACKSIZE+ARGS(%esp)
|
||||
#define STACK_INCY 40 + STACKSIZE+ARGS(%esp)
|
||||
#define BUFFER 44 + STACKSIZE+ARGS(%esp)
|
||||
|
||||
#define MMM 0+ARGS(%esp)
|
||||
#define YY 4+ARGS(%esp)
|
||||
#define AA 8+ARGS(%esp)
|
||||
#define LDAX 12+ARGS(%esp)
|
||||
|
||||
#define I %eax
|
||||
#define J %ebx
|
||||
@@ -153,8 +153,8 @@
|
||||
|
||||
movl YY,J
|
||||
movl J,Y
|
||||
movl STACK_LDA, LDA
|
||||
|
||||
movl STACK_LDA, LDA
|
||||
movl STACK_X, X
|
||||
movl STACK_INCX, INCX
|
||||
|
||||
@@ -688,9 +688,9 @@
|
||||
movl M,J
|
||||
leal (,J,SIZE),%eax
|
||||
addl %eax,AA
|
||||
movl YY,J
|
||||
addl %eax,J
|
||||
movl J,YY
|
||||
movl STACK_INCY,INCY
|
||||
imull INCY,%eax
|
||||
addl %eax,YY
|
||||
jmp .L0t
|
||||
ALIGN_4
|
||||
|
||||
|
||||
@@ -714,9 +714,9 @@
|
||||
movl M,J
|
||||
leal (,J,SIZE),%eax
|
||||
addl %eax,AA
|
||||
movl YY,J
|
||||
addl %eax,J
|
||||
movl J,YY
|
||||
movl STACK_INCY,INCY
|
||||
imull INCY,%eax
|
||||
addl %eax,YY
|
||||
jmp .L0t
|
||||
ALIGN_4
|
||||
|
||||
|
||||
@@ -102,11 +102,9 @@
|
||||
#define STACK_INCY 40 + STACKSIZE+ARGS(%esp)
|
||||
#define BUFFER 44 + STACKSIZE+ARGS(%esp)
|
||||
|
||||
#define MMM 0+STACKSIZE(%esp)
|
||||
#define NN 4+STACKSIZE(%esp)
|
||||
#define AA 8+STACKSIZE(%esp)
|
||||
#define LDAX 12+STACKSIZE(%esp)
|
||||
#define XX 16+STACKSIZE(%esp)
|
||||
#define MMM 0+ARGS(%esp)
|
||||
#define AA 4+ARGS(%esp)
|
||||
#define XX 8+ARGS(%esp)
|
||||
|
||||
#define I %eax
|
||||
#define J %ebx
|
||||
@@ -129,12 +127,8 @@
|
||||
|
||||
PROFCODE
|
||||
|
||||
movl STACK_LDA, LDA
|
||||
movl LDA,LDAX # backup LDA
|
||||
movl STACK_X, X
|
||||
movl X,XX
|
||||
movl N,J
|
||||
movl J,NN # backup N
|
||||
movl A,J
|
||||
movl J,AA # backup A
|
||||
movl M,J
|
||||
@@ -144,7 +138,6 @@
|
||||
addl $1,J
|
||||
sall $22,J # J=2^24*sizeof(float)=buffer size(16MB)
|
||||
subl $8, J # Don't use last 8 float in the buffer.
|
||||
# Now, split M by block J
|
||||
subl J,MMM # MMM=MMM-J
|
||||
movl J,M
|
||||
jge .L00t
|
||||
@@ -159,13 +152,10 @@
|
||||
movl AA,%eax
|
||||
movl %eax,A # mov AA to A
|
||||
|
||||
movl NN,%eax
|
||||
movl %eax,N # reset N
|
||||
|
||||
|
||||
movl LDAX, LDA # reset LDA
|
||||
movl XX,X
|
||||
movl XX,%eax
|
||||
movl %eax,X
|
||||
|
||||
movl STACK_LDA, LDA
|
||||
movl STACK_INCX, INCX
|
||||
movl STACK_INCY, INCY
|
||||
|
||||
@@ -688,9 +678,9 @@
|
||||
movl M,J
|
||||
leal (,J,SIZE),%eax
|
||||
addl %eax,AA
|
||||
movl XX,J
|
||||
addl %eax,J
|
||||
movl J,XX
|
||||
movl STACK_INCX,INCX
|
||||
imull INCX,%eax
|
||||
addl %eax,XX
|
||||
jmp .L0t
|
||||
ALIGN_4
|
||||
|
||||
|
||||
@@ -76,7 +76,7 @@
|
||||
#endif
|
||||
|
||||
#define STACKSIZE 16
|
||||
#define ARGS 16
|
||||
#define ARGS 20
|
||||
|
||||
#define M 4 + STACKSIZE+ARGS(%esp)
|
||||
#define N 8 + STACKSIZE+ARGS(%esp)
|
||||
@@ -89,10 +89,9 @@
|
||||
#define STACK_INCY 44 + STACKSIZE+ARGS(%esp)
|
||||
#define BUFFER 48 + STACKSIZE+ARGS(%esp)
|
||||
|
||||
#define MMM 0+STACKSIZE(%esp)
|
||||
#define AA 4+STACKSIZE(%esp)
|
||||
#define LDAX 8+STACKSIZE(%esp)
|
||||
#define NN 12+STACKSIZE(%esp)
|
||||
#define MMM 0+ARGS(%esp)
|
||||
#define AA 4+ARGS(%esp)
|
||||
#define XX 8+ARGS(%esp)
|
||||
|
||||
#define I %eax
|
||||
#define J %ebx
|
||||
@@ -117,10 +116,8 @@
|
||||
PROFCODE
|
||||
|
||||
|
||||
movl STACK_LDA, LDA
|
||||
movl LDA,LDAX # backup LDA
|
||||
movl N,J
|
||||
movl J,NN # backup N
|
||||
movl STACK_X, X
|
||||
movl X,XX
|
||||
movl A,J
|
||||
movl J,AA # backup A
|
||||
movl M,J
|
||||
@@ -130,7 +127,6 @@
|
||||
addl $1,J
|
||||
sall $21,J # J=2^21*sizeof(double)=buffer size(16MB)
|
||||
subl $4, J # Don't use last 4 double in the buffer.
|
||||
# Now, split M by block J
|
||||
subl J,MMM # MMM=MMM-J
|
||||
movl J,M
|
||||
jge .L00t
|
||||
@@ -142,15 +138,13 @@
|
||||
movl %eax,M
|
||||
|
||||
.L00t:
|
||||
movl XX,%eax
|
||||
movl %eax, X
|
||||
|
||||
movl AA,%eax
|
||||
movl %eax,A # mov AA to A
|
||||
|
||||
movl NN,%eax
|
||||
movl %eax,N # reset N
|
||||
|
||||
|
||||
movl LDAX, LDA # reset LDA
|
||||
movl STACK_X, X
|
||||
movl STACK_LDA, LDA
|
||||
movl STACK_INCX, INCX
|
||||
movl STACK_INCY, INCY
|
||||
|
||||
@@ -605,6 +599,9 @@
|
||||
movl M,J
|
||||
leal (,J,SIZE),%eax
|
||||
addl %eax,AA
|
||||
movl STACK_INCX,INCX
|
||||
imull INCX,%eax
|
||||
addl %eax,XX
|
||||
jmp .L0t
|
||||
ALIGN_4
|
||||
|
||||
|
||||
@@ -74,11 +74,11 @@
|
||||
#else
|
||||
movl %eax, %ecx
|
||||
subl $32, %ecx
|
||||
cmovg %ecx, %eax
|
||||
cmovge %ecx, %eax
|
||||
|
||||
movl %edx, %ecx
|
||||
subl $32, %ecx
|
||||
cmovg %ecx, %edx
|
||||
cmovge %ecx, %edx
|
||||
|
||||
subl %eax, %edx
|
||||
movl $0, %eax
|
||||
|
||||
@@ -69,7 +69,7 @@
|
||||
#define STACK_ALIGN 4096
|
||||
#define STACK_OFFSET 1024
|
||||
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||
#define PREFETCH prefetch
|
||||
#define PREFETCHSIZE (8 * 10 + 4)
|
||||
#endif
|
||||
@@ -439,7 +439,7 @@
|
||||
.L22:
|
||||
mulsd %xmm0, %xmm2
|
||||
addsd %xmm2, %xmm4
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||
PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
|
||||
#endif
|
||||
movlpd 2 * SIZE(BB), %xmm2
|
||||
@@ -488,7 +488,7 @@
|
||||
movlpd 40 * SIZE(BB), %xmm3
|
||||
addsd %xmm0, %xmm7
|
||||
movlpd 8 * SIZE(AA), %xmm0
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||
PREFETCH (PREFETCHSIZE + 8) * SIZE(AA)
|
||||
#endif
|
||||
mulsd %xmm1, %xmm2
|
||||
@@ -1697,7 +1697,7 @@
|
||||
|
||||
.L42:
|
||||
mulpd %xmm0, %xmm2
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||
#endif
|
||||
mulpd 2 * SIZE(BB), %xmm0
|
||||
@@ -1727,7 +1727,7 @@
|
||||
addpd %xmm0, %xmm7
|
||||
movapd 16 * SIZE(AA), %xmm0
|
||||
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||
prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA)
|
||||
#endif
|
||||
mulpd %xmm1, %xmm2
|
||||
|
||||
@@ -64,7 +64,7 @@
|
||||
#define BORIG 60(%esp)
|
||||
#define BUFFER 128(%esp)
|
||||
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||
#define PREFETCH prefetch
|
||||
#define PREFETCHW prefetchw
|
||||
#define PREFETCHSIZE (16 * 10 + 8)
|
||||
@@ -437,7 +437,7 @@
|
||||
.L32:
|
||||
mulss %xmm0, %xmm2
|
||||
addss %xmm2, %xmm4
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||
#endif
|
||||
movss 4 * SIZE(BB), %xmm2
|
||||
@@ -833,7 +833,7 @@
|
||||
.L22:
|
||||
mulps %xmm0, %xmm2
|
||||
addps %xmm2, %xmm4
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||
#endif
|
||||
movaps 4 * SIZE(BB), %xmm2
|
||||
@@ -1848,7 +1848,7 @@
|
||||
|
||||
.L72:
|
||||
mulss %xmm0, %xmm2
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||
#endif
|
||||
mulss 4 * SIZE(BB), %xmm0
|
||||
@@ -2109,7 +2109,7 @@
|
||||
ALIGN_4
|
||||
|
||||
.L62:
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||
#endif
|
||||
|
||||
@@ -2429,7 +2429,7 @@
|
||||
|
||||
.L52:
|
||||
mulps %xmm0, %xmm2
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||
#endif
|
||||
mulps 4 * SIZE(BB), %xmm0
|
||||
@@ -2459,7 +2459,7 @@
|
||||
addps %xmm0, %xmm5
|
||||
movaps 32 * SIZE(AA), %xmm0
|
||||
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
|
||||
#endif
|
||||
mulps %xmm1, %xmm2
|
||||
@@ -2952,7 +2952,7 @@
|
||||
|
||||
.L112:
|
||||
mulss %xmm0, %xmm2
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||
#endif
|
||||
movss 1 * SIZE(AA), %xmm0
|
||||
@@ -3148,7 +3148,7 @@
|
||||
|
||||
.L102:
|
||||
mulps %xmm0, %xmm2
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||
#endif
|
||||
movsd 2 * SIZE(AA), %xmm0
|
||||
@@ -3389,7 +3389,7 @@
|
||||
|
||||
.L92:
|
||||
mulps %xmm0, %xmm2
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||
#endif
|
||||
movaps 4 * SIZE(AA), %xmm0
|
||||
@@ -3404,7 +3404,7 @@
|
||||
mulps 12 * SIZE(BB), %xmm0
|
||||
addps %xmm0, %xmm7
|
||||
movaps 32 * SIZE(AA), %xmm0
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
|
||||
#endif
|
||||
mulps %xmm1, %xmm3
|
||||
|
||||
@@ -69,7 +69,7 @@
|
||||
#define STACK_ALIGN 4096
|
||||
#define STACK_OFFSET 1024
|
||||
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||
#define PREFETCH prefetch
|
||||
#define PREFETCHSIZE (8 * 10 + 4)
|
||||
#endif
|
||||
@@ -910,7 +910,7 @@
|
||||
.L22:
|
||||
mulsd %xmm0, %xmm2
|
||||
addsd %xmm2, %xmm4
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||
PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
|
||||
#endif
|
||||
movlpd 2 * SIZE(BB), %xmm2
|
||||
@@ -959,7 +959,7 @@
|
||||
movlpd 40 * SIZE(BB), %xmm3
|
||||
addsd %xmm0, %xmm7
|
||||
movlpd 8 * SIZE(AA), %xmm0
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||
PREFETCH (PREFETCHSIZE + 8) * SIZE(AA)
|
||||
#endif
|
||||
mulsd %xmm1, %xmm2
|
||||
@@ -1439,7 +1439,7 @@
|
||||
|
||||
.L42:
|
||||
mulpd %xmm0, %xmm2
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||
#endif
|
||||
mulpd 2 * SIZE(BB), %xmm0
|
||||
@@ -1469,7 +1469,7 @@
|
||||
addpd %xmm0, %xmm7
|
||||
movapd 16 * SIZE(AA), %xmm0
|
||||
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||
prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA)
|
||||
#endif
|
||||
mulpd %xmm1, %xmm2
|
||||
|
||||
@@ -64,7 +64,7 @@
|
||||
#define BORIG 60(%esp)
|
||||
#define BUFFER 128(%esp)
|
||||
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||
#define PREFETCH prefetch
|
||||
#define PREFETCHW prefetchw
|
||||
#define PREFETCHSIZE (16 * 10 + 8)
|
||||
@@ -872,7 +872,7 @@
|
||||
.L22:
|
||||
mulps %xmm0, %xmm2
|
||||
addps %xmm2, %xmm4
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||
#endif
|
||||
movaps 4 * SIZE(BB), %xmm2
|
||||
@@ -1316,7 +1316,7 @@
|
||||
.L32:
|
||||
mulss %xmm0, %xmm2
|
||||
addss %xmm2, %xmm4
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||
#endif
|
||||
movss 4 * SIZE(BB), %xmm2
|
||||
@@ -1855,7 +1855,7 @@
|
||||
|
||||
.L52:
|
||||
mulps %xmm0, %xmm2
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||
#endif
|
||||
mulps 4 * SIZE(BB), %xmm0
|
||||
@@ -1885,7 +1885,7 @@
|
||||
addps %xmm0, %xmm5
|
||||
movaps 32 * SIZE(AA), %xmm0
|
||||
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
|
||||
#endif
|
||||
mulps %xmm1, %xmm2
|
||||
@@ -2249,7 +2249,7 @@
|
||||
ALIGN_4
|
||||
|
||||
.L62:
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||
#endif
|
||||
|
||||
@@ -2562,7 +2562,7 @@
|
||||
|
||||
.L72:
|
||||
mulss %xmm0, %xmm2
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||
#endif
|
||||
mulss 4 * SIZE(BB), %xmm0
|
||||
@@ -2957,7 +2957,7 @@
|
||||
|
||||
.L92:
|
||||
mulps %xmm0, %xmm2
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||
#endif
|
||||
movaps 4 * SIZE(AA), %xmm0
|
||||
@@ -2972,7 +2972,7 @@
|
||||
mulps 12 * SIZE(BB), %xmm0
|
||||
addps %xmm0, %xmm7
|
||||
movaps 32 * SIZE(AA), %xmm0
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
|
||||
#endif
|
||||
mulps %xmm1, %xmm3
|
||||
@@ -3280,7 +3280,7 @@
|
||||
|
||||
.L102:
|
||||
mulps %xmm0, %xmm2
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||
#endif
|
||||
movsd 2 * SIZE(AA), %xmm0
|
||||
@@ -3515,7 +3515,7 @@
|
||||
|
||||
.L112:
|
||||
mulss %xmm0, %xmm2
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||
#endif
|
||||
movss 1 * SIZE(AA), %xmm0
|
||||
|
||||
@@ -69,7 +69,7 @@
|
||||
#define STACK_ALIGN 4096
|
||||
#define STACK_OFFSET 1024
|
||||
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||
#define PREFETCH prefetch
|
||||
#define PREFETCHSIZE (8 * 10 + 4)
|
||||
#endif
|
||||
@@ -1036,7 +1036,7 @@
|
||||
|
||||
.L42:
|
||||
mulpd %xmm0, %xmm2
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||
#endif
|
||||
mulpd 2 * SIZE(BB), %xmm0
|
||||
@@ -1066,7 +1066,7 @@
|
||||
addpd %xmm0, %xmm7
|
||||
movapd 16 * SIZE(AA), %xmm0
|
||||
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||
prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA)
|
||||
#endif
|
||||
mulpd %xmm1, %xmm2
|
||||
@@ -2224,7 +2224,7 @@
|
||||
.L22:
|
||||
mulsd %xmm0, %xmm2
|
||||
addsd %xmm2, %xmm4
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||
PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
|
||||
#endif
|
||||
movlpd 2 * SIZE(BB), %xmm2
|
||||
@@ -2273,7 +2273,7 @@
|
||||
movlpd 40 * SIZE(BB), %xmm3
|
||||
addsd %xmm0, %xmm7
|
||||
movlpd 8 * SIZE(AA), %xmm0
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||
PREFETCH (PREFETCHSIZE + 8) * SIZE(AA)
|
||||
#endif
|
||||
mulsd %xmm1, %xmm2
|
||||
|
||||
@@ -64,7 +64,7 @@
|
||||
#define BORIG 60(%esp)
|
||||
#define BUFFER 128(%esp)
|
||||
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||
#define PREFETCH prefetch
|
||||
#define PREFETCHW prefetchw
|
||||
#define PREFETCHSIZE (16 * 10 + 8)
|
||||
@@ -439,7 +439,7 @@
|
||||
|
||||
.L92:
|
||||
mulps %xmm0, %xmm2
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||
#endif
|
||||
movaps 4 * SIZE(AA), %xmm0
|
||||
@@ -454,7 +454,7 @@
|
||||
mulps 12 * SIZE(BB), %xmm0
|
||||
addps %xmm0, %xmm7
|
||||
movaps 32 * SIZE(AA), %xmm0
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
|
||||
#endif
|
||||
mulps %xmm1, %xmm3
|
||||
@@ -758,7 +758,7 @@
|
||||
|
||||
.L102:
|
||||
mulps %xmm0, %xmm2
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||
#endif
|
||||
movsd 2 * SIZE(AA), %xmm0
|
||||
@@ -993,7 +993,7 @@
|
||||
|
||||
.L112:
|
||||
mulss %xmm0, %xmm2
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||
#endif
|
||||
movss 1 * SIZE(AA), %xmm0
|
||||
@@ -1324,7 +1324,7 @@
|
||||
|
||||
.L52:
|
||||
mulps %xmm0, %xmm2
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||
#endif
|
||||
mulps 4 * SIZE(BB), %xmm0
|
||||
@@ -1354,7 +1354,7 @@
|
||||
addps %xmm0, %xmm5
|
||||
movaps 32 * SIZE(AA), %xmm0
|
||||
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
|
||||
#endif
|
||||
mulps %xmm1, %xmm2
|
||||
@@ -1718,7 +1718,7 @@
|
||||
ALIGN_4
|
||||
|
||||
.L62:
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||
#endif
|
||||
|
||||
@@ -2031,7 +2031,7 @@
|
||||
|
||||
.L72:
|
||||
mulss %xmm0, %xmm2
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||
#endif
|
||||
mulss 4 * SIZE(BB), %xmm0
|
||||
@@ -2859,7 +2859,7 @@
|
||||
.L22:
|
||||
mulps %xmm0, %xmm2
|
||||
addps %xmm2, %xmm4
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||
#endif
|
||||
movaps 4 * SIZE(BB), %xmm2
|
||||
@@ -3303,7 +3303,7 @@
|
||||
.L32:
|
||||
mulss %xmm0, %xmm2
|
||||
addss %xmm2, %xmm4
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||
#endif
|
||||
movss 4 * SIZE(BB), %xmm2
|
||||
|
||||
@@ -89,18 +89,23 @@
|
||||
#endif
|
||||
|
||||
#define STACKSIZE 16
|
||||
#define ARGS 20
|
||||
|
||||
#define M 4 + STACKSIZE(%esp)
|
||||
#define N 8 + STACKSIZE(%esp)
|
||||
#define ALPHA_R 16 + STACKSIZE(%esp)
|
||||
#define ALPHA_I 20 + STACKSIZE(%esp)
|
||||
#define A 24 + STACKSIZE(%esp)
|
||||
#define STACK_LDA 28 + STACKSIZE(%esp)
|
||||
#define STACK_X 32 + STACKSIZE(%esp)
|
||||
#define STACK_INCX 36 + STACKSIZE(%esp)
|
||||
#define Y 40 + STACKSIZE(%esp)
|
||||
#define STACK_INCY 44 + STACKSIZE(%esp)
|
||||
#define BUFFER 48 + STACKSIZE(%esp)
|
||||
#define M 4 + STACKSIZE+ARGS(%esp)
|
||||
#define N 8 + STACKSIZE+ARGS(%esp)
|
||||
#define ALPHA_R 16 + STACKSIZE+ARGS(%esp)
|
||||
#define ALPHA_I 20 + STACKSIZE+ARGS(%esp)
|
||||
#define A 24 + STACKSIZE+ARGS(%esp)
|
||||
#define STACK_LDA 28 + STACKSIZE+ARGS(%esp)
|
||||
#define STACK_X 32 + STACKSIZE+ARGS(%esp)
|
||||
#define STACK_INCX 36 + STACKSIZE+ARGS(%esp)
|
||||
#define Y 40 + STACKSIZE+ARGS(%esp)
|
||||
#define STACK_INCY 44 + STACKSIZE+ARGS(%esp)
|
||||
#define BUFFER 48 + STACKSIZE+ARGS(%esp)
|
||||
|
||||
#define MMM 0+ARGS(%esp)
|
||||
#define YY 4+ARGS(%esp)
|
||||
#define AA 8+ARGS(%esp)
|
||||
|
||||
#define I %eax
|
||||
#define J %ebx
|
||||
@@ -123,6 +128,7 @@
|
||||
|
||||
PROLOGUE
|
||||
|
||||
subl $ARGS,%esp
|
||||
pushl %ebp
|
||||
pushl %edi
|
||||
pushl %esi
|
||||
@@ -130,6 +136,33 @@
|
||||
|
||||
PROFCODE
|
||||
|
||||
movl Y,J
|
||||
movl J,YY
|
||||
movl A,J
|
||||
movl J,AA
|
||||
movl M,J
|
||||
movl J,MMM
|
||||
.L0t:
|
||||
xorl J,J
|
||||
addl $1,J
|
||||
sall $20,J
|
||||
subl J,MMM
|
||||
movl J,M
|
||||
jge .L00t
|
||||
ALIGN_3
|
||||
|
||||
movl MMM,%eax
|
||||
addl J,%eax
|
||||
jle .L999x
|
||||
movl %eax,M
|
||||
|
||||
.L00t:
|
||||
movl AA,%eax
|
||||
movl %eax,A
|
||||
|
||||
movl YY,J
|
||||
movl J,Y
|
||||
|
||||
movl STACK_LDA, LDA
|
||||
movl STACK_X, X
|
||||
movl STACK_INCX, INCX
|
||||
@@ -595,10 +628,21 @@
|
||||
ALIGN_3
|
||||
|
||||
.L999:
|
||||
movl M,%eax
|
||||
sall $ZBASE_SHIFT,%eax
|
||||
addl %eax,AA
|
||||
movl STACK_INCY,INCY
|
||||
imull INCY,%eax
|
||||
addl %eax,YY
|
||||
jmp .L0t
|
||||
ALIGN_3
|
||||
|
||||
.L999x:
|
||||
popl %ebx
|
||||
popl %esi
|
||||
popl %edi
|
||||
popl %ebp
|
||||
addl $ARGS,%esp
|
||||
ret
|
||||
|
||||
EPILOGUE
|
||||
|
||||
@@ -76,18 +76,23 @@
|
||||
#endif
|
||||
|
||||
#define STACKSIZE 16
|
||||
#define ARGS 16
|
||||
|
||||
#define M 4 + STACKSIZE+ARGS(%esp)
|
||||
#define N 8 + STACKSIZE+ARGS(%esp)
|
||||
#define ALPHA_R 16 + STACKSIZE+ARGS(%esp)
|
||||
#define ALPHA_I 24 + STACKSIZE+ARGS(%esp)
|
||||
#define A 32 + STACKSIZE+ARGS(%esp)
|
||||
#define STACK_LDA 36 + STACKSIZE+ARGS(%esp)
|
||||
#define STACK_X 40 + STACKSIZE+ARGS(%esp)
|
||||
#define STACK_INCX 44 + STACKSIZE+ARGS(%esp)
|
||||
#define Y 48 + STACKSIZE+ARGS(%esp)
|
||||
#define STACK_INCY 52 + STACKSIZE+ARGS(%esp)
|
||||
#define BUFFER 56 + STACKSIZE+ARGS(%esp)
|
||||
#define MMM 0 + ARGS(%esp)
|
||||
#define YY 4 + ARGS(%esp)
|
||||
#define AA 8 + ARGS(%esp)
|
||||
|
||||
#define M 4 + STACKSIZE(%esp)
|
||||
#define N 8 + STACKSIZE(%esp)
|
||||
#define ALPHA_R 16 + STACKSIZE(%esp)
|
||||
#define ALPHA_I 24 + STACKSIZE(%esp)
|
||||
#define A 32 + STACKSIZE(%esp)
|
||||
#define STACK_LDA 36 + STACKSIZE(%esp)
|
||||
#define STACK_X 40 + STACKSIZE(%esp)
|
||||
#define STACK_INCX 44 + STACKSIZE(%esp)
|
||||
#define Y 48 + STACKSIZE(%esp)
|
||||
#define STACK_INCY 52 + STACKSIZE(%esp)
|
||||
#define BUFFER 56 + STACKSIZE(%esp)
|
||||
|
||||
#define I %eax
|
||||
#define J %ebx
|
||||
@@ -110,6 +115,7 @@
|
||||
|
||||
PROLOGUE
|
||||
|
||||
subl $ARGS,%esp
|
||||
pushl %ebp
|
||||
pushl %edi
|
||||
pushl %esi
|
||||
@@ -117,6 +123,33 @@
|
||||
|
||||
PROFCODE
|
||||
|
||||
movl Y,J
|
||||
movl J,YY
|
||||
movl A,J
|
||||
movl J,AA
|
||||
movl M,J
|
||||
movl J,MMM
|
||||
.L0t:
|
||||
xorl J,J
|
||||
addl $1,J
|
||||
sall $18,J
|
||||
subl J,MMM
|
||||
movl J,M
|
||||
jge .L00t
|
||||
ALIGN_3
|
||||
|
||||
movl MMM,%eax
|
||||
addl J,%eax
|
||||
jle .L999x
|
||||
movl %eax,M
|
||||
|
||||
.L00t:
|
||||
movl AA,%eax
|
||||
movl %eax,A
|
||||
|
||||
movl YY,J
|
||||
movl J,Y
|
||||
|
||||
movl STACK_LDA, LDA
|
||||
movl STACK_X, X
|
||||
movl STACK_INCX, INCX
|
||||
@@ -458,10 +491,21 @@
|
||||
ALIGN_3
|
||||
|
||||
.L999:
|
||||
movl M,%eax
|
||||
sall $ZBASE_SHIFT,%eax
|
||||
addl %eax,AA
|
||||
movl STACK_INCY,INCY
|
||||
imull INCY,%eax
|
||||
addl %eax,YY
|
||||
jmp .L0t
|
||||
ALIGN_3
|
||||
|
||||
.L999x:
|
||||
popl %ebx
|
||||
popl %esi
|
||||
popl %edi
|
||||
popl %ebp
|
||||
addl $ARGS,%esp
|
||||
ret
|
||||
|
||||
EPILOGUE
|
||||
|
||||
@@ -89,18 +89,23 @@
|
||||
#endif
|
||||
|
||||
#define STACKSIZE 16
|
||||
#define ARGS 20
|
||||
|
||||
#define M 4 + STACKSIZE(%esp)
|
||||
#define N 8 + STACKSIZE(%esp)
|
||||
#define ALPHA_R 16 + STACKSIZE(%esp)
|
||||
#define ALPHA_I 20 + STACKSIZE(%esp)
|
||||
#define A 24 + STACKSIZE(%esp)
|
||||
#define STACK_LDA 28 + STACKSIZE(%esp)
|
||||
#define STACK_X 32 + STACKSIZE(%esp)
|
||||
#define STACK_INCX 36 + STACKSIZE(%esp)
|
||||
#define Y 40 + STACKSIZE(%esp)
|
||||
#define STACK_INCY 44 + STACKSIZE(%esp)
|
||||
#define BUFFER 48 + STACKSIZE(%esp)
|
||||
#define M 4 + STACKSIZE+ARGS(%esp)
|
||||
#define N 8 + STACKSIZE+ARGS(%esp)
|
||||
#define ALPHA_R 16 + STACKSIZE+ARGS(%esp)
|
||||
#define ALPHA_I 20 + STACKSIZE+ARGS(%esp)
|
||||
#define A 24 + STACKSIZE+ARGS(%esp)
|
||||
#define STACK_LDA 28 + STACKSIZE+ARGS(%esp)
|
||||
#define STACK_X 32 + STACKSIZE+ARGS(%esp)
|
||||
#define STACK_INCX 36 + STACKSIZE+ARGS(%esp)
|
||||
#define Y 40 + STACKSIZE+ARGS(%esp)
|
||||
#define STACK_INCY 44 + STACKSIZE+ARGS(%esp)
|
||||
#define BUFFER 48 + STACKSIZE+ARGS(%esp)
|
||||
|
||||
#define MMM 0+ARGS(%esp)
|
||||
#define XX 4+ARGS(%esp)
|
||||
#define AA 8+ARGS(%esp)
|
||||
|
||||
#define I %eax
|
||||
#define J %ebx
|
||||
@@ -123,6 +128,7 @@
|
||||
|
||||
PROLOGUE
|
||||
|
||||
subl $ARGS,%esp
|
||||
pushl %ebp
|
||||
pushl %edi
|
||||
pushl %esi
|
||||
@@ -130,8 +136,35 @@
|
||||
|
||||
PROFCODE
|
||||
|
||||
movl STACK_LDA, LDA
|
||||
movl STACK_X, X
|
||||
movl X,XX
|
||||
movl A,J
|
||||
movl J,AA #backup A
|
||||
movl M,J
|
||||
movl J,MMM
|
||||
.L0t:
|
||||
xorl J,J
|
||||
addl $1,J
|
||||
sall $20,J
|
||||
subl $8,J
|
||||
subl J,MMM #MMM-=J
|
||||
movl J,M
|
||||
jge .L00t
|
||||
ALIGN_4
|
||||
|
||||
movl MMM,%eax
|
||||
addl J,%eax
|
||||
jle .L999x
|
||||
movl %eax,M
|
||||
|
||||
.L00t:
|
||||
movl AA,%eax
|
||||
movl %eax,A
|
||||
|
||||
movl XX,%eax
|
||||
movl %eax,X
|
||||
|
||||
movl STACK_LDA,LDA
|
||||
movl STACK_INCX, INCX
|
||||
movl STACK_INCY, INCY
|
||||
|
||||
@@ -513,10 +546,22 @@
|
||||
ALIGN_4
|
||||
|
||||
.L999:
|
||||
movl M,%eax
|
||||
sall $ZBASE_SHIFT, %eax
|
||||
addl %eax,AA
|
||||
movl STACK_INCX,INCX
|
||||
imull INCX,%eax
|
||||
addl %eax,XX
|
||||
jmp .L0t
|
||||
ALIGN_4
|
||||
|
||||
.L999x:
|
||||
popl %ebx
|
||||
popl %esi
|
||||
popl %edi
|
||||
popl %ebp
|
||||
|
||||
addl $ARGS,%esp
|
||||
ret
|
||||
|
||||
EPILOGUE
|
||||
|
||||
@@ -76,19 +76,24 @@
|
||||
#endif
|
||||
|
||||
#define STACKSIZE 16
|
||||
#define ARGS 20
|
||||
|
||||
#define M 4 + STACKSIZE+ARGS(%esp)
|
||||
#define N 8 + STACKSIZE+ARGS(%esp)
|
||||
#define ALPHA_R 16 + STACKSIZE+ARGS(%esp)
|
||||
#define ALPHA_I 24 + STACKSIZE+ARGS(%esp)
|
||||
#define A 32 + STACKSIZE+ARGS(%esp)
|
||||
#define STACK_LDA 36 + STACKSIZE+ARGS(%esp)
|
||||
#define STACK_X 40 + STACKSIZE+ARGS(%esp)
|
||||
#define STACK_INCX 44 + STACKSIZE+ARGS(%esp)
|
||||
#define Y 48 + STACKSIZE+ARGS(%esp)
|
||||
#define STACK_INCY 52 + STACKSIZE+ARGS(%esp)
|
||||
#define BUFFER 56 + STACKSIZE+ARGS(%esp)
|
||||
|
||||
#define MMM 0 + ARGS(%esp)
|
||||
#define AA 4 + ARGS(%esp)
|
||||
#define XX 8 + ARGS(%esp)
|
||||
|
||||
#define M 4 + STACKSIZE(%esp)
|
||||
#define N 8 + STACKSIZE(%esp)
|
||||
#define ALPHA_R 16 + STACKSIZE(%esp)
|
||||
#define ALPHA_I 24 + STACKSIZE(%esp)
|
||||
#define A 32 + STACKSIZE(%esp)
|
||||
#define STACK_LDA 36 + STACKSIZE(%esp)
|
||||
#define STACK_X 40 + STACKSIZE(%esp)
|
||||
#define STACK_INCX 44 + STACKSIZE(%esp)
|
||||
#define Y 48 + STACKSIZE(%esp)
|
||||
#define STACK_INCY 52 + STACKSIZE(%esp)
|
||||
#define BUFFER 56 + STACKSIZE(%esp)
|
||||
|
||||
#define I %eax
|
||||
#define J %ebx
|
||||
|
||||
@@ -110,6 +115,7 @@
|
||||
|
||||
PROLOGUE
|
||||
|
||||
subl $ARGS,%esp
|
||||
pushl %ebp
|
||||
pushl %edi
|
||||
pushl %esi
|
||||
@@ -117,8 +123,35 @@
|
||||
|
||||
PROFCODE
|
||||
|
||||
movl STACK_X, X
|
||||
movl X, XX
|
||||
movl A,J
|
||||
movl J,AA
|
||||
movl M,J
|
||||
movl J,MMM
|
||||
.L0t:
|
||||
xorl J,J
|
||||
addl $1,J
|
||||
sall $18,J
|
||||
subl $4,J
|
||||
subl J,MMM
|
||||
movl J,M
|
||||
jge .L00t
|
||||
ALIGN_4
|
||||
|
||||
movl MMM,%eax
|
||||
addl J,%eax
|
||||
jle .L999x
|
||||
movl %eax, M
|
||||
|
||||
.L00t:
|
||||
movl XX, %eax
|
||||
movl %eax, X
|
||||
|
||||
movl AA,%eax
|
||||
movl %eax,A
|
||||
|
||||
movl STACK_LDA, LDA
|
||||
movl STACK_X, X
|
||||
movl STACK_INCX, INCX
|
||||
movl STACK_INCY, INCY
|
||||
|
||||
@@ -188,7 +221,7 @@
|
||||
movl Y, Y1
|
||||
|
||||
movl N, J
|
||||
ALIGN_3
|
||||
ALIGN_4
|
||||
|
||||
.L11:
|
||||
movl BUFFER, X
|
||||
@@ -395,10 +428,21 @@
|
||||
ALIGN_4
|
||||
|
||||
.L999:
|
||||
movl M,%eax
|
||||
sall $ZBASE_SHIFT,%eax
|
||||
addl %eax,AA
|
||||
movl STACK_INCX,INCX
|
||||
imull INCX,%eax
|
||||
addl %eax,XX
|
||||
jmp .L0t
|
||||
ALIGN_4
|
||||
|
||||
.L999x:
|
||||
popl %ebx
|
||||
popl %esi
|
||||
popl %edi
|
||||
popl %ebp
|
||||
addl $ARGS,%esp
|
||||
ret
|
||||
|
||||
EPILOGUE
|
||||
|
||||
@@ -75,7 +75,7 @@
|
||||
#define STACK_ALIGN 4096
|
||||
#define STACK_OFFSET 1024
|
||||
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||
#define PREFETCHSIZE (16 * 10 + 8)
|
||||
#define WPREFETCHSIZE 112
|
||||
#define PREFETCH prefetch
|
||||
@@ -533,7 +533,7 @@
|
||||
addps %xmm0, %xmm7
|
||||
movsd 16 * SIZE(AA), %xmm0
|
||||
mulps %xmm1, %xmm2
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||
prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA)
|
||||
#endif
|
||||
addps %xmm2, %xmm4
|
||||
|
||||
@@ -75,7 +75,7 @@
|
||||
#define STACK_ALIGN 4096
|
||||
#define STACK_OFFSET 1024
|
||||
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||
#define PREFETCHSIZE (16 * 10 + 8)
|
||||
#define WPREFETCHSIZE 112
|
||||
#define PREFETCH prefetch
|
||||
@@ -994,7 +994,7 @@
|
||||
addps %xmm0, %xmm7
|
||||
movsd 16 * SIZE(AA), %xmm0
|
||||
mulps %xmm1, %xmm2
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||
prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA)
|
||||
#endif
|
||||
addps %xmm2, %xmm4
|
||||
|
||||
@@ -75,7 +75,7 @@
|
||||
#define STACK_ALIGN 4096
|
||||
#define STACK_OFFSET 1024
|
||||
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||
#define PREFETCHSIZE (16 * 10 + 8)
|
||||
#define WPREFETCHSIZE 112
|
||||
#define PREFETCH prefetch
|
||||
@@ -1820,7 +1820,7 @@
|
||||
addps %xmm0, %xmm7
|
||||
movsd 16 * SIZE(AA), %xmm0
|
||||
mulps %xmm1, %xmm2
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||
prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA)
|
||||
#endif
|
||||
addps %xmm2, %xmm4
|
||||
|
||||
@@ -1,62 +1,71 @@
|
||||
ZGEMVNKERNEL = zgemv_n_dup.S
|
||||
ZGEMVTKERNEL = zgemv_t_dup.S
|
||||
|
||||
SGEMMKERNEL = sgemm_kernel_8x4_bulldozer.S
|
||||
SGEMMINCOPY = ../generic/gemm_ncopy_8.c
|
||||
SGEMMITCOPY = ../generic/gemm_tcopy_8.c
|
||||
SGEMMONCOPY = gemm_ncopy_4_opteron.S
|
||||
SGEMMOTCOPY = gemm_tcopy_4_opteron.S
|
||||
DGEMVNKERNEL = dgemv_n_bulldozer.S
|
||||
DGEMVTKERNEL = dgemv_t_bulldozer.S
|
||||
DAXPYKERNEL = daxpy_bulldozer.S
|
||||
DDOTKERNEL = ddot_bulldozer.S
|
||||
DCOPYKERNEL = dcopy_bulldozer.S
|
||||
|
||||
SGEMMKERNEL = sgemm_kernel_16x2_bulldozer.S
|
||||
SGEMMINCOPY = ../generic/gemm_ncopy_16.c
|
||||
SGEMMITCOPY = ../generic/gemm_tcopy_16.c
|
||||
SGEMMONCOPY = gemm_ncopy_2_bulldozer.S
|
||||
SGEMMOTCOPY = gemm_tcopy_2_bulldozer.S
|
||||
|
||||
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMKERNEL = dgemm_kernel_4x4_bulldozer.S
|
||||
DGEMMINCOPY =
|
||||
DGEMMITCOPY =
|
||||
DGEMMONCOPY = gemm_ncopy_4_opteron.S
|
||||
DGEMMOTCOPY = gemm_tcopy_4_opteron.S
|
||||
DGEMMINCOPYOBJ =
|
||||
DGEMMITCOPYOBJ =
|
||||
DGEMMKERNEL = dgemm_kernel_8x2_bulldozer.S
|
||||
DGEMMINCOPY = dgemm_ncopy_8_bulldozer.S
|
||||
DGEMMITCOPY = dgemm_tcopy_8_bulldozer.S
|
||||
DGEMMONCOPY = gemm_ncopy_2_bulldozer.S
|
||||
DGEMMOTCOPY = gemm_tcopy_2_bulldozer.S
|
||||
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMKERNEL = zgemm_kernel_4x2_barcelona.S
|
||||
CGEMMKERNEL = cgemm_kernel_4x2_bulldozer.S
|
||||
CGEMMINCOPY = ../generic/zgemm_ncopy_4.c
|
||||
CGEMMITCOPY = ../generic/zgemm_tcopy_4.c
|
||||
CGEMMONCOPY = zgemm_ncopy_2.S
|
||||
CGEMMOTCOPY = zgemm_tcopy_2.S
|
||||
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
||||
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMKERNEL = zgemm_kernel_2x2_barcelona.S
|
||||
ZGEMMKERNEL = zgemm_kernel_2x2_bulldozer.S
|
||||
ZGEMMINCOPY =
|
||||
ZGEMMITCOPY =
|
||||
ZGEMMONCOPY = zgemm_ncopy_2.S
|
||||
ZGEMMOTCOPY = zgemm_tcopy_2.S
|
||||
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
||||
ZGEMMINCOPYOBJ =
|
||||
ZGEMMITCOPYOBJ =
|
||||
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
STRSMKERNEL_LN = trsm_kernel_LN_8x4_sse.S
|
||||
STRSMKERNEL_LT = trsm_kernel_LT_8x4_sse.S
|
||||
STRSMKERNEL_RN = trsm_kernel_LT_8x4_sse.S
|
||||
STRSMKERNEL_RT = trsm_kernel_RT_8x4_sse.S
|
||||
|
||||
DTRSMKERNEL_LN = trsm_kernel_LN_4x4_barcelona.S
|
||||
DTRSMKERNEL_LT = trsm_kernel_LT_4x4_barcelona.S
|
||||
DTRSMKERNEL_RN = trsm_kernel_LT_4x4_barcelona.S
|
||||
DTRSMKERNEL_RT = trsm_kernel_RT_4x4_barcelona.S
|
||||
|
||||
CTRSMKERNEL_LN = ztrsm_kernel_LN_4x2_sse.S
|
||||
CTRSMKERNEL_LT = ztrsm_kernel_LT_4x2_sse.S
|
||||
CTRSMKERNEL_RN = ztrsm_kernel_LT_4x2_sse.S
|
||||
CTRSMKERNEL_RT = ztrsm_kernel_RT_4x2_sse.S
|
||||
|
||||
ZTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_sse2.S
|
||||
ZTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_sse2.S
|
||||
ZTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_sse2.S
|
||||
ZTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_sse2.S
|
||||
|
||||
CGEMM3MKERNEL = zgemm3m_kernel_8x4_barcelona.S
|
||||
ZGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S
|
||||
|
||||
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
|
||||
|
||||
70
kernel/x86_64/KERNEL.PILEDRIVER
Normal file
70
kernel/x86_64/KERNEL.PILEDRIVER
Normal file
@@ -0,0 +1,70 @@
|
||||
ZGEMVNKERNEL = zgemv_n_dup.S
|
||||
ZGEMVTKERNEL = zgemv_t_dup.S
|
||||
|
||||
DGEMVNKERNEL = dgemv_n_bulldozer.S
|
||||
DGEMVTKERNEL = dgemv_t_bulldozer.S
|
||||
DAXPYKERNEL = daxpy_bulldozer.S
|
||||
DDOTKERNEL = ddot_bulldozer.S
|
||||
DCOPYKERNEL = dcopy_bulldozer.S
|
||||
|
||||
SGEMMKERNEL = sgemm_kernel_16x2_bulldozer.S
|
||||
SGEMMINCOPY = ../generic/gemm_ncopy_16.c
|
||||
SGEMMITCOPY = ../generic/gemm_tcopy_16.c
|
||||
SGEMMONCOPY = gemm_ncopy_2_bulldozer.S
|
||||
SGEMMOTCOPY = gemm_tcopy_2_bulldozer.S
|
||||
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMKERNEL = dgemm_kernel_8x2_bulldozer.S
|
||||
DGEMMINCOPY = dgemm_ncopy_8_bulldozer.S
|
||||
DGEMMITCOPY = dgemm_tcopy_8_bulldozer.S
|
||||
DGEMMONCOPY = gemm_ncopy_2_bulldozer.S
|
||||
DGEMMOTCOPY = gemm_tcopy_2_bulldozer.S
|
||||
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMKERNEL = cgemm_kernel_4x2_bulldozer.S
|
||||
CGEMMINCOPY = ../generic/zgemm_ncopy_4.c
|
||||
CGEMMITCOPY = ../generic/zgemm_tcopy_4.c
|
||||
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
||||
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMKERNEL = zgemm_kernel_2x2_bulldozer.S
|
||||
ZGEMMINCOPY =
|
||||
ZGEMMITCOPY =
|
||||
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
||||
ZGEMMINCOPYOBJ =
|
||||
ZGEMMITCOPYOBJ =
|
||||
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
CGEMM3MKERNEL = zgemm3m_kernel_8x4_barcelona.S
|
||||
ZGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S
|
||||
|
||||
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
|
||||
@@ -69,7 +69,7 @@
|
||||
#endif
|
||||
movaps %xmm0, ALPHA
|
||||
#else
|
||||
movaps %xmm3, ALPHA
|
||||
|
||||
|
||||
movq 40(%rsp), X
|
||||
movq 48(%rsp), INCX
|
||||
@@ -79,6 +79,10 @@
|
||||
|
||||
SAVEREGISTERS
|
||||
|
||||
#ifdef WINDOWS_ABI
|
||||
movaps %xmm3, ALPHA
|
||||
#endif
|
||||
|
||||
shufps $0, ALPHA, ALPHA
|
||||
|
||||
leaq (, INCX, SIZE), INCX
|
||||
|
||||
@@ -69,7 +69,6 @@
|
||||
#endif
|
||||
movaps %xmm0, ALPHA
|
||||
#else
|
||||
movaps %xmm3, ALPHA
|
||||
|
||||
movq 40(%rsp), X
|
||||
movq 48(%rsp), INCX
|
||||
@@ -79,6 +78,10 @@
|
||||
|
||||
SAVEREGISTERS
|
||||
|
||||
#ifdef WINDOWS_ABI
|
||||
movaps %xmm3, ALPHA
|
||||
#endif
|
||||
|
||||
unpcklpd ALPHA, ALPHA
|
||||
|
||||
leaq (, INCX, SIZE), INCX
|
||||
|
||||
1900
kernel/x86_64/cgemm_kernel_4x2_bulldozer.S
Normal file
1900
kernel/x86_64/cgemm_kernel_4x2_bulldozer.S
Normal file
File diff suppressed because it is too large
Load Diff
@@ -47,14 +47,22 @@
|
||||
|
||||
#ifndef WINDOWS_ABI
|
||||
|
||||
#define STACKSIZE 64
|
||||
#define STACKSIZE 128
|
||||
|
||||
#define OLD_INCX 8 + STACKSIZE(%rsp)
|
||||
#define OLD_Y 16 + STACKSIZE(%rsp)
|
||||
#define OLD_INCY 24 + STACKSIZE(%rsp)
|
||||
#define OLD_BUFFER 32 + STACKSIZE(%rsp)
|
||||
#define ALPHA 48 (%rsp)
|
||||
|
||||
|
||||
#define MMM 64(%rsp)
|
||||
#define NN 72(%rsp)
|
||||
#define AA 80(%rsp)
|
||||
#define XX 88(%rsp)
|
||||
#define LDAX 96(%rsp)
|
||||
#define ALPHAR 104(%rsp)
|
||||
#define ALPHAI 112(%rsp)
|
||||
|
||||
#define M %rdi
|
||||
#define N %rsi
|
||||
#define A %rcx
|
||||
@@ -66,7 +74,7 @@
|
||||
|
||||
#else
|
||||
|
||||
#define STACKSIZE 256
|
||||
#define STACKSIZE 288
|
||||
|
||||
#define OLD_ALPHA_I 40 + STACKSIZE(%rsp)
|
||||
#define OLD_A 48 + STACKSIZE(%rsp)
|
||||
@@ -78,6 +86,14 @@
|
||||
#define OLD_BUFFER 96 + STACKSIZE(%rsp)
|
||||
#define ALPHA 224 (%rsp)
|
||||
|
||||
#define MMM 232(%rsp)
|
||||
#define NN 240(%rsp)
|
||||
#define AA 248(%rsp)
|
||||
#define XX 256(%rsp)
|
||||
#define LDAX 264(%rsp)
|
||||
#define ALPHAR 272(%rsp)
|
||||
#define ALPHAI 280(%rsp)
|
||||
|
||||
#define M %rcx
|
||||
#define N %rdx
|
||||
#define A %r8
|
||||
@@ -142,9 +158,37 @@
|
||||
movaps %xmm3, %xmm0
|
||||
movss OLD_ALPHA_I, %xmm1
|
||||
#endif
|
||||
movq A, AA
|
||||
movq N, NN
|
||||
movq M, MMM
|
||||
movq LDA, LDAX
|
||||
movq X, XX
|
||||
movq OLD_Y, Y
|
||||
movss %xmm0,ALPHAR
|
||||
movss %xmm1,ALPHAI
|
||||
|
||||
.L0t:
|
||||
xorq I,I
|
||||
addq $1,I
|
||||
salq $20,I
|
||||
subq I,MMM
|
||||
movq I,M
|
||||
movss ALPHAR,%xmm0
|
||||
movss ALPHAI,%xmm1
|
||||
jge .L00t
|
||||
|
||||
movq MMM,M
|
||||
addq I,M
|
||||
jle .L999x
|
||||
|
||||
.L00t:
|
||||
movq AA, A
|
||||
movq NN, N
|
||||
movq LDAX, LDA
|
||||
movq XX, X
|
||||
|
||||
movq OLD_INCX, INCX
|
||||
movq OLD_Y, Y
|
||||
# movq OLD_Y, Y
|
||||
movq OLD_INCY, INCY
|
||||
movq OLD_BUFFER, BUFFER
|
||||
|
||||
@@ -4274,6 +4318,11 @@
|
||||
ALIGN_3
|
||||
|
||||
.L999:
|
||||
movq M, I
|
||||
salq $ZBASE_SHIFT,I
|
||||
addq I,AA
|
||||
jmp .L0t
|
||||
.L999x:
|
||||
movq 0(%rsp), %rbx
|
||||
movq 8(%rsp), %rbp
|
||||
movq 16(%rsp), %r12
|
||||
|
||||
@@ -47,13 +47,19 @@
|
||||
|
||||
#ifndef WINDOWS_ABI
|
||||
|
||||
#define STACKSIZE 64
|
||||
#define STACKSIZE 128
|
||||
|
||||
#define OLD_INCX 8 + STACKSIZE(%rsp)
|
||||
#define OLD_Y 16 + STACKSIZE(%rsp)
|
||||
#define OLD_INCY 24 + STACKSIZE(%rsp)
|
||||
#define OLD_BUFFER 32 + STACKSIZE(%rsp)
|
||||
#define ALPHA 48 (%rsp)
|
||||
#define MMM 64(%rsp)
|
||||
#define NN 72(%rsp)
|
||||
#define AA 80(%rsp)
|
||||
#define LDAX 88(%rsp)
|
||||
#define ALPHAR 96(%rsp)
|
||||
#define ALPHAI 104(%rsp)
|
||||
|
||||
#define M %rdi
|
||||
#define N %rsi
|
||||
@@ -66,7 +72,7 @@
|
||||
|
||||
#else
|
||||
|
||||
#define STACKSIZE 256
|
||||
#define STACKSIZE 288
|
||||
|
||||
#define OLD_ALPHA_I 40 + STACKSIZE(%rsp)
|
||||
#define OLD_A 48 + STACKSIZE(%rsp)
|
||||
@@ -78,6 +84,13 @@
|
||||
#define OLD_BUFFER 96 + STACKSIZE(%rsp)
|
||||
#define ALPHA 224 (%rsp)
|
||||
|
||||
#define MMM 232(%rsp)
|
||||
#define NN 240(%rsp)
|
||||
#define AA 248(%rsp)
|
||||
#define LDAX 256(%rsp)
|
||||
#define ALPHAR 264(%rsp)
|
||||
#define ALPHAI 272(%rsp)
|
||||
|
||||
#define M %rcx
|
||||
#define N %rdx
|
||||
#define A %r8
|
||||
@@ -144,6 +157,32 @@
|
||||
movss OLD_ALPHA_I, %xmm1
|
||||
#endif
|
||||
|
||||
movq A, AA
|
||||
movq N, NN
|
||||
movq M, MMM
|
||||
movq LDA, LDAX
|
||||
movss %xmm0,ALPHAR
|
||||
movss %xmm1,ALPHAI
|
||||
|
||||
.L0t:
|
||||
xorq I,I
|
||||
addq $1,I
|
||||
salq $20,I
|
||||
subq I,MMM
|
||||
movq I,M
|
||||
movss ALPHAR,%xmm0
|
||||
movss ALPHAI,%xmm1
|
||||
jge .L00t
|
||||
|
||||
movq MMM,M
|
||||
addq I,M
|
||||
jle .L999x
|
||||
|
||||
.L00t:
|
||||
movq AA, A
|
||||
movq NN, N
|
||||
movq LDAX, LDA
|
||||
|
||||
movq OLD_INCX, INCX
|
||||
movq OLD_Y, Y
|
||||
movq OLD_INCY, INCY
|
||||
@@ -4350,6 +4389,11 @@
|
||||
ALIGN_3
|
||||
|
||||
.L999:
|
||||
movq M, I
|
||||
salq $ZBASE_SHIFT,I
|
||||
addq I,AA
|
||||
jmp .L0t
|
||||
.L999x:
|
||||
movq 0(%rsp), %rbx
|
||||
movq 8(%rsp), %rbp
|
||||
movq 16(%rsp), %r12
|
||||
|
||||
408
kernel/x86_64/daxpy_bulldozer.S
Normal file
408
kernel/x86_64/daxpy_bulldozer.S
Normal file
@@ -0,0 +1,408 @@
|
||||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
#ifndef WINDOWS_ABI
|
||||
#define M ARG1
|
||||
#define X ARG4
|
||||
#define INCX ARG5
|
||||
#define Y ARG6
|
||||
#define INCY ARG2
|
||||
#else
|
||||
#define M ARG1
|
||||
#define X ARG2
|
||||
#define INCX ARG3
|
||||
#define Y ARG4
|
||||
#define INCY %r10
|
||||
#endif
|
||||
|
||||
#define YY %r11
|
||||
#define ALPHA %xmm15
|
||||
|
||||
#define A_PRE 640
|
||||
|
||||
#include "l1param.h"
|
||||
|
||||
PROLOGUE
|
||||
PROFCODE
|
||||
|
||||
#ifndef WINDOWS_ABI
|
||||
#ifndef XDOUBLE
|
||||
movq 8(%rsp), INCY
|
||||
#else
|
||||
movq 24(%rsp), INCY
|
||||
#endif
|
||||
vmovups %xmm0, ALPHA
|
||||
#else
|
||||
vmovups %xmm3, ALPHA
|
||||
|
||||
movq 40(%rsp), X
|
||||
movq 48(%rsp), INCX
|
||||
movq 56(%rsp), Y
|
||||
movq 64(%rsp), INCY
|
||||
#endif
|
||||
|
||||
SAVEREGISTERS
|
||||
|
||||
unpcklpd ALPHA, ALPHA
|
||||
|
||||
leaq (, INCX, SIZE), INCX
|
||||
leaq (, INCY, SIZE), INCY
|
||||
|
||||
testq M, M
|
||||
jle .L47
|
||||
|
||||
cmpq $SIZE, INCX
|
||||
jne .L40
|
||||
cmpq $SIZE, INCY
|
||||
jne .L40
|
||||
|
||||
testq $SIZE, Y
|
||||
je .L10
|
||||
|
||||
movsd (X), %xmm0
|
||||
mulsd ALPHA, %xmm0
|
||||
addsd (Y), %xmm0
|
||||
movsd %xmm0, (Y)
|
||||
addq $1 * SIZE, X
|
||||
addq $1 * SIZE, Y
|
||||
decq M
|
||||
jle .L19
|
||||
ALIGN_4
|
||||
|
||||
.L10:
|
||||
subq $-16 * SIZE, X
|
||||
subq $-16 * SIZE, Y
|
||||
|
||||
movq M, %rax
|
||||
sarq $4, %rax
|
||||
jle .L13
|
||||
|
||||
vmovups -16 * SIZE(X), %xmm0
|
||||
vmovups -14 * SIZE(X), %xmm1
|
||||
vmovups -12 * SIZE(X), %xmm2
|
||||
vmovups -10 * SIZE(X), %xmm3
|
||||
|
||||
decq %rax
|
||||
jle .L12
|
||||
ALIGN_3
|
||||
|
||||
.L11:
|
||||
|
||||
prefetchnta A_PRE(Y)
|
||||
|
||||
vmovups -8 * SIZE(X), %xmm4
|
||||
vfmaddpd -16 * SIZE(Y), ALPHA, %xmm0 , %xmm0
|
||||
vfmaddpd -14 * SIZE(Y), ALPHA, %xmm1 , %xmm1
|
||||
vmovups -6 * SIZE(X), %xmm5
|
||||
vmovups -4 * SIZE(X), %xmm6
|
||||
vfmaddpd -12 * SIZE(Y), ALPHA, %xmm2 , %xmm2
|
||||
vfmaddpd -10 * SIZE(Y), ALPHA, %xmm3 , %xmm3
|
||||
vmovups -2 * SIZE(X), %xmm7
|
||||
|
||||
|
||||
vmovups %xmm0, -16 * SIZE(Y)
|
||||
vmovups %xmm1, -14 * SIZE(Y)
|
||||
prefetchnta A_PRE(X)
|
||||
nop
|
||||
vmovups %xmm2, -12 * SIZE(Y)
|
||||
vmovups %xmm3, -10 * SIZE(Y)
|
||||
|
||||
prefetchnta A_PRE+64(Y)
|
||||
|
||||
vmovups 0 * SIZE(X), %xmm0
|
||||
vfmaddpd -8 * SIZE(Y), ALPHA, %xmm4 , %xmm4
|
||||
vfmaddpd -6 * SIZE(Y), ALPHA, %xmm5 , %xmm5
|
||||
vmovups 2 * SIZE(X), %xmm1
|
||||
vmovups 4 * SIZE(X), %xmm2
|
||||
vfmaddpd -4 * SIZE(Y), ALPHA, %xmm6 , %xmm6
|
||||
vfmaddpd -2 * SIZE(Y), ALPHA, %xmm7 , %xmm7
|
||||
vmovups 6 * SIZE(X), %xmm3
|
||||
|
||||
|
||||
vmovups %xmm4, -8 * SIZE(Y)
|
||||
vmovups %xmm5, -6 * SIZE(Y)
|
||||
prefetchnta A_PRE+64(X)
|
||||
nop
|
||||
vmovups %xmm6, -4 * SIZE(Y)
|
||||
vmovups %xmm7, -2 * SIZE(Y)
|
||||
|
||||
subq $-16 * SIZE, Y
|
||||
subq $-16 * SIZE, X
|
||||
decq %rax
|
||||
jg .L11
|
||||
ALIGN_3
|
||||
|
||||
.L12:
|
||||
|
||||
vmovups -8 * SIZE(X), %xmm4
|
||||
vfmaddpd -16 * SIZE(Y), ALPHA, %xmm0 , %xmm0
|
||||
vfmaddpd -14 * SIZE(Y), ALPHA, %xmm1 , %xmm1
|
||||
vmovups -6 * SIZE(X), %xmm5
|
||||
vmovups -4 * SIZE(X), %xmm6
|
||||
vfmaddpd -12 * SIZE(Y), ALPHA, %xmm2 , %xmm2
|
||||
vfmaddpd -10 * SIZE(Y), ALPHA, %xmm3 , %xmm3
|
||||
vmovups -2 * SIZE(X), %xmm7
|
||||
|
||||
|
||||
vmovups %xmm0, -16 * SIZE(Y)
|
||||
vmovups %xmm1, -14 * SIZE(Y)
|
||||
vmovups %xmm2, -12 * SIZE(Y)
|
||||
vmovups %xmm3, -10 * SIZE(Y)
|
||||
|
||||
vfmaddpd -8 * SIZE(Y), ALPHA, %xmm4 , %xmm4
|
||||
vfmaddpd -6 * SIZE(Y), ALPHA, %xmm5 , %xmm5
|
||||
vfmaddpd -4 * SIZE(Y), ALPHA, %xmm6 , %xmm6
|
||||
vfmaddpd -2 * SIZE(Y), ALPHA, %xmm7 , %xmm7
|
||||
|
||||
vmovups %xmm4, -8 * SIZE(Y)
|
||||
vmovups %xmm5, -6 * SIZE(Y)
|
||||
vmovups %xmm6, -4 * SIZE(Y)
|
||||
vmovups %xmm7, -2 * SIZE(Y)
|
||||
|
||||
subq $-16 * SIZE, Y
|
||||
subq $-16 * SIZE, X
|
||||
ALIGN_3
|
||||
|
||||
.L13:
|
||||
|
||||
|
||||
movq M, %rax
|
||||
andq $8, %rax
|
||||
jle .L14
|
||||
ALIGN_3
|
||||
|
||||
vmovups -16 * SIZE(X), %xmm0
|
||||
vmovups -14 * SIZE(X), %xmm1
|
||||
vmovups -12 * SIZE(X), %xmm2
|
||||
vmovups -10 * SIZE(X), %xmm3
|
||||
|
||||
vfmaddpd -16 * SIZE(Y), ALPHA, %xmm0 , %xmm0
|
||||
vfmaddpd -14 * SIZE(Y), ALPHA, %xmm1 , %xmm1
|
||||
vfmaddpd -12 * SIZE(Y), ALPHA, %xmm2 , %xmm2
|
||||
vfmaddpd -10 * SIZE(Y), ALPHA, %xmm3 , %xmm3
|
||||
|
||||
vmovups %xmm0, -16 * SIZE(Y)
|
||||
vmovups %xmm1, -14 * SIZE(Y)
|
||||
vmovups %xmm2, -12 * SIZE(Y)
|
||||
vmovups %xmm3, -10 * SIZE(Y)
|
||||
|
||||
addq $8 * SIZE, X
|
||||
addq $8 * SIZE, Y
|
||||
ALIGN_3
|
||||
|
||||
.L14:
|
||||
movq M, %rax
|
||||
andq $4, %rax
|
||||
jle .L15
|
||||
ALIGN_3
|
||||
|
||||
vmovups -16 * SIZE(X), %xmm0
|
||||
vmovups -14 * SIZE(X), %xmm1
|
||||
|
||||
vfmaddpd -16 * SIZE(Y), ALPHA, %xmm0 , %xmm0
|
||||
vfmaddpd -14 * SIZE(Y), ALPHA, %xmm1 , %xmm1
|
||||
|
||||
vmovups %xmm0, -16 * SIZE(Y)
|
||||
vmovups %xmm1, -14 * SIZE(Y)
|
||||
|
||||
addq $4 * SIZE, X
|
||||
addq $4 * SIZE, Y
|
||||
ALIGN_3
|
||||
|
||||
.L15:
|
||||
movq M, %rax
|
||||
andq $2, %rax
|
||||
jle .L16
|
||||
ALIGN_3
|
||||
|
||||
vmovups -16 * SIZE(X), %xmm0
|
||||
vfmaddpd -16 * SIZE(Y), ALPHA, %xmm0 , %xmm0
|
||||
vmovups %xmm0, -16 * SIZE(Y)
|
||||
|
||||
addq $2 * SIZE, X
|
||||
addq $2 * SIZE, Y
|
||||
ALIGN_3
|
||||
|
||||
.L16:
|
||||
movq M, %rax
|
||||
andq $1, %rax
|
||||
jle .L19
|
||||
ALIGN_3
|
||||
|
||||
vmovsd -16 * SIZE(X), %xmm0
|
||||
vfmaddsd -16 * SIZE(Y), ALPHA, %xmm0 , %xmm0
|
||||
|
||||
vmovsd %xmm0, -16 * SIZE(Y)
|
||||
ALIGN_3
|
||||
|
||||
.L19:
|
||||
xorq %rax,%rax
|
||||
|
||||
RESTOREREGISTERS
|
||||
|
||||
ret
|
||||
ALIGN_3
|
||||
|
||||
|
||||
.L40:
|
||||
movq Y, YY
|
||||
movq M, %rax
|
||||
//If incx==0 || incy==0, avoid unloop.
|
||||
cmpq $0, INCX
|
||||
je .L46
|
||||
cmpq $0, INCY
|
||||
je .L46
|
||||
|
||||
sarq $3, %rax
|
||||
jle .L45
|
||||
|
||||
prefetchnta 512(X)
|
||||
prefetchnta 512+64(X)
|
||||
prefetchnta 512+128(X)
|
||||
prefetchnta 512+192(X)
|
||||
|
||||
prefetchnta 512(Y)
|
||||
prefetchnta 512+64(Y)
|
||||
prefetchnta 512+128(Y)
|
||||
prefetchnta 512+192(Y)
|
||||
ALIGN_3
|
||||
|
||||
.L41:
|
||||
|
||||
vmovsd 0 * SIZE(X), %xmm0
|
||||
addq INCX, X
|
||||
vmovhpd 0 * SIZE(X), %xmm0 , %xmm0
|
||||
addq INCX, X
|
||||
|
||||
vmovsd 0 * SIZE(YY), %xmm6
|
||||
addq INCY, YY
|
||||
vmovhpd 0 * SIZE(YY), %xmm6 , %xmm6
|
||||
addq INCY, YY
|
||||
|
||||
|
||||
vmovsd 0 * SIZE(X), %xmm1
|
||||
addq INCX, X
|
||||
vmovhpd 0 * SIZE(X), %xmm1 , %xmm1
|
||||
addq INCX, X
|
||||
|
||||
vmovsd 0 * SIZE(YY), %xmm7
|
||||
addq INCY, YY
|
||||
vmovhpd 0 * SIZE(YY), %xmm7 , %xmm7
|
||||
addq INCY, YY
|
||||
|
||||
vfmaddpd %xmm6 , ALPHA , %xmm0 , %xmm0
|
||||
|
||||
vmovsd 0 * SIZE(X), %xmm2
|
||||
addq INCX, X
|
||||
vmovhpd 0 * SIZE(X), %xmm2 , %xmm2
|
||||
addq INCX, X
|
||||
|
||||
vmovsd 0 * SIZE(YY), %xmm8
|
||||
addq INCY, YY
|
||||
vmovhpd 0 * SIZE(YY), %xmm8 , %xmm8
|
||||
addq INCY, YY
|
||||
|
||||
vfmaddpd %xmm7 , ALPHA , %xmm1 , %xmm1
|
||||
|
||||
vmovsd 0 * SIZE(X), %xmm3
|
||||
addq INCX, X
|
||||
vmovhpd 0 * SIZE(X), %xmm3 , %xmm3
|
||||
addq INCX, X
|
||||
|
||||
vfmaddpd %xmm8 , ALPHA , %xmm2 , %xmm2
|
||||
|
||||
vmovsd 0 * SIZE(YY), %xmm9
|
||||
addq INCY, YY
|
||||
vmovhpd 0 * SIZE(YY), %xmm9 , %xmm9
|
||||
addq INCY, YY
|
||||
|
||||
|
||||
vmovsd %xmm0, 0 * SIZE(Y)
|
||||
addq INCY, Y
|
||||
vmovhpd %xmm0, 0 * SIZE(Y)
|
||||
addq INCY, Y
|
||||
vmovsd %xmm1, 0 * SIZE(Y)
|
||||
addq INCY, Y
|
||||
vmovhpd %xmm1, 0 * SIZE(Y)
|
||||
addq INCY, Y
|
||||
vmovsd %xmm2, 0 * SIZE(Y)
|
||||
addq INCY, Y
|
||||
vmovhpd %xmm2, 0 * SIZE(Y)
|
||||
addq INCY, Y
|
||||
|
||||
vfmaddpd %xmm9 , ALPHA , %xmm3 , %xmm3
|
||||
|
||||
vmovsd %xmm3, 0 * SIZE(Y)
|
||||
addq INCY, Y
|
||||
vmovhpd %xmm3, 0 * SIZE(Y)
|
||||
addq INCY, Y
|
||||
|
||||
decq %rax
|
||||
jg .L41
|
||||
ALIGN_3
|
||||
|
||||
.L45:
|
||||
movq M, %rax
|
||||
andq $7, %rax
|
||||
jle .L47
|
||||
ALIGN_3
|
||||
|
||||
.L46:
|
||||
vmovsd (X), %xmm0
|
||||
addq INCX, X
|
||||
|
||||
vfmaddsd (Y) , ALPHA , %xmm0 , %xmm0
|
||||
|
||||
vmovsd %xmm0, (Y)
|
||||
addq INCY, Y
|
||||
|
||||
decq %rax
|
||||
jg .L46
|
||||
ALIGN_3
|
||||
|
||||
.L47:
|
||||
xorq %rax, %rax
|
||||
|
||||
RESTOREREGISTERS
|
||||
|
||||
ret
|
||||
|
||||
EPILOGUE
|
||||
291
kernel/x86_64/dcopy_bulldozer.S
Normal file
291
kernel/x86_64/dcopy_bulldozer.S
Normal file
@@ -0,0 +1,291 @@
|
||||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
#define M ARG1 /* rdi */
|
||||
#define X ARG2 /* rsi */
|
||||
#define INCX ARG3 /* rdx */
|
||||
#define Y ARG4 /* rcx */
|
||||
#ifndef WINDOWS_ABI
|
||||
#define INCY ARG5 /* r8 */
|
||||
#else
|
||||
#define INCY %r10
|
||||
#endif
|
||||
|
||||
#include "l1param.h"
|
||||
|
||||
#define VLOAD(OFFSET, ADDR, REG) vmovups OFFSET(ADDR), REG
|
||||
#define VSHUFPD_1(REG1 , REG2) vshufpd $0x01, REG1, REG2, REG2
|
||||
#define A_PRE 640
|
||||
#define B_PRE 640
|
||||
|
||||
PROLOGUE
|
||||
PROFCODE
|
||||
|
||||
#ifdef WINDOWS_ABI
|
||||
movq 40(%rsp), INCY
|
||||
#endif
|
||||
|
||||
SAVEREGISTERS
|
||||
|
||||
leaq (, INCX, SIZE), INCX
|
||||
leaq (, INCY, SIZE), INCY
|
||||
|
||||
cmpq $SIZE, INCX
|
||||
jne .L40
|
||||
cmpq $SIZE, INCY
|
||||
jne .L40
|
||||
|
||||
testq $SIZE, X
|
||||
je .L10
|
||||
|
||||
vmovsd (X), %xmm0
|
||||
vmovsd %xmm0, (Y)
|
||||
addq $1 * SIZE, X
|
||||
addq $1 * SIZE, Y
|
||||
decq M
|
||||
jle .L19
|
||||
ALIGN_4
|
||||
|
||||
.L10:
|
||||
subq $-16 * SIZE, X
|
||||
subq $-16 * SIZE, Y
|
||||
|
||||
|
||||
movq M, %rax
|
||||
sarq $4, %rax
|
||||
jle .L13
|
||||
|
||||
vmovups -16 * SIZE(X), %xmm0
|
||||
vmovups -14 * SIZE(X), %xmm1
|
||||
vmovups -12 * SIZE(X), %xmm2
|
||||
vmovups -10 * SIZE(X), %xmm3
|
||||
vmovups -8 * SIZE(X), %xmm4
|
||||
vmovups -6 * SIZE(X), %xmm5
|
||||
vmovups -4 * SIZE(X), %xmm6
|
||||
vmovups -2 * SIZE(X), %xmm7
|
||||
|
||||
decq %rax
|
||||
jle .L12
|
||||
ALIGN_4
|
||||
|
||||
.L11:
|
||||
|
||||
prefetchnta A_PRE(X)
|
||||
nop
|
||||
vmovups %xmm0, -16 * SIZE(Y)
|
||||
vmovups %xmm1, -14 * SIZE(Y)
|
||||
prefetchnta B_PRE(Y)
|
||||
nop
|
||||
vmovups %xmm2, -12 * SIZE(Y)
|
||||
vmovups %xmm3, -10 * SIZE(Y)
|
||||
|
||||
VLOAD( 0 * SIZE, X, %xmm0)
|
||||
VLOAD( 2 * SIZE, X, %xmm1)
|
||||
VLOAD( 4 * SIZE, X, %xmm2)
|
||||
VLOAD( 6 * SIZE, X, %xmm3)
|
||||
|
||||
prefetchnta A_PRE+64(X)
|
||||
nop
|
||||
vmovups %xmm4, -8 * SIZE(Y)
|
||||
vmovups %xmm5, -6 * SIZE(Y)
|
||||
prefetchnta B_PRE+64(Y)
|
||||
nop
|
||||
vmovups %xmm6, -4 * SIZE(Y)
|
||||
vmovups %xmm7, -2 * SIZE(Y)
|
||||
|
||||
VLOAD( 8 * SIZE, X, %xmm4)
|
||||
VLOAD(10 * SIZE, X, %xmm5)
|
||||
subq $-16 * SIZE, Y
|
||||
VLOAD(12 * SIZE, X, %xmm6)
|
||||
VLOAD(14 * SIZE, X, %xmm7)
|
||||
|
||||
subq $-16 * SIZE, X
|
||||
decq %rax
|
||||
jg .L11
|
||||
ALIGN_3
|
||||
|
||||
.L12:
|
||||
vmovups %xmm0, -16 * SIZE(Y)
|
||||
vmovups %xmm1, -14 * SIZE(Y)
|
||||
vmovups %xmm2, -12 * SIZE(Y)
|
||||
vmovups %xmm3, -10 * SIZE(Y)
|
||||
vmovups %xmm4, -8 * SIZE(Y)
|
||||
vmovups %xmm5, -6 * SIZE(Y)
|
||||
vmovups %xmm6, -4 * SIZE(Y)
|
||||
vmovups %xmm7, -2 * SIZE(Y)
|
||||
|
||||
subq $-16 * SIZE, Y
|
||||
subq $-16 * SIZE, X
|
||||
ALIGN_3
|
||||
|
||||
.L13:
|
||||
testq $8, M
|
||||
jle .L14
|
||||
ALIGN_3
|
||||
|
||||
vmovups -16 * SIZE(X), %xmm0
|
||||
vmovups -14 * SIZE(X), %xmm1
|
||||
vmovups -12 * SIZE(X), %xmm2
|
||||
vmovups -10 * SIZE(X), %xmm3
|
||||
|
||||
vmovups %xmm0, -16 * SIZE(Y)
|
||||
vmovups %xmm1, -14 * SIZE(Y)
|
||||
vmovups %xmm2, -12 * SIZE(Y)
|
||||
vmovups %xmm3, -10 * SIZE(Y)
|
||||
|
||||
addq $8 * SIZE, X
|
||||
addq $8 * SIZE, Y
|
||||
ALIGN_3
|
||||
|
||||
.L14:
|
||||
testq $4, M
|
||||
jle .L15
|
||||
ALIGN_3
|
||||
|
||||
vmovups -16 * SIZE(X), %xmm0
|
||||
vmovups -14 * SIZE(X), %xmm1
|
||||
|
||||
vmovups %xmm0, -16 * SIZE(Y)
|
||||
vmovups %xmm1, -14 * SIZE(Y)
|
||||
|
||||
addq $4 * SIZE, X
|
||||
addq $4 * SIZE, Y
|
||||
ALIGN_3
|
||||
|
||||
.L15:
|
||||
testq $2, M
|
||||
jle .L16
|
||||
ALIGN_3
|
||||
|
||||
vmovups -16 * SIZE(X), %xmm0
|
||||
vmovups %xmm0, -16 * SIZE(Y)
|
||||
|
||||
addq $2 * SIZE, X
|
||||
addq $2 * SIZE, Y
|
||||
ALIGN_3
|
||||
|
||||
.L16:
|
||||
testq $1, M
|
||||
jle .L19
|
||||
ALIGN_3
|
||||
|
||||
vmovsd -16 * SIZE(X), %xmm0
|
||||
vmovsd %xmm0, -16 * SIZE(Y)
|
||||
ALIGN_3
|
||||
|
||||
.L19:
|
||||
xorq %rax,%rax
|
||||
|
||||
RESTOREREGISTERS
|
||||
|
||||
ret
|
||||
ALIGN_3
|
||||
|
||||
|
||||
|
||||
.L40:
|
||||
movq M, %rax
|
||||
sarq $3, %rax
|
||||
jle .L45
|
||||
ALIGN_3
|
||||
|
||||
.L41:
|
||||
vmovsd (X), %xmm0
|
||||
addq INCX, X
|
||||
vmovsd (X), %xmm4
|
||||
addq INCX, X
|
||||
vmovsd (X), %xmm1
|
||||
addq INCX, X
|
||||
vmovsd (X), %xmm5
|
||||
addq INCX, X
|
||||
vmovsd (X), %xmm2
|
||||
addq INCX, X
|
||||
vmovsd (X), %xmm6
|
||||
addq INCX, X
|
||||
vmovsd (X), %xmm3
|
||||
addq INCX, X
|
||||
vmovsd (X), %xmm7
|
||||
addq INCX, X
|
||||
|
||||
vmovsd %xmm0, (Y)
|
||||
addq INCY, Y
|
||||
vmovsd %xmm4, (Y)
|
||||
addq INCY, Y
|
||||
vmovsd %xmm1, (Y)
|
||||
addq INCY, Y
|
||||
vmovsd %xmm5, (Y)
|
||||
addq INCY, Y
|
||||
vmovsd %xmm2, (Y)
|
||||
addq INCY, Y
|
||||
vmovsd %xmm6, (Y)
|
||||
addq INCY, Y
|
||||
vmovsd %xmm3, (Y)
|
||||
addq INCY, Y
|
||||
vmovsd %xmm7, (Y)
|
||||
addq INCY, Y
|
||||
|
||||
decq %rax
|
||||
jg .L41
|
||||
ALIGN_3
|
||||
|
||||
.L45:
|
||||
movq M, %rax
|
||||
andq $7, %rax
|
||||
jle .L47
|
||||
ALIGN_3
|
||||
|
||||
.L46:
|
||||
vmovsd (X), %xmm0
|
||||
addq INCX, X
|
||||
vmovsd %xmm0, (Y)
|
||||
addq INCY, Y
|
||||
decq %rax
|
||||
jg .L46
|
||||
ALIGN_3
|
||||
|
||||
.L47:
|
||||
xorq %rax, %rax
|
||||
|
||||
RESTOREREGISTERS
|
||||
|
||||
ret
|
||||
|
||||
EPILOGUE
|
||||
311
kernel/x86_64/ddot_bulldozer.S
Normal file
311
kernel/x86_64/ddot_bulldozer.S
Normal file
@@ -0,0 +1,311 @@
|
||||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
#define N ARG1 /* rdi */
|
||||
#define X ARG2 /* rsi */
|
||||
#define INCX ARG3 /* rdx */
|
||||
#define Y ARG4 /* rcx */
|
||||
#ifndef WINDOWS_ABI
|
||||
#define INCY ARG5 /* r8 */
|
||||
#else
|
||||
#define INCY %r10
|
||||
#endif
|
||||
|
||||
#define A_PRE 512
|
||||
|
||||
#include "l1param.h"
|
||||
|
||||
PROLOGUE
|
||||
PROFCODE
|
||||
|
||||
#ifdef WINDOWS_ABI
|
||||
movq 40(%rsp), INCY
|
||||
#endif
|
||||
|
||||
SAVEREGISTERS
|
||||
|
||||
leaq (, INCX, SIZE), INCX
|
||||
leaq (, INCY, SIZE), INCY
|
||||
|
||||
vxorps %xmm0, %xmm0 , %xmm0
|
||||
vxorps %xmm1, %xmm1 , %xmm1
|
||||
vxorps %xmm2, %xmm2 , %xmm2
|
||||
vxorps %xmm3, %xmm3 , %xmm3
|
||||
|
||||
cmpq $0, N
|
||||
jle .L999
|
||||
|
||||
cmpq $SIZE, INCX
|
||||
jne .L50
|
||||
cmpq $SIZE, INCY
|
||||
jne .L50
|
||||
|
||||
subq $-16 * SIZE, X
|
||||
subq $-16 * SIZE, Y
|
||||
|
||||
testq $SIZE, Y
|
||||
je .L10
|
||||
|
||||
vmovsd -16 * SIZE(X), %xmm0
|
||||
vmulsd -16 * SIZE(Y), %xmm0 , %xmm0
|
||||
addq $1 * SIZE, X
|
||||
addq $1 * SIZE, Y
|
||||
decq N
|
||||
ALIGN_2
|
||||
|
||||
.L10:
|
||||
|
||||
movq N, %rax
|
||||
sarq $4, %rax
|
||||
jle .L14
|
||||
|
||||
vmovups -16 * SIZE(X), %xmm4
|
||||
vmovups -14 * SIZE(X), %xmm5
|
||||
vmovups -12 * SIZE(X), %xmm6
|
||||
vmovups -10 * SIZE(X), %xmm7
|
||||
|
||||
vmovups -8 * SIZE(X), %xmm8
|
||||
vmovups -6 * SIZE(X), %xmm9
|
||||
vmovups -4 * SIZE(X), %xmm10
|
||||
vmovups -2 * SIZE(X), %xmm11
|
||||
|
||||
decq %rax
|
||||
jle .L12
|
||||
|
||||
ALIGN_3
|
||||
|
||||
.L11:
|
||||
prefetchnta A_PRE(Y)
|
||||
|
||||
vfmaddpd %xmm0 , -16 * SIZE(Y), %xmm4 , %xmm0
|
||||
vfmaddpd %xmm1 , -14 * SIZE(Y), %xmm5 , %xmm1
|
||||
prefetchnta A_PRE(X)
|
||||
vfmaddpd %xmm2 , -12 * SIZE(Y), %xmm6 , %xmm2
|
||||
vfmaddpd %xmm3 , -10 * SIZE(Y), %xmm7 , %xmm3
|
||||
|
||||
vmovups 0 * SIZE(X), %xmm4
|
||||
vfmaddpd %xmm0 , -8 * SIZE(Y), %xmm8 , %xmm0
|
||||
vfmaddpd %xmm1 , -6 * SIZE(Y), %xmm9 , %xmm1
|
||||
vmovups 2 * SIZE(X), %xmm5
|
||||
vmovups 4 * SIZE(X), %xmm6
|
||||
vfmaddpd %xmm2 , -4 * SIZE(Y), %xmm10, %xmm2
|
||||
vfmaddpd %xmm3 , -2 * SIZE(Y), %xmm11, %xmm3
|
||||
vmovups 6 * SIZE(X), %xmm7
|
||||
|
||||
prefetchnta A_PRE+64(Y)
|
||||
|
||||
vmovups 8 * SIZE(X), %xmm8
|
||||
vmovups 10 * SIZE(X), %xmm9
|
||||
prefetchnta A_PRE+64(X)
|
||||
vmovups 12 * SIZE(X), %xmm10
|
||||
vmovups 14 * SIZE(X), %xmm11
|
||||
|
||||
subq $-16 * SIZE, X
|
||||
subq $-16 * SIZE, Y
|
||||
|
||||
decq %rax
|
||||
jg .L11
|
||||
ALIGN_3
|
||||
|
||||
.L12:
|
||||
|
||||
vfmaddpd %xmm0 , -16 * SIZE(Y), %xmm4 , %xmm0
|
||||
vfmaddpd %xmm1 , -14 * SIZE(Y), %xmm5 , %xmm1
|
||||
vfmaddpd %xmm2 , -12 * SIZE(Y), %xmm6 , %xmm2
|
||||
vfmaddpd %xmm3 , -10 * SIZE(Y), %xmm7 , %xmm3
|
||||
|
||||
vfmaddpd %xmm0 , -8 * SIZE(Y), %xmm8 , %xmm0
|
||||
vfmaddpd %xmm1 , -6 * SIZE(Y), %xmm9 , %xmm1
|
||||
vfmaddpd %xmm2 , -4 * SIZE(Y), %xmm10, %xmm2
|
||||
vfmaddpd %xmm3 , -2 * SIZE(Y), %xmm11, %xmm3
|
||||
|
||||
subq $-16 * SIZE, X
|
||||
subq $-16 * SIZE, Y
|
||||
ALIGN_3
|
||||
|
||||
.L14:
|
||||
testq $15, N
|
||||
jle .L999
|
||||
|
||||
testq $8, N
|
||||
jle .L15
|
||||
|
||||
vmovups -16 * SIZE(X), %xmm4
|
||||
vmovups -14 * SIZE(X), %xmm5
|
||||
vmovups -12 * SIZE(X), %xmm6
|
||||
vmovups -10 * SIZE(X), %xmm7
|
||||
|
||||
vfmaddpd %xmm0 , -16 * SIZE(Y), %xmm4 , %xmm0
|
||||
vfmaddpd %xmm1 , -14 * SIZE(Y), %xmm5 , %xmm1
|
||||
vfmaddpd %xmm2 , -12 * SIZE(Y), %xmm6 , %xmm2
|
||||
vfmaddpd %xmm3 , -10 * SIZE(Y), %xmm7 , %xmm3
|
||||
|
||||
addq $8 * SIZE, X
|
||||
addq $8 * SIZE, Y
|
||||
ALIGN_3
|
||||
|
||||
.L15:
|
||||
testq $4, N
|
||||
jle .L16
|
||||
|
||||
vmovups -16 * SIZE(X), %xmm4
|
||||
vmovups -14 * SIZE(X), %xmm5
|
||||
|
||||
vfmaddpd %xmm0 , -16 * SIZE(Y), %xmm4 , %xmm0
|
||||
vfmaddpd %xmm1 , -14 * SIZE(Y), %xmm5 , %xmm1
|
||||
|
||||
addq $4 * SIZE, X
|
||||
addq $4 * SIZE, Y
|
||||
ALIGN_3
|
||||
|
||||
.L16:
|
||||
testq $2, N
|
||||
jle .L17
|
||||
|
||||
vmovups -16 * SIZE(X), %xmm4
|
||||
vfmaddpd %xmm0 , -16 * SIZE(Y), %xmm4 , %xmm0
|
||||
|
||||
|
||||
addq $2 * SIZE, X
|
||||
addq $2 * SIZE, Y
|
||||
ALIGN_3
|
||||
|
||||
.L17:
|
||||
testq $1, N
|
||||
jle .L999
|
||||
|
||||
vmovsd -16 * SIZE(X), %xmm4
|
||||
vmovsd -16 * SIZE(Y), %xmm5
|
||||
vfmaddpd %xmm0, %xmm4 , %xmm5 , %xmm0
|
||||
jmp .L999
|
||||
ALIGN_3
|
||||
|
||||
|
||||
.L50:
|
||||
movq N, %rax
|
||||
sarq $3, %rax
|
||||
jle .L55
|
||||
ALIGN_3
|
||||
|
||||
.L53:
|
||||
|
||||
|
||||
vmovsd 0 * SIZE(X), %xmm4
|
||||
addq INCX, X
|
||||
vmovsd 0 * SIZE(Y), %xmm8
|
||||
addq INCY, Y
|
||||
vmovsd 0 * SIZE(X), %xmm5
|
||||
addq INCX, X
|
||||
vmovsd 0 * SIZE(Y), %xmm9
|
||||
addq INCY, Y
|
||||
|
||||
vmovsd 0 * SIZE(X), %xmm6
|
||||
addq INCX, X
|
||||
vmovsd 0 * SIZE(Y), %xmm10
|
||||
addq INCY, Y
|
||||
vmovsd 0 * SIZE(X), %xmm7
|
||||
addq INCX, X
|
||||
vmovsd 0 * SIZE(Y), %xmm11
|
||||
addq INCY, Y
|
||||
|
||||
vfmaddpd %xmm0 , %xmm4 , %xmm8 , %xmm0
|
||||
vfmaddpd %xmm1 , %xmm5 , %xmm9 , %xmm1
|
||||
vfmaddpd %xmm2 , %xmm6 , %xmm10, %xmm2
|
||||
vfmaddpd %xmm3 , %xmm7 , %xmm11, %xmm3
|
||||
|
||||
|
||||
vmovsd 0 * SIZE(X), %xmm4
|
||||
addq INCX, X
|
||||
vmovsd 0 * SIZE(Y), %xmm8
|
||||
addq INCY, Y
|
||||
vmovsd 0 * SIZE(X), %xmm5
|
||||
addq INCX, X
|
||||
vmovsd 0 * SIZE(Y), %xmm9
|
||||
addq INCY, Y
|
||||
|
||||
vmovsd 0 * SIZE(X), %xmm6
|
||||
addq INCX, X
|
||||
vmovsd 0 * SIZE(Y), %xmm10
|
||||
addq INCY, Y
|
||||
vmovsd 0 * SIZE(X), %xmm7
|
||||
addq INCX, X
|
||||
vmovsd 0 * SIZE(Y), %xmm11
|
||||
addq INCY, Y
|
||||
|
||||
vfmaddpd %xmm0 , %xmm4 , %xmm8 , %xmm0
|
||||
vfmaddpd %xmm1 , %xmm5 , %xmm9 , %xmm1
|
||||
vfmaddpd %xmm2 , %xmm6 , %xmm10, %xmm2
|
||||
vfmaddpd %xmm3 , %xmm7 , %xmm11, %xmm3
|
||||
|
||||
decq %rax
|
||||
jg .L53
|
||||
ALIGN_3
|
||||
|
||||
.L55:
|
||||
movq N, %rax
|
||||
andq $7, %rax
|
||||
jle .L999
|
||||
ALIGN_3
|
||||
|
||||
.L56:
|
||||
vmovsd 0 * SIZE(X), %xmm4
|
||||
addq INCX, X
|
||||
vmovsd 0 * SIZE(Y), %xmm8
|
||||
addq INCY, Y
|
||||
|
||||
vfmaddpd %xmm0 , %xmm4 , %xmm8 , %xmm0
|
||||
|
||||
decq %rax
|
||||
jg .L56
|
||||
ALIGN_3
|
||||
|
||||
.L999:
|
||||
vaddpd %xmm1, %xmm0 , %xmm0
|
||||
vaddpd %xmm3, %xmm2 , %xmm2
|
||||
vaddpd %xmm2, %xmm0 , %xmm0
|
||||
|
||||
vhaddpd %xmm0, %xmm0 , %xmm0
|
||||
|
||||
RESTOREREGISTERS
|
||||
|
||||
ret
|
||||
|
||||
EPILOGUE
|
||||
File diff suppressed because it is too large
Load Diff
3880
kernel/x86_64/dgemm_kernel_8x2_bulldozer.S
Normal file
3880
kernel/x86_64/dgemm_kernel_8x2_bulldozer.S
Normal file
File diff suppressed because it is too large
Load Diff
1821
kernel/x86_64/dgemm_ncopy_8_bulldozer.S
Normal file
1821
kernel/x86_64/dgemm_ncopy_8_bulldozer.S
Normal file
File diff suppressed because it is too large
Load Diff
667
kernel/x86_64/dgemm_tcopy_8_bulldozer.S
Normal file
667
kernel/x86_64/dgemm_tcopy_8_bulldozer.S
Normal file
@@ -0,0 +1,667 @@
|
||||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
#define VMOVUPS_A1(OFF, ADDR, REGS) vmovups OFF(ADDR), REGS
|
||||
#define VMOVUPS_A2(OFF, ADDR, BASE, SCALE, REGS) vmovups OFF(ADDR, BASE, SCALE), REGS
|
||||
|
||||
#define A_PRE 256
|
||||
|
||||
#ifndef WINDOWS_ABI
|
||||
|
||||
#define N ARG1 /* rsi */
|
||||
#define M ARG2 /* rdi */
|
||||
#define A ARG3 /* rdx */
|
||||
#define LDA ARG4 /* rcx */
|
||||
#define B ARG5 /* r8 */
|
||||
|
||||
#define AO1 %r9
|
||||
#define AO2 %r10
|
||||
#define LDA3 %r11
|
||||
#define M8 %r12
|
||||
|
||||
#else
|
||||
|
||||
#define N ARG1 /* rdx */
|
||||
#define M ARG2 /* rcx */
|
||||
#define A ARG3 /* r8 */
|
||||
#define LDA ARG4 /* r9 */
|
||||
#define OLD_B 40 + 56(%rsp)
|
||||
|
||||
#define B %r12
|
||||
|
||||
#define AO1 %rsi
|
||||
#define AO2 %rdi
|
||||
#define LDA3 %r10
|
||||
#define M8 %r11
|
||||
#endif
|
||||
|
||||
#define I %rax
|
||||
|
||||
#define B0 %rbp
|
||||
#define B1 %r13
|
||||
#define B2 %r14
|
||||
#define B3 %r15
|
||||
|
||||
PROLOGUE
|
||||
PROFCODE
|
||||
|
||||
#ifdef WINDOWS_ABI
|
||||
pushq %rdi
|
||||
pushq %rsi
|
||||
#endif
|
||||
|
||||
pushq %r15
|
||||
pushq %r14
|
||||
pushq %r13
|
||||
pushq %r12
|
||||
pushq %rbp
|
||||
|
||||
#ifdef WINDOWS_ABI
|
||||
movq OLD_B, B
|
||||
#endif
|
||||
|
||||
subq $-16 * SIZE, B
|
||||
|
||||
movq M, B1
|
||||
movq M, B2
|
||||
movq M, B3
|
||||
|
||||
andq $-8, B1
|
||||
andq $-4, B2
|
||||
andq $-2, B3
|
||||
|
||||
imulq N, B1
|
||||
imulq N, B2
|
||||
imulq N, B3
|
||||
|
||||
leaq (B, B1, SIZE), B1
|
||||
leaq (B, B2, SIZE), B2
|
||||
leaq (B, B3, SIZE), B3
|
||||
|
||||
leaq (,LDA, SIZE), LDA
|
||||
leaq (LDA, LDA, 2), LDA3
|
||||
|
||||
leaq (, N, SIZE), M8
|
||||
|
||||
cmpq $8, N
|
||||
jl .L20
|
||||
ALIGN_4
|
||||
|
||||
.L11:
|
||||
subq $8, N
|
||||
|
||||
movq A, AO1
|
||||
leaq (A, LDA, 4), AO2
|
||||
leaq (A, LDA, 8), A
|
||||
|
||||
movq B, B0
|
||||
addq $64 * SIZE, B
|
||||
|
||||
movq M, I
|
||||
sarq $3, I
|
||||
jle .L14
|
||||
ALIGN_4
|
||||
|
||||
.L13:
|
||||
|
||||
prefetchnta A_PRE(AO1)
|
||||
VMOVUPS_A1(0 * SIZE, AO1, %xmm0)
|
||||
VMOVUPS_A1(2 * SIZE, AO1, %xmm1)
|
||||
VMOVUPS_A1(4 * SIZE, AO1, %xmm2)
|
||||
VMOVUPS_A1(6 * SIZE, AO1, %xmm3)
|
||||
|
||||
vmovups %xmm0, -16 * SIZE(B0)
|
||||
vmovups %xmm1, -14 * SIZE(B0)
|
||||
vmovups %xmm2, -12 * SIZE(B0)
|
||||
vmovups %xmm3, -10 * SIZE(B0)
|
||||
|
||||
|
||||
prefetchnta A_PRE(AO1, LDA, 1)
|
||||
VMOVUPS_A2(0 * SIZE, AO1, LDA, 1, %xmm0)
|
||||
VMOVUPS_A2(2 * SIZE, AO1, LDA, 1, %xmm1)
|
||||
VMOVUPS_A2(4 * SIZE, AO1, LDA, 1, %xmm2)
|
||||
VMOVUPS_A2(6 * SIZE, AO1, LDA, 1, %xmm3)
|
||||
|
||||
vmovups %xmm0, -8 * SIZE(B0)
|
||||
vmovups %xmm1, -6 * SIZE(B0)
|
||||
vmovups %xmm2, -4 * SIZE(B0)
|
||||
vmovups %xmm3, -2 * SIZE(B0)
|
||||
|
||||
|
||||
prefetchnta A_PRE(AO1, LDA, 2)
|
||||
VMOVUPS_A2(0 * SIZE, AO1, LDA, 2, %xmm0)
|
||||
VMOVUPS_A2(2 * SIZE, AO1, LDA, 2, %xmm1)
|
||||
VMOVUPS_A2(4 * SIZE, AO1, LDA, 2, %xmm2)
|
||||
VMOVUPS_A2(6 * SIZE, AO1, LDA, 2, %xmm3)
|
||||
|
||||
|
||||
vmovups %xmm0, 0 * SIZE(B0)
|
||||
vmovups %xmm1, 2 * SIZE(B0)
|
||||
vmovups %xmm2, 4 * SIZE(B0)
|
||||
vmovups %xmm3, 6 * SIZE(B0)
|
||||
|
||||
|
||||
prefetchnta A_PRE(AO1, LDA3, 1)
|
||||
VMOVUPS_A2(0 * SIZE, AO1, LDA3, 1, %xmm0)
|
||||
VMOVUPS_A2(2 * SIZE, AO1, LDA3, 1, %xmm1)
|
||||
VMOVUPS_A2(4 * SIZE, AO1, LDA3, 1, %xmm2)
|
||||
VMOVUPS_A2(6 * SIZE, AO1, LDA3, 1, %xmm3)
|
||||
|
||||
vmovups %xmm0, 8 * SIZE(B0)
|
||||
vmovups %xmm1, 10 * SIZE(B0)
|
||||
vmovups %xmm2, 12 * SIZE(B0)
|
||||
vmovups %xmm3, 14 * SIZE(B0)
|
||||
|
||||
prefetchnta A_PRE(AO2)
|
||||
VMOVUPS_A1(0 * SIZE, AO2, %xmm0)
|
||||
VMOVUPS_A1(2 * SIZE, AO2, %xmm1)
|
||||
VMOVUPS_A1(4 * SIZE, AO2, %xmm2)
|
||||
VMOVUPS_A1(6 * SIZE, AO2, %xmm3)
|
||||
|
||||
vmovups %xmm0, 16 * SIZE(B0)
|
||||
vmovups %xmm1, 18 * SIZE(B0)
|
||||
vmovups %xmm2, 20 * SIZE(B0)
|
||||
vmovups %xmm3, 22 * SIZE(B0)
|
||||
|
||||
prefetchnta A_PRE(AO2, LDA, 1)
|
||||
VMOVUPS_A2(0 * SIZE, AO2, LDA, 1, %xmm0)
|
||||
VMOVUPS_A2(2 * SIZE, AO2, LDA, 1, %xmm1)
|
||||
VMOVUPS_A2(4 * SIZE, AO2, LDA, 1, %xmm2)
|
||||
VMOVUPS_A2(6 * SIZE, AO2, LDA, 1, %xmm3)
|
||||
|
||||
vmovups %xmm0, 24 * SIZE(B0)
|
||||
vmovups %xmm1, 26 * SIZE(B0)
|
||||
vmovups %xmm2, 28 * SIZE(B0)
|
||||
vmovups %xmm3, 30 * SIZE(B0)
|
||||
|
||||
prefetchnta A_PRE(AO2, LDA, 2)
|
||||
VMOVUPS_A2(0 * SIZE, AO2, LDA, 2, %xmm0)
|
||||
VMOVUPS_A2(2 * SIZE, AO2, LDA, 2, %xmm1)
|
||||
VMOVUPS_A2(4 * SIZE, AO2, LDA, 2, %xmm2)
|
||||
VMOVUPS_A2(6 * SIZE, AO2, LDA, 2, %xmm3)
|
||||
|
||||
vmovups %xmm0, 32 * SIZE(B0)
|
||||
vmovups %xmm1, 34 * SIZE(B0)
|
||||
vmovups %xmm2, 36 * SIZE(B0)
|
||||
vmovups %xmm3, 38 * SIZE(B0)
|
||||
|
||||
prefetchnta A_PRE(AO2, LDA3, 1)
|
||||
VMOVUPS_A2(0 * SIZE, AO2, LDA3, 1, %xmm0)
|
||||
VMOVUPS_A2(2 * SIZE, AO2, LDA3, 1, %xmm1)
|
||||
VMOVUPS_A2(4 * SIZE, AO2, LDA3, 1, %xmm2)
|
||||
VMOVUPS_A2(6 * SIZE, AO2, LDA3, 1, %xmm3)
|
||||
|
||||
vmovups %xmm0, 40 * SIZE(B0)
|
||||
vmovups %xmm1, 42 * SIZE(B0)
|
||||
vmovups %xmm2, 44 * SIZE(B0)
|
||||
vmovups %xmm3, 46 * SIZE(B0)
|
||||
|
||||
addq $8 * SIZE, AO1
|
||||
addq $8 * SIZE, AO2
|
||||
leaq (B0, M8, 8), B0
|
||||
|
||||
decq I
|
||||
jg .L13
|
||||
ALIGN_4
|
||||
|
||||
.L14:
|
||||
testq $4, M
|
||||
jle .L16
|
||||
|
||||
VMOVUPS_A1(0 * SIZE, AO1, %xmm0)
|
||||
VMOVUPS_A1(2 * SIZE, AO1, %xmm1)
|
||||
VMOVUPS_A2(0 * SIZE, AO1, LDA, 1, %xmm2)
|
||||
VMOVUPS_A2(2 * SIZE, AO1, LDA, 1, %xmm3)
|
||||
|
||||
vmovups %xmm0, -16 * SIZE(B1)
|
||||
vmovups %xmm1, -14 * SIZE(B1)
|
||||
vmovups %xmm2, -12 * SIZE(B1)
|
||||
vmovups %xmm3, -10 * SIZE(B1)
|
||||
|
||||
VMOVUPS_A2(0 * SIZE, AO1, LDA, 2, %xmm0)
|
||||
VMOVUPS_A2(2 * SIZE, AO1, LDA, 2, %xmm1)
|
||||
VMOVUPS_A2(0 * SIZE, AO1, LDA3, 1, %xmm2)
|
||||
VMOVUPS_A2(2 * SIZE, AO1, LDA3, 1, %xmm3)
|
||||
|
||||
vmovups %xmm0, -8 * SIZE(B1)
|
||||
vmovups %xmm1, -6 * SIZE(B1)
|
||||
vmovups %xmm2, -4 * SIZE(B1)
|
||||
vmovups %xmm3, -2 * SIZE(B1)
|
||||
|
||||
VMOVUPS_A1(0 * SIZE, AO2, %xmm0)
|
||||
VMOVUPS_A1(2 * SIZE, AO2, %xmm1)
|
||||
VMOVUPS_A2(0 * SIZE, AO2, LDA, 1, %xmm2)
|
||||
VMOVUPS_A2(2 * SIZE, AO2, LDA, 1, %xmm3)
|
||||
|
||||
vmovups %xmm0, 0 * SIZE(B1)
|
||||
vmovups %xmm1, 2 * SIZE(B1)
|
||||
vmovups %xmm2, 4 * SIZE(B1)
|
||||
vmovups %xmm3, 6 * SIZE(B1)
|
||||
|
||||
VMOVUPS_A2(0 * SIZE, AO2, LDA, 2, %xmm0)
|
||||
VMOVUPS_A2(2 * SIZE, AO2, LDA, 2, %xmm1)
|
||||
VMOVUPS_A2(0 * SIZE, AO2, LDA3, 1, %xmm2)
|
||||
VMOVUPS_A2(2 * SIZE, AO2, LDA3, 1, %xmm3)
|
||||
|
||||
vmovups %xmm0, 8 * SIZE(B1)
|
||||
vmovups %xmm1, 10 * SIZE(B1)
|
||||
vmovups %xmm2, 12 * SIZE(B1)
|
||||
vmovups %xmm3, 14 * SIZE(B1)
|
||||
|
||||
addq $4 * SIZE, AO1
|
||||
addq $4 * SIZE, AO2
|
||||
subq $-32 * SIZE, B1
|
||||
ALIGN_4
|
||||
|
||||
.L16:
|
||||
testq $2, M
|
||||
jle .L18
|
||||
|
||||
VMOVUPS_A1(0 * SIZE, AO1, %xmm0)
|
||||
VMOVUPS_A2(0 * SIZE, AO1, LDA, 1, %xmm1)
|
||||
VMOVUPS_A2(0 * SIZE, AO1, LDA, 2, %xmm2)
|
||||
VMOVUPS_A2(0 * SIZE, AO1, LDA3, 1, %xmm3)
|
||||
|
||||
vmovups %xmm0, -16 * SIZE(B2)
|
||||
vmovups %xmm1, -14 * SIZE(B2)
|
||||
vmovups %xmm2, -12 * SIZE(B2)
|
||||
vmovups %xmm3, -10 * SIZE(B2)
|
||||
|
||||
VMOVUPS_A1(0 * SIZE, AO2, %xmm0)
|
||||
VMOVUPS_A2(0 * SIZE, AO2, LDA, 1, %xmm1)
|
||||
VMOVUPS_A2(0 * SIZE, AO2, LDA, 2, %xmm2)
|
||||
VMOVUPS_A2(0 * SIZE, AO2, LDA3, 1, %xmm3)
|
||||
|
||||
vmovups %xmm0, -8 * SIZE(B2)
|
||||
vmovups %xmm1, -6 * SIZE(B2)
|
||||
vmovups %xmm2, -4 * SIZE(B2)
|
||||
vmovups %xmm3, -2 * SIZE(B2)
|
||||
|
||||
addq $2 * SIZE, AO1
|
||||
addq $2 * SIZE, AO2
|
||||
subq $-16 * SIZE, B2
|
||||
ALIGN_4
|
||||
|
||||
.L18:
|
||||
testq $1, M
|
||||
jle .L19
|
||||
|
||||
vmovsd 0 * SIZE(AO1), %xmm0
|
||||
vmovsd 0 * SIZE(AO1, LDA), %xmm1
|
||||
vmovsd 0 * SIZE(AO1, LDA, 2), %xmm2
|
||||
vmovsd 0 * SIZE(AO1, LDA3), %xmm3
|
||||
|
||||
vunpcklpd %xmm1, %xmm0 , %xmm0
|
||||
vunpcklpd %xmm3, %xmm2 , %xmm2
|
||||
|
||||
vmovups %xmm0, -16 * SIZE(B3)
|
||||
vmovups %xmm2, -14 * SIZE(B3)
|
||||
|
||||
vmovsd 0 * SIZE(AO2), %xmm0
|
||||
vmovsd 0 * SIZE(AO2, LDA), %xmm1
|
||||
vmovsd 0 * SIZE(AO2, LDA, 2), %xmm2
|
||||
vmovsd 0 * SIZE(AO2, LDA3), %xmm3
|
||||
|
||||
vunpcklpd %xmm1, %xmm0 , %xmm0
|
||||
vunpcklpd %xmm3, %xmm2 , %xmm2
|
||||
|
||||
vmovups %xmm0, -12 * SIZE(B3)
|
||||
vmovups %xmm2, -10 * SIZE(B3)
|
||||
|
||||
subq $-8 * SIZE, B3
|
||||
ALIGN_4
|
||||
|
||||
.L19:
|
||||
cmpq $8, N
|
||||
jge .L11
|
||||
ALIGN_4
|
||||
|
||||
.L20:
|
||||
cmpq $4, N
|
||||
jl .L30
|
||||
|
||||
subq $4, N
|
||||
|
||||
movq A, AO1
|
||||
leaq (A, LDA, 2), AO2
|
||||
leaq (A, LDA, 4), A
|
||||
|
||||
movq B, B0
|
||||
addq $32 * SIZE, B
|
||||
|
||||
movq M, I
|
||||
sarq $3, I
|
||||
jle .L24
|
||||
ALIGN_4
|
||||
|
||||
.L23:
|
||||
|
||||
VMOVUPS_A1(0 * SIZE, AO1, %xmm0)
|
||||
VMOVUPS_A1(2 * SIZE, AO1, %xmm1)
|
||||
VMOVUPS_A1(4 * SIZE, AO1, %xmm2)
|
||||
VMOVUPS_A1(6 * SIZE, AO1, %xmm3)
|
||||
|
||||
vmovups %xmm0, -16 * SIZE(B0)
|
||||
vmovups %xmm1, -14 * SIZE(B0)
|
||||
vmovups %xmm2, -12 * SIZE(B0)
|
||||
vmovups %xmm3, -10 * SIZE(B0)
|
||||
|
||||
|
||||
VMOVUPS_A2(0 * SIZE, AO1, LDA, 1, %xmm0)
|
||||
VMOVUPS_A2(2 * SIZE, AO1, LDA, 1, %xmm1)
|
||||
VMOVUPS_A2(4 * SIZE, AO1, LDA, 1, %xmm2)
|
||||
VMOVUPS_A2(6 * SIZE, AO1, LDA, 1, %xmm3)
|
||||
|
||||
vmovups %xmm0, -8 * SIZE(B0)
|
||||
vmovups %xmm1, -6 * SIZE(B0)
|
||||
vmovups %xmm2, -4 * SIZE(B0)
|
||||
vmovups %xmm3, -2 * SIZE(B0)
|
||||
|
||||
VMOVUPS_A1(0 * SIZE, AO2, %xmm0)
|
||||
VMOVUPS_A1(2 * SIZE, AO2, %xmm1)
|
||||
VMOVUPS_A1(4 * SIZE, AO2, %xmm2)
|
||||
VMOVUPS_A1(6 * SIZE, AO2, %xmm3)
|
||||
|
||||
vmovups %xmm0, 0 * SIZE(B0)
|
||||
vmovups %xmm1, 2 * SIZE(B0)
|
||||
vmovups %xmm2, 4 * SIZE(B0)
|
||||
vmovups %xmm3, 6 * SIZE(B0)
|
||||
|
||||
VMOVUPS_A2(0 * SIZE, AO2, LDA, 1, %xmm0)
|
||||
VMOVUPS_A2(2 * SIZE, AO2, LDA, 1, %xmm1)
|
||||
VMOVUPS_A2(4 * SIZE, AO2, LDA, 1, %xmm2)
|
||||
VMOVUPS_A2(6 * SIZE, AO2, LDA, 1, %xmm3)
|
||||
|
||||
vmovups %xmm0, 8 * SIZE(B0)
|
||||
vmovups %xmm1, 10 * SIZE(B0)
|
||||
vmovups %xmm2, 12 * SIZE(B0)
|
||||
vmovups %xmm3, 14 * SIZE(B0)
|
||||
|
||||
addq $8 * SIZE, AO1
|
||||
addq $8 * SIZE, AO2
|
||||
leaq (B0, M8, 8), B0
|
||||
|
||||
decq I
|
||||
jg .L23
|
||||
ALIGN_4
|
||||
|
||||
.L24:
|
||||
testq $4, M
|
||||
jle .L26
|
||||
|
||||
VMOVUPS_A1(0 * SIZE, AO1, %xmm0)
|
||||
VMOVUPS_A1(2 * SIZE, AO1, %xmm1)
|
||||
VMOVUPS_A2(0 * SIZE, AO1, LDA, 1, %xmm2)
|
||||
VMOVUPS_A2(2 * SIZE, AO1, LDA, 1, %xmm3)
|
||||
|
||||
vmovups %xmm0, -16 * SIZE(B1)
|
||||
vmovups %xmm1, -14 * SIZE(B1)
|
||||
vmovups %xmm2, -12 * SIZE(B1)
|
||||
vmovups %xmm3, -10 * SIZE(B1)
|
||||
|
||||
VMOVUPS_A1(0 * SIZE, AO2, %xmm0)
|
||||
VMOVUPS_A1(2 * SIZE, AO2, %xmm1)
|
||||
VMOVUPS_A2(0 * SIZE, AO2, LDA, 1, %xmm2)
|
||||
VMOVUPS_A2(2 * SIZE, AO2, LDA, 1, %xmm3)
|
||||
|
||||
vmovups %xmm0, -8 * SIZE(B1)
|
||||
vmovups %xmm1, -6 * SIZE(B1)
|
||||
vmovups %xmm2, -4 * SIZE(B1)
|
||||
vmovups %xmm3, -2 * SIZE(B1)
|
||||
|
||||
addq $4 * SIZE, AO1
|
||||
addq $4 * SIZE, AO2
|
||||
subq $-16 * SIZE, B1
|
||||
ALIGN_4
|
||||
|
||||
.L26:
|
||||
testq $2, M
|
||||
jle .L28
|
||||
|
||||
VMOVUPS_A1(0 * SIZE, AO1, %xmm0)
|
||||
VMOVUPS_A2(0 * SIZE, AO1, LDA, 1, %xmm1)
|
||||
VMOVUPS_A1(0 * SIZE, AO2, %xmm2)
|
||||
VMOVUPS_A2(0 * SIZE, AO2, LDA, 1, %xmm3)
|
||||
|
||||
vmovups %xmm0, -16 * SIZE(B2)
|
||||
vmovups %xmm1, -14 * SIZE(B2)
|
||||
vmovups %xmm2, -12 * SIZE(B2)
|
||||
vmovups %xmm3, -10 * SIZE(B2)
|
||||
|
||||
addq $2 * SIZE, AO1
|
||||
addq $2 * SIZE, AO2
|
||||
subq $-8 * SIZE, B2
|
||||
ALIGN_4
|
||||
|
||||
.L28:
|
||||
testq $1, M
|
||||
jle .L30
|
||||
|
||||
vmovsd 0 * SIZE(AO1), %xmm0
|
||||
vmovsd 0 * SIZE(AO1, LDA), %xmm1
|
||||
vmovsd 0 * SIZE(AO2), %xmm2
|
||||
vmovsd 0 * SIZE(AO2, LDA), %xmm3
|
||||
|
||||
vunpcklpd %xmm1, %xmm0, %xmm0
|
||||
vunpcklpd %xmm3, %xmm2, %xmm2
|
||||
|
||||
vmovups %xmm0, -16 * SIZE(B3)
|
||||
vmovups %xmm2, -14 * SIZE(B3)
|
||||
subq $-4 * SIZE, B3
|
||||
ALIGN_4
|
||||
|
||||
.L30:
|
||||
cmpq $2, N
|
||||
jl .L40
|
||||
|
||||
subq $2, N
|
||||
|
||||
movq A, AO1
|
||||
leaq (A, LDA), AO2
|
||||
leaq (A, LDA, 2), A
|
||||
|
||||
movq B, B0
|
||||
addq $16 * SIZE, B
|
||||
|
||||
movq M, I
|
||||
sarq $3, I
|
||||
jle .L34
|
||||
ALIGN_4
|
||||
|
||||
.L33:
|
||||
|
||||
VMOVUPS_A1(0 * SIZE, AO1, %xmm0)
|
||||
VMOVUPS_A1(2 * SIZE, AO1, %xmm1)
|
||||
VMOVUPS_A1(4 * SIZE, AO1, %xmm2)
|
||||
VMOVUPS_A1(6 * SIZE, AO1, %xmm3)
|
||||
|
||||
vmovups %xmm0, -16 * SIZE(B0)
|
||||
vmovups %xmm1, -14 * SIZE(B0)
|
||||
vmovups %xmm2, -12 * SIZE(B0)
|
||||
vmovups %xmm3, -10 * SIZE(B0)
|
||||
|
||||
VMOVUPS_A1(0 * SIZE, AO2, %xmm0)
|
||||
VMOVUPS_A1(2 * SIZE, AO2, %xmm1)
|
||||
VMOVUPS_A1(4 * SIZE, AO2, %xmm2)
|
||||
VMOVUPS_A1(6 * SIZE, AO2, %xmm3)
|
||||
|
||||
vmovups %xmm0, -8 * SIZE(B0)
|
||||
vmovups %xmm1, -6 * SIZE(B0)
|
||||
vmovups %xmm2, -4 * SIZE(B0)
|
||||
vmovups %xmm3, -2 * SIZE(B0)
|
||||
|
||||
addq $8 * SIZE, AO1
|
||||
addq $8 * SIZE, AO2
|
||||
leaq (B0, M8, 8), B0
|
||||
|
||||
decq I
|
||||
jg .L33
|
||||
ALIGN_4
|
||||
|
||||
.L34:
|
||||
testq $4, M
|
||||
jle .L36
|
||||
|
||||
VMOVUPS_A1(0 * SIZE, AO1, %xmm0)
|
||||
VMOVUPS_A1(2 * SIZE, AO1, %xmm1)
|
||||
VMOVUPS_A1(0 * SIZE, AO2, %xmm2)
|
||||
VMOVUPS_A1(2 * SIZE, AO2, %xmm3)
|
||||
|
||||
vmovups %xmm0, -16 * SIZE(B1)
|
||||
vmovups %xmm1, -14 * SIZE(B1)
|
||||
vmovups %xmm2, -12 * SIZE(B1)
|
||||
vmovups %xmm3, -10 * SIZE(B1)
|
||||
|
||||
addq $4 * SIZE, AO1
|
||||
addq $4 * SIZE, AO2
|
||||
subq $-8 * SIZE, B1
|
||||
ALIGN_4
|
||||
|
||||
.L36:
|
||||
testq $2, M
|
||||
jle .L38
|
||||
|
||||
VMOVUPS_A1(0 * SIZE, AO1, %xmm0)
|
||||
VMOVUPS_A1(0 * SIZE, AO2, %xmm1)
|
||||
|
||||
vmovups %xmm0, -16 * SIZE(B2)
|
||||
vmovups %xmm1, -14 * SIZE(B2)
|
||||
|
||||
addq $2 * SIZE, AO1
|
||||
addq $2 * SIZE, AO2
|
||||
subq $-4 * SIZE, B2
|
||||
ALIGN_4
|
||||
|
||||
.L38:
|
||||
testq $1, M
|
||||
jle .L40
|
||||
|
||||
vmovsd 0 * SIZE(AO1), %xmm0
|
||||
vmovsd 0 * SIZE(AO2), %xmm1
|
||||
|
||||
vunpcklpd %xmm1, %xmm0, %xmm0
|
||||
|
||||
vmovups %xmm0, -16 * SIZE(B3)
|
||||
subq $-2 * SIZE, B3
|
||||
ALIGN_4
|
||||
|
||||
.L40:
|
||||
cmpq $1, N
|
||||
jl .L999
|
||||
|
||||
movq A, AO1
|
||||
|
||||
movq B, B0
|
||||
|
||||
movq M, I
|
||||
sarq $3, I
|
||||
jle .L44
|
||||
ALIGN_4
|
||||
|
||||
.L43:
|
||||
|
||||
VMOVUPS_A1(0 * SIZE, AO1, %xmm0)
|
||||
VMOVUPS_A1(2 * SIZE, AO1, %xmm1)
|
||||
VMOVUPS_A1(4 * SIZE, AO1, %xmm2)
|
||||
VMOVUPS_A1(6 * SIZE, AO1, %xmm3)
|
||||
|
||||
vmovups %xmm0, -16 * SIZE(B0)
|
||||
vmovups %xmm1, -14 * SIZE(B0)
|
||||
vmovups %xmm2, -12 * SIZE(B0)
|
||||
vmovups %xmm3, -10 * SIZE(B0)
|
||||
|
||||
addq $8 * SIZE, AO1
|
||||
leaq (B0, M8, 8), B0
|
||||
|
||||
decq I
|
||||
jg .L43
|
||||
ALIGN_4
|
||||
|
||||
.L44:
|
||||
testq $4, M
|
||||
jle .L45
|
||||
|
||||
VMOVUPS_A1(0 * SIZE, AO1, %xmm0)
|
||||
VMOVUPS_A1(2 * SIZE, AO1, %xmm1)
|
||||
|
||||
vmovups %xmm0, -16 * SIZE(B1)
|
||||
vmovups %xmm1, -14 * SIZE(B1)
|
||||
|
||||
addq $4 * SIZE, AO1
|
||||
subq $-4 * SIZE, B1
|
||||
ALIGN_4
|
||||
|
||||
.L45:
|
||||
testq $2, M
|
||||
jle .L46
|
||||
|
||||
VMOVUPS_A1(0 * SIZE, AO1, %xmm0)
|
||||
|
||||
vmovups %xmm0, -16 * SIZE(B2)
|
||||
|
||||
addq $2 * SIZE, AO1
|
||||
subq $-2 * SIZE, B2
|
||||
ALIGN_4
|
||||
|
||||
.L46:
|
||||
testq $1, M
|
||||
jle .L999
|
||||
|
||||
vmovsd 0 * SIZE(AO1), %xmm0
|
||||
|
||||
vmovsd %xmm0, -16 * SIZE(B3)
|
||||
jmp .L999
|
||||
ALIGN_4
|
||||
|
||||
.L999:
|
||||
popq %rbp
|
||||
popq %r12
|
||||
popq %r13
|
||||
popq %r14
|
||||
popq %r15
|
||||
|
||||
#ifdef WINDOWS_ABI
|
||||
popq %rsi
|
||||
popq %rdi
|
||||
#endif
|
||||
ret
|
||||
|
||||
EPILOGUE
|
||||
@@ -47,7 +47,7 @@
|
||||
|
||||
#ifndef WINDOWS_ABI
|
||||
|
||||
#define STACKSIZE 64
|
||||
#define STACKSIZE 128
|
||||
|
||||
#define OLD_M %rdi
|
||||
#define OLD_N %rsi
|
||||
@@ -59,9 +59,14 @@
|
||||
#define STACK_BUFFER 32 + STACKSIZE(%rsp)
|
||||
#define ALPHA 48 (%rsp)
|
||||
|
||||
#define MMM 56(%rsp)
|
||||
#define NN 64(%rsp)
|
||||
#define AA 72(%rsp)
|
||||
#define LDAX 80(%rsp)
|
||||
#define XX 88(%rsp)
|
||||
#else
|
||||
|
||||
#define STACKSIZE 256
|
||||
#define STACKSIZE 288
|
||||
|
||||
#define OLD_M %rcx
|
||||
#define OLD_N %rdx
|
||||
@@ -74,6 +79,12 @@
|
||||
#define STACK_BUFFER 88 + STACKSIZE(%rsp)
|
||||
#define ALPHA 224 (%rsp)
|
||||
|
||||
#define MMM 232(%rsp)
|
||||
#define NN 240(%rsp)
|
||||
#define AA 248(%rsp)
|
||||
#define LDAX 256(%rsp)
|
||||
#define XX 264(%rsp)
|
||||
|
||||
#endif
|
||||
|
||||
#define LDA %r8
|
||||
@@ -137,17 +148,42 @@
|
||||
movq OLD_LDA, LDA
|
||||
#endif
|
||||
|
||||
movq STACK_INCX, INCX
|
||||
movq STACK_Y, Y
|
||||
movq STACK_INCY, INCY
|
||||
movq STACK_BUFFER, BUFFER
|
||||
|
||||
#ifndef WINDOWS_ABI
|
||||
movsd %xmm0, ALPHA
|
||||
#else
|
||||
movsd %xmm3, ALPHA
|
||||
#endif
|
||||
|
||||
movq STACK_Y, Y
|
||||
movq A,AA
|
||||
movq N,NN
|
||||
movq M,MMM
|
||||
movq LDA,LDAX
|
||||
movq X,XX
|
||||
|
||||
.L0t:
|
||||
xorq I,I
|
||||
addq $1,I
|
||||
salq $21,I
|
||||
subq I,MMM
|
||||
movq I,M
|
||||
jge .L00t
|
||||
|
||||
movq MMM,M
|
||||
addq I,M
|
||||
jle .L999x
|
||||
|
||||
.L00t:
|
||||
movq XX,X
|
||||
movq AA,A
|
||||
movq NN,N
|
||||
movq LDAX,LDA
|
||||
|
||||
movq STACK_INCX, INCX
|
||||
movq STACK_INCY, INCY
|
||||
movq STACK_BUFFER, BUFFER
|
||||
|
||||
|
||||
leaq -1(INCY), %rax
|
||||
|
||||
leaq (,INCX, SIZE), INCX
|
||||
@@ -2815,6 +2851,12 @@
|
||||
ALIGN_3
|
||||
|
||||
.L999:
|
||||
leaq (, M, SIZE), %rax
|
||||
addq %rax,AA
|
||||
jmp .L0t
|
||||
ALIGN_4
|
||||
|
||||
.L999x:
|
||||
movq 0(%rsp), %rbx
|
||||
movq 8(%rsp), %rbp
|
||||
movq 16(%rsp), %r12
|
||||
|
||||
2325
kernel/x86_64/dgemv_n_bulldozer.S
Normal file
2325
kernel/x86_64/dgemv_n_bulldozer.S
Normal file
File diff suppressed because it is too large
Load Diff
1938
kernel/x86_64/dgemv_t_bulldozer.S
Normal file
1938
kernel/x86_64/dgemv_t_bulldozer.S
Normal file
File diff suppressed because it is too large
Load Diff
360
kernel/x86_64/gemm_ncopy_2_bulldozer.S
Normal file
360
kernel/x86_64/gemm_ncopy_2_bulldozer.S
Normal file
@@ -0,0 +1,360 @@
|
||||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
|
||||
#ifndef WINDOWS_ABI
|
||||
|
||||
#define M ARG1 /* rdi */
|
||||
#define N ARG2 /* rsi */
|
||||
#define A ARG3 /* rdx */
|
||||
#define LDA ARG4 /* rcx */
|
||||
#define B ARG5 /* r8 */
|
||||
|
||||
#define I %r9
|
||||
|
||||
#else
|
||||
|
||||
#define STACKSIZE 256
|
||||
|
||||
#define M ARG1 /* rcx */
|
||||
#define N ARG2 /* rdx */
|
||||
#define A ARG3 /* r8 */
|
||||
#define LDA ARG4 /* r9 */
|
||||
#define OLD_B 40 + 32 + STACKSIZE(%rsp)
|
||||
|
||||
#define B %r14
|
||||
#define I %r15
|
||||
|
||||
#endif
|
||||
|
||||
#define J %r10
|
||||
#define AO1 %r11
|
||||
#define AO2 %r12
|
||||
#define AO3 %r13
|
||||
#define AO4 %rax
|
||||
|
||||
PROLOGUE
|
||||
PROFCODE
|
||||
|
||||
#ifdef WINDOWS_ABI
|
||||
pushq %r15
|
||||
pushq %r14
|
||||
#endif
|
||||
pushq %r13
|
||||
pushq %r12
|
||||
|
||||
#ifdef WINDOWS_ABI
|
||||
subq $STACKSIZE, %rsp
|
||||
|
||||
vmovups %xmm6, 0(%rsp)
|
||||
vmovups %xmm7, 16(%rsp)
|
||||
vmovups %xmm8, 32(%rsp)
|
||||
vmovups %xmm9, 48(%rsp)
|
||||
vmovups %xmm10, 64(%rsp)
|
||||
vmovups %xmm11, 80(%rsp)
|
||||
vmovups %xmm12, 96(%rsp)
|
||||
vmovups %xmm13, 112(%rsp)
|
||||
vmovups %xmm14, 128(%rsp)
|
||||
vmovups %xmm15, 144(%rsp)
|
||||
|
||||
movq OLD_B, B
|
||||
#endif
|
||||
|
||||
leaq (,LDA, SIZE), LDA # Scaling
|
||||
|
||||
movq N, J
|
||||
sarq $1, J
|
||||
jle .L20
|
||||
ALIGN_4
|
||||
|
||||
.L01:
|
||||
movq A, AO1
|
||||
leaq (A, LDA), AO2
|
||||
leaq (A, LDA, 2), A
|
||||
|
||||
movq M, I
|
||||
sarq $3, I
|
||||
jle .L08
|
||||
ALIGN_4
|
||||
|
||||
.L03:
|
||||
|
||||
#ifndef DOUBLE
|
||||
vmovss 0 * SIZE(AO1), %xmm0
|
||||
vmovss 0 * SIZE(AO2), %xmm1
|
||||
vmovss 1 * SIZE(AO1), %xmm2
|
||||
vmovss 1 * SIZE(AO2), %xmm3
|
||||
vmovss 2 * SIZE(AO1), %xmm4
|
||||
vmovss 2 * SIZE(AO2), %xmm5
|
||||
vmovss 3 * SIZE(AO1), %xmm6
|
||||
vmovss 3 * SIZE(AO2), %xmm7
|
||||
|
||||
vmovss 4 * SIZE(AO1), %xmm8
|
||||
vmovss 4 * SIZE(AO2), %xmm9
|
||||
vmovss 5 * SIZE(AO1), %xmm10
|
||||
vmovss 5 * SIZE(AO2), %xmm11
|
||||
vmovss 6 * SIZE(AO1), %xmm12
|
||||
vmovss 6 * SIZE(AO2), %xmm13
|
||||
vmovss 7 * SIZE(AO1), %xmm14
|
||||
vmovss 7 * SIZE(AO2), %xmm15
|
||||
|
||||
vmovss %xmm0, 0 * SIZE(B)
|
||||
vmovss %xmm1, 1 * SIZE(B)
|
||||
vmovss %xmm2, 2 * SIZE(B)
|
||||
vmovss %xmm3, 3 * SIZE(B)
|
||||
vmovss %xmm4, 4 * SIZE(B)
|
||||
vmovss %xmm5, 5 * SIZE(B)
|
||||
vmovss %xmm6, 6 * SIZE(B)
|
||||
vmovss %xmm7, 7 * SIZE(B)
|
||||
|
||||
vmovss %xmm8, 8 * SIZE(B)
|
||||
vmovss %xmm9, 9 * SIZE(B)
|
||||
vmovss %xmm10, 10 * SIZE(B)
|
||||
vmovss %xmm11, 11 * SIZE(B)
|
||||
vmovss %xmm12, 12 * SIZE(B)
|
||||
vmovss %xmm13, 13 * SIZE(B)
|
||||
vmovss %xmm14, 14 * SIZE(B)
|
||||
vmovss %xmm15, 15 * SIZE(B)
|
||||
|
||||
#else
|
||||
prefetchw 256(B)
|
||||
|
||||
prefetchnta 256(AO1)
|
||||
vmovsd 0 * SIZE(AO1), %xmm0
|
||||
vmovsd 1 * SIZE(AO1), %xmm1
|
||||
vmovsd 2 * SIZE(AO1), %xmm2
|
||||
vmovsd 3 * SIZE(AO1), %xmm3
|
||||
vmovsd 4 * SIZE(AO1), %xmm4
|
||||
vmovsd 5 * SIZE(AO1), %xmm5
|
||||
vmovsd 6 * SIZE(AO1), %xmm6
|
||||
vmovsd 7 * SIZE(AO1), %xmm7
|
||||
|
||||
prefetchnta 256(AO2)
|
||||
vmovhpd 0 * SIZE(AO2), %xmm0 , %xmm0
|
||||
vmovhpd 1 * SIZE(AO2), %xmm1 , %xmm1
|
||||
vmovhpd 2 * SIZE(AO2), %xmm2 , %xmm2
|
||||
vmovhpd 3 * SIZE(AO2), %xmm3 , %xmm3
|
||||
vmovhpd 4 * SIZE(AO2), %xmm4 , %xmm4
|
||||
vmovhpd 5 * SIZE(AO2), %xmm5 , %xmm5
|
||||
vmovhpd 6 * SIZE(AO2), %xmm6 , %xmm6
|
||||
vmovhpd 7 * SIZE(AO2), %xmm7 , %xmm7
|
||||
|
||||
|
||||
prefetchw 256+64(B)
|
||||
vmovups %xmm0, 0 * SIZE(B)
|
||||
vmovups %xmm1, 2 * SIZE(B)
|
||||
vmovups %xmm2, 4 * SIZE(B)
|
||||
vmovups %xmm3, 6 * SIZE(B)
|
||||
vmovups %xmm4, 8 * SIZE(B)
|
||||
vmovups %xmm5, 10 * SIZE(B)
|
||||
vmovups %xmm6, 12 * SIZE(B)
|
||||
vmovups %xmm7, 14 * SIZE(B)
|
||||
|
||||
#endif
|
||||
|
||||
addq $8 * SIZE, AO1
|
||||
addq $8 * SIZE, AO2
|
||||
subq $-16 * SIZE, B
|
||||
decq I
|
||||
jg .L03
|
||||
ALIGN_4
|
||||
|
||||
|
||||
.L08:
|
||||
testq $4 , M
|
||||
je .L14
|
||||
|
||||
ALIGN_4
|
||||
|
||||
|
||||
.L13:
|
||||
#ifndef DOUBLE
|
||||
vmovss 0 * SIZE(AO1), %xmm0
|
||||
vmovss 0 * SIZE(AO2), %xmm1
|
||||
vmovss 1 * SIZE(AO1), %xmm2
|
||||
vmovss 1 * SIZE(AO2), %xmm3
|
||||
vmovss 2 * SIZE(AO1), %xmm4
|
||||
vmovss 2 * SIZE(AO2), %xmm5
|
||||
vmovss 3 * SIZE(AO1), %xmm6
|
||||
vmovss 3 * SIZE(AO2), %xmm7
|
||||
|
||||
vmovss %xmm0, 0 * SIZE(B)
|
||||
vmovss %xmm1, 1 * SIZE(B)
|
||||
vmovss %xmm2, 2 * SIZE(B)
|
||||
vmovss %xmm3, 3 * SIZE(B)
|
||||
vmovss %xmm4, 4 * SIZE(B)
|
||||
vmovss %xmm5, 5 * SIZE(B)
|
||||
vmovss %xmm6, 6 * SIZE(B)
|
||||
vmovss %xmm7, 7 * SIZE(B)
|
||||
#else
|
||||
|
||||
vmovsd 0 * SIZE(AO1), %xmm0
|
||||
vmovsd 1 * SIZE(AO1), %xmm1
|
||||
vmovsd 2 * SIZE(AO1), %xmm2
|
||||
vmovsd 3 * SIZE(AO1), %xmm3
|
||||
|
||||
vmovhpd 0 * SIZE(AO2), %xmm0 , %xmm0
|
||||
vmovhpd 1 * SIZE(AO2), %xmm1 , %xmm1
|
||||
vmovhpd 2 * SIZE(AO2), %xmm2 , %xmm2
|
||||
vmovhpd 3 * SIZE(AO2), %xmm3 , %xmm3
|
||||
|
||||
|
||||
vmovups %xmm0, 0 * SIZE(B)
|
||||
vmovups %xmm1, 2 * SIZE(B)
|
||||
vmovups %xmm2, 4 * SIZE(B)
|
||||
vmovups %xmm3, 6 * SIZE(B)
|
||||
#endif
|
||||
|
||||
addq $4 * SIZE, AO1
|
||||
addq $4 * SIZE, AO2
|
||||
subq $-8 * SIZE, B
|
||||
ALIGN_4
|
||||
|
||||
.L14:
|
||||
movq M, I
|
||||
andq $3, I
|
||||
jle .L16
|
||||
ALIGN_4
|
||||
|
||||
.L15:
|
||||
#ifndef DOUBLE
|
||||
vmovss 0 * SIZE(AO1), %xmm0
|
||||
vmovss 0 * SIZE(AO2), %xmm1
|
||||
|
||||
vmovss %xmm0, 0 * SIZE(B)
|
||||
vmovss %xmm1, 1 * SIZE(B)
|
||||
#else
|
||||
vmovsd 0 * SIZE(AO1), %xmm0
|
||||
vmovhpd 0 * SIZE(AO2), %xmm0 , %xmm0
|
||||
|
||||
vmovups %xmm0, 0 * SIZE(B)
|
||||
#endif
|
||||
|
||||
addq $SIZE, AO1
|
||||
addq $SIZE, AO2
|
||||
addq $2 * SIZE, B
|
||||
decq I
|
||||
jg .L15
|
||||
ALIGN_4
|
||||
|
||||
.L16:
|
||||
decq J
|
||||
jg .L01
|
||||
ALIGN_4
|
||||
|
||||
.L20:
|
||||
testq $1, N
|
||||
jle .L999
|
||||
|
||||
movq A, AO1
|
||||
|
||||
movq M, I
|
||||
sarq $2, I
|
||||
jle .L34
|
||||
ALIGN_4
|
||||
|
||||
.L33:
|
||||
#ifndef DOUBLE
|
||||
vmovups 0 * SIZE(AO1), %xmm0
|
||||
|
||||
vmovups %xmm0, 0 * SIZE(B)
|
||||
#else
|
||||
vmovups 0 * SIZE(AO1), %xmm0
|
||||
vmovups 2 * SIZE(AO1), %xmm1
|
||||
|
||||
vmovups %xmm0, 0 * SIZE(B)
|
||||
vmovups %xmm1, 2 * SIZE(B)
|
||||
#endif
|
||||
|
||||
addq $4 * SIZE, AO1
|
||||
subq $-4 * SIZE, B
|
||||
decq I
|
||||
jg .L33
|
||||
ALIGN_4
|
||||
|
||||
.L34:
|
||||
movq M, I
|
||||
andq $3, I
|
||||
jle .L999
|
||||
ALIGN_4
|
||||
|
||||
.L35:
|
||||
#ifndef DOUBLE
|
||||
vmovss 0 * SIZE(AO1), %xmm0
|
||||
vmovss %xmm0, 0 * SIZE(B)
|
||||
#else
|
||||
vmovsd 0 * SIZE(AO1), %xmm0
|
||||
vmovsd %xmm0, 0 * SIZE(B)
|
||||
#endif
|
||||
|
||||
addq $SIZE, AO1
|
||||
addq $1 * SIZE, B
|
||||
decq I
|
||||
jg .L35
|
||||
ALIGN_4
|
||||
|
||||
|
||||
.L999:
|
||||
#ifdef WINDOWS_ABI
|
||||
vmovups 0(%rsp), %xmm6
|
||||
vmovups 16(%rsp), %xmm7
|
||||
vmovups 32(%rsp), %xmm8
|
||||
vmovups 48(%rsp), %xmm9
|
||||
vmovups 64(%rsp), %xmm10
|
||||
vmovups 80(%rsp), %xmm11
|
||||
vmovups 96(%rsp), %xmm12
|
||||
vmovups 112(%rsp), %xmm13
|
||||
vmovups 128(%rsp), %xmm14
|
||||
vmovups 144(%rsp), %xmm15
|
||||
|
||||
addq $STACKSIZE, %rsp
|
||||
#endif
|
||||
|
||||
popq %r12
|
||||
popq %r13
|
||||
|
||||
#ifdef WINDOWS_ABI
|
||||
popq %r14
|
||||
popq %r15
|
||||
#endif
|
||||
ret
|
||||
|
||||
EPILOGUE
|
||||
374
kernel/x86_64/gemm_tcopy_2_bulldozer.S
Normal file
374
kernel/x86_64/gemm_tcopy_2_bulldozer.S
Normal file
@@ -0,0 +1,374 @@
|
||||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
#ifndef WINDOWS_ABI
|
||||
|
||||
#define M ARG1 /* rdi */
|
||||
#define N ARG2 /* rsi */
|
||||
#define A ARG3 /* rdx */
|
||||
#define LDA ARG4 /* rcx */
|
||||
#define B ARG5 /* r8 */
|
||||
|
||||
#define I %r10
|
||||
#define J %rbp
|
||||
|
||||
#define AO1 %r9
|
||||
#define AO2 %r15
|
||||
#define AO3 %r11
|
||||
#define AO4 %r14
|
||||
#define BO1 %r13
|
||||
#define M8 %rbx
|
||||
#define BO %rax
|
||||
|
||||
#else
|
||||
|
||||
#define STACKSIZE 256
|
||||
|
||||
#define M ARG1 /* rcx */
|
||||
#define N ARG2 /* rdx */
|
||||
#define A ARG3 /* r8 */
|
||||
#define LDA ARG4 /* r9 */
|
||||
#define OLD_B 40 + 64 + STACKSIZE(%rsp)
|
||||
|
||||
#define B %rdi
|
||||
|
||||
#define I %r10
|
||||
#define J %r11
|
||||
|
||||
#define AO1 %r12
|
||||
#define AO2 %r13
|
||||
#define AO3 %r14
|
||||
#define AO4 %r15
|
||||
|
||||
#define BO1 %rsi
|
||||
#define M8 %rbp
|
||||
#define BO %rax
|
||||
|
||||
#endif
|
||||
|
||||
PROLOGUE
|
||||
PROFCODE
|
||||
|
||||
#ifdef WINDOWS_ABI
|
||||
pushq %rdi
|
||||
pushq %rsi
|
||||
#endif
|
||||
pushq %r15
|
||||
pushq %r14
|
||||
pushq %r13
|
||||
pushq %r12
|
||||
pushq %rbp
|
||||
pushq %rbx
|
||||
|
||||
#ifdef WINDOWS_ABI
|
||||
subq $STACKSIZE, %rsp
|
||||
|
||||
vmovups %xmm6, 0(%rsp)
|
||||
vmovups %xmm7, 16(%rsp)
|
||||
vmovups %xmm8, 32(%rsp)
|
||||
vmovups %xmm9, 48(%rsp)
|
||||
vmovups %xmm10, 64(%rsp)
|
||||
vmovups %xmm11, 80(%rsp)
|
||||
vmovups %xmm12, 96(%rsp)
|
||||
vmovups %xmm13, 112(%rsp)
|
||||
vmovups %xmm14, 128(%rsp)
|
||||
vmovups %xmm15, 144(%rsp)
|
||||
|
||||
movq OLD_B, B
|
||||
#endif
|
||||
|
||||
movq N, %rax
|
||||
andq $-2, %rax
|
||||
imulq M, %rax
|
||||
|
||||
leaq (B, %rax, SIZE), BO1
|
||||
|
||||
leaq (, LDA, SIZE), LDA
|
||||
leaq (, M, SIZE), M8
|
||||
|
||||
movq M, J
|
||||
sarq $1, J
|
||||
jle .L20
|
||||
ALIGN_4
|
||||
|
||||
.L01:
|
||||
movq A, AO1
|
||||
leaq (A, LDA ), AO2
|
||||
leaq (A, LDA, 2), A
|
||||
|
||||
movq B, BO
|
||||
addq $4 * SIZE, B
|
||||
|
||||
movq N, I
|
||||
sarq $3, I
|
||||
jle .L10
|
||||
ALIGN_4
|
||||
|
||||
|
||||
.L08:
|
||||
#ifndef DOUBLE
|
||||
|
||||
vmovsd 0 * SIZE(AO1), %xmm0
|
||||
vmovsd 2 * SIZE(AO1), %xmm2
|
||||
vmovsd 4 * SIZE(AO1), %xmm4
|
||||
vmovsd 6 * SIZE(AO1), %xmm6
|
||||
vmovsd 0 * SIZE(AO2), %xmm1
|
||||
vmovsd 2 * SIZE(AO2), %xmm3
|
||||
vmovsd 4 * SIZE(AO2), %xmm5
|
||||
vmovsd 6 * SIZE(AO2), %xmm7
|
||||
|
||||
vmovsd %xmm0, 0 * SIZE(BO)
|
||||
vmovsd %xmm1, 2 * SIZE(BO)
|
||||
leaq (BO, M8, 2), BO
|
||||
|
||||
vmovsd %xmm2, 0 * SIZE(BO)
|
||||
vmovsd %xmm3, 2 * SIZE(BO)
|
||||
leaq (BO, M8, 2), BO
|
||||
|
||||
vmovsd %xmm4, 0 * SIZE(BO)
|
||||
vmovsd %xmm5, 2 * SIZE(BO)
|
||||
leaq (BO, M8, 2), BO
|
||||
|
||||
vmovsd %xmm6, 0 * SIZE(BO)
|
||||
vmovsd %xmm7, 2 * SIZE(BO)
|
||||
leaq (BO, M8, 2), BO
|
||||
|
||||
|
||||
#else
|
||||
|
||||
prefetchnta 256(AO1)
|
||||
prefetchnta 256(AO2)
|
||||
vmovups 0 * SIZE(AO1), %xmm0
|
||||
vmovups 2 * SIZE(AO1), %xmm2
|
||||
vmovups 4 * SIZE(AO1), %xmm4
|
||||
vmovups 6 * SIZE(AO1), %xmm6
|
||||
vmovups 0 * SIZE(AO2), %xmm1
|
||||
vmovups 2 * SIZE(AO2), %xmm3
|
||||
vmovups 4 * SIZE(AO2), %xmm5
|
||||
vmovups 6 * SIZE(AO2), %xmm7
|
||||
|
||||
vmovups %xmm0, 0 * SIZE(BO)
|
||||
vmovups %xmm1, 2 * SIZE(BO)
|
||||
leaq (BO, M8, 2), BO
|
||||
|
||||
vmovups %xmm2, 0 * SIZE(BO)
|
||||
vmovups %xmm3, 2 * SIZE(BO)
|
||||
leaq (BO, M8, 2), BO
|
||||
|
||||
vmovups %xmm4, 0 * SIZE(BO)
|
||||
vmovups %xmm5, 2 * SIZE(BO)
|
||||
leaq (BO, M8, 2), BO
|
||||
|
||||
vmovups %xmm6, 0 * SIZE(BO)
|
||||
vmovups %xmm7, 2 * SIZE(BO)
|
||||
leaq (BO, M8, 2), BO
|
||||
|
||||
#endif
|
||||
|
||||
addq $8 * SIZE, AO1
|
||||
addq $8 * SIZE, AO2
|
||||
decq I
|
||||
jg .L08
|
||||
ALIGN_4
|
||||
|
||||
|
||||
|
||||
.L10:
|
||||
testq $4, N
|
||||
jle .L12
|
||||
#ifndef DOUBLE
|
||||
|
||||
vmovsd 0 * SIZE(AO1), %xmm0
|
||||
vmovsd 2 * SIZE(AO1), %xmm2
|
||||
vmovsd 0 * SIZE(AO2), %xmm1
|
||||
vmovsd 2 * SIZE(AO2), %xmm3
|
||||
|
||||
vmovsd %xmm0, 0 * SIZE(BO)
|
||||
vmovsd %xmm1, 2 * SIZE(BO)
|
||||
leaq (BO, M8, 2), BO
|
||||
|
||||
vmovsd %xmm2, 0 * SIZE(BO)
|
||||
vmovsd %xmm3, 2 * SIZE(BO)
|
||||
leaq (BO, M8, 2), BO
|
||||
|
||||
|
||||
#else
|
||||
|
||||
vmovups 0 * SIZE(AO1), %xmm0
|
||||
vmovups 2 * SIZE(AO1), %xmm2
|
||||
vmovups 0 * SIZE(AO2), %xmm1
|
||||
vmovups 2 * SIZE(AO2), %xmm3
|
||||
|
||||
vmovups %xmm0, 0 * SIZE(BO)
|
||||
vmovups %xmm1, 2 * SIZE(BO)
|
||||
leaq (BO, M8, 2), BO
|
||||
|
||||
vmovups %xmm2, 0 * SIZE(BO)
|
||||
vmovups %xmm3, 2 * SIZE(BO)
|
||||
leaq (BO, M8, 2), BO
|
||||
|
||||
#endif
|
||||
|
||||
addq $4 * SIZE, AO1
|
||||
addq $4 * SIZE, AO2
|
||||
ALIGN_4
|
||||
|
||||
|
||||
.L12:
|
||||
testq $2, N
|
||||
jle .L14
|
||||
#ifndef DOUBLE
|
||||
vmovsd 0 * SIZE(AO1), %xmm0
|
||||
vmovsd 0 * SIZE(AO2), %xmm1
|
||||
|
||||
vmovsd %xmm0, 0 * SIZE(BO)
|
||||
vmovsd %xmm1, 2 * SIZE(BO)
|
||||
#else
|
||||
vmovups 0 * SIZE(AO1), %xmm0
|
||||
vmovups 0 * SIZE(AO2), %xmm1
|
||||
|
||||
vmovups %xmm0, 0 * SIZE(BO)
|
||||
vmovups %xmm1, 2 * SIZE(BO)
|
||||
#endif
|
||||
|
||||
leaq (BO, M8, 2), BO
|
||||
addq $2 * SIZE, AO1
|
||||
addq $2 * SIZE, AO2
|
||||
ALIGN_4
|
||||
|
||||
.L14:
|
||||
testq $1, N
|
||||
jle .L19
|
||||
|
||||
#ifndef DOUBLE
|
||||
vmovss 0 * SIZE(AO1), %xmm0
|
||||
vmovss 0 * SIZE(AO2), %xmm1
|
||||
|
||||
vmovss %xmm0, 0 * SIZE(BO1)
|
||||
vmovss %xmm1, 1 * SIZE(BO1)
|
||||
#else
|
||||
vmovsd 0 * SIZE(AO1), %xmm0
|
||||
vmovhpd 0 * SIZE(AO2), %xmm0 , %xmm0
|
||||
|
||||
vmovups %xmm0, 0 * SIZE(BO1)
|
||||
#endif
|
||||
|
||||
addq $2 * SIZE, BO1
|
||||
ALIGN_4
|
||||
|
||||
.L19:
|
||||
decq J
|
||||
jg .L01
|
||||
ALIGN_4
|
||||
|
||||
.L20:
|
||||
testq $1, M
|
||||
jle .L999
|
||||
ALIGN_4
|
||||
|
||||
.L31:
|
||||
movq A, AO1
|
||||
movq B, BO
|
||||
|
||||
movq N, I
|
||||
sarq $1, I
|
||||
jle .L33
|
||||
ALIGN_4
|
||||
|
||||
.L32:
|
||||
#ifndef DOUBLE
|
||||
vmovsd 0 * SIZE(AO1), %xmm0
|
||||
vmovsd %xmm0, 0 * SIZE(BO)
|
||||
#else
|
||||
vmovups 0 * SIZE(AO1), %xmm0
|
||||
vmovups %xmm0, 0 * SIZE(BO)
|
||||
#endif
|
||||
|
||||
addq $2 * SIZE, AO1
|
||||
leaq (BO, M8, 2), BO
|
||||
decq I
|
||||
jg .L32
|
||||
ALIGN_4
|
||||
|
||||
.L33:
|
||||
testq $1, N
|
||||
jle .L999
|
||||
|
||||
#ifndef DOUBLE
|
||||
vmovss 0 * SIZE(AO1), %xmm0
|
||||
vmovss %xmm0, 0 * SIZE(BO1)
|
||||
#else
|
||||
vmovsd 0 * SIZE(AO1), %xmm0
|
||||
vmovsd %xmm0, 0 * SIZE(BO1)
|
||||
#endif
|
||||
addq $1 * SIZE, BO1
|
||||
ALIGN_4
|
||||
|
||||
.L999:
|
||||
#ifdef WINDOWS_ABI
|
||||
vmovups 0(%rsp), %xmm6
|
||||
vmovups 16(%rsp), %xmm7
|
||||
vmovups 32(%rsp), %xmm8
|
||||
vmovups 48(%rsp), %xmm9
|
||||
vmovups 64(%rsp), %xmm10
|
||||
vmovups 80(%rsp), %xmm11
|
||||
vmovups 96(%rsp), %xmm12
|
||||
vmovups 112(%rsp), %xmm13
|
||||
vmovups 128(%rsp), %xmm14
|
||||
vmovups 144(%rsp), %xmm15
|
||||
|
||||
addq $STACKSIZE, %rsp
|
||||
#endif
|
||||
|
||||
popq %rbx
|
||||
popq %rbp
|
||||
popq %r12
|
||||
popq %r13
|
||||
popq %r14
|
||||
popq %r15
|
||||
#ifdef WINDOWS_ABI
|
||||
popq %rsi
|
||||
popq %rdi
|
||||
#endif
|
||||
|
||||
ret
|
||||
|
||||
EPILOGUE
|
||||
4657
kernel/x86_64/sgemm_kernel_16x2_bulldozer.S
Normal file
4657
kernel/x86_64/sgemm_kernel_16x2_bulldozer.S
Normal file
File diff suppressed because it is too large
Load Diff
@@ -47,7 +47,7 @@
|
||||
|
||||
#ifndef WINDOWS_ABI
|
||||
|
||||
#define STACKSIZE 64
|
||||
#define STACKSIZE 128
|
||||
|
||||
#define OLD_M %rdi
|
||||
#define OLD_N %rsi
|
||||
@@ -58,10 +58,14 @@
|
||||
#define STACK_INCY 24 + STACKSIZE(%rsp)
|
||||
#define STACK_BUFFER 32 + STACKSIZE(%rsp)
|
||||
#define ALPHA 48 (%rsp)
|
||||
|
||||
#define MMM 56(%rsp)
|
||||
#define NN 64(%rsp)
|
||||
#define AA 72(%rsp)
|
||||
#define LDAX 80(%rsp)
|
||||
#define XX 96(%rsp)
|
||||
#else
|
||||
|
||||
#define STACKSIZE 256
|
||||
#define STACKSIZE 288
|
||||
|
||||
#define OLD_M %rcx
|
||||
#define OLD_N %rdx
|
||||
@@ -74,6 +78,12 @@
|
||||
#define STACK_BUFFER 88 + STACKSIZE(%rsp)
|
||||
#define ALPHA 224 (%rsp)
|
||||
|
||||
#define MMM 232(%rsp)
|
||||
#define NN 240(%rsp)
|
||||
#define AA 248(%rsp)
|
||||
#define LDAX 256(%rsp)
|
||||
#define XX 264(%rsp)
|
||||
|
||||
#endif
|
||||
|
||||
#define LDA %r8
|
||||
@@ -137,17 +147,41 @@
|
||||
movq OLD_LDA, LDA
|
||||
#endif
|
||||
|
||||
movq STACK_INCX, INCX
|
||||
movq STACK_Y, Y
|
||||
movq STACK_INCY, INCY
|
||||
movq STACK_BUFFER, BUFFER
|
||||
|
||||
#ifndef WINDOWS_ABI
|
||||
movss %xmm0, ALPHA
|
||||
#else
|
||||
movss %xmm3, ALPHA
|
||||
#endif
|
||||
|
||||
|
||||
movq M,MMM
|
||||
movq A,AA
|
||||
movq N,NN
|
||||
movq LDA,LDAX
|
||||
movq X,XX
|
||||
movq STACK_Y, Y
|
||||
.L0t:
|
||||
xorq I,I
|
||||
addq $1,I
|
||||
salq $22,I
|
||||
subq I,MMM
|
||||
movq I,M
|
||||
jge .L00t
|
||||
|
||||
movq MMM,M
|
||||
addq I,M
|
||||
jle .L999x
|
||||
|
||||
.L00t:
|
||||
movq AA,A
|
||||
movq NN,N
|
||||
movq LDAX,LDA
|
||||
movq XX,X
|
||||
|
||||
movq STACK_INCX, INCX
|
||||
movq STACK_INCY, INCY
|
||||
movq STACK_BUFFER, BUFFER
|
||||
|
||||
leaq (,INCX, SIZE), INCX
|
||||
leaq (,INCY, SIZE), INCY
|
||||
leaq (,LDA, SIZE), LDA
|
||||
@@ -5990,6 +6024,12 @@
|
||||
ALIGN_3
|
||||
|
||||
.L999:
|
||||
leaq (,M,SIZE),%rax
|
||||
addq %rax,AA
|
||||
jmp .L0t
|
||||
ALIGN_4
|
||||
|
||||
.L999x:
|
||||
movq 0(%rsp), %rbx
|
||||
movq 8(%rsp), %rbp
|
||||
movq 16(%rsp), %r12
|
||||
|
||||
@@ -63,7 +63,7 @@
|
||||
|
||||
#else
|
||||
|
||||
#define STACKSIZE 256
|
||||
#define STACKSIZE 288
|
||||
|
||||
#define OLD_M %rcx
|
||||
#define OLD_N %rdx
|
||||
@@ -74,10 +74,10 @@
|
||||
#define STACK_Y 72 + STACKSIZE(%rsp)
|
||||
#define STACK_INCY 80 + STACKSIZE(%rsp)
|
||||
#define STACK_BUFFER 88 + STACKSIZE(%rsp)
|
||||
#define MMM 216(%rsp)
|
||||
#define NN 224(%rsp)
|
||||
#define AA 232(%rsp)
|
||||
#define LDAX 240(%rsp)
|
||||
#define MMM 232(%rsp)
|
||||
#define NN 240(%rsp)
|
||||
#define AA 248(%rsp)
|
||||
#define LDAX 256(%rsp)
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
@@ -76,7 +76,7 @@
|
||||
#define movsd movlps
|
||||
#endif
|
||||
|
||||
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||
#define PREFETCH prefetch
|
||||
#define PREFETCHW prefetchw
|
||||
#define PREFETCHSIZE (16 * 16)
|
||||
|
||||
@@ -76,7 +76,7 @@
|
||||
#define movsd movlpd
|
||||
#endif
|
||||
|
||||
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||
#define PREFETCH prefetch
|
||||
#define PREFETCHW prefetchw
|
||||
#define PREFETCHSIZE (16 * 16)
|
||||
|
||||
@@ -76,7 +76,7 @@
|
||||
#define movsd movlps
|
||||
#endif
|
||||
|
||||
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||
#define PREFETCH prefetch
|
||||
#define PREFETCHW prefetchw
|
||||
#define PREFETCHSIZE (16 * 16)
|
||||
|
||||
@@ -76,7 +76,7 @@
|
||||
#define movsd movlpd
|
||||
#endif
|
||||
|
||||
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||
#define PREFETCH prefetch
|
||||
#define PREFETCHW prefetchw
|
||||
#define PREFETCHSIZE (16 * 16)
|
||||
|
||||
1407
kernel/x86_64/zgemm_kernel_2x2_bulldozer.S
Normal file
1407
kernel/x86_64/zgemm_kernel_2x2_bulldozer.S
Normal file
File diff suppressed because it is too large
Load Diff
@@ -1385,7 +1385,7 @@ ALIGN_5
|
||||
EXTRA_DY $1, yvec15, xvec7;
|
||||
EXTRA_DY $1, yvec14, xvec6;
|
||||
EXTRA_DY $1, yvec13, xvec5;
|
||||
EXTRA_DY $2, yvec12, xvec4;
|
||||
EXTRA_DY $1, yvec12, xvec4;
|
||||
#ifndef TRMMKERNEL
|
||||
LDL_DX 0*SIZE(C0), xvec0, xvec0;
|
||||
LDH_DX 1*SIZE(C0), xvec0, xvec0;
|
||||
@@ -1406,8 +1406,8 @@ STL_DX xvec7, 2*SIZE(C0, ldc, 1);
|
||||
STH_DX xvec7, 3*SIZE(C0, ldc, 1);
|
||||
STL_DX xvec13, 0*SIZE(C0, ldc, 1);
|
||||
STH_DX xvec13, 1*SIZE(C0, ldc, 1);
|
||||
STL_DX xvec6, 2*SIZE(C0);
|
||||
STH_DX xvec6, 3*SIZE(C0);
|
||||
STL_DX xvec5, 2*SIZE(C0);
|
||||
STH_DX xvec5, 3*SIZE(C0);
|
||||
#ifndef TRMMKERNEL
|
||||
LDL_DX 0*SIZE(C1), xvec0, xvec0;
|
||||
LDH_DX 1*SIZE(C1), xvec0, xvec0;
|
||||
|
||||
@@ -42,7 +42,7 @@
|
||||
|
||||
#ifndef WINDOWS_ABI
|
||||
|
||||
#define STACKSIZE 64
|
||||
#define STACKSIZE 128
|
||||
|
||||
#define OLD_INCX 8 + STACKSIZE(%rsp)
|
||||
#define OLD_Y 16 + STACKSIZE(%rsp)
|
||||
@@ -50,7 +50,15 @@
|
||||
#define OLD_BUFFER 32 + STACKSIZE(%rsp)
|
||||
#define ALPHA_R 48 (%rsp)
|
||||
#define ALPHA_I 56 (%rsp)
|
||||
|
||||
|
||||
#define MMM 64(%rsp)
|
||||
#define NN 72(%rsp)
|
||||
#define AA 80(%rsp)
|
||||
#define XX 88(%rsp)
|
||||
#define LDAX 96(%rsp)
|
||||
#define ALPHAR 104(%rsp)
|
||||
#define ALPHAI 112(%rsp)
|
||||
|
||||
#define M %rdi
|
||||
#define N %rsi
|
||||
#define A %rcx
|
||||
@@ -62,7 +70,7 @@
|
||||
|
||||
#else
|
||||
|
||||
#define STACKSIZE 256
|
||||
#define STACKSIZE 288
|
||||
|
||||
#define OLD_ALPHA_I 40 + STACKSIZE(%rsp)
|
||||
#define OLD_A 48 + STACKSIZE(%rsp)
|
||||
@@ -75,6 +83,14 @@
|
||||
#define ALPHA_R 224 (%rsp)
|
||||
#define ALPHA_I 232 (%rsp)
|
||||
|
||||
#define MMM 232(%rsp)
|
||||
#define NN 240(%rsp)
|
||||
#define AA 248(%rsp)
|
||||
#define XX 256(%rsp)
|
||||
#define LDAX 264(%rsp)
|
||||
#define ALPHAR 272(%rsp)
|
||||
#define ALPHAI 280(%rsp)
|
||||
|
||||
#define M %rcx
|
||||
#define N %rdx
|
||||
#define A %r8
|
||||
@@ -136,8 +152,37 @@
|
||||
movsd OLD_ALPHA_I, %xmm1
|
||||
#endif
|
||||
|
||||
movq OLD_INCX, INCX
|
||||
movq A, AA
|
||||
movq N, NN
|
||||
movq M, MMM
|
||||
movq LDA, LDAX
|
||||
movq X, XX
|
||||
movq OLD_Y, Y
|
||||
movsd %xmm0,ALPHAR
|
||||
movsd %xmm1,ALPHAI
|
||||
|
||||
.L0t:
|
||||
xorq I,I
|
||||
addq $1,I
|
||||
salq $18,I
|
||||
subq I,MMM
|
||||
movq I,M
|
||||
movsd ALPHAR,%xmm0
|
||||
movsd ALPHAI,%xmm1
|
||||
jge .L00t
|
||||
|
||||
movq MMM,M
|
||||
addq I,M
|
||||
jle .L999x
|
||||
|
||||
.L00t:
|
||||
movq AA, A
|
||||
movq NN, N
|
||||
movq LDAX, LDA
|
||||
movq XX, X
|
||||
|
||||
movq OLD_INCX, INCX
|
||||
# movq OLD_Y, Y
|
||||
movq OLD_INCY, INCY
|
||||
movq OLD_BUFFER, BUFFER
|
||||
|
||||
@@ -2673,6 +2718,12 @@
|
||||
ALIGN_3
|
||||
|
||||
.L999:
|
||||
movq M, I
|
||||
salq $ZBASE_SHIFT,I
|
||||
addq I,AA
|
||||
jmp .L0t
|
||||
.L999x:
|
||||
|
||||
movq 0(%rsp), %rbx
|
||||
movq 8(%rsp), %rbp
|
||||
movq 16(%rsp), %r12
|
||||
|
||||
@@ -42,13 +42,20 @@
|
||||
|
||||
#ifndef WINDOWS_ABI
|
||||
|
||||
#define STACKSIZE 64
|
||||
#define STACKSIZE 128
|
||||
|
||||
#define OLD_INCX 8 + STACKSIZE(%rsp)
|
||||
#define OLD_Y 16 + STACKSIZE(%rsp)
|
||||
#define OLD_INCY 24 + STACKSIZE(%rsp)
|
||||
#define OLD_BUFFER 32 + STACKSIZE(%rsp)
|
||||
|
||||
#define MMM 64(%rsp)
|
||||
#define NN 72(%rsp)
|
||||
#define AA 80(%rsp)
|
||||
#define LDAX 88(%rsp)
|
||||
#define ALPHAR 96(%rsp)
|
||||
#define ALPHAI 104(%rsp)
|
||||
|
||||
#define M %rdi
|
||||
#define N %rsi
|
||||
#define A %rcx
|
||||
@@ -60,7 +67,7 @@
|
||||
|
||||
#else
|
||||
|
||||
#define STACKSIZE 256
|
||||
#define STACKSIZE 288
|
||||
|
||||
#define OLD_ALPHA_I 40 + STACKSIZE(%rsp)
|
||||
#define OLD_A 48 + STACKSIZE(%rsp)
|
||||
@@ -71,6 +78,13 @@
|
||||
#define OLD_INCY 88 + STACKSIZE(%rsp)
|
||||
#define OLD_BUFFER 96 + STACKSIZE(%rsp)
|
||||
|
||||
#define MMM 232(%rsp)
|
||||
#define NN 240(%rsp)
|
||||
#define AA 248(%rsp)
|
||||
#define LDAX 256(%rsp)
|
||||
#define ALPHAR 264(%rsp)
|
||||
#define ALPHAI 272(%rsp)
|
||||
|
||||
#define M %rcx
|
||||
#define N %rdx
|
||||
#define A %r8
|
||||
@@ -135,6 +149,32 @@
|
||||
movsd OLD_ALPHA_I, %xmm1
|
||||
#endif
|
||||
|
||||
movq A, AA
|
||||
movq N, NN
|
||||
movq M, MMM
|
||||
movq LDA, LDAX
|
||||
movsd %xmm0,ALPHAR
|
||||
movsd %xmm1,ALPHAI
|
||||
|
||||
.L0t:
|
||||
xorq I,I
|
||||
addq $1,I
|
||||
salq $19,I
|
||||
subq I,MMM
|
||||
movq I,M
|
||||
movsd ALPHAR,%xmm0
|
||||
movsd ALPHAI,%xmm1
|
||||
jge .L00t
|
||||
|
||||
movq MMM,M
|
||||
addq I,M
|
||||
jle .L999x
|
||||
|
||||
.L00t:
|
||||
movq AA, A
|
||||
movq NN, N
|
||||
movq LDAX, LDA
|
||||
|
||||
movq OLD_INCX, INCX
|
||||
movq OLD_Y, Y
|
||||
movq OLD_INCY, INCY
|
||||
@@ -2405,6 +2445,12 @@
|
||||
ALIGN_3
|
||||
|
||||
.L999:
|
||||
movq M, I
|
||||
salq $ZBASE_SHIFT,I
|
||||
addq I,AA
|
||||
jmp .L0t
|
||||
.L999x:
|
||||
|
||||
movq 0(%rsp), %rbx
|
||||
movq 8(%rsp), %rbp
|
||||
movq 16(%rsp), %r12
|
||||
|
||||
@@ -160,7 +160,7 @@
|
||||
#define a3 %xmm14
|
||||
#define xt1 %xmm15
|
||||
|
||||
#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||
#define MOVDDUP(a, b, c) movddup a(b), c
|
||||
#define MOVDDUP2(a, b, c) movddup a##b, c
|
||||
#else
|
||||
|
||||
@@ -76,7 +76,7 @@
|
||||
#define movsd movlpd
|
||||
#endif
|
||||
|
||||
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||
#define PREFETCH prefetch
|
||||
#define PREFETCHW prefetchw
|
||||
#define PREFETCHSIZE (16 * 16)
|
||||
@@ -167,7 +167,7 @@
|
||||
#define a3 %xmm14
|
||||
#define xt1 %xmm15
|
||||
|
||||
#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
|
||||
#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BARCELONA_OPTIMIZATION)
|
||||
#define MOVDDUP(a, b, c) movddup a(b), c
|
||||
#define MOVDDUP2(a, b, c) movddup a##b, c
|
||||
#else
|
||||
|
||||
@@ -76,7 +76,7 @@
|
||||
#define movsd movlpd
|
||||
#endif
|
||||
|
||||
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||
#define PREFETCH prefetch
|
||||
#define PREFETCHW prefetchw
|
||||
#define PREFETCHSIZE (16 * 16)
|
||||
@@ -166,7 +166,7 @@
|
||||
#define xt1 %xmm14
|
||||
#define xt2 %xmm15
|
||||
|
||||
#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
|
||||
#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BARCELONA_OPTIMIZATION)
|
||||
#define MOVDDUP(a, b, c) movddup a(b), c
|
||||
#define MOVDDUP2(a, b, c) movddup a##b, c
|
||||
#else
|
||||
|
||||
@@ -76,7 +76,7 @@
|
||||
#define movsd movlpd
|
||||
#endif
|
||||
|
||||
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||
#define PREFETCH prefetch
|
||||
#define PREFETCHW prefetchw
|
||||
#define PREFETCHSIZE (16 * 16)
|
||||
@@ -166,7 +166,7 @@
|
||||
#define a3 %xmm14
|
||||
#define xt1 %xmm15
|
||||
|
||||
#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
|
||||
#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BARCELONA_OPTIMIZATION)
|
||||
#define MOVDDUP(a, b, c) movddup a(b), c
|
||||
#define MOVDDUP2(a, b, c) movddup a##b, c
|
||||
#else
|
||||
|
||||
Reference in New Issue
Block a user