Merge branch 'develop' into bulldozer

Conflicts:
	kernel/x86_64/KERNEL.BULLDOZER
This commit is contained in:
Zhang Xianyi
2013-07-28 06:38:25 +02:00
5433 changed files with 1471224 additions and 2545 deletions

View File

@@ -388,7 +388,7 @@ $(KDIR)xgerv_k$(TSUFFIX).$(SUFFIX) $(KDIR)xgerv_k$(TSUFFIX).$(PSUFFIX) : $(KER
$(CC) -c $(CFLAGS) -DXDOUBLE -UCONJ -DXCONJ $< -o $@
$(KDIR)xgerd_k$(TSUFFIX).$(SUFFIX) $(KDIR)xgerd_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGERCKERNEL) $(XGERPARAM)
$(CC) -c $(CFLAGS) -DXDOUBLE -DCONJ-DXCONJ $< -o $@
$(CC) -c $(CFLAGS) -DXDOUBLE -DCONJ -DXCONJ $< -o $@
$(KDIR)chemv_U$(TSUFFIX).$(SUFFIX) $(KDIR)chemv_U$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CHEMV_U_KERNEL) $(CHEMV_U_PARAM)
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DHEMV $< -o $@

View File

@@ -1206,328 +1206,328 @@ $(KDIR)xhemm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_utcopy_$(XGEMM_UNROLL_M
$(KDIR)xhemm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_ltcopy_$(XGEMM_UNROLL_M).c
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER $< -DLOWER -o $@
$(KDIR)cgemm3m_oncopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(SGEMM_UNROLL_N).c
$(KDIR)cgemm3m_oncopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
$(KDIR)cgemm3m_oncopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(SGEMM_UNROLL_N).c
$(KDIR)cgemm3m_oncopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)cgemm3m_oncopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(SGEMM_UNROLL_N).c
$(KDIR)cgemm3m_oncopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)cgemm3m_otcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(SGEMM_UNROLL_N).c
$(KDIR)cgemm3m_otcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
$(KDIR)cgemm3m_otcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(SGEMM_UNROLL_N).c
$(KDIR)cgemm3m_otcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)cgemm3m_otcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(SGEMM_UNROLL_N).c
$(KDIR)cgemm3m_otcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)cgemm3m_incopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(SGEMM_UNROLL_M).c
$(KDIR)cgemm3m_incopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@
$(KDIR)cgemm3m_incopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(SGEMM_UNROLL_M).c
$(KDIR)cgemm3m_incopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)cgemm3m_incopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(SGEMM_UNROLL_M).c
$(KDIR)cgemm3m_incopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)cgemm3m_itcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(SGEMM_UNROLL_M).c
$(KDIR)cgemm3m_itcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@
$(KDIR)cgemm3m_itcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(SGEMM_UNROLL_M).c
$(KDIR)cgemm3m_itcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)cgemm3m_itcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(SGEMM_UNROLL_M).c
$(KDIR)cgemm3m_itcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)zgemm3m_oncopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(DGEMM_UNROLL_N).c
$(KDIR)zgemm3m_oncopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
$(KDIR)zgemm3m_oncopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(DGEMM_UNROLL_N).c
$(KDIR)zgemm3m_oncopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)zgemm3m_oncopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(DGEMM_UNROLL_N).c
$(KDIR)zgemm3m_oncopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)zgemm3m_otcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(DGEMM_UNROLL_N).c
$(KDIR)zgemm3m_otcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
$(KDIR)zgemm3m_otcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(DGEMM_UNROLL_N).c
$(KDIR)zgemm3m_otcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)zgemm3m_otcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(DGEMM_UNROLL_N).c
$(KDIR)zgemm3m_otcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)zgemm3m_incopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(DGEMM_UNROLL_M).c
$(KDIR)zgemm3m_incopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@
$(KDIR)zgemm3m_incopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(DGEMM_UNROLL_M).c
$(KDIR)zgemm3m_incopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)zgemm3m_incopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(DGEMM_UNROLL_M).c
$(KDIR)zgemm3m_incopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)zgemm3m_itcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(DGEMM_UNROLL_M).c
$(KDIR)zgemm3m_itcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@
$(KDIR)zgemm3m_itcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(DGEMM_UNROLL_M).c
$(KDIR)zgemm3m_itcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)zgemm3m_itcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(DGEMM_UNROLL_M).c
$(KDIR)zgemm3m_itcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)xgemm3m_oncopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(QGEMM_UNROLL_N).c
$(KDIR)xgemm3m_oncopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
$(KDIR)xgemm3m_oncopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(QGEMM_UNROLL_N).c
$(KDIR)xgemm3m_oncopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)xgemm3m_oncopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(QGEMM_UNROLL_N).c
$(KDIR)xgemm3m_oncopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)xgemm3m_otcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(QGEMM_UNROLL_N).c
$(KDIR)xgemm3m_otcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
$(KDIR)xgemm3m_otcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(QGEMM_UNROLL_N).c
$(KDIR)xgemm3m_otcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)xgemm3m_otcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(QGEMM_UNROLL_N).c
$(KDIR)xgemm3m_otcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)xgemm3m_incopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(QGEMM_UNROLL_M).c
$(KDIR)xgemm3m_incopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@
$(KDIR)xgemm3m_incopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(QGEMM_UNROLL_M).c
$(KDIR)xgemm3m_incopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)xgemm3m_incopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(QGEMM_UNROLL_M).c
$(KDIR)xgemm3m_incopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)xgemm3m_itcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(QGEMM_UNROLL_M).c
$(KDIR)xgemm3m_itcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@
$(KDIR)xgemm3m_itcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(QGEMM_UNROLL_M).c
$(KDIR)xgemm3m_itcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)xgemm3m_itcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(QGEMM_UNROLL_M).c
$(KDIR)xgemm3m_itcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)csymm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(SGEMM_UNROLL_N).c
$(KDIR)csymm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
$(KDIR)csymm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(SGEMM_UNROLL_N).c
$(KDIR)csymm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
$(KDIR)csymm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(SGEMM_UNROLL_N).c
$(KDIR)csymm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)csymm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(SGEMM_UNROLL_N).c
$(KDIR)csymm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)csymm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(SGEMM_UNROLL_N).c
$(KDIR)csymm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)csymm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(SGEMM_UNROLL_N).c
$(KDIR)csymm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)csymm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(SGEMM_UNROLL_M).c
$(KDIR)csymm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
$(KDIR)csymm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(SGEMM_UNROLL_M).c
$(KDIR)csymm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
$(KDIR)csymm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(SGEMM_UNROLL_M).c
$(KDIR)csymm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)csymm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(SGEMM_UNROLL_M).c
$(KDIR)csymm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)csymm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(SGEMM_UNROLL_M).c
$(KDIR)csymm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)csymm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(SGEMM_UNROLL_M).c
$(KDIR)csymm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)zsymm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(DGEMM_UNROLL_N).c
$(KDIR)zsymm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
$(KDIR)zsymm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(DGEMM_UNROLL_N).c
$(KDIR)zsymm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
$(KDIR)zsymm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(DGEMM_UNROLL_N).c
$(KDIR)zsymm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)zsymm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(DGEMM_UNROLL_N).c
$(KDIR)zsymm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)zsymm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(DGEMM_UNROLL_N).c
$(KDIR)zsymm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)zsymm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(DGEMM_UNROLL_N).c
$(KDIR)zsymm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)zsymm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(DGEMM_UNROLL_M).c
$(KDIR)zsymm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
$(KDIR)zsymm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(DGEMM_UNROLL_M).c
$(KDIR)zsymm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
$(KDIR)zsymm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(DGEMM_UNROLL_M).c
$(KDIR)zsymm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)zsymm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(DGEMM_UNROLL_M).c
$(KDIR)zsymm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)zsymm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(DGEMM_UNROLL_M).c
$(KDIR)zsymm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)zsymm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(DGEMM_UNROLL_M).c
$(KDIR)zsymm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)xsymm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(QGEMM_UNROLL_N).c
$(KDIR)xsymm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
$(KDIR)xsymm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(QGEMM_UNROLL_N).c
$(KDIR)xsymm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
$(KDIR)xsymm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(QGEMM_UNROLL_N).c
$(KDIR)xsymm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)xsymm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(QGEMM_UNROLL_N).c
$(KDIR)xsymm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)xsymm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(QGEMM_UNROLL_N).c
$(KDIR)xsymm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)xsymm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(QGEMM_UNROLL_N).c
$(KDIR)xsymm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)xsymm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(QGEMM_UNROLL_M).c
$(KDIR)xsymm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
$(KDIR)xsymm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(QGEMM_UNROLL_M).c
$(KDIR)xsymm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
$(KDIR)xsymm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(QGEMM_UNROLL_M).c
$(KDIR)xsymm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)xsymm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(QGEMM_UNROLL_M).c
$(KDIR)xsymm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)xsymm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(QGEMM_UNROLL_M).c
$(KDIR)xsymm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)xsymm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(QGEMM_UNROLL_M).c
$(KDIR)xsymm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)chemm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(SGEMM_UNROLL_N).c
$(KDIR)chemm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
$(KDIR)chemm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(SGEMM_UNROLL_N).c
$(KDIR)chemm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
$(KDIR)chemm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(SGEMM_UNROLL_N).c
$(KDIR)chemm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)chemm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(SGEMM_UNROLL_N).c
$(KDIR)chemm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)chemm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(SGEMM_UNROLL_N).c
$(KDIR)chemm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)chemm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(SGEMM_UNROLL_N).c
$(KDIR)chemm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)chemm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(SGEMM_UNROLL_M).c
$(KDIR)chemm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
$(KDIR)chemm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(SGEMM_UNROLL_M).c
$(KDIR)chemm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
$(KDIR)chemm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(SGEMM_UNROLL_M).c
$(KDIR)chemm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)chemm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(SGEMM_UNROLL_M).c
$(KDIR)chemm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)chemm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(SGEMM_UNROLL_M).c
$(KDIR)chemm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)chemm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(SGEMM_UNROLL_M).c
$(KDIR)chemm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)zhemm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(DGEMM_UNROLL_N).c
$(KDIR)zhemm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
$(KDIR)zhemm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(DGEMM_UNROLL_N).c
$(KDIR)zhemm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
$(KDIR)zhemm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(DGEMM_UNROLL_N).c
$(KDIR)zhemm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)zhemm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(DGEMM_UNROLL_N).c
$(KDIR)zhemm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)zhemm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(DGEMM_UNROLL_N).c
$(KDIR)zhemm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)zhemm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(DGEMM_UNROLL_N).c
$(KDIR)zhemm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)zhemm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(DGEMM_UNROLL_M).c
$(KDIR)zhemm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
$(KDIR)zhemm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(DGEMM_UNROLL_M).c
$(KDIR)zhemm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
$(KDIR)zhemm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(DGEMM_UNROLL_M).c
$(KDIR)zhemm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)zhemm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(DGEMM_UNROLL_M).c
$(KDIR)zhemm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)zhemm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(DGEMM_UNROLL_M).c
$(KDIR)zhemm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)zhemm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(DGEMM_UNROLL_M).c
$(KDIR)zhemm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)xhemm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(QGEMM_UNROLL_N).c
$(KDIR)xhemm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
$(KDIR)xhemm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(QGEMM_UNROLL_N).c
$(KDIR)xhemm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
$(KDIR)xhemm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(QGEMM_UNROLL_N).c
$(KDIR)xhemm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)xhemm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(QGEMM_UNROLL_N).c
$(KDIR)xhemm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)xhemm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(QGEMM_UNROLL_N).c
$(KDIR)xhemm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)xhemm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(QGEMM_UNROLL_N).c
$(KDIR)xhemm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)xhemm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(QGEMM_UNROLL_M).c
$(KDIR)xhemm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
$(KDIR)xhemm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(QGEMM_UNROLL_M).c
$(KDIR)xhemm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
$(KDIR)xhemm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(QGEMM_UNROLL_M).c
$(KDIR)xhemm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)xhemm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(QGEMM_UNROLL_M).c
$(KDIR)xhemm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)xhemm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(QGEMM_UNROLL_M).c
$(KDIR)xhemm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)xhemm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(QGEMM_UNROLL_M).c
$(KDIR)xhemm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)strsm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(SGEMM_UNROLL_M).c
@@ -2608,328 +2608,328 @@ $(KDIR)xhemm_iutcopy$(TSUFFIX).$(PSUFFIX) : generic/zhemm_utcopy_$(XGEMM_UNROLL_
$(KDIR)xhemm_iltcopy$(TSUFFIX).$(PSUFFIX) : generic/zhemm_ltcopy_$(XGEMM_UNROLL_M).c
$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER $< -DLOWER -o $@
$(KDIR)cgemm3m_oncopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(SGEMM_UNROLL_N).c
$(KDIR)cgemm3m_oncopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
$(KDIR)cgemm3m_oncopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(SGEMM_UNROLL_N).c
$(KDIR)cgemm3m_oncopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)cgemm3m_oncopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(SGEMM_UNROLL_N).c
$(KDIR)cgemm3m_oncopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)cgemm3m_otcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(SGEMM_UNROLL_N).c
$(KDIR)cgemm3m_otcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
$(KDIR)cgemm3m_otcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(SGEMM_UNROLL_N).c
$(KDIR)cgemm3m_otcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)cgemm3m_otcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(SGEMM_UNROLL_N).c
$(KDIR)cgemm3m_otcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)cgemm3m_incopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(SGEMM_UNROLL_M).c
$(KDIR)cgemm3m_incopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@
$(KDIR)cgemm3m_incopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(SGEMM_UNROLL_M).c
$(KDIR)cgemm3m_incopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)cgemm3m_incopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(SGEMM_UNROLL_M).c
$(KDIR)cgemm3m_incopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)cgemm3m_itcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(SGEMM_UNROLL_M).c
$(KDIR)cgemm3m_itcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@
$(KDIR)cgemm3m_itcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(SGEMM_UNROLL_M).c
$(KDIR)cgemm3m_itcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)cgemm3m_itcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(SGEMM_UNROLL_M).c
$(KDIR)cgemm3m_itcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)zgemm3m_oncopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(DGEMM_UNROLL_N).c
$(KDIR)zgemm3m_oncopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
$(KDIR)zgemm3m_oncopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(DGEMM_UNROLL_N).c
$(KDIR)zgemm3m_oncopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)zgemm3m_oncopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(DGEMM_UNROLL_N).c
$(KDIR)zgemm3m_oncopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)zgemm3m_otcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(DGEMM_UNROLL_N).c
$(KDIR)zgemm3m_otcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
$(KDIR)zgemm3m_otcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(DGEMM_UNROLL_N).c
$(KDIR)zgemm3m_otcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)zgemm3m_otcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(DGEMM_UNROLL_N).c
$(KDIR)zgemm3m_otcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)zgemm3m_incopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(DGEMM_UNROLL_M).c
$(KDIR)zgemm3m_incopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@
$(KDIR)zgemm3m_incopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(DGEMM_UNROLL_M).c
$(KDIR)zgemm3m_incopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)zgemm3m_incopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(DGEMM_UNROLL_M).c
$(KDIR)zgemm3m_incopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)zgemm3m_itcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(DGEMM_UNROLL_M).c
$(KDIR)zgemm3m_itcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@
$(KDIR)zgemm3m_itcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(DGEMM_UNROLL_M).c
$(KDIR)zgemm3m_itcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)zgemm3m_itcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(DGEMM_UNROLL_M).c
$(KDIR)zgemm3m_itcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)xgemm3m_oncopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(QGEMM_UNROLL_N).c
$(KDIR)xgemm3m_oncopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
$(KDIR)xgemm3m_oncopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(QGEMM_UNROLL_N).c
$(KDIR)xgemm3m_oncopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)xgemm3m_oncopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(QGEMM_UNROLL_N).c
$(KDIR)xgemm3m_oncopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)xgemm3m_otcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(QGEMM_UNROLL_N).c
$(KDIR)xgemm3m_otcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
$(KDIR)xgemm3m_otcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(QGEMM_UNROLL_N).c
$(KDIR)xgemm3m_otcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)xgemm3m_otcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(QGEMM_UNROLL_N).c
$(KDIR)xgemm3m_otcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)xgemm3m_incopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(QGEMM_UNROLL_M).c
$(KDIR)xgemm3m_incopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@
$(KDIR)xgemm3m_incopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(QGEMM_UNROLL_M).c
$(KDIR)xgemm3m_incopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)xgemm3m_incopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(QGEMM_UNROLL_M).c
$(KDIR)xgemm3m_incopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)xgemm3m_itcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(QGEMM_UNROLL_M).c
$(KDIR)xgemm3m_itcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@
$(KDIR)xgemm3m_itcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(QGEMM_UNROLL_M).c
$(KDIR)xgemm3m_itcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)xgemm3m_itcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(QGEMM_UNROLL_M).c
$(KDIR)xgemm3m_itcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)csymm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(SGEMM_UNROLL_N).c
$(KDIR)csymm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
$(KDIR)csymm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(SGEMM_UNROLL_N).c
$(KDIR)csymm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
$(KDIR)csymm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(SGEMM_UNROLL_N).c
$(KDIR)csymm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)csymm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(SGEMM_UNROLL_N).c
$(KDIR)csymm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)csymm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(SGEMM_UNROLL_N).c
$(KDIR)csymm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)csymm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(SGEMM_UNROLL_N).c
$(KDIR)csymm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)csymm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(SGEMM_UNROLL_M).c
$(KDIR)csymm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
$(KDIR)csymm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(SGEMM_UNROLL_M).c
$(KDIR)csymm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
$(KDIR)csymm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(SGEMM_UNROLL_M).c
$(KDIR)csymm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)csymm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(SGEMM_UNROLL_M).c
$(KDIR)csymm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)csymm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(SGEMM_UNROLL_M).c
$(KDIR)csymm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)csymm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(SGEMM_UNROLL_M).c
$(KDIR)csymm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)zsymm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(DGEMM_UNROLL_N).c
$(KDIR)zsymm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
$(KDIR)zsymm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(DGEMM_UNROLL_N).c
$(KDIR)zsymm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
$(KDIR)zsymm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(DGEMM_UNROLL_N).c
$(KDIR)zsymm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)zsymm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(DGEMM_UNROLL_N).c
$(KDIR)zsymm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)zsymm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(DGEMM_UNROLL_N).c
$(KDIR)zsymm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)zsymm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(DGEMM_UNROLL_N).c
$(KDIR)zsymm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)zsymm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(DGEMM_UNROLL_M).c
$(KDIR)zsymm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
$(KDIR)zsymm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(DGEMM_UNROLL_M).c
$(KDIR)zsymm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
$(KDIR)zsymm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(DGEMM_UNROLL_M).c
$(KDIR)zsymm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)zsymm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(DGEMM_UNROLL_M).c
$(KDIR)zsymm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)zsymm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(DGEMM_UNROLL_M).c
$(KDIR)zsymm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)zsymm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(DGEMM_UNROLL_M).c
$(KDIR)zsymm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)xsymm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(QGEMM_UNROLL_N).c
$(KDIR)xsymm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
$(KDIR)xsymm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(QGEMM_UNROLL_N).c
$(KDIR)xsymm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
$(KDIR)xsymm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(QGEMM_UNROLL_N).c
$(KDIR)xsymm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)xsymm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(QGEMM_UNROLL_N).c
$(KDIR)xsymm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)xsymm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(QGEMM_UNROLL_N).c
$(KDIR)xsymm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)xsymm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(QGEMM_UNROLL_N).c
$(KDIR)xsymm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)xsymm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(QGEMM_UNROLL_M).c
$(KDIR)xsymm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
$(KDIR)xsymm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(QGEMM_UNROLL_M).c
$(KDIR)xsymm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
$(KDIR)xsymm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(QGEMM_UNROLL_M).c
$(KDIR)xsymm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)xsymm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(QGEMM_UNROLL_M).c
$(KDIR)xsymm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)xsymm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(QGEMM_UNROLL_M).c
$(KDIR)xsymm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)xsymm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(QGEMM_UNROLL_M).c
$(KDIR)xsymm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)chemm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(SGEMM_UNROLL_N).c
$(KDIR)chemm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
$(KDIR)chemm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(SGEMM_UNROLL_N).c
$(KDIR)chemm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
$(KDIR)chemm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(SGEMM_UNROLL_N).c
$(KDIR)chemm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)chemm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(SGEMM_UNROLL_N).c
$(KDIR)chemm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)chemm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(SGEMM_UNROLL_N).c
$(KDIR)chemm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)chemm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(SGEMM_UNROLL_N).c
$(KDIR)chemm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)chemm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(SGEMM_UNROLL_M).c
$(KDIR)chemm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
$(KDIR)chemm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(SGEMM_UNROLL_M).c
$(KDIR)chemm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
$(KDIR)chemm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(SGEMM_UNROLL_M).c
$(KDIR)chemm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)chemm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(SGEMM_UNROLL_M).c
$(KDIR)chemm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)chemm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(SGEMM_UNROLL_M).c
$(KDIR)chemm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)chemm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(SGEMM_UNROLL_M).c
$(KDIR)chemm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)zhemm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(DGEMM_UNROLL_N).c
$(KDIR)zhemm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
$(KDIR)zhemm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(DGEMM_UNROLL_N).c
$(KDIR)zhemm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
$(KDIR)zhemm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(DGEMM_UNROLL_N).c
$(KDIR)zhemm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)zhemm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(DGEMM_UNROLL_N).c
$(KDIR)zhemm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)zhemm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(DGEMM_UNROLL_N).c
$(KDIR)zhemm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)zhemm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(DGEMM_UNROLL_N).c
$(KDIR)zhemm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)zhemm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(DGEMM_UNROLL_M).c
$(KDIR)zhemm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
$(KDIR)zhemm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(DGEMM_UNROLL_M).c
$(KDIR)zhemm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
$(KDIR)zhemm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(DGEMM_UNROLL_M).c
$(KDIR)zhemm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)zhemm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(DGEMM_UNROLL_M).c
$(KDIR)zhemm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)zhemm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(DGEMM_UNROLL_M).c
$(KDIR)zhemm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)zhemm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(DGEMM_UNROLL_M).c
$(KDIR)zhemm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)xhemm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(QGEMM_UNROLL_N).c
$(KDIR)xhemm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
$(KDIR)xhemm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(QGEMM_UNROLL_N).c
$(KDIR)xhemm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
$(KDIR)xhemm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(QGEMM_UNROLL_N).c
$(KDIR)xhemm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)xhemm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(QGEMM_UNROLL_N).c
$(KDIR)xhemm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)xhemm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(QGEMM_UNROLL_N).c
$(KDIR)xhemm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)xhemm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(QGEMM_UNROLL_N).c
$(KDIR)xhemm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)xhemm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(QGEMM_UNROLL_M).c
$(KDIR)xhemm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
$(KDIR)xhemm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(QGEMM_UNROLL_M).c
$(KDIR)xhemm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
$(KDIR)xhemm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(QGEMM_UNROLL_M).c
$(KDIR)xhemm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)xhemm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(QGEMM_UNROLL_M).c
$(KDIR)xhemm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)xhemm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(QGEMM_UNROLL_M).c
$(KDIR)xhemm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)xhemm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(QGEMM_UNROLL_M).c
$(KDIR)xhemm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)strsm_iunucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_uncopy_$(SGEMM_UNROLL_M).c

View File

@@ -826,6 +826,22 @@ static void init_parameter(void) {
#endif
#endif
#ifdef PILEDRIVER
#ifdef DEBUG
fprintf(stderr, "Piledriver\n");
#endif
TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P;
TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P;
TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P;
TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P;
#ifdef EXPRECISION
TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P;
TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P;
#endif
#endif
#ifdef NANO
#ifdef DEBUG

View File

@@ -0,0 +1,59 @@
SGEMMKERNEL = gemm_kernel_4x4_barcelona.S
SGEMMINCOPY =
SGEMMITCOPY =
SGEMMONCOPY = ../generic/gemm_ncopy_4.c
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
SGEMMINCOPYOBJ =
SGEMMITCOPYOBJ =
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
DGEMMKERNEL = gemm_kernel_2x4_barcelona.S
DGEMMINCOPY = ../generic/gemm_ncopy_2.c
DGEMMITCOPY = ../generic/gemm_tcopy_2.c
DGEMMONCOPY = ../generic/gemm_ncopy_4.c
DGEMMOTCOPY = ../generic/gemm_tcopy_4.c
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
CGEMMKERNEL = zgemm_kernel_2x2_barcelona.S
CGEMMINCOPY =
CGEMMITCOPY =
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
CGEMMINCOPYOBJ =
CGEMMITCOPYOBJ =
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
ZGEMMKERNEL = zgemm_kernel_1x2_barcelona.S
ZGEMMINCOPY = ../generic/zgemm_ncopy_1.c
ZGEMMITCOPY = ../generic/zgemm_tcopy_1.c
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
STRSMKERNEL_LN = trsm_kernel_LN_4x4_sse.S
STRSMKERNEL_LT = trsm_kernel_LT_4x4_sse.S
STRSMKERNEL_RN = trsm_kernel_LT_4x4_sse.S
STRSMKERNEL_RT = trsm_kernel_RT_4x4_sse.S
DTRSMKERNEL_LN = trsm_kernel_LN_2x4_sse2.S
DTRSMKERNEL_LT = trsm_kernel_LT_2x4_sse2.S
DTRSMKERNEL_RN = trsm_kernel_LT_2x4_sse2.S
DTRSMKERNEL_RT = trsm_kernel_RT_2x4_sse2.S
CTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_sse.S
CTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_sse.S
CTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_sse.S
CTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_sse.S
ZTRSMKERNEL_LN = ztrsm_kernel_LT_1x2_sse2.S
ZTRSMKERNEL_LT = ztrsm_kernel_LT_1x2_sse2.S
ZTRSMKERNEL_RN = ztrsm_kernel_LT_1x2_sse2.S
ZTRSMKERNEL_RT = ztrsm_kernel_RT_1x2_sse2.S
CGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S
ZGEMM3MKERNEL = zgemm3m_kernel_2x4_barcelona.S

View File

@@ -101,10 +101,10 @@
#define Y 36 + STACKSIZE+ARGS(%esp)
#define STACK_INCY 40 + STACKSIZE+ARGS(%esp)
#define BUFFER 44 + STACKSIZE+ARGS(%esp)
#define MMM 0+ARGS(%esp)
#define YY 4+ARGS(%esp)
#define AA 8+ARGS(%esp)
#define LDAX 12+ARGS(%esp)
#define I %eax
#define J %ebx
@@ -153,8 +153,8 @@
movl YY,J
movl J,Y
movl STACK_LDA, LDA
movl STACK_LDA, LDA
movl STACK_X, X
movl STACK_INCX, INCX
@@ -688,9 +688,9 @@
movl M,J
leal (,J,SIZE),%eax
addl %eax,AA
movl YY,J
addl %eax,J
movl J,YY
movl STACK_INCY,INCY
imull INCY,%eax
addl %eax,YY
jmp .L0t
ALIGN_4

View File

@@ -714,9 +714,9 @@
movl M,J
leal (,J,SIZE),%eax
addl %eax,AA
movl YY,J
addl %eax,J
movl J,YY
movl STACK_INCY,INCY
imull INCY,%eax
addl %eax,YY
jmp .L0t
ALIGN_4

View File

@@ -102,11 +102,9 @@
#define STACK_INCY 40 + STACKSIZE+ARGS(%esp)
#define BUFFER 44 + STACKSIZE+ARGS(%esp)
#define MMM 0+STACKSIZE(%esp)
#define NN 4+STACKSIZE(%esp)
#define AA 8+STACKSIZE(%esp)
#define LDAX 12+STACKSIZE(%esp)
#define XX 16+STACKSIZE(%esp)
#define MMM 0+ARGS(%esp)
#define AA 4+ARGS(%esp)
#define XX 8+ARGS(%esp)
#define I %eax
#define J %ebx
@@ -129,12 +127,8 @@
PROFCODE
movl STACK_LDA, LDA
movl LDA,LDAX # backup LDA
movl STACK_X, X
movl X,XX
movl N,J
movl J,NN # backup N
movl A,J
movl J,AA # backup A
movl M,J
@@ -144,7 +138,6 @@
addl $1,J
sall $22,J # J=2^24*sizeof(float)=buffer size(16MB)
subl $8, J # Don't use last 8 float in the buffer.
# Now, split M by block J
subl J,MMM # MMM=MMM-J
movl J,M
jge .L00t
@@ -159,13 +152,10 @@
movl AA,%eax
movl %eax,A # mov AA to A
movl NN,%eax
movl %eax,N # reset N
movl LDAX, LDA # reset LDA
movl XX,X
movl XX,%eax
movl %eax,X
movl STACK_LDA, LDA
movl STACK_INCX, INCX
movl STACK_INCY, INCY
@@ -688,9 +678,9 @@
movl M,J
leal (,J,SIZE),%eax
addl %eax,AA
movl XX,J
addl %eax,J
movl J,XX
movl STACK_INCX,INCX
imull INCX,%eax
addl %eax,XX
jmp .L0t
ALIGN_4

View File

@@ -76,7 +76,7 @@
#endif
#define STACKSIZE 16
#define ARGS 16
#define ARGS 20
#define M 4 + STACKSIZE+ARGS(%esp)
#define N 8 + STACKSIZE+ARGS(%esp)
@@ -89,10 +89,9 @@
#define STACK_INCY 44 + STACKSIZE+ARGS(%esp)
#define BUFFER 48 + STACKSIZE+ARGS(%esp)
#define MMM 0+STACKSIZE(%esp)
#define AA 4+STACKSIZE(%esp)
#define LDAX 8+STACKSIZE(%esp)
#define NN 12+STACKSIZE(%esp)
#define MMM 0+ARGS(%esp)
#define AA 4+ARGS(%esp)
#define XX 8+ARGS(%esp)
#define I %eax
#define J %ebx
@@ -117,10 +116,8 @@
PROFCODE
movl STACK_LDA, LDA
movl LDA,LDAX # backup LDA
movl N,J
movl J,NN # backup N
movl STACK_X, X
movl X,XX
movl A,J
movl J,AA # backup A
movl M,J
@@ -130,7 +127,6 @@
addl $1,J
sall $21,J # J=2^21*sizeof(double)=buffer size(16MB)
subl $4, J # Don't use last 4 double in the buffer.
# Now, split M by block J
subl J,MMM # MMM=MMM-J
movl J,M
jge .L00t
@@ -142,15 +138,13 @@
movl %eax,M
.L00t:
movl XX,%eax
movl %eax, X
movl AA,%eax
movl %eax,A # mov AA to A
movl NN,%eax
movl %eax,N # reset N
movl LDAX, LDA # reset LDA
movl STACK_X, X
movl STACK_LDA, LDA
movl STACK_INCX, INCX
movl STACK_INCY, INCY
@@ -605,6 +599,9 @@
movl M,J
leal (,J,SIZE),%eax
addl %eax,AA
movl STACK_INCX,INCX
imull INCX,%eax
addl %eax,XX
jmp .L0t
ALIGN_4

View File

@@ -74,11 +74,11 @@
#else
movl %eax, %ecx
subl $32, %ecx
cmovg %ecx, %eax
cmovge %ecx, %eax
movl %edx, %ecx
subl $32, %ecx
cmovg %ecx, %edx
cmovge %ecx, %edx
subl %eax, %edx
movl $0, %eax

View File

@@ -69,7 +69,7 @@
#define STACK_ALIGN 4096
#define STACK_OFFSET 1024
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
#define PREFETCH prefetch
#define PREFETCHSIZE (8 * 10 + 4)
#endif
@@ -439,7 +439,7 @@
.L22:
mulsd %xmm0, %xmm2
addsd %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movlpd 2 * SIZE(BB), %xmm2
@@ -488,7 +488,7 @@
movlpd 40 * SIZE(BB), %xmm3
addsd %xmm0, %xmm7
movlpd 8 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
PREFETCH (PREFETCHSIZE + 8) * SIZE(AA)
#endif
mulsd %xmm1, %xmm2
@@ -1697,7 +1697,7 @@
.L42:
mulpd %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
mulpd 2 * SIZE(BB), %xmm0
@@ -1727,7 +1727,7 @@
addpd %xmm0, %xmm7
movapd 16 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA)
#endif
mulpd %xmm1, %xmm2

View File

@@ -64,7 +64,7 @@
#define BORIG 60(%esp)
#define BUFFER 128(%esp)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
#define PREFETCH prefetch
#define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 10 + 8)
@@ -437,7 +437,7 @@
.L32:
mulss %xmm0, %xmm2
addss %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movss 4 * SIZE(BB), %xmm2
@@ -833,7 +833,7 @@
.L22:
mulps %xmm0, %xmm2
addps %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movaps 4 * SIZE(BB), %xmm2
@@ -1848,7 +1848,7 @@
.L72:
mulss %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
mulss 4 * SIZE(BB), %xmm0
@@ -2109,7 +2109,7 @@
ALIGN_4
.L62:
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
@@ -2429,7 +2429,7 @@
.L52:
mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
mulps 4 * SIZE(BB), %xmm0
@@ -2459,7 +2459,7 @@
addps %xmm0, %xmm5
movaps 32 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
#endif
mulps %xmm1, %xmm2
@@ -2952,7 +2952,7 @@
.L112:
mulss %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movss 1 * SIZE(AA), %xmm0
@@ -3148,7 +3148,7 @@
.L102:
mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movsd 2 * SIZE(AA), %xmm0
@@ -3389,7 +3389,7 @@
.L92:
mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movaps 4 * SIZE(AA), %xmm0
@@ -3404,7 +3404,7 @@
mulps 12 * SIZE(BB), %xmm0
addps %xmm0, %xmm7
movaps 32 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
#endif
mulps %xmm1, %xmm3

View File

@@ -69,7 +69,7 @@
#define STACK_ALIGN 4096
#define STACK_OFFSET 1024
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
#define PREFETCH prefetch
#define PREFETCHSIZE (8 * 10 + 4)
#endif
@@ -910,7 +910,7 @@
.L22:
mulsd %xmm0, %xmm2
addsd %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movlpd 2 * SIZE(BB), %xmm2
@@ -959,7 +959,7 @@
movlpd 40 * SIZE(BB), %xmm3
addsd %xmm0, %xmm7
movlpd 8 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
PREFETCH (PREFETCHSIZE + 8) * SIZE(AA)
#endif
mulsd %xmm1, %xmm2
@@ -1439,7 +1439,7 @@
.L42:
mulpd %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
mulpd 2 * SIZE(BB), %xmm0
@@ -1469,7 +1469,7 @@
addpd %xmm0, %xmm7
movapd 16 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA)
#endif
mulpd %xmm1, %xmm2

View File

@@ -64,7 +64,7 @@
#define BORIG 60(%esp)
#define BUFFER 128(%esp)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
#define PREFETCH prefetch
#define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 10 + 8)
@@ -872,7 +872,7 @@
.L22:
mulps %xmm0, %xmm2
addps %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movaps 4 * SIZE(BB), %xmm2
@@ -1316,7 +1316,7 @@
.L32:
mulss %xmm0, %xmm2
addss %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movss 4 * SIZE(BB), %xmm2
@@ -1855,7 +1855,7 @@
.L52:
mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
mulps 4 * SIZE(BB), %xmm0
@@ -1885,7 +1885,7 @@
addps %xmm0, %xmm5
movaps 32 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
#endif
mulps %xmm1, %xmm2
@@ -2249,7 +2249,7 @@
ALIGN_4
.L62:
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
@@ -2562,7 +2562,7 @@
.L72:
mulss %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
mulss 4 * SIZE(BB), %xmm0
@@ -2957,7 +2957,7 @@
.L92:
mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movaps 4 * SIZE(AA), %xmm0
@@ -2972,7 +2972,7 @@
mulps 12 * SIZE(BB), %xmm0
addps %xmm0, %xmm7
movaps 32 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
#endif
mulps %xmm1, %xmm3
@@ -3280,7 +3280,7 @@
.L102:
mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movsd 2 * SIZE(AA), %xmm0
@@ -3515,7 +3515,7 @@
.L112:
mulss %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movss 1 * SIZE(AA), %xmm0

View File

@@ -69,7 +69,7 @@
#define STACK_ALIGN 4096
#define STACK_OFFSET 1024
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
#define PREFETCH prefetch
#define PREFETCHSIZE (8 * 10 + 4)
#endif
@@ -1036,7 +1036,7 @@
.L42:
mulpd %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
mulpd 2 * SIZE(BB), %xmm0
@@ -1066,7 +1066,7 @@
addpd %xmm0, %xmm7
movapd 16 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA)
#endif
mulpd %xmm1, %xmm2
@@ -2224,7 +2224,7 @@
.L22:
mulsd %xmm0, %xmm2
addsd %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movlpd 2 * SIZE(BB), %xmm2
@@ -2273,7 +2273,7 @@
movlpd 40 * SIZE(BB), %xmm3
addsd %xmm0, %xmm7
movlpd 8 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
PREFETCH (PREFETCHSIZE + 8) * SIZE(AA)
#endif
mulsd %xmm1, %xmm2

View File

@@ -64,7 +64,7 @@
#define BORIG 60(%esp)
#define BUFFER 128(%esp)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
#define PREFETCH prefetch
#define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 10 + 8)
@@ -439,7 +439,7 @@
.L92:
mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movaps 4 * SIZE(AA), %xmm0
@@ -454,7 +454,7 @@
mulps 12 * SIZE(BB), %xmm0
addps %xmm0, %xmm7
movaps 32 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
#endif
mulps %xmm1, %xmm3
@@ -758,7 +758,7 @@
.L102:
mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movsd 2 * SIZE(AA), %xmm0
@@ -993,7 +993,7 @@
.L112:
mulss %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movss 1 * SIZE(AA), %xmm0
@@ -1324,7 +1324,7 @@
.L52:
mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
mulps 4 * SIZE(BB), %xmm0
@@ -1354,7 +1354,7 @@
addps %xmm0, %xmm5
movaps 32 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
#endif
mulps %xmm1, %xmm2
@@ -1718,7 +1718,7 @@
ALIGN_4
.L62:
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
@@ -2031,7 +2031,7 @@
.L72:
mulss %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
mulss 4 * SIZE(BB), %xmm0
@@ -2859,7 +2859,7 @@
.L22:
mulps %xmm0, %xmm2
addps %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movaps 4 * SIZE(BB), %xmm2
@@ -3303,7 +3303,7 @@
.L32:
mulss %xmm0, %xmm2
addss %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movss 4 * SIZE(BB), %xmm2

View File

@@ -89,18 +89,23 @@
#endif
#define STACKSIZE 16
#define ARGS 20
#define M 4 + STACKSIZE(%esp)
#define N 8 + STACKSIZE(%esp)
#define ALPHA_R 16 + STACKSIZE(%esp)
#define ALPHA_I 20 + STACKSIZE(%esp)
#define A 24 + STACKSIZE(%esp)
#define STACK_LDA 28 + STACKSIZE(%esp)
#define STACK_X 32 + STACKSIZE(%esp)
#define STACK_INCX 36 + STACKSIZE(%esp)
#define Y 40 + STACKSIZE(%esp)
#define STACK_INCY 44 + STACKSIZE(%esp)
#define BUFFER 48 + STACKSIZE(%esp)
#define M 4 + STACKSIZE+ARGS(%esp)
#define N 8 + STACKSIZE+ARGS(%esp)
#define ALPHA_R 16 + STACKSIZE+ARGS(%esp)
#define ALPHA_I 20 + STACKSIZE+ARGS(%esp)
#define A 24 + STACKSIZE+ARGS(%esp)
#define STACK_LDA 28 + STACKSIZE+ARGS(%esp)
#define STACK_X 32 + STACKSIZE+ARGS(%esp)
#define STACK_INCX 36 + STACKSIZE+ARGS(%esp)
#define Y 40 + STACKSIZE+ARGS(%esp)
#define STACK_INCY 44 + STACKSIZE+ARGS(%esp)
#define BUFFER 48 + STACKSIZE+ARGS(%esp)
#define MMM 0+ARGS(%esp)
#define YY 4+ARGS(%esp)
#define AA 8+ARGS(%esp)
#define I %eax
#define J %ebx
@@ -123,6 +128,7 @@
PROLOGUE
subl $ARGS,%esp
pushl %ebp
pushl %edi
pushl %esi
@@ -130,6 +136,33 @@
PROFCODE
movl Y,J
movl J,YY
movl A,J
movl J,AA
movl M,J
movl J,MMM
.L0t:
xorl J,J
addl $1,J
sall $20,J
subl J,MMM
movl J,M
jge .L00t
ALIGN_3
movl MMM,%eax
addl J,%eax
jle .L999x
movl %eax,M
.L00t:
movl AA,%eax
movl %eax,A
movl YY,J
movl J,Y
movl STACK_LDA, LDA
movl STACK_X, X
movl STACK_INCX, INCX
@@ -595,10 +628,21 @@
ALIGN_3
.L999:
movl M,%eax
sall $ZBASE_SHIFT,%eax
addl %eax,AA
movl STACK_INCY,INCY
imull INCY,%eax
addl %eax,YY
jmp .L0t
ALIGN_3
.L999x:
popl %ebx
popl %esi
popl %edi
popl %ebp
addl $ARGS,%esp
ret
EPILOGUE

View File

@@ -76,18 +76,23 @@
#endif
#define STACKSIZE 16
#define ARGS 16
#define M 4 + STACKSIZE+ARGS(%esp)
#define N 8 + STACKSIZE+ARGS(%esp)
#define ALPHA_R 16 + STACKSIZE+ARGS(%esp)
#define ALPHA_I 24 + STACKSIZE+ARGS(%esp)
#define A 32 + STACKSIZE+ARGS(%esp)
#define STACK_LDA 36 + STACKSIZE+ARGS(%esp)
#define STACK_X 40 + STACKSIZE+ARGS(%esp)
#define STACK_INCX 44 + STACKSIZE+ARGS(%esp)
#define Y 48 + STACKSIZE+ARGS(%esp)
#define STACK_INCY 52 + STACKSIZE+ARGS(%esp)
#define BUFFER 56 + STACKSIZE+ARGS(%esp)
#define MMM 0 + ARGS(%esp)
#define YY 4 + ARGS(%esp)
#define AA 8 + ARGS(%esp)
#define M 4 + STACKSIZE(%esp)
#define N 8 + STACKSIZE(%esp)
#define ALPHA_R 16 + STACKSIZE(%esp)
#define ALPHA_I 24 + STACKSIZE(%esp)
#define A 32 + STACKSIZE(%esp)
#define STACK_LDA 36 + STACKSIZE(%esp)
#define STACK_X 40 + STACKSIZE(%esp)
#define STACK_INCX 44 + STACKSIZE(%esp)
#define Y 48 + STACKSIZE(%esp)
#define STACK_INCY 52 + STACKSIZE(%esp)
#define BUFFER 56 + STACKSIZE(%esp)
#define I %eax
#define J %ebx
@@ -110,6 +115,7 @@
PROLOGUE
subl $ARGS,%esp
pushl %ebp
pushl %edi
pushl %esi
@@ -117,6 +123,33 @@
PROFCODE
movl Y,J
movl J,YY
movl A,J
movl J,AA
movl M,J
movl J,MMM
.L0t:
xorl J,J
addl $1,J
sall $18,J
subl J,MMM
movl J,M
jge .L00t
ALIGN_3
movl MMM,%eax
addl J,%eax
jle .L999x
movl %eax,M
.L00t:
movl AA,%eax
movl %eax,A
movl YY,J
movl J,Y
movl STACK_LDA, LDA
movl STACK_X, X
movl STACK_INCX, INCX
@@ -458,10 +491,21 @@
ALIGN_3
.L999:
movl M,%eax
sall $ZBASE_SHIFT,%eax
addl %eax,AA
movl STACK_INCY,INCY
imull INCY,%eax
addl %eax,YY
jmp .L0t
ALIGN_3
.L999x:
popl %ebx
popl %esi
popl %edi
popl %ebp
addl $ARGS,%esp
ret
EPILOGUE

View File

@@ -89,18 +89,23 @@
#endif
#define STACKSIZE 16
#define ARGS 20
#define M 4 + STACKSIZE(%esp)
#define N 8 + STACKSIZE(%esp)
#define ALPHA_R 16 + STACKSIZE(%esp)
#define ALPHA_I 20 + STACKSIZE(%esp)
#define A 24 + STACKSIZE(%esp)
#define STACK_LDA 28 + STACKSIZE(%esp)
#define STACK_X 32 + STACKSIZE(%esp)
#define STACK_INCX 36 + STACKSIZE(%esp)
#define Y 40 + STACKSIZE(%esp)
#define STACK_INCY 44 + STACKSIZE(%esp)
#define BUFFER 48 + STACKSIZE(%esp)
#define M 4 + STACKSIZE+ARGS(%esp)
#define N 8 + STACKSIZE+ARGS(%esp)
#define ALPHA_R 16 + STACKSIZE+ARGS(%esp)
#define ALPHA_I 20 + STACKSIZE+ARGS(%esp)
#define A 24 + STACKSIZE+ARGS(%esp)
#define STACK_LDA 28 + STACKSIZE+ARGS(%esp)
#define STACK_X 32 + STACKSIZE+ARGS(%esp)
#define STACK_INCX 36 + STACKSIZE+ARGS(%esp)
#define Y 40 + STACKSIZE+ARGS(%esp)
#define STACK_INCY 44 + STACKSIZE+ARGS(%esp)
#define BUFFER 48 + STACKSIZE+ARGS(%esp)
#define MMM 0+ARGS(%esp)
#define XX 4+ARGS(%esp)
#define AA 8+ARGS(%esp)
#define I %eax
#define J %ebx
@@ -123,6 +128,7 @@
PROLOGUE
subl $ARGS,%esp
pushl %ebp
pushl %edi
pushl %esi
@@ -130,8 +136,35 @@
PROFCODE
movl STACK_LDA, LDA
movl STACK_X, X
movl X,XX
movl A,J
movl J,AA #backup A
movl M,J
movl J,MMM
.L0t:
xorl J,J
addl $1,J
sall $20,J
subl $8,J
subl J,MMM #MMM-=J
movl J,M
jge .L00t
ALIGN_4
movl MMM,%eax
addl J,%eax
jle .L999x
movl %eax,M
.L00t:
movl AA,%eax
movl %eax,A
movl XX,%eax
movl %eax,X
movl STACK_LDA,LDA
movl STACK_INCX, INCX
movl STACK_INCY, INCY
@@ -513,10 +546,22 @@
ALIGN_4
.L999:
movl M,%eax
sall $ZBASE_SHIFT, %eax
addl %eax,AA
movl STACK_INCX,INCX
imull INCX,%eax
addl %eax,XX
jmp .L0t
ALIGN_4
.L999x:
popl %ebx
popl %esi
popl %edi
popl %ebp
addl $ARGS,%esp
ret
EPILOGUE

View File

@@ -76,19 +76,24 @@
#endif
#define STACKSIZE 16
#define ARGS 20
#define M 4 + STACKSIZE+ARGS(%esp)
#define N 8 + STACKSIZE+ARGS(%esp)
#define ALPHA_R 16 + STACKSIZE+ARGS(%esp)
#define ALPHA_I 24 + STACKSIZE+ARGS(%esp)
#define A 32 + STACKSIZE+ARGS(%esp)
#define STACK_LDA 36 + STACKSIZE+ARGS(%esp)
#define STACK_X 40 + STACKSIZE+ARGS(%esp)
#define STACK_INCX 44 + STACKSIZE+ARGS(%esp)
#define Y 48 + STACKSIZE+ARGS(%esp)
#define STACK_INCY 52 + STACKSIZE+ARGS(%esp)
#define BUFFER 56 + STACKSIZE+ARGS(%esp)
#define MMM 0 + ARGS(%esp)
#define AA 4 + ARGS(%esp)
#define XX 8 + ARGS(%esp)
#define M 4 + STACKSIZE(%esp)
#define N 8 + STACKSIZE(%esp)
#define ALPHA_R 16 + STACKSIZE(%esp)
#define ALPHA_I 24 + STACKSIZE(%esp)
#define A 32 + STACKSIZE(%esp)
#define STACK_LDA 36 + STACKSIZE(%esp)
#define STACK_X 40 + STACKSIZE(%esp)
#define STACK_INCX 44 + STACKSIZE(%esp)
#define Y 48 + STACKSIZE(%esp)
#define STACK_INCY 52 + STACKSIZE(%esp)
#define BUFFER 56 + STACKSIZE(%esp)
#define I %eax
#define J %ebx
@@ -110,6 +115,7 @@
PROLOGUE
subl $ARGS,%esp
pushl %ebp
pushl %edi
pushl %esi
@@ -117,8 +123,35 @@
PROFCODE
movl STACK_X, X
movl X, XX
movl A,J
movl J,AA
movl M,J
movl J,MMM
.L0t:
xorl J,J
addl $1,J
sall $18,J
subl $4,J
subl J,MMM
movl J,M
jge .L00t
ALIGN_4
movl MMM,%eax
addl J,%eax
jle .L999x
movl %eax, M
.L00t:
movl XX, %eax
movl %eax, X
movl AA,%eax
movl %eax,A
movl STACK_LDA, LDA
movl STACK_X, X
movl STACK_INCX, INCX
movl STACK_INCY, INCY
@@ -188,7 +221,7 @@
movl Y, Y1
movl N, J
ALIGN_3
ALIGN_4
.L11:
movl BUFFER, X
@@ -395,10 +428,21 @@
ALIGN_4
.L999:
movl M,%eax
sall $ZBASE_SHIFT,%eax
addl %eax,AA
movl STACK_INCX,INCX
imull INCX,%eax
addl %eax,XX
jmp .L0t
ALIGN_4
.L999x:
popl %ebx
popl %esi
popl %edi
popl %ebp
addl $ARGS,%esp
ret
EPILOGUE

View File

@@ -75,7 +75,7 @@
#define STACK_ALIGN 4096
#define STACK_OFFSET 1024
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
#define PREFETCHSIZE (16 * 10 + 8)
#define WPREFETCHSIZE 112
#define PREFETCH prefetch
@@ -533,7 +533,7 @@
addps %xmm0, %xmm7
movsd 16 * SIZE(AA), %xmm0
mulps %xmm1, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA)
#endif
addps %xmm2, %xmm4

View File

@@ -75,7 +75,7 @@
#define STACK_ALIGN 4096
#define STACK_OFFSET 1024
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
#define PREFETCHSIZE (16 * 10 + 8)
#define WPREFETCHSIZE 112
#define PREFETCH prefetch
@@ -994,7 +994,7 @@
addps %xmm0, %xmm7
movsd 16 * SIZE(AA), %xmm0
mulps %xmm1, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA)
#endif
addps %xmm2, %xmm4

View File

@@ -75,7 +75,7 @@
#define STACK_ALIGN 4096
#define STACK_OFFSET 1024
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
#define PREFETCHSIZE (16 * 10 + 8)
#define WPREFETCHSIZE 112
#define PREFETCH prefetch
@@ -1820,7 +1820,7 @@
addps %xmm0, %xmm7
movsd 16 * SIZE(AA), %xmm0
mulps %xmm1, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA)
#endif
addps %xmm2, %xmm4

View File

@@ -1,62 +1,71 @@
ZGEMVNKERNEL = zgemv_n_dup.S
ZGEMVTKERNEL = zgemv_t_dup.S
SGEMMKERNEL = sgemm_kernel_8x4_bulldozer.S
SGEMMINCOPY = ../generic/gemm_ncopy_8.c
SGEMMITCOPY = ../generic/gemm_tcopy_8.c
SGEMMONCOPY = gemm_ncopy_4_opteron.S
SGEMMOTCOPY = gemm_tcopy_4_opteron.S
DGEMVNKERNEL = dgemv_n_bulldozer.S
DGEMVTKERNEL = dgemv_t_bulldozer.S
DAXPYKERNEL = daxpy_bulldozer.S
DDOTKERNEL = ddot_bulldozer.S
DCOPYKERNEL = dcopy_bulldozer.S
SGEMMKERNEL = sgemm_kernel_16x2_bulldozer.S
SGEMMINCOPY = ../generic/gemm_ncopy_16.c
SGEMMITCOPY = ../generic/gemm_tcopy_16.c
SGEMMONCOPY = gemm_ncopy_2_bulldozer.S
SGEMMOTCOPY = gemm_tcopy_2_bulldozer.S
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
DGEMMKERNEL = dgemm_kernel_4x4_bulldozer.S
DGEMMINCOPY =
DGEMMITCOPY =
DGEMMONCOPY = gemm_ncopy_4_opteron.S
DGEMMOTCOPY = gemm_tcopy_4_opteron.S
DGEMMINCOPYOBJ =
DGEMMITCOPYOBJ =
DGEMMKERNEL = dgemm_kernel_8x2_bulldozer.S
DGEMMINCOPY = dgemm_ncopy_8_bulldozer.S
DGEMMITCOPY = dgemm_tcopy_8_bulldozer.S
DGEMMONCOPY = gemm_ncopy_2_bulldozer.S
DGEMMOTCOPY = gemm_tcopy_2_bulldozer.S
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
CGEMMKERNEL = zgemm_kernel_4x2_barcelona.S
CGEMMKERNEL = cgemm_kernel_4x2_bulldozer.S
CGEMMINCOPY = ../generic/zgemm_ncopy_4.c
CGEMMITCOPY = ../generic/zgemm_tcopy_4.c
CGEMMONCOPY = zgemm_ncopy_2.S
CGEMMOTCOPY = zgemm_tcopy_2.S
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
ZGEMMKERNEL = zgemm_kernel_2x2_barcelona.S
ZGEMMKERNEL = zgemm_kernel_2x2_bulldozer.S
ZGEMMINCOPY =
ZGEMMITCOPY =
ZGEMMONCOPY = zgemm_ncopy_2.S
ZGEMMOTCOPY = zgemm_tcopy_2.S
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
ZGEMMINCOPYOBJ =
ZGEMMITCOPYOBJ =
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
STRSMKERNEL_LN = trsm_kernel_LN_8x4_sse.S
STRSMKERNEL_LT = trsm_kernel_LT_8x4_sse.S
STRSMKERNEL_RN = trsm_kernel_LT_8x4_sse.S
STRSMKERNEL_RT = trsm_kernel_RT_8x4_sse.S
DTRSMKERNEL_LN = trsm_kernel_LN_4x4_barcelona.S
DTRSMKERNEL_LT = trsm_kernel_LT_4x4_barcelona.S
DTRSMKERNEL_RN = trsm_kernel_LT_4x4_barcelona.S
DTRSMKERNEL_RT = trsm_kernel_RT_4x4_barcelona.S
CTRSMKERNEL_LN = ztrsm_kernel_LN_4x2_sse.S
CTRSMKERNEL_LT = ztrsm_kernel_LT_4x2_sse.S
CTRSMKERNEL_RN = ztrsm_kernel_LT_4x2_sse.S
CTRSMKERNEL_RT = ztrsm_kernel_RT_4x2_sse.S
ZTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_sse2.S
ZTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_sse2.S
ZTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_sse2.S
ZTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_sse2.S
CGEMM3MKERNEL = zgemm3m_kernel_8x4_barcelona.S
ZGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c

View File

@@ -0,0 +1,70 @@
ZGEMVNKERNEL = zgemv_n_dup.S
ZGEMVTKERNEL = zgemv_t_dup.S
DGEMVNKERNEL = dgemv_n_bulldozer.S
DGEMVTKERNEL = dgemv_t_bulldozer.S
DAXPYKERNEL = daxpy_bulldozer.S
DDOTKERNEL = ddot_bulldozer.S
DCOPYKERNEL = dcopy_bulldozer.S
SGEMMKERNEL = sgemm_kernel_16x2_bulldozer.S
SGEMMINCOPY = ../generic/gemm_ncopy_16.c
SGEMMITCOPY = ../generic/gemm_tcopy_16.c
SGEMMONCOPY = gemm_ncopy_2_bulldozer.S
SGEMMOTCOPY = gemm_tcopy_2_bulldozer.S
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
DGEMMKERNEL = dgemm_kernel_8x2_bulldozer.S
DGEMMINCOPY = dgemm_ncopy_8_bulldozer.S
DGEMMITCOPY = dgemm_tcopy_8_bulldozer.S
DGEMMONCOPY = gemm_ncopy_2_bulldozer.S
DGEMMOTCOPY = gemm_tcopy_2_bulldozer.S
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
CGEMMKERNEL = cgemm_kernel_4x2_bulldozer.S
CGEMMINCOPY = ../generic/zgemm_ncopy_4.c
CGEMMITCOPY = ../generic/zgemm_tcopy_4.c
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
ZGEMMKERNEL = zgemm_kernel_2x2_bulldozer.S
ZGEMMINCOPY =
ZGEMMITCOPY =
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
ZGEMMINCOPYOBJ =
ZGEMMITCOPYOBJ =
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
CGEMM3MKERNEL = zgemm3m_kernel_8x4_barcelona.S
ZGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c

View File

@@ -69,7 +69,7 @@
#endif
movaps %xmm0, ALPHA
#else
movaps %xmm3, ALPHA
movq 40(%rsp), X
movq 48(%rsp), INCX
@@ -79,6 +79,10 @@
SAVEREGISTERS
#ifdef WINDOWS_ABI
movaps %xmm3, ALPHA
#endif
shufps $0, ALPHA, ALPHA
leaq (, INCX, SIZE), INCX

View File

@@ -69,7 +69,6 @@
#endif
movaps %xmm0, ALPHA
#else
movaps %xmm3, ALPHA
movq 40(%rsp), X
movq 48(%rsp), INCX
@@ -79,6 +78,10 @@
SAVEREGISTERS
#ifdef WINDOWS_ABI
movaps %xmm3, ALPHA
#endif
unpcklpd ALPHA, ALPHA
leaq (, INCX, SIZE), INCX

File diff suppressed because it is too large Load Diff

View File

@@ -47,14 +47,22 @@
#ifndef WINDOWS_ABI
#define STACKSIZE 64
#define STACKSIZE 128
#define OLD_INCX 8 + STACKSIZE(%rsp)
#define OLD_Y 16 + STACKSIZE(%rsp)
#define OLD_INCY 24 + STACKSIZE(%rsp)
#define OLD_BUFFER 32 + STACKSIZE(%rsp)
#define ALPHA 48 (%rsp)
#define MMM 64(%rsp)
#define NN 72(%rsp)
#define AA 80(%rsp)
#define XX 88(%rsp)
#define LDAX 96(%rsp)
#define ALPHAR 104(%rsp)
#define ALPHAI 112(%rsp)
#define M %rdi
#define N %rsi
#define A %rcx
@@ -66,7 +74,7 @@
#else
#define STACKSIZE 256
#define STACKSIZE 288
#define OLD_ALPHA_I 40 + STACKSIZE(%rsp)
#define OLD_A 48 + STACKSIZE(%rsp)
@@ -78,6 +86,14 @@
#define OLD_BUFFER 96 + STACKSIZE(%rsp)
#define ALPHA 224 (%rsp)
#define MMM 232(%rsp)
#define NN 240(%rsp)
#define AA 248(%rsp)
#define XX 256(%rsp)
#define LDAX 264(%rsp)
#define ALPHAR 272(%rsp)
#define ALPHAI 280(%rsp)
#define M %rcx
#define N %rdx
#define A %r8
@@ -142,9 +158,37 @@
movaps %xmm3, %xmm0
movss OLD_ALPHA_I, %xmm1
#endif
movq A, AA
movq N, NN
movq M, MMM
movq LDA, LDAX
movq X, XX
movq OLD_Y, Y
movss %xmm0,ALPHAR
movss %xmm1,ALPHAI
.L0t:
xorq I,I
addq $1,I
salq $20,I
subq I,MMM
movq I,M
movss ALPHAR,%xmm0
movss ALPHAI,%xmm1
jge .L00t
movq MMM,M
addq I,M
jle .L999x
.L00t:
movq AA, A
movq NN, N
movq LDAX, LDA
movq XX, X
movq OLD_INCX, INCX
movq OLD_Y, Y
# movq OLD_Y, Y
movq OLD_INCY, INCY
movq OLD_BUFFER, BUFFER
@@ -4274,6 +4318,11 @@
ALIGN_3
.L999:
movq M, I
salq $ZBASE_SHIFT,I
addq I,AA
jmp .L0t
.L999x:
movq 0(%rsp), %rbx
movq 8(%rsp), %rbp
movq 16(%rsp), %r12

View File

@@ -47,13 +47,19 @@
#ifndef WINDOWS_ABI
#define STACKSIZE 64
#define STACKSIZE 128
#define OLD_INCX 8 + STACKSIZE(%rsp)
#define OLD_Y 16 + STACKSIZE(%rsp)
#define OLD_INCY 24 + STACKSIZE(%rsp)
#define OLD_BUFFER 32 + STACKSIZE(%rsp)
#define ALPHA 48 (%rsp)
#define MMM 64(%rsp)
#define NN 72(%rsp)
#define AA 80(%rsp)
#define LDAX 88(%rsp)
#define ALPHAR 96(%rsp)
#define ALPHAI 104(%rsp)
#define M %rdi
#define N %rsi
@@ -66,7 +72,7 @@
#else
#define STACKSIZE 256
#define STACKSIZE 288
#define OLD_ALPHA_I 40 + STACKSIZE(%rsp)
#define OLD_A 48 + STACKSIZE(%rsp)
@@ -78,6 +84,13 @@
#define OLD_BUFFER 96 + STACKSIZE(%rsp)
#define ALPHA 224 (%rsp)
#define MMM 232(%rsp)
#define NN 240(%rsp)
#define AA 248(%rsp)
#define LDAX 256(%rsp)
#define ALPHAR 264(%rsp)
#define ALPHAI 272(%rsp)
#define M %rcx
#define N %rdx
#define A %r8
@@ -144,6 +157,32 @@
movss OLD_ALPHA_I, %xmm1
#endif
movq A, AA
movq N, NN
movq M, MMM
movq LDA, LDAX
movss %xmm0,ALPHAR
movss %xmm1,ALPHAI
.L0t:
xorq I,I
addq $1,I
salq $20,I
subq I,MMM
movq I,M
movss ALPHAR,%xmm0
movss ALPHAI,%xmm1
jge .L00t
movq MMM,M
addq I,M
jle .L999x
.L00t:
movq AA, A
movq NN, N
movq LDAX, LDA
movq OLD_INCX, INCX
movq OLD_Y, Y
movq OLD_INCY, INCY
@@ -4350,6 +4389,11 @@
ALIGN_3
.L999:
movq M, I
salq $ZBASE_SHIFT,I
addq I,AA
jmp .L0t
.L999x:
movq 0(%rsp), %rbx
movq 8(%rsp), %rbp
movq 16(%rsp), %r12

View File

@@ -0,0 +1,408 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#define ASSEMBLER
#include "common.h"
#ifndef WINDOWS_ABI
#define M ARG1
#define X ARG4
#define INCX ARG5
#define Y ARG6
#define INCY ARG2
#else
#define M ARG1
#define X ARG2
#define INCX ARG3
#define Y ARG4
#define INCY %r10
#endif
#define YY %r11
#define ALPHA %xmm15
#define A_PRE 640
#include "l1param.h"
PROLOGUE
PROFCODE
#ifndef WINDOWS_ABI
#ifndef XDOUBLE
movq 8(%rsp), INCY
#else
movq 24(%rsp), INCY
#endif
vmovups %xmm0, ALPHA
#else
vmovups %xmm3, ALPHA
movq 40(%rsp), X
movq 48(%rsp), INCX
movq 56(%rsp), Y
movq 64(%rsp), INCY
#endif
SAVEREGISTERS
unpcklpd ALPHA, ALPHA
leaq (, INCX, SIZE), INCX
leaq (, INCY, SIZE), INCY
testq M, M
jle .L47
cmpq $SIZE, INCX
jne .L40
cmpq $SIZE, INCY
jne .L40
testq $SIZE, Y
je .L10
movsd (X), %xmm0
mulsd ALPHA, %xmm0
addsd (Y), %xmm0
movsd %xmm0, (Y)
addq $1 * SIZE, X
addq $1 * SIZE, Y
decq M
jle .L19
ALIGN_4
.L10:
subq $-16 * SIZE, X
subq $-16 * SIZE, Y
movq M, %rax
sarq $4, %rax
jle .L13
vmovups -16 * SIZE(X), %xmm0
vmovups -14 * SIZE(X), %xmm1
vmovups -12 * SIZE(X), %xmm2
vmovups -10 * SIZE(X), %xmm3
decq %rax
jle .L12
ALIGN_3
.L11:
prefetchnta A_PRE(Y)
vmovups -8 * SIZE(X), %xmm4
vfmaddpd -16 * SIZE(Y), ALPHA, %xmm0 , %xmm0
vfmaddpd -14 * SIZE(Y), ALPHA, %xmm1 , %xmm1
vmovups -6 * SIZE(X), %xmm5
vmovups -4 * SIZE(X), %xmm6
vfmaddpd -12 * SIZE(Y), ALPHA, %xmm2 , %xmm2
vfmaddpd -10 * SIZE(Y), ALPHA, %xmm3 , %xmm3
vmovups -2 * SIZE(X), %xmm7
vmovups %xmm0, -16 * SIZE(Y)
vmovups %xmm1, -14 * SIZE(Y)
prefetchnta A_PRE(X)
nop
vmovups %xmm2, -12 * SIZE(Y)
vmovups %xmm3, -10 * SIZE(Y)
prefetchnta A_PRE+64(Y)
vmovups 0 * SIZE(X), %xmm0
vfmaddpd -8 * SIZE(Y), ALPHA, %xmm4 , %xmm4
vfmaddpd -6 * SIZE(Y), ALPHA, %xmm5 , %xmm5
vmovups 2 * SIZE(X), %xmm1
vmovups 4 * SIZE(X), %xmm2
vfmaddpd -4 * SIZE(Y), ALPHA, %xmm6 , %xmm6
vfmaddpd -2 * SIZE(Y), ALPHA, %xmm7 , %xmm7
vmovups 6 * SIZE(X), %xmm3
vmovups %xmm4, -8 * SIZE(Y)
vmovups %xmm5, -6 * SIZE(Y)
prefetchnta A_PRE+64(X)
nop
vmovups %xmm6, -4 * SIZE(Y)
vmovups %xmm7, -2 * SIZE(Y)
subq $-16 * SIZE, Y
subq $-16 * SIZE, X
decq %rax
jg .L11
ALIGN_3
.L12:
vmovups -8 * SIZE(X), %xmm4
vfmaddpd -16 * SIZE(Y), ALPHA, %xmm0 , %xmm0
vfmaddpd -14 * SIZE(Y), ALPHA, %xmm1 , %xmm1
vmovups -6 * SIZE(X), %xmm5
vmovups -4 * SIZE(X), %xmm6
vfmaddpd -12 * SIZE(Y), ALPHA, %xmm2 , %xmm2
vfmaddpd -10 * SIZE(Y), ALPHA, %xmm3 , %xmm3
vmovups -2 * SIZE(X), %xmm7
vmovups %xmm0, -16 * SIZE(Y)
vmovups %xmm1, -14 * SIZE(Y)
vmovups %xmm2, -12 * SIZE(Y)
vmovups %xmm3, -10 * SIZE(Y)
vfmaddpd -8 * SIZE(Y), ALPHA, %xmm4 , %xmm4
vfmaddpd -6 * SIZE(Y), ALPHA, %xmm5 , %xmm5
vfmaddpd -4 * SIZE(Y), ALPHA, %xmm6 , %xmm6
vfmaddpd -2 * SIZE(Y), ALPHA, %xmm7 , %xmm7
vmovups %xmm4, -8 * SIZE(Y)
vmovups %xmm5, -6 * SIZE(Y)
vmovups %xmm6, -4 * SIZE(Y)
vmovups %xmm7, -2 * SIZE(Y)
subq $-16 * SIZE, Y
subq $-16 * SIZE, X
ALIGN_3
.L13:
movq M, %rax
andq $8, %rax
jle .L14
ALIGN_3
vmovups -16 * SIZE(X), %xmm0
vmovups -14 * SIZE(X), %xmm1
vmovups -12 * SIZE(X), %xmm2
vmovups -10 * SIZE(X), %xmm3
vfmaddpd -16 * SIZE(Y), ALPHA, %xmm0 , %xmm0
vfmaddpd -14 * SIZE(Y), ALPHA, %xmm1 , %xmm1
vfmaddpd -12 * SIZE(Y), ALPHA, %xmm2 , %xmm2
vfmaddpd -10 * SIZE(Y), ALPHA, %xmm3 , %xmm3
vmovups %xmm0, -16 * SIZE(Y)
vmovups %xmm1, -14 * SIZE(Y)
vmovups %xmm2, -12 * SIZE(Y)
vmovups %xmm3, -10 * SIZE(Y)
addq $8 * SIZE, X
addq $8 * SIZE, Y
ALIGN_3
.L14:
movq M, %rax
andq $4, %rax
jle .L15
ALIGN_3
vmovups -16 * SIZE(X), %xmm0
vmovups -14 * SIZE(X), %xmm1
vfmaddpd -16 * SIZE(Y), ALPHA, %xmm0 , %xmm0
vfmaddpd -14 * SIZE(Y), ALPHA, %xmm1 , %xmm1
vmovups %xmm0, -16 * SIZE(Y)
vmovups %xmm1, -14 * SIZE(Y)
addq $4 * SIZE, X
addq $4 * SIZE, Y
ALIGN_3
.L15:
movq M, %rax
andq $2, %rax
jle .L16
ALIGN_3
vmovups -16 * SIZE(X), %xmm0
vfmaddpd -16 * SIZE(Y), ALPHA, %xmm0 , %xmm0
vmovups %xmm0, -16 * SIZE(Y)
addq $2 * SIZE, X
addq $2 * SIZE, Y
ALIGN_3
.L16:
movq M, %rax
andq $1, %rax
jle .L19
ALIGN_3
vmovsd -16 * SIZE(X), %xmm0
vfmaddsd -16 * SIZE(Y), ALPHA, %xmm0 , %xmm0
vmovsd %xmm0, -16 * SIZE(Y)
ALIGN_3
.L19:
xorq %rax,%rax
RESTOREREGISTERS
ret
ALIGN_3
.L40:
movq Y, YY
movq M, %rax
//If incx==0 || incy==0, avoid unloop.
cmpq $0, INCX
je .L46
cmpq $0, INCY
je .L46
sarq $3, %rax
jle .L45
prefetchnta 512(X)
prefetchnta 512+64(X)
prefetchnta 512+128(X)
prefetchnta 512+192(X)
prefetchnta 512(Y)
prefetchnta 512+64(Y)
prefetchnta 512+128(Y)
prefetchnta 512+192(Y)
ALIGN_3
.L41:
vmovsd 0 * SIZE(X), %xmm0
addq INCX, X
vmovhpd 0 * SIZE(X), %xmm0 , %xmm0
addq INCX, X
vmovsd 0 * SIZE(YY), %xmm6
addq INCY, YY
vmovhpd 0 * SIZE(YY), %xmm6 , %xmm6
addq INCY, YY
vmovsd 0 * SIZE(X), %xmm1
addq INCX, X
vmovhpd 0 * SIZE(X), %xmm1 , %xmm1
addq INCX, X
vmovsd 0 * SIZE(YY), %xmm7
addq INCY, YY
vmovhpd 0 * SIZE(YY), %xmm7 , %xmm7
addq INCY, YY
vfmaddpd %xmm6 , ALPHA , %xmm0 , %xmm0
vmovsd 0 * SIZE(X), %xmm2
addq INCX, X
vmovhpd 0 * SIZE(X), %xmm2 , %xmm2
addq INCX, X
vmovsd 0 * SIZE(YY), %xmm8
addq INCY, YY
vmovhpd 0 * SIZE(YY), %xmm8 , %xmm8
addq INCY, YY
vfmaddpd %xmm7 , ALPHA , %xmm1 , %xmm1
vmovsd 0 * SIZE(X), %xmm3
addq INCX, X
vmovhpd 0 * SIZE(X), %xmm3 , %xmm3
addq INCX, X
vfmaddpd %xmm8 , ALPHA , %xmm2 , %xmm2
vmovsd 0 * SIZE(YY), %xmm9
addq INCY, YY
vmovhpd 0 * SIZE(YY), %xmm9 , %xmm9
addq INCY, YY
vmovsd %xmm0, 0 * SIZE(Y)
addq INCY, Y
vmovhpd %xmm0, 0 * SIZE(Y)
addq INCY, Y
vmovsd %xmm1, 0 * SIZE(Y)
addq INCY, Y
vmovhpd %xmm1, 0 * SIZE(Y)
addq INCY, Y
vmovsd %xmm2, 0 * SIZE(Y)
addq INCY, Y
vmovhpd %xmm2, 0 * SIZE(Y)
addq INCY, Y
vfmaddpd %xmm9 , ALPHA , %xmm3 , %xmm3
vmovsd %xmm3, 0 * SIZE(Y)
addq INCY, Y
vmovhpd %xmm3, 0 * SIZE(Y)
addq INCY, Y
decq %rax
jg .L41
ALIGN_3
.L45:
movq M, %rax
andq $7, %rax
jle .L47
ALIGN_3
.L46:
vmovsd (X), %xmm0
addq INCX, X
vfmaddsd (Y) , ALPHA , %xmm0 , %xmm0
vmovsd %xmm0, (Y)
addq INCY, Y
decq %rax
jg .L46
ALIGN_3
.L47:
xorq %rax, %rax
RESTOREREGISTERS
ret
EPILOGUE

View File

@@ -0,0 +1,291 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#define ASSEMBLER
#include "common.h"
#define M ARG1 /* rdi */
#define X ARG2 /* rsi */
#define INCX ARG3 /* rdx */
#define Y ARG4 /* rcx */
#ifndef WINDOWS_ABI
#define INCY ARG5 /* r8 */
#else
#define INCY %r10
#endif
#include "l1param.h"
#define VLOAD(OFFSET, ADDR, REG) vmovups OFFSET(ADDR), REG
#define VSHUFPD_1(REG1 , REG2) vshufpd $0x01, REG1, REG2, REG2
#define A_PRE 640
#define B_PRE 640
PROLOGUE
PROFCODE
#ifdef WINDOWS_ABI
movq 40(%rsp), INCY
#endif
SAVEREGISTERS
leaq (, INCX, SIZE), INCX
leaq (, INCY, SIZE), INCY
cmpq $SIZE, INCX
jne .L40
cmpq $SIZE, INCY
jne .L40
testq $SIZE, X
je .L10
vmovsd (X), %xmm0
vmovsd %xmm0, (Y)
addq $1 * SIZE, X
addq $1 * SIZE, Y
decq M
jle .L19
ALIGN_4
.L10:
subq $-16 * SIZE, X
subq $-16 * SIZE, Y
movq M, %rax
sarq $4, %rax
jle .L13
vmovups -16 * SIZE(X), %xmm0
vmovups -14 * SIZE(X), %xmm1
vmovups -12 * SIZE(X), %xmm2
vmovups -10 * SIZE(X), %xmm3
vmovups -8 * SIZE(X), %xmm4
vmovups -6 * SIZE(X), %xmm5
vmovups -4 * SIZE(X), %xmm6
vmovups -2 * SIZE(X), %xmm7
decq %rax
jle .L12
ALIGN_4
.L11:
prefetchnta A_PRE(X)
nop
vmovups %xmm0, -16 * SIZE(Y)
vmovups %xmm1, -14 * SIZE(Y)
prefetchnta B_PRE(Y)
nop
vmovups %xmm2, -12 * SIZE(Y)
vmovups %xmm3, -10 * SIZE(Y)
VLOAD( 0 * SIZE, X, %xmm0)
VLOAD( 2 * SIZE, X, %xmm1)
VLOAD( 4 * SIZE, X, %xmm2)
VLOAD( 6 * SIZE, X, %xmm3)
prefetchnta A_PRE+64(X)
nop
vmovups %xmm4, -8 * SIZE(Y)
vmovups %xmm5, -6 * SIZE(Y)
prefetchnta B_PRE+64(Y)
nop
vmovups %xmm6, -4 * SIZE(Y)
vmovups %xmm7, -2 * SIZE(Y)
VLOAD( 8 * SIZE, X, %xmm4)
VLOAD(10 * SIZE, X, %xmm5)
subq $-16 * SIZE, Y
VLOAD(12 * SIZE, X, %xmm6)
VLOAD(14 * SIZE, X, %xmm7)
subq $-16 * SIZE, X
decq %rax
jg .L11
ALIGN_3
.L12:
vmovups %xmm0, -16 * SIZE(Y)
vmovups %xmm1, -14 * SIZE(Y)
vmovups %xmm2, -12 * SIZE(Y)
vmovups %xmm3, -10 * SIZE(Y)
vmovups %xmm4, -8 * SIZE(Y)
vmovups %xmm5, -6 * SIZE(Y)
vmovups %xmm6, -4 * SIZE(Y)
vmovups %xmm7, -2 * SIZE(Y)
subq $-16 * SIZE, Y
subq $-16 * SIZE, X
ALIGN_3
.L13:
testq $8, M
jle .L14
ALIGN_3
vmovups -16 * SIZE(X), %xmm0
vmovups -14 * SIZE(X), %xmm1
vmovups -12 * SIZE(X), %xmm2
vmovups -10 * SIZE(X), %xmm3
vmovups %xmm0, -16 * SIZE(Y)
vmovups %xmm1, -14 * SIZE(Y)
vmovups %xmm2, -12 * SIZE(Y)
vmovups %xmm3, -10 * SIZE(Y)
addq $8 * SIZE, X
addq $8 * SIZE, Y
ALIGN_3
.L14:
testq $4, M
jle .L15
ALIGN_3
vmovups -16 * SIZE(X), %xmm0
vmovups -14 * SIZE(X), %xmm1
vmovups %xmm0, -16 * SIZE(Y)
vmovups %xmm1, -14 * SIZE(Y)
addq $4 * SIZE, X
addq $4 * SIZE, Y
ALIGN_3
.L15:
testq $2, M
jle .L16
ALIGN_3
vmovups -16 * SIZE(X), %xmm0
vmovups %xmm0, -16 * SIZE(Y)
addq $2 * SIZE, X
addq $2 * SIZE, Y
ALIGN_3
.L16:
testq $1, M
jle .L19
ALIGN_3
vmovsd -16 * SIZE(X), %xmm0
vmovsd %xmm0, -16 * SIZE(Y)
ALIGN_3
.L19:
xorq %rax,%rax
RESTOREREGISTERS
ret
ALIGN_3
.L40:
movq M, %rax
sarq $3, %rax
jle .L45
ALIGN_3
.L41:
vmovsd (X), %xmm0
addq INCX, X
vmovsd (X), %xmm4
addq INCX, X
vmovsd (X), %xmm1
addq INCX, X
vmovsd (X), %xmm5
addq INCX, X
vmovsd (X), %xmm2
addq INCX, X
vmovsd (X), %xmm6
addq INCX, X
vmovsd (X), %xmm3
addq INCX, X
vmovsd (X), %xmm7
addq INCX, X
vmovsd %xmm0, (Y)
addq INCY, Y
vmovsd %xmm4, (Y)
addq INCY, Y
vmovsd %xmm1, (Y)
addq INCY, Y
vmovsd %xmm5, (Y)
addq INCY, Y
vmovsd %xmm2, (Y)
addq INCY, Y
vmovsd %xmm6, (Y)
addq INCY, Y
vmovsd %xmm3, (Y)
addq INCY, Y
vmovsd %xmm7, (Y)
addq INCY, Y
decq %rax
jg .L41
ALIGN_3
.L45:
movq M, %rax
andq $7, %rax
jle .L47
ALIGN_3
.L46:
vmovsd (X), %xmm0
addq INCX, X
vmovsd %xmm0, (Y)
addq INCY, Y
decq %rax
jg .L46
ALIGN_3
.L47:
xorq %rax, %rax
RESTOREREGISTERS
ret
EPILOGUE

View File

@@ -0,0 +1,311 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#define ASSEMBLER
#include "common.h"
#define N ARG1 /* rdi */
#define X ARG2 /* rsi */
#define INCX ARG3 /* rdx */
#define Y ARG4 /* rcx */
#ifndef WINDOWS_ABI
#define INCY ARG5 /* r8 */
#else
#define INCY %r10
#endif
#define A_PRE 512
#include "l1param.h"
PROLOGUE
PROFCODE
#ifdef WINDOWS_ABI
movq 40(%rsp), INCY
#endif
SAVEREGISTERS
leaq (, INCX, SIZE), INCX
leaq (, INCY, SIZE), INCY
vxorps %xmm0, %xmm0 , %xmm0
vxorps %xmm1, %xmm1 , %xmm1
vxorps %xmm2, %xmm2 , %xmm2
vxorps %xmm3, %xmm3 , %xmm3
cmpq $0, N
jle .L999
cmpq $SIZE, INCX
jne .L50
cmpq $SIZE, INCY
jne .L50
subq $-16 * SIZE, X
subq $-16 * SIZE, Y
testq $SIZE, Y
je .L10
vmovsd -16 * SIZE(X), %xmm0
vmulsd -16 * SIZE(Y), %xmm0 , %xmm0
addq $1 * SIZE, X
addq $1 * SIZE, Y
decq N
ALIGN_2
.L10:
movq N, %rax
sarq $4, %rax
jle .L14
vmovups -16 * SIZE(X), %xmm4
vmovups -14 * SIZE(X), %xmm5
vmovups -12 * SIZE(X), %xmm6
vmovups -10 * SIZE(X), %xmm7
vmovups -8 * SIZE(X), %xmm8
vmovups -6 * SIZE(X), %xmm9
vmovups -4 * SIZE(X), %xmm10
vmovups -2 * SIZE(X), %xmm11
decq %rax
jle .L12
ALIGN_3
.L11:
prefetchnta A_PRE(Y)
vfmaddpd %xmm0 , -16 * SIZE(Y), %xmm4 , %xmm0
vfmaddpd %xmm1 , -14 * SIZE(Y), %xmm5 , %xmm1
prefetchnta A_PRE(X)
vfmaddpd %xmm2 , -12 * SIZE(Y), %xmm6 , %xmm2
vfmaddpd %xmm3 , -10 * SIZE(Y), %xmm7 , %xmm3
vmovups 0 * SIZE(X), %xmm4
vfmaddpd %xmm0 , -8 * SIZE(Y), %xmm8 , %xmm0
vfmaddpd %xmm1 , -6 * SIZE(Y), %xmm9 , %xmm1
vmovups 2 * SIZE(X), %xmm5
vmovups 4 * SIZE(X), %xmm6
vfmaddpd %xmm2 , -4 * SIZE(Y), %xmm10, %xmm2
vfmaddpd %xmm3 , -2 * SIZE(Y), %xmm11, %xmm3
vmovups 6 * SIZE(X), %xmm7
prefetchnta A_PRE+64(Y)
vmovups 8 * SIZE(X), %xmm8
vmovups 10 * SIZE(X), %xmm9
prefetchnta A_PRE+64(X)
vmovups 12 * SIZE(X), %xmm10
vmovups 14 * SIZE(X), %xmm11
subq $-16 * SIZE, X
subq $-16 * SIZE, Y
decq %rax
jg .L11
ALIGN_3
.L12:
vfmaddpd %xmm0 , -16 * SIZE(Y), %xmm4 , %xmm0
vfmaddpd %xmm1 , -14 * SIZE(Y), %xmm5 , %xmm1
vfmaddpd %xmm2 , -12 * SIZE(Y), %xmm6 , %xmm2
vfmaddpd %xmm3 , -10 * SIZE(Y), %xmm7 , %xmm3
vfmaddpd %xmm0 , -8 * SIZE(Y), %xmm8 , %xmm0
vfmaddpd %xmm1 , -6 * SIZE(Y), %xmm9 , %xmm1
vfmaddpd %xmm2 , -4 * SIZE(Y), %xmm10, %xmm2
vfmaddpd %xmm3 , -2 * SIZE(Y), %xmm11, %xmm3
subq $-16 * SIZE, X
subq $-16 * SIZE, Y
ALIGN_3
.L14:
testq $15, N
jle .L999
testq $8, N
jle .L15
vmovups -16 * SIZE(X), %xmm4
vmovups -14 * SIZE(X), %xmm5
vmovups -12 * SIZE(X), %xmm6
vmovups -10 * SIZE(X), %xmm7
vfmaddpd %xmm0 , -16 * SIZE(Y), %xmm4 , %xmm0
vfmaddpd %xmm1 , -14 * SIZE(Y), %xmm5 , %xmm1
vfmaddpd %xmm2 , -12 * SIZE(Y), %xmm6 , %xmm2
vfmaddpd %xmm3 , -10 * SIZE(Y), %xmm7 , %xmm3
addq $8 * SIZE, X
addq $8 * SIZE, Y
ALIGN_3
.L15:
testq $4, N
jle .L16
vmovups -16 * SIZE(X), %xmm4
vmovups -14 * SIZE(X), %xmm5
vfmaddpd %xmm0 , -16 * SIZE(Y), %xmm4 , %xmm0
vfmaddpd %xmm1 , -14 * SIZE(Y), %xmm5 , %xmm1
addq $4 * SIZE, X
addq $4 * SIZE, Y
ALIGN_3
.L16:
testq $2, N
jle .L17
vmovups -16 * SIZE(X), %xmm4
vfmaddpd %xmm0 , -16 * SIZE(Y), %xmm4 , %xmm0
addq $2 * SIZE, X
addq $2 * SIZE, Y
ALIGN_3
.L17:
testq $1, N
jle .L999
vmovsd -16 * SIZE(X), %xmm4
vmovsd -16 * SIZE(Y), %xmm5
vfmaddpd %xmm0, %xmm4 , %xmm5 , %xmm0
jmp .L999
ALIGN_3
.L50:
movq N, %rax
sarq $3, %rax
jle .L55
ALIGN_3
.L53:
vmovsd 0 * SIZE(X), %xmm4
addq INCX, X
vmovsd 0 * SIZE(Y), %xmm8
addq INCY, Y
vmovsd 0 * SIZE(X), %xmm5
addq INCX, X
vmovsd 0 * SIZE(Y), %xmm9
addq INCY, Y
vmovsd 0 * SIZE(X), %xmm6
addq INCX, X
vmovsd 0 * SIZE(Y), %xmm10
addq INCY, Y
vmovsd 0 * SIZE(X), %xmm7
addq INCX, X
vmovsd 0 * SIZE(Y), %xmm11
addq INCY, Y
vfmaddpd %xmm0 , %xmm4 , %xmm8 , %xmm0
vfmaddpd %xmm1 , %xmm5 , %xmm9 , %xmm1
vfmaddpd %xmm2 , %xmm6 , %xmm10, %xmm2
vfmaddpd %xmm3 , %xmm7 , %xmm11, %xmm3
vmovsd 0 * SIZE(X), %xmm4
addq INCX, X
vmovsd 0 * SIZE(Y), %xmm8
addq INCY, Y
vmovsd 0 * SIZE(X), %xmm5
addq INCX, X
vmovsd 0 * SIZE(Y), %xmm9
addq INCY, Y
vmovsd 0 * SIZE(X), %xmm6
addq INCX, X
vmovsd 0 * SIZE(Y), %xmm10
addq INCY, Y
vmovsd 0 * SIZE(X), %xmm7
addq INCX, X
vmovsd 0 * SIZE(Y), %xmm11
addq INCY, Y
vfmaddpd %xmm0 , %xmm4 , %xmm8 , %xmm0
vfmaddpd %xmm1 , %xmm5 , %xmm9 , %xmm1
vfmaddpd %xmm2 , %xmm6 , %xmm10, %xmm2
vfmaddpd %xmm3 , %xmm7 , %xmm11, %xmm3
decq %rax
jg .L53
ALIGN_3
.L55:
movq N, %rax
andq $7, %rax
jle .L999
ALIGN_3
.L56:
vmovsd 0 * SIZE(X), %xmm4
addq INCX, X
vmovsd 0 * SIZE(Y), %xmm8
addq INCY, Y
vfmaddpd %xmm0 , %xmm4 , %xmm8 , %xmm0
decq %rax
jg .L56
ALIGN_3
.L999:
vaddpd %xmm1, %xmm0 , %xmm0
vaddpd %xmm3, %xmm2 , %xmm2
vaddpd %xmm2, %xmm0 , %xmm0
vhaddpd %xmm0, %xmm0 , %xmm0
RESTOREREGISTERS
ret
EPILOGUE

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,667 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#define ASSEMBLER
#include "common.h"
#define VMOVUPS_A1(OFF, ADDR, REGS) vmovups OFF(ADDR), REGS
#define VMOVUPS_A2(OFF, ADDR, BASE, SCALE, REGS) vmovups OFF(ADDR, BASE, SCALE), REGS
#define A_PRE 256
#ifndef WINDOWS_ABI
#define N ARG1 /* rsi */
#define M ARG2 /* rdi */
#define A ARG3 /* rdx */
#define LDA ARG4 /* rcx */
#define B ARG5 /* r8 */
#define AO1 %r9
#define AO2 %r10
#define LDA3 %r11
#define M8 %r12
#else
#define N ARG1 /* rdx */
#define M ARG2 /* rcx */
#define A ARG3 /* r8 */
#define LDA ARG4 /* r9 */
#define OLD_B 40 + 56(%rsp)
#define B %r12
#define AO1 %rsi
#define AO2 %rdi
#define LDA3 %r10
#define M8 %r11
#endif
#define I %rax
#define B0 %rbp
#define B1 %r13
#define B2 %r14
#define B3 %r15
PROLOGUE
PROFCODE
#ifdef WINDOWS_ABI
pushq %rdi
pushq %rsi
#endif
pushq %r15
pushq %r14
pushq %r13
pushq %r12
pushq %rbp
#ifdef WINDOWS_ABI
movq OLD_B, B
#endif
subq $-16 * SIZE, B
movq M, B1
movq M, B2
movq M, B3
andq $-8, B1
andq $-4, B2
andq $-2, B3
imulq N, B1
imulq N, B2
imulq N, B3
leaq (B, B1, SIZE), B1
leaq (B, B2, SIZE), B2
leaq (B, B3, SIZE), B3
leaq (,LDA, SIZE), LDA
leaq (LDA, LDA, 2), LDA3
leaq (, N, SIZE), M8
cmpq $8, N
jl .L20
ALIGN_4
.L11:
subq $8, N
movq A, AO1
leaq (A, LDA, 4), AO2
leaq (A, LDA, 8), A
movq B, B0
addq $64 * SIZE, B
movq M, I
sarq $3, I
jle .L14
ALIGN_4
.L13:
prefetchnta A_PRE(AO1)
VMOVUPS_A1(0 * SIZE, AO1, %xmm0)
VMOVUPS_A1(2 * SIZE, AO1, %xmm1)
VMOVUPS_A1(4 * SIZE, AO1, %xmm2)
VMOVUPS_A1(6 * SIZE, AO1, %xmm3)
vmovups %xmm0, -16 * SIZE(B0)
vmovups %xmm1, -14 * SIZE(B0)
vmovups %xmm2, -12 * SIZE(B0)
vmovups %xmm3, -10 * SIZE(B0)
prefetchnta A_PRE(AO1, LDA, 1)
VMOVUPS_A2(0 * SIZE, AO1, LDA, 1, %xmm0)
VMOVUPS_A2(2 * SIZE, AO1, LDA, 1, %xmm1)
VMOVUPS_A2(4 * SIZE, AO1, LDA, 1, %xmm2)
VMOVUPS_A2(6 * SIZE, AO1, LDA, 1, %xmm3)
vmovups %xmm0, -8 * SIZE(B0)
vmovups %xmm1, -6 * SIZE(B0)
vmovups %xmm2, -4 * SIZE(B0)
vmovups %xmm3, -2 * SIZE(B0)
prefetchnta A_PRE(AO1, LDA, 2)
VMOVUPS_A2(0 * SIZE, AO1, LDA, 2, %xmm0)
VMOVUPS_A2(2 * SIZE, AO1, LDA, 2, %xmm1)
VMOVUPS_A2(4 * SIZE, AO1, LDA, 2, %xmm2)
VMOVUPS_A2(6 * SIZE, AO1, LDA, 2, %xmm3)
vmovups %xmm0, 0 * SIZE(B0)
vmovups %xmm1, 2 * SIZE(B0)
vmovups %xmm2, 4 * SIZE(B0)
vmovups %xmm3, 6 * SIZE(B0)
prefetchnta A_PRE(AO1, LDA3, 1)
VMOVUPS_A2(0 * SIZE, AO1, LDA3, 1, %xmm0)
VMOVUPS_A2(2 * SIZE, AO1, LDA3, 1, %xmm1)
VMOVUPS_A2(4 * SIZE, AO1, LDA3, 1, %xmm2)
VMOVUPS_A2(6 * SIZE, AO1, LDA3, 1, %xmm3)
vmovups %xmm0, 8 * SIZE(B0)
vmovups %xmm1, 10 * SIZE(B0)
vmovups %xmm2, 12 * SIZE(B0)
vmovups %xmm3, 14 * SIZE(B0)
prefetchnta A_PRE(AO2)
VMOVUPS_A1(0 * SIZE, AO2, %xmm0)
VMOVUPS_A1(2 * SIZE, AO2, %xmm1)
VMOVUPS_A1(4 * SIZE, AO2, %xmm2)
VMOVUPS_A1(6 * SIZE, AO2, %xmm3)
vmovups %xmm0, 16 * SIZE(B0)
vmovups %xmm1, 18 * SIZE(B0)
vmovups %xmm2, 20 * SIZE(B0)
vmovups %xmm3, 22 * SIZE(B0)
prefetchnta A_PRE(AO2, LDA, 1)
VMOVUPS_A2(0 * SIZE, AO2, LDA, 1, %xmm0)
VMOVUPS_A2(2 * SIZE, AO2, LDA, 1, %xmm1)
VMOVUPS_A2(4 * SIZE, AO2, LDA, 1, %xmm2)
VMOVUPS_A2(6 * SIZE, AO2, LDA, 1, %xmm3)
vmovups %xmm0, 24 * SIZE(B0)
vmovups %xmm1, 26 * SIZE(B0)
vmovups %xmm2, 28 * SIZE(B0)
vmovups %xmm3, 30 * SIZE(B0)
prefetchnta A_PRE(AO2, LDA, 2)
VMOVUPS_A2(0 * SIZE, AO2, LDA, 2, %xmm0)
VMOVUPS_A2(2 * SIZE, AO2, LDA, 2, %xmm1)
VMOVUPS_A2(4 * SIZE, AO2, LDA, 2, %xmm2)
VMOVUPS_A2(6 * SIZE, AO2, LDA, 2, %xmm3)
vmovups %xmm0, 32 * SIZE(B0)
vmovups %xmm1, 34 * SIZE(B0)
vmovups %xmm2, 36 * SIZE(B0)
vmovups %xmm3, 38 * SIZE(B0)
prefetchnta A_PRE(AO2, LDA3, 1)
VMOVUPS_A2(0 * SIZE, AO2, LDA3, 1, %xmm0)
VMOVUPS_A2(2 * SIZE, AO2, LDA3, 1, %xmm1)
VMOVUPS_A2(4 * SIZE, AO2, LDA3, 1, %xmm2)
VMOVUPS_A2(6 * SIZE, AO2, LDA3, 1, %xmm3)
vmovups %xmm0, 40 * SIZE(B0)
vmovups %xmm1, 42 * SIZE(B0)
vmovups %xmm2, 44 * SIZE(B0)
vmovups %xmm3, 46 * SIZE(B0)
addq $8 * SIZE, AO1
addq $8 * SIZE, AO2
leaq (B0, M8, 8), B0
decq I
jg .L13
ALIGN_4
.L14:
testq $4, M
jle .L16
VMOVUPS_A1(0 * SIZE, AO1, %xmm0)
VMOVUPS_A1(2 * SIZE, AO1, %xmm1)
VMOVUPS_A2(0 * SIZE, AO1, LDA, 1, %xmm2)
VMOVUPS_A2(2 * SIZE, AO1, LDA, 1, %xmm3)
vmovups %xmm0, -16 * SIZE(B1)
vmovups %xmm1, -14 * SIZE(B1)
vmovups %xmm2, -12 * SIZE(B1)
vmovups %xmm3, -10 * SIZE(B1)
VMOVUPS_A2(0 * SIZE, AO1, LDA, 2, %xmm0)
VMOVUPS_A2(2 * SIZE, AO1, LDA, 2, %xmm1)
VMOVUPS_A2(0 * SIZE, AO1, LDA3, 1, %xmm2)
VMOVUPS_A2(2 * SIZE, AO1, LDA3, 1, %xmm3)
vmovups %xmm0, -8 * SIZE(B1)
vmovups %xmm1, -6 * SIZE(B1)
vmovups %xmm2, -4 * SIZE(B1)
vmovups %xmm3, -2 * SIZE(B1)
VMOVUPS_A1(0 * SIZE, AO2, %xmm0)
VMOVUPS_A1(2 * SIZE, AO2, %xmm1)
VMOVUPS_A2(0 * SIZE, AO2, LDA, 1, %xmm2)
VMOVUPS_A2(2 * SIZE, AO2, LDA, 1, %xmm3)
vmovups %xmm0, 0 * SIZE(B1)
vmovups %xmm1, 2 * SIZE(B1)
vmovups %xmm2, 4 * SIZE(B1)
vmovups %xmm3, 6 * SIZE(B1)
VMOVUPS_A2(0 * SIZE, AO2, LDA, 2, %xmm0)
VMOVUPS_A2(2 * SIZE, AO2, LDA, 2, %xmm1)
VMOVUPS_A2(0 * SIZE, AO2, LDA3, 1, %xmm2)
VMOVUPS_A2(2 * SIZE, AO2, LDA3, 1, %xmm3)
vmovups %xmm0, 8 * SIZE(B1)
vmovups %xmm1, 10 * SIZE(B1)
vmovups %xmm2, 12 * SIZE(B1)
vmovups %xmm3, 14 * SIZE(B1)
addq $4 * SIZE, AO1
addq $4 * SIZE, AO2
subq $-32 * SIZE, B1
ALIGN_4
.L16:
testq $2, M
jle .L18
VMOVUPS_A1(0 * SIZE, AO1, %xmm0)
VMOVUPS_A2(0 * SIZE, AO1, LDA, 1, %xmm1)
VMOVUPS_A2(0 * SIZE, AO1, LDA, 2, %xmm2)
VMOVUPS_A2(0 * SIZE, AO1, LDA3, 1, %xmm3)
vmovups %xmm0, -16 * SIZE(B2)
vmovups %xmm1, -14 * SIZE(B2)
vmovups %xmm2, -12 * SIZE(B2)
vmovups %xmm3, -10 * SIZE(B2)
VMOVUPS_A1(0 * SIZE, AO2, %xmm0)
VMOVUPS_A2(0 * SIZE, AO2, LDA, 1, %xmm1)
VMOVUPS_A2(0 * SIZE, AO2, LDA, 2, %xmm2)
VMOVUPS_A2(0 * SIZE, AO2, LDA3, 1, %xmm3)
vmovups %xmm0, -8 * SIZE(B2)
vmovups %xmm1, -6 * SIZE(B2)
vmovups %xmm2, -4 * SIZE(B2)
vmovups %xmm3, -2 * SIZE(B2)
addq $2 * SIZE, AO1
addq $2 * SIZE, AO2
subq $-16 * SIZE, B2
ALIGN_4
.L18:
testq $1, M
jle .L19
vmovsd 0 * SIZE(AO1), %xmm0
vmovsd 0 * SIZE(AO1, LDA), %xmm1
vmovsd 0 * SIZE(AO1, LDA, 2), %xmm2
vmovsd 0 * SIZE(AO1, LDA3), %xmm3
vunpcklpd %xmm1, %xmm0 , %xmm0
vunpcklpd %xmm3, %xmm2 , %xmm2
vmovups %xmm0, -16 * SIZE(B3)
vmovups %xmm2, -14 * SIZE(B3)
vmovsd 0 * SIZE(AO2), %xmm0
vmovsd 0 * SIZE(AO2, LDA), %xmm1
vmovsd 0 * SIZE(AO2, LDA, 2), %xmm2
vmovsd 0 * SIZE(AO2, LDA3), %xmm3
vunpcklpd %xmm1, %xmm0 , %xmm0
vunpcklpd %xmm3, %xmm2 , %xmm2
vmovups %xmm0, -12 * SIZE(B3)
vmovups %xmm2, -10 * SIZE(B3)
subq $-8 * SIZE, B3
ALIGN_4
.L19:
cmpq $8, N
jge .L11
ALIGN_4
.L20:
cmpq $4, N
jl .L30
subq $4, N
movq A, AO1
leaq (A, LDA, 2), AO2
leaq (A, LDA, 4), A
movq B, B0
addq $32 * SIZE, B
movq M, I
sarq $3, I
jle .L24
ALIGN_4
.L23:
VMOVUPS_A1(0 * SIZE, AO1, %xmm0)
VMOVUPS_A1(2 * SIZE, AO1, %xmm1)
VMOVUPS_A1(4 * SIZE, AO1, %xmm2)
VMOVUPS_A1(6 * SIZE, AO1, %xmm3)
vmovups %xmm0, -16 * SIZE(B0)
vmovups %xmm1, -14 * SIZE(B0)
vmovups %xmm2, -12 * SIZE(B0)
vmovups %xmm3, -10 * SIZE(B0)
VMOVUPS_A2(0 * SIZE, AO1, LDA, 1, %xmm0)
VMOVUPS_A2(2 * SIZE, AO1, LDA, 1, %xmm1)
VMOVUPS_A2(4 * SIZE, AO1, LDA, 1, %xmm2)
VMOVUPS_A2(6 * SIZE, AO1, LDA, 1, %xmm3)
vmovups %xmm0, -8 * SIZE(B0)
vmovups %xmm1, -6 * SIZE(B0)
vmovups %xmm2, -4 * SIZE(B0)
vmovups %xmm3, -2 * SIZE(B0)
VMOVUPS_A1(0 * SIZE, AO2, %xmm0)
VMOVUPS_A1(2 * SIZE, AO2, %xmm1)
VMOVUPS_A1(4 * SIZE, AO2, %xmm2)
VMOVUPS_A1(6 * SIZE, AO2, %xmm3)
vmovups %xmm0, 0 * SIZE(B0)
vmovups %xmm1, 2 * SIZE(B0)
vmovups %xmm2, 4 * SIZE(B0)
vmovups %xmm3, 6 * SIZE(B0)
VMOVUPS_A2(0 * SIZE, AO2, LDA, 1, %xmm0)
VMOVUPS_A2(2 * SIZE, AO2, LDA, 1, %xmm1)
VMOVUPS_A2(4 * SIZE, AO2, LDA, 1, %xmm2)
VMOVUPS_A2(6 * SIZE, AO2, LDA, 1, %xmm3)
vmovups %xmm0, 8 * SIZE(B0)
vmovups %xmm1, 10 * SIZE(B0)
vmovups %xmm2, 12 * SIZE(B0)
vmovups %xmm3, 14 * SIZE(B0)
addq $8 * SIZE, AO1
addq $8 * SIZE, AO2
leaq (B0, M8, 8), B0
decq I
jg .L23
ALIGN_4
.L24:
testq $4, M
jle .L26
VMOVUPS_A1(0 * SIZE, AO1, %xmm0)
VMOVUPS_A1(2 * SIZE, AO1, %xmm1)
VMOVUPS_A2(0 * SIZE, AO1, LDA, 1, %xmm2)
VMOVUPS_A2(2 * SIZE, AO1, LDA, 1, %xmm3)
vmovups %xmm0, -16 * SIZE(B1)
vmovups %xmm1, -14 * SIZE(B1)
vmovups %xmm2, -12 * SIZE(B1)
vmovups %xmm3, -10 * SIZE(B1)
VMOVUPS_A1(0 * SIZE, AO2, %xmm0)
VMOVUPS_A1(2 * SIZE, AO2, %xmm1)
VMOVUPS_A2(0 * SIZE, AO2, LDA, 1, %xmm2)
VMOVUPS_A2(2 * SIZE, AO2, LDA, 1, %xmm3)
vmovups %xmm0, -8 * SIZE(B1)
vmovups %xmm1, -6 * SIZE(B1)
vmovups %xmm2, -4 * SIZE(B1)
vmovups %xmm3, -2 * SIZE(B1)
addq $4 * SIZE, AO1
addq $4 * SIZE, AO2
subq $-16 * SIZE, B1
ALIGN_4
.L26:
testq $2, M
jle .L28
VMOVUPS_A1(0 * SIZE, AO1, %xmm0)
VMOVUPS_A2(0 * SIZE, AO1, LDA, 1, %xmm1)
VMOVUPS_A1(0 * SIZE, AO2, %xmm2)
VMOVUPS_A2(0 * SIZE, AO2, LDA, 1, %xmm3)
vmovups %xmm0, -16 * SIZE(B2)
vmovups %xmm1, -14 * SIZE(B2)
vmovups %xmm2, -12 * SIZE(B2)
vmovups %xmm3, -10 * SIZE(B2)
addq $2 * SIZE, AO1
addq $2 * SIZE, AO2
subq $-8 * SIZE, B2
ALIGN_4
.L28:
testq $1, M
jle .L30
vmovsd 0 * SIZE(AO1), %xmm0
vmovsd 0 * SIZE(AO1, LDA), %xmm1
vmovsd 0 * SIZE(AO2), %xmm2
vmovsd 0 * SIZE(AO2, LDA), %xmm3
vunpcklpd %xmm1, %xmm0, %xmm0
vunpcklpd %xmm3, %xmm2, %xmm2
vmovups %xmm0, -16 * SIZE(B3)
vmovups %xmm2, -14 * SIZE(B3)
subq $-4 * SIZE, B3
ALIGN_4
.L30:
cmpq $2, N
jl .L40
subq $2, N
movq A, AO1
leaq (A, LDA), AO2
leaq (A, LDA, 2), A
movq B, B0
addq $16 * SIZE, B
movq M, I
sarq $3, I
jle .L34
ALIGN_4
.L33:
VMOVUPS_A1(0 * SIZE, AO1, %xmm0)
VMOVUPS_A1(2 * SIZE, AO1, %xmm1)
VMOVUPS_A1(4 * SIZE, AO1, %xmm2)
VMOVUPS_A1(6 * SIZE, AO1, %xmm3)
vmovups %xmm0, -16 * SIZE(B0)
vmovups %xmm1, -14 * SIZE(B0)
vmovups %xmm2, -12 * SIZE(B0)
vmovups %xmm3, -10 * SIZE(B0)
VMOVUPS_A1(0 * SIZE, AO2, %xmm0)
VMOVUPS_A1(2 * SIZE, AO2, %xmm1)
VMOVUPS_A1(4 * SIZE, AO2, %xmm2)
VMOVUPS_A1(6 * SIZE, AO2, %xmm3)
vmovups %xmm0, -8 * SIZE(B0)
vmovups %xmm1, -6 * SIZE(B0)
vmovups %xmm2, -4 * SIZE(B0)
vmovups %xmm3, -2 * SIZE(B0)
addq $8 * SIZE, AO1
addq $8 * SIZE, AO2
leaq (B0, M8, 8), B0
decq I
jg .L33
ALIGN_4
.L34:
testq $4, M
jle .L36
VMOVUPS_A1(0 * SIZE, AO1, %xmm0)
VMOVUPS_A1(2 * SIZE, AO1, %xmm1)
VMOVUPS_A1(0 * SIZE, AO2, %xmm2)
VMOVUPS_A1(2 * SIZE, AO2, %xmm3)
vmovups %xmm0, -16 * SIZE(B1)
vmovups %xmm1, -14 * SIZE(B1)
vmovups %xmm2, -12 * SIZE(B1)
vmovups %xmm3, -10 * SIZE(B1)
addq $4 * SIZE, AO1
addq $4 * SIZE, AO2
subq $-8 * SIZE, B1
ALIGN_4
.L36:
testq $2, M
jle .L38
VMOVUPS_A1(0 * SIZE, AO1, %xmm0)
VMOVUPS_A1(0 * SIZE, AO2, %xmm1)
vmovups %xmm0, -16 * SIZE(B2)
vmovups %xmm1, -14 * SIZE(B2)
addq $2 * SIZE, AO1
addq $2 * SIZE, AO2
subq $-4 * SIZE, B2
ALIGN_4
.L38:
testq $1, M
jle .L40
vmovsd 0 * SIZE(AO1), %xmm0
vmovsd 0 * SIZE(AO2), %xmm1
vunpcklpd %xmm1, %xmm0, %xmm0
vmovups %xmm0, -16 * SIZE(B3)
subq $-2 * SIZE, B3
ALIGN_4
.L40:
cmpq $1, N
jl .L999
movq A, AO1
movq B, B0
movq M, I
sarq $3, I
jle .L44
ALIGN_4
.L43:
VMOVUPS_A1(0 * SIZE, AO1, %xmm0)
VMOVUPS_A1(2 * SIZE, AO1, %xmm1)
VMOVUPS_A1(4 * SIZE, AO1, %xmm2)
VMOVUPS_A1(6 * SIZE, AO1, %xmm3)
vmovups %xmm0, -16 * SIZE(B0)
vmovups %xmm1, -14 * SIZE(B0)
vmovups %xmm2, -12 * SIZE(B0)
vmovups %xmm3, -10 * SIZE(B0)
addq $8 * SIZE, AO1
leaq (B0, M8, 8), B0
decq I
jg .L43
ALIGN_4
.L44:
testq $4, M
jle .L45
VMOVUPS_A1(0 * SIZE, AO1, %xmm0)
VMOVUPS_A1(2 * SIZE, AO1, %xmm1)
vmovups %xmm0, -16 * SIZE(B1)
vmovups %xmm1, -14 * SIZE(B1)
addq $4 * SIZE, AO1
subq $-4 * SIZE, B1
ALIGN_4
.L45:
testq $2, M
jle .L46
VMOVUPS_A1(0 * SIZE, AO1, %xmm0)
vmovups %xmm0, -16 * SIZE(B2)
addq $2 * SIZE, AO1
subq $-2 * SIZE, B2
ALIGN_4
.L46:
testq $1, M
jle .L999
vmovsd 0 * SIZE(AO1), %xmm0
vmovsd %xmm0, -16 * SIZE(B3)
jmp .L999
ALIGN_4
.L999:
popq %rbp
popq %r12
popq %r13
popq %r14
popq %r15
#ifdef WINDOWS_ABI
popq %rsi
popq %rdi
#endif
ret
EPILOGUE

View File

@@ -47,7 +47,7 @@
#ifndef WINDOWS_ABI
#define STACKSIZE 64
#define STACKSIZE 128
#define OLD_M %rdi
#define OLD_N %rsi
@@ -59,9 +59,14 @@
#define STACK_BUFFER 32 + STACKSIZE(%rsp)
#define ALPHA 48 (%rsp)
#define MMM 56(%rsp)
#define NN 64(%rsp)
#define AA 72(%rsp)
#define LDAX 80(%rsp)
#define XX 88(%rsp)
#else
#define STACKSIZE 256
#define STACKSIZE 288
#define OLD_M %rcx
#define OLD_N %rdx
@@ -74,6 +79,12 @@
#define STACK_BUFFER 88 + STACKSIZE(%rsp)
#define ALPHA 224 (%rsp)
#define MMM 232(%rsp)
#define NN 240(%rsp)
#define AA 248(%rsp)
#define LDAX 256(%rsp)
#define XX 264(%rsp)
#endif
#define LDA %r8
@@ -137,17 +148,42 @@
movq OLD_LDA, LDA
#endif
movq STACK_INCX, INCX
movq STACK_Y, Y
movq STACK_INCY, INCY
movq STACK_BUFFER, BUFFER
#ifndef WINDOWS_ABI
movsd %xmm0, ALPHA
#else
movsd %xmm3, ALPHA
#endif
movq STACK_Y, Y
movq A,AA
movq N,NN
movq M,MMM
movq LDA,LDAX
movq X,XX
.L0t:
xorq I,I
addq $1,I
salq $21,I
subq I,MMM
movq I,M
jge .L00t
movq MMM,M
addq I,M
jle .L999x
.L00t:
movq XX,X
movq AA,A
movq NN,N
movq LDAX,LDA
movq STACK_INCX, INCX
movq STACK_INCY, INCY
movq STACK_BUFFER, BUFFER
leaq -1(INCY), %rax
leaq (,INCX, SIZE), INCX
@@ -2815,6 +2851,12 @@
ALIGN_3
.L999:
leaq (, M, SIZE), %rax
addq %rax,AA
jmp .L0t
ALIGN_4
.L999x:
movq 0(%rsp), %rbx
movq 8(%rsp), %rbp
movq 16(%rsp), %r12

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,360 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#define ASSEMBLER
#include "common.h"
#ifndef WINDOWS_ABI
#define M ARG1 /* rdi */
#define N ARG2 /* rsi */
#define A ARG3 /* rdx */
#define LDA ARG4 /* rcx */
#define B ARG5 /* r8 */
#define I %r9
#else
#define STACKSIZE 256
#define M ARG1 /* rcx */
#define N ARG2 /* rdx */
#define A ARG3 /* r8 */
#define LDA ARG4 /* r9 */
#define OLD_B 40 + 32 + STACKSIZE(%rsp)
#define B %r14
#define I %r15
#endif
#define J %r10
#define AO1 %r11
#define AO2 %r12
#define AO3 %r13
#define AO4 %rax
PROLOGUE
PROFCODE
#ifdef WINDOWS_ABI
pushq %r15
pushq %r14
#endif
pushq %r13
pushq %r12
#ifdef WINDOWS_ABI
subq $STACKSIZE, %rsp
vmovups %xmm6, 0(%rsp)
vmovups %xmm7, 16(%rsp)
vmovups %xmm8, 32(%rsp)
vmovups %xmm9, 48(%rsp)
vmovups %xmm10, 64(%rsp)
vmovups %xmm11, 80(%rsp)
vmovups %xmm12, 96(%rsp)
vmovups %xmm13, 112(%rsp)
vmovups %xmm14, 128(%rsp)
vmovups %xmm15, 144(%rsp)
movq OLD_B, B
#endif
leaq (,LDA, SIZE), LDA # Scaling
movq N, J
sarq $1, J
jle .L20
ALIGN_4
.L01:
movq A, AO1
leaq (A, LDA), AO2
leaq (A, LDA, 2), A
movq M, I
sarq $3, I
jle .L08
ALIGN_4
.L03:
#ifndef DOUBLE
vmovss 0 * SIZE(AO1), %xmm0
vmovss 0 * SIZE(AO2), %xmm1
vmovss 1 * SIZE(AO1), %xmm2
vmovss 1 * SIZE(AO2), %xmm3
vmovss 2 * SIZE(AO1), %xmm4
vmovss 2 * SIZE(AO2), %xmm5
vmovss 3 * SIZE(AO1), %xmm6
vmovss 3 * SIZE(AO2), %xmm7
vmovss 4 * SIZE(AO1), %xmm8
vmovss 4 * SIZE(AO2), %xmm9
vmovss 5 * SIZE(AO1), %xmm10
vmovss 5 * SIZE(AO2), %xmm11
vmovss 6 * SIZE(AO1), %xmm12
vmovss 6 * SIZE(AO2), %xmm13
vmovss 7 * SIZE(AO1), %xmm14
vmovss 7 * SIZE(AO2), %xmm15
vmovss %xmm0, 0 * SIZE(B)
vmovss %xmm1, 1 * SIZE(B)
vmovss %xmm2, 2 * SIZE(B)
vmovss %xmm3, 3 * SIZE(B)
vmovss %xmm4, 4 * SIZE(B)
vmovss %xmm5, 5 * SIZE(B)
vmovss %xmm6, 6 * SIZE(B)
vmovss %xmm7, 7 * SIZE(B)
vmovss %xmm8, 8 * SIZE(B)
vmovss %xmm9, 9 * SIZE(B)
vmovss %xmm10, 10 * SIZE(B)
vmovss %xmm11, 11 * SIZE(B)
vmovss %xmm12, 12 * SIZE(B)
vmovss %xmm13, 13 * SIZE(B)
vmovss %xmm14, 14 * SIZE(B)
vmovss %xmm15, 15 * SIZE(B)
#else
prefetchw 256(B)
prefetchnta 256(AO1)
vmovsd 0 * SIZE(AO1), %xmm0
vmovsd 1 * SIZE(AO1), %xmm1
vmovsd 2 * SIZE(AO1), %xmm2
vmovsd 3 * SIZE(AO1), %xmm3
vmovsd 4 * SIZE(AO1), %xmm4
vmovsd 5 * SIZE(AO1), %xmm5
vmovsd 6 * SIZE(AO1), %xmm6
vmovsd 7 * SIZE(AO1), %xmm7
prefetchnta 256(AO2)
vmovhpd 0 * SIZE(AO2), %xmm0 , %xmm0
vmovhpd 1 * SIZE(AO2), %xmm1 , %xmm1
vmovhpd 2 * SIZE(AO2), %xmm2 , %xmm2
vmovhpd 3 * SIZE(AO2), %xmm3 , %xmm3
vmovhpd 4 * SIZE(AO2), %xmm4 , %xmm4
vmovhpd 5 * SIZE(AO2), %xmm5 , %xmm5
vmovhpd 6 * SIZE(AO2), %xmm6 , %xmm6
vmovhpd 7 * SIZE(AO2), %xmm7 , %xmm7
prefetchw 256+64(B)
vmovups %xmm0, 0 * SIZE(B)
vmovups %xmm1, 2 * SIZE(B)
vmovups %xmm2, 4 * SIZE(B)
vmovups %xmm3, 6 * SIZE(B)
vmovups %xmm4, 8 * SIZE(B)
vmovups %xmm5, 10 * SIZE(B)
vmovups %xmm6, 12 * SIZE(B)
vmovups %xmm7, 14 * SIZE(B)
#endif
addq $8 * SIZE, AO1
addq $8 * SIZE, AO2
subq $-16 * SIZE, B
decq I
jg .L03
ALIGN_4
.L08:
testq $4 , M
je .L14
ALIGN_4
.L13:
#ifndef DOUBLE
vmovss 0 * SIZE(AO1), %xmm0
vmovss 0 * SIZE(AO2), %xmm1
vmovss 1 * SIZE(AO1), %xmm2
vmovss 1 * SIZE(AO2), %xmm3
vmovss 2 * SIZE(AO1), %xmm4
vmovss 2 * SIZE(AO2), %xmm5
vmovss 3 * SIZE(AO1), %xmm6
vmovss 3 * SIZE(AO2), %xmm7
vmovss %xmm0, 0 * SIZE(B)
vmovss %xmm1, 1 * SIZE(B)
vmovss %xmm2, 2 * SIZE(B)
vmovss %xmm3, 3 * SIZE(B)
vmovss %xmm4, 4 * SIZE(B)
vmovss %xmm5, 5 * SIZE(B)
vmovss %xmm6, 6 * SIZE(B)
vmovss %xmm7, 7 * SIZE(B)
#else
vmovsd 0 * SIZE(AO1), %xmm0
vmovsd 1 * SIZE(AO1), %xmm1
vmovsd 2 * SIZE(AO1), %xmm2
vmovsd 3 * SIZE(AO1), %xmm3
vmovhpd 0 * SIZE(AO2), %xmm0 , %xmm0
vmovhpd 1 * SIZE(AO2), %xmm1 , %xmm1
vmovhpd 2 * SIZE(AO2), %xmm2 , %xmm2
vmovhpd 3 * SIZE(AO2), %xmm3 , %xmm3
vmovups %xmm0, 0 * SIZE(B)
vmovups %xmm1, 2 * SIZE(B)
vmovups %xmm2, 4 * SIZE(B)
vmovups %xmm3, 6 * SIZE(B)
#endif
addq $4 * SIZE, AO1
addq $4 * SIZE, AO2
subq $-8 * SIZE, B
ALIGN_4
.L14:
movq M, I
andq $3, I
jle .L16
ALIGN_4
.L15:
#ifndef DOUBLE
vmovss 0 * SIZE(AO1), %xmm0
vmovss 0 * SIZE(AO2), %xmm1
vmovss %xmm0, 0 * SIZE(B)
vmovss %xmm1, 1 * SIZE(B)
#else
vmovsd 0 * SIZE(AO1), %xmm0
vmovhpd 0 * SIZE(AO2), %xmm0 , %xmm0
vmovups %xmm0, 0 * SIZE(B)
#endif
addq $SIZE, AO1
addq $SIZE, AO2
addq $2 * SIZE, B
decq I
jg .L15
ALIGN_4
.L16:
decq J
jg .L01
ALIGN_4
.L20:
testq $1, N
jle .L999
movq A, AO1
movq M, I
sarq $2, I
jle .L34
ALIGN_4
.L33:
#ifndef DOUBLE
vmovups 0 * SIZE(AO1), %xmm0
vmovups %xmm0, 0 * SIZE(B)
#else
vmovups 0 * SIZE(AO1), %xmm0
vmovups 2 * SIZE(AO1), %xmm1
vmovups %xmm0, 0 * SIZE(B)
vmovups %xmm1, 2 * SIZE(B)
#endif
addq $4 * SIZE, AO1
subq $-4 * SIZE, B
decq I
jg .L33
ALIGN_4
.L34:
movq M, I
andq $3, I
jle .L999
ALIGN_4
.L35:
#ifndef DOUBLE
vmovss 0 * SIZE(AO1), %xmm0
vmovss %xmm0, 0 * SIZE(B)
#else
vmovsd 0 * SIZE(AO1), %xmm0
vmovsd %xmm0, 0 * SIZE(B)
#endif
addq $SIZE, AO1
addq $1 * SIZE, B
decq I
jg .L35
ALIGN_4
.L999:
#ifdef WINDOWS_ABI
vmovups 0(%rsp), %xmm6
vmovups 16(%rsp), %xmm7
vmovups 32(%rsp), %xmm8
vmovups 48(%rsp), %xmm9
vmovups 64(%rsp), %xmm10
vmovups 80(%rsp), %xmm11
vmovups 96(%rsp), %xmm12
vmovups 112(%rsp), %xmm13
vmovups 128(%rsp), %xmm14
vmovups 144(%rsp), %xmm15
addq $STACKSIZE, %rsp
#endif
popq %r12
popq %r13
#ifdef WINDOWS_ABI
popq %r14
popq %r15
#endif
ret
EPILOGUE

View File

@@ -0,0 +1,374 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#define ASSEMBLER
#include "common.h"
#ifndef WINDOWS_ABI
#define M ARG1 /* rdi */
#define N ARG2 /* rsi */
#define A ARG3 /* rdx */
#define LDA ARG4 /* rcx */
#define B ARG5 /* r8 */
#define I %r10
#define J %rbp
#define AO1 %r9
#define AO2 %r15
#define AO3 %r11
#define AO4 %r14
#define BO1 %r13
#define M8 %rbx
#define BO %rax
#else
#define STACKSIZE 256
#define M ARG1 /* rcx */
#define N ARG2 /* rdx */
#define A ARG3 /* r8 */
#define LDA ARG4 /* r9 */
#define OLD_B 40 + 64 + STACKSIZE(%rsp)
#define B %rdi
#define I %r10
#define J %r11
#define AO1 %r12
#define AO2 %r13
#define AO3 %r14
#define AO4 %r15
#define BO1 %rsi
#define M8 %rbp
#define BO %rax
#endif
PROLOGUE
PROFCODE
#ifdef WINDOWS_ABI
pushq %rdi
pushq %rsi
#endif
pushq %r15
pushq %r14
pushq %r13
pushq %r12
pushq %rbp
pushq %rbx
#ifdef WINDOWS_ABI
subq $STACKSIZE, %rsp
vmovups %xmm6, 0(%rsp)
vmovups %xmm7, 16(%rsp)
vmovups %xmm8, 32(%rsp)
vmovups %xmm9, 48(%rsp)
vmovups %xmm10, 64(%rsp)
vmovups %xmm11, 80(%rsp)
vmovups %xmm12, 96(%rsp)
vmovups %xmm13, 112(%rsp)
vmovups %xmm14, 128(%rsp)
vmovups %xmm15, 144(%rsp)
movq OLD_B, B
#endif
movq N, %rax
andq $-2, %rax
imulq M, %rax
leaq (B, %rax, SIZE), BO1
leaq (, LDA, SIZE), LDA
leaq (, M, SIZE), M8
movq M, J
sarq $1, J
jle .L20
ALIGN_4
.L01:
movq A, AO1
leaq (A, LDA ), AO2
leaq (A, LDA, 2), A
movq B, BO
addq $4 * SIZE, B
movq N, I
sarq $3, I
jle .L10
ALIGN_4
.L08:
#ifndef DOUBLE
vmovsd 0 * SIZE(AO1), %xmm0
vmovsd 2 * SIZE(AO1), %xmm2
vmovsd 4 * SIZE(AO1), %xmm4
vmovsd 6 * SIZE(AO1), %xmm6
vmovsd 0 * SIZE(AO2), %xmm1
vmovsd 2 * SIZE(AO2), %xmm3
vmovsd 4 * SIZE(AO2), %xmm5
vmovsd 6 * SIZE(AO2), %xmm7
vmovsd %xmm0, 0 * SIZE(BO)
vmovsd %xmm1, 2 * SIZE(BO)
leaq (BO, M8, 2), BO
vmovsd %xmm2, 0 * SIZE(BO)
vmovsd %xmm3, 2 * SIZE(BO)
leaq (BO, M8, 2), BO
vmovsd %xmm4, 0 * SIZE(BO)
vmovsd %xmm5, 2 * SIZE(BO)
leaq (BO, M8, 2), BO
vmovsd %xmm6, 0 * SIZE(BO)
vmovsd %xmm7, 2 * SIZE(BO)
leaq (BO, M8, 2), BO
#else
prefetchnta 256(AO1)
prefetchnta 256(AO2)
vmovups 0 * SIZE(AO1), %xmm0
vmovups 2 * SIZE(AO1), %xmm2
vmovups 4 * SIZE(AO1), %xmm4
vmovups 6 * SIZE(AO1), %xmm6
vmovups 0 * SIZE(AO2), %xmm1
vmovups 2 * SIZE(AO2), %xmm3
vmovups 4 * SIZE(AO2), %xmm5
vmovups 6 * SIZE(AO2), %xmm7
vmovups %xmm0, 0 * SIZE(BO)
vmovups %xmm1, 2 * SIZE(BO)
leaq (BO, M8, 2), BO
vmovups %xmm2, 0 * SIZE(BO)
vmovups %xmm3, 2 * SIZE(BO)
leaq (BO, M8, 2), BO
vmovups %xmm4, 0 * SIZE(BO)
vmovups %xmm5, 2 * SIZE(BO)
leaq (BO, M8, 2), BO
vmovups %xmm6, 0 * SIZE(BO)
vmovups %xmm7, 2 * SIZE(BO)
leaq (BO, M8, 2), BO
#endif
addq $8 * SIZE, AO1
addq $8 * SIZE, AO2
decq I
jg .L08
ALIGN_4
.L10:
testq $4, N
jle .L12
#ifndef DOUBLE
vmovsd 0 * SIZE(AO1), %xmm0
vmovsd 2 * SIZE(AO1), %xmm2
vmovsd 0 * SIZE(AO2), %xmm1
vmovsd 2 * SIZE(AO2), %xmm3
vmovsd %xmm0, 0 * SIZE(BO)
vmovsd %xmm1, 2 * SIZE(BO)
leaq (BO, M8, 2), BO
vmovsd %xmm2, 0 * SIZE(BO)
vmovsd %xmm3, 2 * SIZE(BO)
leaq (BO, M8, 2), BO
#else
vmovups 0 * SIZE(AO1), %xmm0
vmovups 2 * SIZE(AO1), %xmm2
vmovups 0 * SIZE(AO2), %xmm1
vmovups 2 * SIZE(AO2), %xmm3
vmovups %xmm0, 0 * SIZE(BO)
vmovups %xmm1, 2 * SIZE(BO)
leaq (BO, M8, 2), BO
vmovups %xmm2, 0 * SIZE(BO)
vmovups %xmm3, 2 * SIZE(BO)
leaq (BO, M8, 2), BO
#endif
addq $4 * SIZE, AO1
addq $4 * SIZE, AO2
ALIGN_4
.L12:
testq $2, N
jle .L14
#ifndef DOUBLE
vmovsd 0 * SIZE(AO1), %xmm0
vmovsd 0 * SIZE(AO2), %xmm1
vmovsd %xmm0, 0 * SIZE(BO)
vmovsd %xmm1, 2 * SIZE(BO)
#else
vmovups 0 * SIZE(AO1), %xmm0
vmovups 0 * SIZE(AO2), %xmm1
vmovups %xmm0, 0 * SIZE(BO)
vmovups %xmm1, 2 * SIZE(BO)
#endif
leaq (BO, M8, 2), BO
addq $2 * SIZE, AO1
addq $2 * SIZE, AO2
ALIGN_4
.L14:
testq $1, N
jle .L19
#ifndef DOUBLE
vmovss 0 * SIZE(AO1), %xmm0
vmovss 0 * SIZE(AO2), %xmm1
vmovss %xmm0, 0 * SIZE(BO1)
vmovss %xmm1, 1 * SIZE(BO1)
#else
vmovsd 0 * SIZE(AO1), %xmm0
vmovhpd 0 * SIZE(AO2), %xmm0 , %xmm0
vmovups %xmm0, 0 * SIZE(BO1)
#endif
addq $2 * SIZE, BO1
ALIGN_4
.L19:
decq J
jg .L01
ALIGN_4
.L20:
testq $1, M
jle .L999
ALIGN_4
.L31:
movq A, AO1
movq B, BO
movq N, I
sarq $1, I
jle .L33
ALIGN_4
.L32:
#ifndef DOUBLE
vmovsd 0 * SIZE(AO1), %xmm0
vmovsd %xmm0, 0 * SIZE(BO)
#else
vmovups 0 * SIZE(AO1), %xmm0
vmovups %xmm0, 0 * SIZE(BO)
#endif
addq $2 * SIZE, AO1
leaq (BO, M8, 2), BO
decq I
jg .L32
ALIGN_4
.L33:
testq $1, N
jle .L999
#ifndef DOUBLE
vmovss 0 * SIZE(AO1), %xmm0
vmovss %xmm0, 0 * SIZE(BO1)
#else
vmovsd 0 * SIZE(AO1), %xmm0
vmovsd %xmm0, 0 * SIZE(BO1)
#endif
addq $1 * SIZE, BO1
ALIGN_4
.L999:
#ifdef WINDOWS_ABI
vmovups 0(%rsp), %xmm6
vmovups 16(%rsp), %xmm7
vmovups 32(%rsp), %xmm8
vmovups 48(%rsp), %xmm9
vmovups 64(%rsp), %xmm10
vmovups 80(%rsp), %xmm11
vmovups 96(%rsp), %xmm12
vmovups 112(%rsp), %xmm13
vmovups 128(%rsp), %xmm14
vmovups 144(%rsp), %xmm15
addq $STACKSIZE, %rsp
#endif
popq %rbx
popq %rbp
popq %r12
popq %r13
popq %r14
popq %r15
#ifdef WINDOWS_ABI
popq %rsi
popq %rdi
#endif
ret
EPILOGUE

File diff suppressed because it is too large Load Diff

View File

@@ -47,7 +47,7 @@
#ifndef WINDOWS_ABI
#define STACKSIZE 64
#define STACKSIZE 128
#define OLD_M %rdi
#define OLD_N %rsi
@@ -58,10 +58,14 @@
#define STACK_INCY 24 + STACKSIZE(%rsp)
#define STACK_BUFFER 32 + STACKSIZE(%rsp)
#define ALPHA 48 (%rsp)
#define MMM 56(%rsp)
#define NN 64(%rsp)
#define AA 72(%rsp)
#define LDAX 80(%rsp)
#define XX 96(%rsp)
#else
#define STACKSIZE 256
#define STACKSIZE 288
#define OLD_M %rcx
#define OLD_N %rdx
@@ -74,6 +78,12 @@
#define STACK_BUFFER 88 + STACKSIZE(%rsp)
#define ALPHA 224 (%rsp)
#define MMM 232(%rsp)
#define NN 240(%rsp)
#define AA 248(%rsp)
#define LDAX 256(%rsp)
#define XX 264(%rsp)
#endif
#define LDA %r8
@@ -137,17 +147,41 @@
movq OLD_LDA, LDA
#endif
movq STACK_INCX, INCX
movq STACK_Y, Y
movq STACK_INCY, INCY
movq STACK_BUFFER, BUFFER
#ifndef WINDOWS_ABI
movss %xmm0, ALPHA
#else
movss %xmm3, ALPHA
#endif
movq M,MMM
movq A,AA
movq N,NN
movq LDA,LDAX
movq X,XX
movq STACK_Y, Y
.L0t:
xorq I,I
addq $1,I
salq $22,I
subq I,MMM
movq I,M
jge .L00t
movq MMM,M
addq I,M
jle .L999x
.L00t:
movq AA,A
movq NN,N
movq LDAX,LDA
movq XX,X
movq STACK_INCX, INCX
movq STACK_INCY, INCY
movq STACK_BUFFER, BUFFER
leaq (,INCX, SIZE), INCX
leaq (,INCY, SIZE), INCY
leaq (,LDA, SIZE), LDA
@@ -5990,6 +6024,12 @@
ALIGN_3
.L999:
leaq (,M,SIZE),%rax
addq %rax,AA
jmp .L0t
ALIGN_4
.L999x:
movq 0(%rsp), %rbx
movq 8(%rsp), %rbp
movq 16(%rsp), %r12

View File

@@ -63,7 +63,7 @@
#else
#define STACKSIZE 256
#define STACKSIZE 288
#define OLD_M %rcx
#define OLD_N %rdx
@@ -74,10 +74,10 @@
#define STACK_Y 72 + STACKSIZE(%rsp)
#define STACK_INCY 80 + STACKSIZE(%rsp)
#define STACK_BUFFER 88 + STACKSIZE(%rsp)
#define MMM 216(%rsp)
#define NN 224(%rsp)
#define AA 232(%rsp)
#define LDAX 240(%rsp)
#define MMM 232(%rsp)
#define NN 240(%rsp)
#define AA 248(%rsp)
#define LDAX 256(%rsp)
#endif

View File

@@ -76,7 +76,7 @@
#define movsd movlps
#endif
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
#define PREFETCH prefetch
#define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 16)

View File

@@ -76,7 +76,7 @@
#define movsd movlpd
#endif
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
#define PREFETCH prefetch
#define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 16)

View File

@@ -76,7 +76,7 @@
#define movsd movlps
#endif
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
#define PREFETCH prefetch
#define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 16)

View File

@@ -76,7 +76,7 @@
#define movsd movlpd
#endif
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
#define PREFETCH prefetch
#define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 16)

File diff suppressed because it is too large Load Diff

View File

@@ -1385,7 +1385,7 @@ ALIGN_5
EXTRA_DY $1, yvec15, xvec7;
EXTRA_DY $1, yvec14, xvec6;
EXTRA_DY $1, yvec13, xvec5;
EXTRA_DY $2, yvec12, xvec4;
EXTRA_DY $1, yvec12, xvec4;
#ifndef TRMMKERNEL
LDL_DX 0*SIZE(C0), xvec0, xvec0;
LDH_DX 1*SIZE(C0), xvec0, xvec0;
@@ -1406,8 +1406,8 @@ STL_DX xvec7, 2*SIZE(C0, ldc, 1);
STH_DX xvec7, 3*SIZE(C0, ldc, 1);
STL_DX xvec13, 0*SIZE(C0, ldc, 1);
STH_DX xvec13, 1*SIZE(C0, ldc, 1);
STL_DX xvec6, 2*SIZE(C0);
STH_DX xvec6, 3*SIZE(C0);
STL_DX xvec5, 2*SIZE(C0);
STH_DX xvec5, 3*SIZE(C0);
#ifndef TRMMKERNEL
LDL_DX 0*SIZE(C1), xvec0, xvec0;
LDH_DX 1*SIZE(C1), xvec0, xvec0;

View File

@@ -42,7 +42,7 @@
#ifndef WINDOWS_ABI
#define STACKSIZE 64
#define STACKSIZE 128
#define OLD_INCX 8 + STACKSIZE(%rsp)
#define OLD_Y 16 + STACKSIZE(%rsp)
@@ -50,7 +50,15 @@
#define OLD_BUFFER 32 + STACKSIZE(%rsp)
#define ALPHA_R 48 (%rsp)
#define ALPHA_I 56 (%rsp)
#define MMM 64(%rsp)
#define NN 72(%rsp)
#define AA 80(%rsp)
#define XX 88(%rsp)
#define LDAX 96(%rsp)
#define ALPHAR 104(%rsp)
#define ALPHAI 112(%rsp)
#define M %rdi
#define N %rsi
#define A %rcx
@@ -62,7 +70,7 @@
#else
#define STACKSIZE 256
#define STACKSIZE 288
#define OLD_ALPHA_I 40 + STACKSIZE(%rsp)
#define OLD_A 48 + STACKSIZE(%rsp)
@@ -75,6 +83,14 @@
#define ALPHA_R 224 (%rsp)
#define ALPHA_I 232 (%rsp)
#define MMM 232(%rsp)
#define NN 240(%rsp)
#define AA 248(%rsp)
#define XX 256(%rsp)
#define LDAX 264(%rsp)
#define ALPHAR 272(%rsp)
#define ALPHAI 280(%rsp)
#define M %rcx
#define N %rdx
#define A %r8
@@ -136,8 +152,37 @@
movsd OLD_ALPHA_I, %xmm1
#endif
movq OLD_INCX, INCX
movq A, AA
movq N, NN
movq M, MMM
movq LDA, LDAX
movq X, XX
movq OLD_Y, Y
movsd %xmm0,ALPHAR
movsd %xmm1,ALPHAI
.L0t:
xorq I,I
addq $1,I
salq $18,I
subq I,MMM
movq I,M
movsd ALPHAR,%xmm0
movsd ALPHAI,%xmm1
jge .L00t
movq MMM,M
addq I,M
jle .L999x
.L00t:
movq AA, A
movq NN, N
movq LDAX, LDA
movq XX, X
movq OLD_INCX, INCX
# movq OLD_Y, Y
movq OLD_INCY, INCY
movq OLD_BUFFER, BUFFER
@@ -2673,6 +2718,12 @@
ALIGN_3
.L999:
movq M, I
salq $ZBASE_SHIFT,I
addq I,AA
jmp .L0t
.L999x:
movq 0(%rsp), %rbx
movq 8(%rsp), %rbp
movq 16(%rsp), %r12

View File

@@ -42,13 +42,20 @@
#ifndef WINDOWS_ABI
#define STACKSIZE 64
#define STACKSIZE 128
#define OLD_INCX 8 + STACKSIZE(%rsp)
#define OLD_Y 16 + STACKSIZE(%rsp)
#define OLD_INCY 24 + STACKSIZE(%rsp)
#define OLD_BUFFER 32 + STACKSIZE(%rsp)
#define MMM 64(%rsp)
#define NN 72(%rsp)
#define AA 80(%rsp)
#define LDAX 88(%rsp)
#define ALPHAR 96(%rsp)
#define ALPHAI 104(%rsp)
#define M %rdi
#define N %rsi
#define A %rcx
@@ -60,7 +67,7 @@
#else
#define STACKSIZE 256
#define STACKSIZE 288
#define OLD_ALPHA_I 40 + STACKSIZE(%rsp)
#define OLD_A 48 + STACKSIZE(%rsp)
@@ -71,6 +78,13 @@
#define OLD_INCY 88 + STACKSIZE(%rsp)
#define OLD_BUFFER 96 + STACKSIZE(%rsp)
#define MMM 232(%rsp)
#define NN 240(%rsp)
#define AA 248(%rsp)
#define LDAX 256(%rsp)
#define ALPHAR 264(%rsp)
#define ALPHAI 272(%rsp)
#define M %rcx
#define N %rdx
#define A %r8
@@ -135,6 +149,32 @@
movsd OLD_ALPHA_I, %xmm1
#endif
movq A, AA
movq N, NN
movq M, MMM
movq LDA, LDAX
movsd %xmm0,ALPHAR
movsd %xmm1,ALPHAI
.L0t:
xorq I,I
addq $1,I
salq $19,I
subq I,MMM
movq I,M
movsd ALPHAR,%xmm0
movsd ALPHAI,%xmm1
jge .L00t
movq MMM,M
addq I,M
jle .L999x
.L00t:
movq AA, A
movq NN, N
movq LDAX, LDA
movq OLD_INCX, INCX
movq OLD_Y, Y
movq OLD_INCY, INCY
@@ -2405,6 +2445,12 @@
ALIGN_3
.L999:
movq M, I
salq $ZBASE_SHIFT,I
addq I,AA
jmp .L0t
.L999x:
movq 0(%rsp), %rbx
movq 8(%rsp), %rbp
movq 16(%rsp), %r12

View File

@@ -160,7 +160,7 @@
#define a3 %xmm14
#define xt1 %xmm15
#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
#define MOVDDUP(a, b, c) movddup a(b), c
#define MOVDDUP2(a, b, c) movddup a##b, c
#else

View File

@@ -76,7 +76,7 @@
#define movsd movlpd
#endif
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
#define PREFETCH prefetch
#define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 16)
@@ -167,7 +167,7 @@
#define a3 %xmm14
#define xt1 %xmm15
#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BARCELONA_OPTIMIZATION)
#define MOVDDUP(a, b, c) movddup a(b), c
#define MOVDDUP2(a, b, c) movddup a##b, c
#else

View File

@@ -76,7 +76,7 @@
#define movsd movlpd
#endif
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
#define PREFETCH prefetch
#define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 16)
@@ -166,7 +166,7 @@
#define xt1 %xmm14
#define xt2 %xmm15
#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BARCELONA_OPTIMIZATION)
#define MOVDDUP(a, b, c) movddup a(b), c
#define MOVDDUP2(a, b, c) movddup a##b, c
#else

View File

@@ -76,7 +76,7 @@
#define movsd movlpd
#endif
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
#define PREFETCH prefetch
#define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 16)
@@ -166,7 +166,7 @@
#define a3 %xmm14
#define xt1 %xmm15
#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BARCELONA_OPTIMIZATION)
#define MOVDDUP(a, b, c) movddup a(b), c
#define MOVDDUP2(a, b, c) movddup a##b, c
#else