diff --git a/kernel/zarch/damax.c b/kernel/zarch/damax.c index 37008f702..2598145c3 100644 --- a/kernel/zarch/damax.c +++ b/kernel/zarch/damax.c @@ -33,27 +33,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static FLOAT damax_kernel_32(BLASLONG n, FLOAT *x) { FLOAT amax; - __asm__("vl %%v0,0(%[x])\n\t" + __asm__("vl %%v0,0(%[x]),3\n\t" "srlg %[n],%[n],5\n\t" "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vl %%v24,128(%%r1,%[x])\n\t" - "vl %%v25,144(%%r1,%[x])\n\t" - "vl %%v26,160(%%r1,%[x])\n\t" - "vl %%v27,176(%%r1,%[x])\n\t" - "vl %%v28,192(%%r1,%[x])\n\t" - "vl %%v29,208(%%r1,%[x])\n\t" - "vl %%v30,224(%%r1,%[x])\n\t" - "vl %%v31,240(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x]),3\n\t" + "vl %%v17,16(%%r1,%[x]),3\n\t" + "vl %%v18,32(%%r1,%[x]),3\n\t" + "vl %%v19,48(%%r1,%[x]),3\n\t" + "vl %%v20,64(%%r1,%[x]),3\n\t" + "vl %%v21,80(%%r1,%[x]),3\n\t" + "vl %%v22,96(%%r1,%[x]),3\n\t" + "vl %%v23,112(%%r1,%[x]),3\n\t" + "vl %%v24,128(%%r1,%[x]),3\n\t" + "vl %%v25,144(%%r1,%[x]),3\n\t" + "vl %%v26,160(%%r1,%[x]),3\n\t" + "vl %%v27,176(%%r1,%[x]),3\n\t" + "vl %%v28,192(%%r1,%[x]),3\n\t" + "vl %%v29,208(%%r1,%[x]),3\n\t" + "vl %%v30,224(%%r1,%[x]),3\n\t" + "vl %%v31,240(%%r1,%[x]),3\n\t" "vfmaxdb %%v16,%%v16,%%v24,8\n\t" "vfmaxdb %%v17,%%v17,%%v25,8\n\t" "vfmaxdb %%v18,%%v18,%%v26,8\n\t" diff --git a/kernel/zarch/damax_z13.c b/kernel/zarch/damax_z13.c index 530d6e5bb..f7e11c3ce 100644 --- a/kernel/zarch/damax_z13.c +++ b/kernel/zarch/damax_z13.c @@ -33,20 +33,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static FLOAT damax_kernel_32(BLASLONG n, FLOAT *x) { FLOAT amax; - __asm__("vl %%v0,0(%[x])\n\t" + __asm__("vl %%v0,0(%[x]),3\n\t" "vflpdb %%v0,%%v0\n\t" "srlg %[n],%[n],5\n\t" "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x]),3\n\t" + "vl %%v17,16(%%r1,%[x]),3\n\t" + "vl %%v18,32(%%r1,%[x]),3\n\t" + "vl %%v19,48(%%r1,%[x]),3\n\t" + "vl %%v20,64(%%r1,%[x]),3\n\t" + "vl %%v21,80(%%r1,%[x]),3\n\t" + "vl %%v22,96(%%r1,%[x]),3\n\t" + "vl %%v23,112(%%r1,%[x]),3\n\t" "vflpdb %%v16, %%v16\n\t" "vflpdb %%v17, %%v17\n\t" "vflpdb %%v18, %%v18\n\t" @@ -71,14 +71,14 @@ static FLOAT damax_kernel_32(BLASLONG n, FLOAT *x) { "vsel %%v30,%%v28,%%v29,%%v30\n\t" "vfchdb %%v31,%%v30,%%v0\n\t" "vsel %%v0,%%v30,%%v0,%%v31\n\t" - "vl %%v16,128(%%r1,%[x])\n\t" - "vl %%v17,144(%%r1,%[x])\n\t" - "vl %%v18,160(%%r1,%[x])\n\t" - "vl %%v19,176(%%r1,%[x])\n\t" - "vl %%v20,192(%%r1,%[x])\n\t" - "vl %%v21,208(%%r1,%[x])\n\t" - "vl %%v22,224(%%r1,%[x])\n\t" - "vl %%v23,240(%%r1,%[x])\n\t" + "vl %%v16,128(%%r1,%[x]),3\n\t" + "vl %%v17,144(%%r1,%[x]),3\n\t" + "vl %%v18,160(%%r1,%[x]),3\n\t" + "vl %%v19,176(%%r1,%[x]),3\n\t" + "vl %%v20,192(%%r1,%[x]),3\n\t" + "vl %%v21,208(%%r1,%[x]),3\n\t" + "vl %%v22,224(%%r1,%[x]),3\n\t" + "vl %%v23,240(%%r1,%[x]),3\n\t" "vflpdb %%v16, %%v16\n\t" "vflpdb %%v17, %%v17\n\t" "vflpdb %%v18, %%v18\n\t" diff --git a/kernel/zarch/damin.c b/kernel/zarch/damin.c index a01791741..25f018c66 100644 --- a/kernel/zarch/damin.c +++ b/kernel/zarch/damin.c @@ -33,27 +33,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static FLOAT damin_kernel_32(BLASLONG n, FLOAT *x) { FLOAT amin; - __asm__("vl %%v0,0(%[x])\n\t" + __asm__("vl %%v0,0(%[x]),3\n\t" "srlg %[n],%[n],5\n\t" "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vl %%v24,128(%%r1,%[x])\n\t" - "vl %%v25,144(%%r1,%[x])\n\t" - "vl %%v26,160(%%r1,%[x])\n\t" - "vl %%v27,176(%%r1,%[x])\n\t" - "vl %%v28,192(%%r1,%[x])\n\t" - "vl %%v29,208(%%r1,%[x])\n\t" - "vl %%v30,224(%%r1,%[x])\n\t" - "vl %%v31,240(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x]),3\n\t" + "vl %%v17,16(%%r1,%[x]),3\n\t" + "vl %%v18,32(%%r1,%[x]),3\n\t" + "vl %%v19,48(%%r1,%[x]),3\n\t" + "vl %%v20,64(%%r1,%[x]),3\n\t" + "vl %%v21,80(%%r1,%[x]),3\n\t" + "vl %%v22,96(%%r1,%[x]),3\n\t" + "vl %%v23,112(%%r1,%[x]),3\n\t" + "vl %%v24,128(%%r1,%[x]),3\n\t" + "vl %%v25,144(%%r1,%[x]),3\n\t" + "vl %%v26,160(%%r1,%[x]),3\n\t" + "vl %%v27,176(%%r1,%[x]),3\n\t" + "vl %%v28,192(%%r1,%[x]),3\n\t" + "vl %%v29,208(%%r1,%[x]),3\n\t" + "vl %%v30,224(%%r1,%[x]),3\n\t" + "vl %%v31,240(%%r1,%[x]),3\n\t" "vfmindb %%v16,%%v16,%%v24,8\n\t" "vfmindb %%v17,%%v17,%%v25,8\n\t" "vfmindb %%v18,%%v18,%%v26,8\n\t" diff --git a/kernel/zarch/damin_z13.c b/kernel/zarch/damin_z13.c index 2172b6d6f..091aceb37 100644 --- a/kernel/zarch/damin_z13.c +++ b/kernel/zarch/damin_z13.c @@ -33,20 +33,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static FLOAT damin_kernel_32(BLASLONG n, FLOAT *x) { FLOAT amin; - __asm__("vl %%v0,0(%[x])\n\t" + __asm__("vl %%v0,0(%[x]),3\n\t" "vflpdb %%v0,%%v0\n\t" "srlg %[n],%[n],5\n\t" "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x]),3\n\t" + "vl %%v17,16(%%r1,%[x]),3\n\t" + "vl %%v18,32(%%r1,%[x]),3\n\t" + "vl %%v19,48(%%r1,%[x]),3\n\t" + "vl %%v20,64(%%r1,%[x]),3\n\t" + "vl %%v21,80(%%r1,%[x]),3\n\t" + "vl %%v22,96(%%r1,%[x]),3\n\t" + "vl %%v23,112(%%r1,%[x]),3\n\t" "vflpdb %%v16, %%v16\n\t" "vflpdb %%v17, %%v17\n\t" "vflpdb %%v18, %%v18\n\t" @@ -71,14 +71,14 @@ static FLOAT damin_kernel_32(BLASLONG n, FLOAT *x) { "vsel %%v30,%%v28,%%v29,%%v30\n\t" "vfchdb %%v31,%%v0,%%v30\n\t" "vsel %%v0,%%v30,%%v0,%%v31\n\t" - "vl %%v16,128(%%r1,%[x])\n\t" - "vl %%v17,144(%%r1,%[x])\n\t" - "vl %%v18,160(%%r1,%[x])\n\t" - "vl %%v19,176(%%r1,%[x])\n\t" - "vl %%v20,192(%%r1,%[x])\n\t" - "vl %%v21,208(%%r1,%[x])\n\t" - "vl %%v22,224(%%r1,%[x])\n\t" - "vl %%v23,240(%%r1,%[x])\n\t" + "vl %%v16,128(%%r1,%[x]),3\n\t" + "vl %%v17,144(%%r1,%[x]),3\n\t" + "vl %%v18,160(%%r1,%[x]),3\n\t" + "vl %%v19,176(%%r1,%[x]),3\n\t" + "vl %%v20,192(%%r1,%[x]),3\n\t" + "vl %%v21,208(%%r1,%[x]),3\n\t" + "vl %%v22,224(%%r1,%[x]),3\n\t" + "vl %%v23,240(%%r1,%[x]),3\n\t" "vflpdb %%v16, %%v16\n\t" "vflpdb %%v17, %%v17\n\t" "vflpdb %%v18, %%v18\n\t" diff --git a/kernel/zarch/dasum.c b/kernel/zarch/dasum.c index 9f69a9931..641949963 100644 --- a/kernel/zarch/dasum.c +++ b/kernel/zarch/dasum.c @@ -45,14 +45,14 @@ static FLOAT dasum_kernel_32(BLASLONG n, FLOAT *x) { "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16, 0(%%r1,%[x])\n\t" - "vl %%v17, 16(%%r1,%[x])\n\t" - "vl %%v18, 32(%%r1,%[x])\n\t" - "vl %%v19, 48(%%r1,%[x])\n\t" - "vl %%v20, 64(%%r1,%[x])\n\t" - "vl %%v21, 80(%%r1,%[x])\n\t" - "vl %%v22, 96(%%r1,%[x])\n\t" - "vl %%v23, 112(%%r1,%[x])\n\t" + "vl %%v16, 0(%%r1,%[x]),3\n\t" + "vl %%v17, 16(%%r1,%[x]),3\n\t" + "vl %%v18, 32(%%r1,%[x]),3\n\t" + "vl %%v19, 48(%%r1,%[x]),3\n\t" + "vl %%v20, 64(%%r1,%[x]),3\n\t" + "vl %%v21, 80(%%r1,%[x]),3\n\t" + "vl %%v22, 96(%%r1,%[x]),3\n\t" + "vl %%v23, 112(%%r1,%[x]),3\n\t" "vflpdb %%v16, %%v16\n\t" "vflpdb %%v17, %%v17\n\t" "vflpdb %%v18, %%v18\n\t" @@ -69,14 +69,14 @@ static FLOAT dasum_kernel_32(BLASLONG n, FLOAT *x) { "vfadb %%v29,%%v29,%%v21\n\t" "vfadb %%v30,%%v30,%%v22\n\t" "vfadb %%v31,%%v31,%%v23\n\t" - "vl %%v16, 128(%%r1,%[x])\n\t" - "vl %%v17, 144(%%r1,%[x])\n\t" - "vl %%v18, 160(%%r1,%[x])\n\t" - "vl %%v19, 176(%%r1,%[x])\n\t" - "vl %%v20, 192(%%r1,%[x])\n\t" - "vl %%v21, 208(%%r1,%[x])\n\t" - "vl %%v22, 224(%%r1,%[x])\n\t" - "vl %%v23, 240(%%r1,%[x])\n\t" + "vl %%v16, 128(%%r1,%[x]),3\n\t" + "vl %%v17, 144(%%r1,%[x]),3\n\t" + "vl %%v18, 160(%%r1,%[x]),3\n\t" + "vl %%v19, 176(%%r1,%[x]),3\n\t" + "vl %%v20, 192(%%r1,%[x]),3\n\t" + "vl %%v21, 208(%%r1,%[x]),3\n\t" + "vl %%v22, 224(%%r1,%[x]),3\n\t" + "vl %%v23, 240(%%r1,%[x]),3\n\t" "vflpdb %%v16, %%v16\n\t" "vflpdb %%v17, %%v17\n\t" "vflpdb %%v18, %%v18\n\t" diff --git a/kernel/zarch/daxpy.c b/kernel/zarch/daxpy.c index 179ef8834..c02ad0aac 100644 --- a/kernel/zarch/daxpy.c +++ b/kernel/zarch/daxpy.c @@ -34,22 +34,22 @@ static void daxpy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { "0:\n\t" "pfd 1, 1024(%%r1,%[x])\n\t" "pfd 2, 1024(%%r1,%[y])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,0(%%r1,%[y])\n\t" - "vl %%v21,16(%%r1,%[y])\n\t" - "vl %%v22,32(%%r1,%[y])\n\t" - "vl %%v23,48(%%r1,%[y])\n\t" - "vl %%v24,64(%%r1,%[x])\n\t" - "vl %%v25,80(%%r1,%[x])\n\t" - "vl %%v26,96(%%r1,%[x])\n\t" - "vl %%v27,112(%%r1,%[x])\n\t" - "vl %%v28,64(%%r1,%[y])\n\t" - "vl %%v29,80(%%r1,%[y])\n\t" - "vl %%v30,96(%%r1,%[y])\n\t" - "vl %%v31,112(%%r1,%[y])\n\t" + "vl %%v16,0(%%r1,%[x]),3\n\t" + "vl %%v17,16(%%r1,%[x]),3\n\t" + "vl %%v18,32(%%r1,%[x]),3\n\t" + "vl %%v19,48(%%r1,%[x]),3\n\t" + "vl %%v20,0(%%r1,%[y]),3\n\t" + "vl %%v21,16(%%r1,%[y]),3\n\t" + "vl %%v22,32(%%r1,%[y]),3\n\t" + "vl %%v23,48(%%r1,%[y]),3\n\t" + "vl %%v24,64(%%r1,%[x]),3\n\t" + "vl %%v25,80(%%r1,%[x]),3\n\t" + "vl %%v26,96(%%r1,%[x]),3\n\t" + "vl %%v27,112(%%r1,%[x]),3\n\t" + "vl %%v28,64(%%r1,%[y]),3\n\t" + "vl %%v29,80(%%r1,%[y]),3\n\t" + "vl %%v30,96(%%r1,%[y]),3\n\t" + "vl %%v31,112(%%r1,%[y]),3\n\t" "vfmadb %%v16,%%v0,%%v16,%%v20\n\t" "vfmadb %%v17,%%v0,%%v17,%%v21\n\t" "vfmadb %%v18,%%v0,%%v18,%%v22\n\t" @@ -58,30 +58,30 @@ static void daxpy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { "vfmadb %%v25,%%v0,%%v25,%%v29\n\t" "vfmadb %%v26,%%v0,%%v26,%%v30\n\t" "vfmadb %%v27,%%v0,%%v27,%%v31\n\t" - "vst %%v16,0(%%r1,%[y])\n\t" - "vst %%v17,16(%%r1,%[y])\n\t" - "vst %%v18,32(%%r1,%[y])\n\t" - "vst %%v19,48(%%r1,%[y])\n\t" - "vst %%v24,64(%%r1,%[y])\n\t" - "vst %%v25,80(%%r1,%[y])\n\t" - "vst %%v26,96(%%r1,%[y])\n\t" - "vst %%v27,112(%%r1,%[y])\n\t" - "vl %%v16,128(%%r1,%[x])\n\t" - "vl %%v17,144(%%r1,%[x])\n\t" - "vl %%v18,160(%%r1,%[x])\n\t" - "vl %%v19,176(%%r1,%[x])\n\t" - "vl %%v20,128(%%r1,%[y])\n\t" - "vl %%v21,144(%%r1,%[y])\n\t" - "vl %%v22,160(%%r1,%[y])\n\t" - "vl %%v23,176(%%r1,%[y])\n\t" - "vl %%v24,192(%%r1,%[x])\n\t" - "vl %%v25,208(%%r1,%[x])\n\t" - "vl %%v26,224(%%r1,%[x])\n\t" - "vl %%v27,240(%%r1,%[x])\n\t" - "vl %%v28,192(%%r1,%[y])\n\t" - "vl %%v29,208(%%r1,%[y])\n\t" - "vl %%v30,224(%%r1,%[y])\n\t" - "vl %%v31,240(%%r1,%[y])\n\t" + "vst %%v16,0(%%r1,%[y]),3\n\t" + "vst %%v17,16(%%r1,%[y]),3\n\t" + "vst %%v18,32(%%r1,%[y]),3\n\t" + "vst %%v19,48(%%r1,%[y]),3\n\t" + "vst %%v24,64(%%r1,%[y]),3\n\t" + "vst %%v25,80(%%r1,%[y]),3\n\t" + "vst %%v26,96(%%r1,%[y]),3\n\t" + "vst %%v27,112(%%r1,%[y]),3\n\t" + "vl %%v16,128(%%r1,%[x]),3\n\t" + "vl %%v17,144(%%r1,%[x]),3\n\t" + "vl %%v18,160(%%r1,%[x]),3\n\t" + "vl %%v19,176(%%r1,%[x]),3\n\t" + "vl %%v20,128(%%r1,%[y]),3\n\t" + "vl %%v21,144(%%r1,%[y]),3\n\t" + "vl %%v22,160(%%r1,%[y]),3\n\t" + "vl %%v23,176(%%r1,%[y]),3\n\t" + "vl %%v24,192(%%r1,%[x]),3\n\t" + "vl %%v25,208(%%r1,%[x]),3\n\t" + "vl %%v26,224(%%r1,%[x]),3\n\t" + "vl %%v27,240(%%r1,%[x]),3\n\t" + "vl %%v28,192(%%r1,%[y]),3\n\t" + "vl %%v29,208(%%r1,%[y]),3\n\t" + "vl %%v30,224(%%r1,%[y]),3\n\t" + "vl %%v31,240(%%r1,%[y]),3\n\t" "vfmadb %%v16,%%v0,%%v16,%%v20\n\t" "vfmadb %%v17,%%v0,%%v17,%%v21\n\t" "vfmadb %%v18,%%v0,%%v18,%%v22\n\t" @@ -90,14 +90,14 @@ static void daxpy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { "vfmadb %%v25,%%v0,%%v25,%%v29\n\t" "vfmadb %%v26,%%v0,%%v26,%%v30\n\t" "vfmadb %%v27,%%v0,%%v27,%%v31\n\t" - "vst %%v16,128(%%r1,%[y])\n\t" - "vst %%v17,144(%%r1,%[y])\n\t" - "vst %%v18,160(%%r1,%[y])\n\t" - "vst %%v19,176(%%r1,%[y])\n\t" - "vst %%v24,192(%%r1,%[y])\n\t" - "vst %%v25,208(%%r1,%[y])\n\t" - "vst %%v26,224(%%r1,%[y])\n\t" - "vst %%v27,240(%%r1,%[y])\n\t" + "vst %%v16,128(%%r1,%[y]),3\n\t" + "vst %%v17,144(%%r1,%[y]),3\n\t" + "vst %%v18,160(%%r1,%[y]),3\n\t" + "vst %%v19,176(%%r1,%[y]),3\n\t" + "vst %%v24,192(%%r1,%[y]),3\n\t" + "vst %%v25,208(%%r1,%[y]),3\n\t" + "vst %%v26,224(%%r1,%[y]),3\n\t" + "vst %%v27,240(%%r1,%[y]),3\n\t" "agfi %%r1,256\n\t" "brctg %[n],0b" : "+m"(*(FLOAT (*)[n]) y),[n] "+&r"(n) diff --git a/kernel/zarch/ddot.c b/kernel/zarch/ddot.c index f5f601717..0dd8ed08a 100644 --- a/kernel/zarch/ddot.c +++ b/kernel/zarch/ddot.c @@ -43,22 +43,22 @@ static FLOAT ddot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) { "0:\n\t" "pfd 1,1024(%%r1,%[x])\n\t" "pfd 1,1024(%%r1,%[y])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vl %%v24,0(%%r1,%[y])\n\t" - "vl %%v25,16(%%r1,%[y])\n\t" - "vl %%v26,32(%%r1,%[y])\n\t" - "vl %%v27,48(%%r1,%[y])\n\t" - "vl %%v28,64(%%r1,%[y])\n\t" - "vl %%v29,80(%%r1,%[y])\n\t" - "vl %%v30,96(%%r1,%[y])\n\t" - "vl %%v31,112(%%r1,%[y])\n\t" + "vl %%v16,0(%%r1,%[x]),3\n\t" + "vl %%v17,16(%%r1,%[x]),3\n\t" + "vl %%v18,32(%%r1,%[x]),3\n\t" + "vl %%v19,48(%%r1,%[x]),3\n\t" + "vl %%v20,64(%%r1,%[x]),3\n\t" + "vl %%v21,80(%%r1,%[x]),3\n\t" + "vl %%v22,96(%%r1,%[x]),3\n\t" + "vl %%v23,112(%%r1,%[x]),3\n\t" + "vl %%v24,0(%%r1,%[y]),3\n\t" + "vl %%v25,16(%%r1,%[y]),3\n\t" + "vl %%v26,32(%%r1,%[y]),3\n\t" + "vl %%v27,48(%%r1,%[y]),3\n\t" + "vl %%v28,64(%%r1,%[y]),3\n\t" + "vl %%v29,80(%%r1,%[y]),3\n\t" + "vl %%v30,96(%%r1,%[y]),3\n\t" + "vl %%v31,112(%%r1,%[y]),3\n\t" "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" "vfmadb %%v1,%%v17,%%v25,%%v1\n\t" "vfmadb %%v2,%%v18,%%v26,%%v2\n\t" diff --git a/kernel/zarch/dgemv_n_4.c b/kernel/zarch/dgemv_n_4.c index c93ff9b54..87ed6ecd1 100644 --- a/kernel/zarch/dgemv_n_4.c +++ b/kernel/zarch/dgemv_n_4.c @@ -52,26 +52,26 @@ static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "pfd 1,1024(%%r1,%[ap2])\n\t" "pfd 1,1024(%%r1,%[ap3])\n\t" "pfd 2,1024(%%r1,%[y])\n\t" - "vl %%v16,0(%%r1,%[ap0])\n\t" - "vl %%v17,0(%%r1,%[ap1])\n\t" - "vl %%v18,0(%%r1,%[ap2])\n\t" - "vl %%v19,0(%%r1,%[ap3])\n\t" - "vl %%v20,16(%%r1,%[ap0])\n\t" - "vl %%v21,16(%%r1,%[ap1])\n\t" - "vl %%v22,16(%%r1,%[ap2])\n\t" - "vl %%v23,16(%%r1,%[ap3])\n\t" - "vl %%v24,32(%%r1,%[ap0])\n\t" - "vl %%v25,32(%%r1,%[ap1])\n\t" - "vl %%v26,32(%%r1,%[ap2])\n\t" - "vl %%v27,32(%%r1,%[ap3])\n\t" - "vl %%v28,48(%%r1,%[ap0])\n\t" - "vl %%v29,48(%%r1,%[ap1])\n\t" - "vl %%v30,48(%%r1,%[ap2])\n\t" - "vl %%v31,48(%%r1,%[ap3])\n\t" - "vl %%v4,0(%%r1,%[y])\n\t" - "vl %%v5,16(%%r1,%[y])\n\t" - "vl %%v6,32(%%r1,%[y])\n\t" - "vl %%v7,48(%%r1,%[y])\n\t" + "vl %%v16,0(%%r1,%[ap0]),3\n\t" + "vl %%v17,0(%%r1,%[ap1]),3\n\t" + "vl %%v18,0(%%r1,%[ap2]),3\n\t" + "vl %%v19,0(%%r1,%[ap3]),3\n\t" + "vl %%v20,16(%%r1,%[ap0]),3\n\t" + "vl %%v21,16(%%r1,%[ap1]),3\n\t" + "vl %%v22,16(%%r1,%[ap2]),3\n\t" + "vl %%v23,16(%%r1,%[ap3]),3\n\t" + "vl %%v24,32(%%r1,%[ap0]),3\n\t" + "vl %%v25,32(%%r1,%[ap1]),3\n\t" + "vl %%v26,32(%%r1,%[ap2]),3\n\t" + "vl %%v27,32(%%r1,%[ap3]),3\n\t" + "vl %%v28,48(%%r1,%[ap0]),3\n\t" + "vl %%v29,48(%%r1,%[ap1]),3\n\t" + "vl %%v30,48(%%r1,%[ap2]),3\n\t" + "vl %%v31,48(%%r1,%[ap3]),3\n\t" + "vl %%v4,0(%%r1,%[y]),3\n\t" + "vl %%v5,16(%%r1,%[y]),3\n\t" + "vl %%v6,32(%%r1,%[y]),3\n\t" + "vl %%v7,48(%%r1,%[y]),3\n\t" "vfmadb %%v4,%%v16,%%v0,%%v4\n\t" "vfmadb %%v5,%%v20,%%v0,%%v5\n\t" "vfmadb %%v6,%%v24,%%v0,%%v6\n\t" @@ -88,30 +88,30 @@ static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "vfmadb %%v5,%%v23,%%v3,%%v5\n\t" "vfmadb %%v6,%%v27,%%v3,%%v6\n\t" "vfmadb %%v7,%%v31,%%v3,%%v7\n\t" - "vst %%v4,0(%%r1,%[y])\n\t" - "vst %%v5,16(%%r1,%[y])\n\t" - "vst %%v6,32(%%r1,%[y])\n\t" - "vst %%v7,48(%%r1,%[y])\n\t" - "vl %%v16,64(%%r1,%[ap0])\n\t" - "vl %%v17,64(%%r1,%[ap1])\n\t" - "vl %%v18,64(%%r1,%[ap2])\n\t" - "vl %%v19,64(%%r1,%[ap3])\n\t" - "vl %%v20,80(%%r1,%[ap0])\n\t" - "vl %%v21,80(%%r1,%[ap1])\n\t" - "vl %%v22,80(%%r1,%[ap2])\n\t" - "vl %%v23,80(%%r1,%[ap3])\n\t" - "vl %%v24,96(%%r1,%[ap0])\n\t" - "vl %%v25,96(%%r1,%[ap1])\n\t" - "vl %%v26,96(%%r1,%[ap2])\n\t" - "vl %%v27,96(%%r1,%[ap3])\n\t" - "vl %%v28,112(%%r1,%[ap0])\n\t" - "vl %%v29,112(%%r1,%[ap1])\n\t" - "vl %%v30,112(%%r1,%[ap2])\n\t" - "vl %%v31,112(%%r1,%[ap3])\n\t" - "vl %%v4,64(%%r1,%[y])\n\t" - "vl %%v5,80(%%r1,%[y])\n\t" - "vl %%v6,96(%%r1,%[y])\n\t" - "vl %%v7,112(%%r1,%[y])\n\t" + "vst %%v4,0(%%r1,%[y]),3\n\t" + "vst %%v5,16(%%r1,%[y]),3\n\t" + "vst %%v6,32(%%r1,%[y]),3\n\t" + "vst %%v7,48(%%r1,%[y]),3\n\t" + "vl %%v16,64(%%r1,%[ap0]),3\n\t" + "vl %%v17,64(%%r1,%[ap1]),3\n\t" + "vl %%v18,64(%%r1,%[ap2]),3\n\t" + "vl %%v19,64(%%r1,%[ap3]),3\n\t" + "vl %%v20,80(%%r1,%[ap0]),3\n\t" + "vl %%v21,80(%%r1,%[ap1]),3\n\t" + "vl %%v22,80(%%r1,%[ap2]),3\n\t" + "vl %%v23,80(%%r1,%[ap3]),3\n\t" + "vl %%v24,96(%%r1,%[ap0]),3\n\t" + "vl %%v25,96(%%r1,%[ap1]),3\n\t" + "vl %%v26,96(%%r1,%[ap2]),3\n\t" + "vl %%v27,96(%%r1,%[ap3]),3\n\t" + "vl %%v28,112(%%r1,%[ap0]),3\n\t" + "vl %%v29,112(%%r1,%[ap1]),3\n\t" + "vl %%v30,112(%%r1,%[ap2]),3\n\t" + "vl %%v31,112(%%r1,%[ap3]),3\n\t" + "vl %%v4,64(%%r1,%[y]),3\n\t" + "vl %%v5,80(%%r1,%[y]),3\n\t" + "vl %%v6,96(%%r1,%[y]),3\n\t" + "vl %%v7,112(%%r1,%[y]),3\n\t" "vfmadb %%v4,%%v16,%%v0,%%v4\n\t" "vfmadb %%v5,%%v20,%%v0,%%v5\n\t" "vfmadb %%v6,%%v24,%%v0,%%v6\n\t" @@ -128,10 +128,10 @@ static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "vfmadb %%v5,%%v23,%%v3,%%v5\n\t" "vfmadb %%v6,%%v27,%%v3,%%v6\n\t" "vfmadb %%v7,%%v31,%%v3,%%v7\n\t" - "vst %%v4,64(%%r1,%[y])\n\t" - "vst %%v5,80(%%r1,%[y])\n\t" - "vst %%v6,96(%%r1,%[y])\n\t" - "vst %%v7,112(%%r1,%[y])\n\t" + "vst %%v4,64(%%r1,%[y]),3\n\t" + "vst %%v5,80(%%r1,%[y]),3\n\t" + "vst %%v6,96(%%r1,%[y]),3\n\t" + "vst %%v7,112(%%r1,%[y]),3\n\t" "agfi %%r1,128\n\t" "brctg %%r0,0b\n\t" "1:\n\t" @@ -141,16 +141,16 @@ static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "jz 3f\n\t" "srlg %%r0,%%r0,2\n\t" "2:\n\t" - "vl %%v16,0(%%r1,%[ap0])\n\t" - "vl %%v17,0(%%r1,%[ap1])\n\t" - "vl %%v18,0(%%r1,%[ap2])\n\t" - "vl %%v19,0(%%r1,%[ap3])\n\t" - "vl %%v20,16(%%r1,%[ap0])\n\t" - "vl %%v21,16(%%r1,%[ap1])\n\t" - "vl %%v22,16(%%r1,%[ap2])\n\t" - "vl %%v23,16(%%r1,%[ap3])\n\t" - "vl %%v4,0(%%r1,%[y])\n\t" - "vl %%v5,16(%%r1,%[y])\n\t" + "vl %%v16,0(%%r1,%[ap0]),3\n\t" + "vl %%v17,0(%%r1,%[ap1]),3\n\t" + "vl %%v18,0(%%r1,%[ap2]),3\n\t" + "vl %%v19,0(%%r1,%[ap3]),3\n\t" + "vl %%v20,16(%%r1,%[ap0]),3\n\t" + "vl %%v21,16(%%r1,%[ap1]),3\n\t" + "vl %%v22,16(%%r1,%[ap2]),3\n\t" + "vl %%v23,16(%%r1,%[ap3]),3\n\t" + "vl %%v4,0(%%r1,%[y]),3\n\t" + "vl %%v5,16(%%r1,%[y]),3\n\t" "vfmadb %%v4,%%v16,%%v0,%%v4\n\t" "vfmadb %%v5,%%v20,%%v0,%%v5\n\t" "vfmadb %%v4,%%v17,%%v1,%%v4\n\t" @@ -159,8 +159,8 @@ static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "vfmadb %%v5,%%v22,%%v2,%%v5\n\t" "vfmadb %%v4,%%v19,%%v3,%%v4\n\t" "vfmadb %%v5,%%v23,%%v3,%%v5\n\t" - "vst %%v4,0(%%r1,%[y])\n\t" - "vst %%v5,16(%%r1,%[y])\n\t" + "vst %%v4,0(%%r1,%[y]),3\n\t" + "vst %%v5,16(%%r1,%[y]),3\n\t" "agfi %%r1,32\n\t" "brctg %%r0,2b\n\t" "3:\n\t" @@ -193,30 +193,30 @@ static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "pfd 1,1024(%%r1,%[ap0])\n\t" "pfd 1,1024(%%r1,%[ap1])\n\t" "pfd 2,1024(%%r1,%[y])\n\t" - "vl %%v16,0(%%r1,%[ap0])\n\t" - "vl %%v17,0(%%r1,%[ap1])\n\t" - "vl %%v18,16(%%r1,%[ap0])\n\t" - "vl %%v19,16(%%r1,%[ap1])\n\t" - "vl %%v20,32(%%r1,%[ap0])\n\t" - "vl %%v21,32(%%r1,%[ap1])\n\t" - "vl %%v22,48(%%r1,%[ap0])\n\t" - "vl %%v23,48(%%r1,%[ap1])\n\t" - "vl %%v24,64(%%r1,%[ap0])\n\t" - "vl %%v25,64(%%r1,%[ap1])\n\t" - "vl %%v26,80(%%r1,%[ap0])\n\t" - "vl %%v27,80(%%r1,%[ap1])\n\t" - "vl %%v28,96(%%r1,%[ap0])\n\t" - "vl %%v29,96(%%r1,%[ap1])\n\t" - "vl %%v30,112(%%r1,%[ap0])\n\t" - "vl %%v31,112(%%r1,%[ap1])\n\t" - "vl %%v2,0(%%r1,%[y])\n\t" - "vl %%v3,16(%%r1,%[y])\n\t" - "vl %%v4,32(%%r1,%[y])\n\t" - "vl %%v5,48(%%r1,%[y])\n\t" - "vl %%v6,64(%%r1,%[y])\n\t" - "vl %%v7,80(%%r1,%[y])\n\t" - "vl %%v8,96(%%r1,%[y])\n\t" - "vl %%v9,112(%%r1,%[y])\n\t" + "vl %%v16,0(%%r1,%[ap0]),3\n\t" + "vl %%v17,0(%%r1,%[ap1]),3\n\t" + "vl %%v18,16(%%r1,%[ap0]),3\n\t" + "vl %%v19,16(%%r1,%[ap1]),3\n\t" + "vl %%v20,32(%%r1,%[ap0]),3\n\t" + "vl %%v21,32(%%r1,%[ap1]),3\n\t" + "vl %%v22,48(%%r1,%[ap0]),3\n\t" + "vl %%v23,48(%%r1,%[ap1]),3\n\t" + "vl %%v24,64(%%r1,%[ap0]),3\n\t" + "vl %%v25,64(%%r1,%[ap1]),3\n\t" + "vl %%v26,80(%%r1,%[ap0]),3\n\t" + "vl %%v27,80(%%r1,%[ap1]),3\n\t" + "vl %%v28,96(%%r1,%[ap0]),3\n\t" + "vl %%v29,96(%%r1,%[ap1]),3\n\t" + "vl %%v30,112(%%r1,%[ap0]),3\n\t" + "vl %%v31,112(%%r1,%[ap1]),3\n\t" + "vl %%v2,0(%%r1,%[y]),3\n\t" + "vl %%v3,16(%%r1,%[y]),3\n\t" + "vl %%v4,32(%%r1,%[y]),3\n\t" + "vl %%v5,48(%%r1,%[y]),3\n\t" + "vl %%v6,64(%%r1,%[y]),3\n\t" + "vl %%v7,80(%%r1,%[y]),3\n\t" + "vl %%v8,96(%%r1,%[y]),3\n\t" + "vl %%v9,112(%%r1,%[y]),3\n\t" "vfmadb %%v2,%%v16,%%v0,%%v2\n\t" "vfmadb %%v3,%%v18,%%v0,%%v3\n\t" "vfmadb %%v4,%%v20,%%v0,%%v4\n\t" @@ -233,14 +233,14 @@ static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "vfmadb %%v7,%%v27,%%v1,%%v7\n\t" "vfmadb %%v8,%%v29,%%v1,%%v8\n\t" "vfmadb %%v9,%%v31,%%v1,%%v9\n\t" - "vst %%v2,0(%%r1,%[y])\n\t" - "vst %%v3,16(%%r1,%[y])\n\t" - "vst %%v4,32(%%r1,%[y])\n\t" - "vst %%v5,48(%%r1,%[y])\n\t" - "vst %%v6,64(%%r1,%[y])\n\t" - "vst %%v7,80(%%r1,%[y])\n\t" - "vst %%v8,96(%%r1,%[y])\n\t" - "vst %%v9,112(%%r1,%[y])\n\t" + "vst %%v2,0(%%r1,%[y]),3\n\t" + "vst %%v3,16(%%r1,%[y]),3\n\t" + "vst %%v4,32(%%r1,%[y]),3\n\t" + "vst %%v5,48(%%r1,%[y]),3\n\t" + "vst %%v6,64(%%r1,%[y]),3\n\t" + "vst %%v7,80(%%r1,%[y]),3\n\t" + "vst %%v8,96(%%r1,%[y]),3\n\t" + "vst %%v9,112(%%r1,%[y]),3\n\t" "agfi %%r1,128\n\t" "brctg %%r0,0b\n\t" "1:\n\t" @@ -250,18 +250,18 @@ static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "jz 3f\n\t" "srlg %%r0,%%r0,2\n\t" "2:\n\t" - "vl %%v16,0(%%r1,%[ap0])\n\t" - "vl %%v17,0(%%r1,%[ap1])\n\t" - "vl %%v18,16(%%r1,%[ap0])\n\t" - "vl %%v19,16(%%r1,%[ap1])\n\t" - "vl %%v2,0(%%r1,%[y])\n\t" - "vl %%v3,16(%%r1,%[y])\n\t" + "vl %%v16,0(%%r1,%[ap0]),3\n\t" + "vl %%v17,0(%%r1,%[ap1]),3\n\t" + "vl %%v18,16(%%r1,%[ap0]),3\n\t" + "vl %%v19,16(%%r1,%[ap1]),3\n\t" + "vl %%v2,0(%%r1,%[y]),3\n\t" + "vl %%v3,16(%%r1,%[y]),3\n\t" "vfmadb %%v2,%%v16,%%v0,%%v2\n\t" "vfmadb %%v3,%%v18,%%v0,%%v3\n\t" "vfmadb %%v2,%%v17,%%v1,%%v2\n\t" "vfmadb %%v3,%%v19,%%v1,%%v3\n\t" - "vst %%v2,0(%%r1,%[y])\n\t" - "vst %%v3,16(%%r1,%[y])\n\t" + "vst %%v2,0(%%r1,%[y]),3\n\t" + "vst %%v3,16(%%r1,%[y]),3\n\t" "agfi %%r1,32\n\t" "brctg %%r0,2b\n\t" "3:\n\t" @@ -289,22 +289,22 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y, "0:\n\t" "pfd 1,1024(%%r1,%[a0])\n\t" "pfd 2,1024(%%r1,%[y])\n\t" - "vl %%v16,0(%%r1,%[a0])\n\t" - "vl %%v17,16(%%r1,%[a0])\n\t" - "vl %%v18,32(%%r1,%[a0])\n\t" - "vl %%v19,48(%%r1,%[a0])\n\t" - "vl %%v20,64(%%r1,%[a0])\n\t" - "vl %%v21,80(%%r1,%[a0])\n\t" - "vl %%v22,96(%%r1,%[a0])\n\t" - "vl %%v23,112(%%r1,%[a0])\n\t" - "vl %%v24,0(%%r1,%[y])\n\t" - "vl %%v25,16(%%r1,%[y])\n\t" - "vl %%v26,32(%%r1,%[y])\n\t" - "vl %%v27,48(%%r1,%[y])\n\t" - "vl %%v28,64(%%r1,%[y])\n\t" - "vl %%v29,80(%%r1,%[y])\n\t" - "vl %%v30,96(%%r1,%[y])\n\t" - "vl %%v31,112(%%r1,%[y])\n\t" + "vl %%v16,0(%%r1,%[a0]),3\n\t" + "vl %%v17,16(%%r1,%[a0]),3\n\t" + "vl %%v18,32(%%r1,%[a0]),3\n\t" + "vl %%v19,48(%%r1,%[a0]),3\n\t" + "vl %%v20,64(%%r1,%[a0]),3\n\t" + "vl %%v21,80(%%r1,%[a0]),3\n\t" + "vl %%v22,96(%%r1,%[a0]),3\n\t" + "vl %%v23,112(%%r1,%[a0]),3\n\t" + "vl %%v24,0(%%r1,%[y]),3\n\t" + "vl %%v25,16(%%r1,%[y]),3\n\t" + "vl %%v26,32(%%r1,%[y]),3\n\t" + "vl %%v27,48(%%r1,%[y]),3\n\t" + "vl %%v28,64(%%r1,%[y]),3\n\t" + "vl %%v29,80(%%r1,%[y]),3\n\t" + "vl %%v30,96(%%r1,%[y]),3\n\t" + "vl %%v31,112(%%r1,%[y]),3\n\t" "vfmadb %%v24,%%v16,%%v0,%%v24\n\t" "vfmadb %%v25,%%v17,%%v0,%%v25\n\t" "vfmadb %%v26,%%v18,%%v0,%%v26\n\t" @@ -313,14 +313,14 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y, "vfmadb %%v29,%%v21,%%v0,%%v29\n\t" "vfmadb %%v30,%%v22,%%v0,%%v30\n\t" "vfmadb %%v31,%%v23,%%v0,%%v31\n\t" - "vst %%v24,0(%%r1,%[y])\n\t" - "vst %%v25,16(%%r1,%[y])\n\t" - "vst %%v26,32(%%r1,%[y])\n\t" - "vst %%v27,48(%%r1,%[y])\n\t" - "vst %%v28,64(%%r1,%[y])\n\t" - "vst %%v29,80(%%r1,%[y])\n\t" - "vst %%v30,96(%%r1,%[y])\n\t" - "vst %%v31,112(%%r1,%[y])\n\t" + "vst %%v24,0(%%r1,%[y]),3\n\t" + "vst %%v25,16(%%r1,%[y]),3\n\t" + "vst %%v26,32(%%r1,%[y]),3\n\t" + "vst %%v27,48(%%r1,%[y]),3\n\t" + "vst %%v28,64(%%r1,%[y]),3\n\t" + "vst %%v29,80(%%r1,%[y]),3\n\t" + "vst %%v30,96(%%r1,%[y]),3\n\t" + "vst %%v31,112(%%r1,%[y]),3\n\t" "agfi %%r1,128\n\t" "brctg %%r0,0b\n\t" "1:\n\t" @@ -330,14 +330,14 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y, "jz 3f\n\t" "srlg %%r0,%%r0,2\n\t" "2:\n\t" - "vl %%v16,0(%%r1,%[a0])\n\t" - "vl %%v17,16(%%r1,%[a0])\n\t" - "vl %%v18,0(%%r1,%[y])\n\t" - "vl %%v19,16(%%r1,%[y])\n\t" + "vl %%v16,0(%%r1,%[a0]),3\n\t" + "vl %%v17,16(%%r1,%[a0]),3\n\t" + "vl %%v18,0(%%r1,%[y]),3\n\t" + "vl %%v19,16(%%r1,%[y]),3\n\t" "vfmadb %%v18,%%v16,%%v0,%%v18\n\t" "vfmadb %%v19,%%v17,%%v0,%%v19\n\t" - "vst %%v18,0(%%r1,%[y])\n\t" - "vst %%v19,16(%%r1,%[y])\n\t" + "vst %%v18,0(%%r1,%[y]),3\n\t" + "vst %%v19,16(%%r1,%[y]),3\n\t" "agfi %%r1,32\n\t" "brctg %%r0,2b\n\t" "3:\n\t" diff --git a/kernel/zarch/dgemv_t_4.c b/kernel/zarch/dgemv_t_4.c index 24680cf1b..9fd3c09d6 100644 --- a/kernel/zarch/dgemv_t_4.c +++ b/kernel/zarch/dgemv_t_4.c @@ -50,77 +50,77 @@ static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { "pfd 1,1024(%%r1,%[ap2])\n\t" "pfd 1,1024(%%r1,%[ap3])\n\t" "pfd 1,1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vl %%v24,0(%%r1,%[ap0])\n\t" + "vl %%v16,0(%%r1,%[x]),3\n\t" + "vl %%v17,16(%%r1,%[x]),3\n\t" + "vl %%v18,32(%%r1,%[x]),3\n\t" + "vl %%v19,48(%%r1,%[x]),3\n\t" + "vl %%v20,64(%%r1,%[x]),3\n\t" + "vl %%v21,80(%%r1,%[x]),3\n\t" + "vl %%v22,96(%%r1,%[x]),3\n\t" + "vl %%v23,112(%%r1,%[x]),3\n\t" + "vl %%v24,0(%%r1,%[ap0]),3\n\t" "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" - "vl %%v25,0(%%r1,%[ap1])\n\t" + "vl %%v25,0(%%r1,%[ap1]),3\n\t" "vfmadb %%v1,%%v16,%%v25,%%v1\n\t" - "vl %%v26,0(%%r1,%[ap2])\n\t" + "vl %%v26,0(%%r1,%[ap2]),3\n\t" "vfmadb %%v2,%%v16,%%v26,%%v2\n\t" - "vl %%v27,0(%%r1,%[ap3])\n\t" + "vl %%v27,0(%%r1,%[ap3]),3\n\t" "vfmadb %%v3,%%v16,%%v27,%%v3\n\t" - "vl %%v28,16(%%r1,%[ap0])\n\t" + "vl %%v28,16(%%r1,%[ap0]),3\n\t" "vfmadb %%v4,%%v17,%%v28,%%v4\n\t" - "vl %%v29,16(%%r1,%[ap1])\n\t" + "vl %%v29,16(%%r1,%[ap1]),3\n\t" "vfmadb %%v5,%%v17,%%v29,%%v5\n\t" - "vl %%v30,16(%%r1,%[ap2])\n\t" + "vl %%v30,16(%%r1,%[ap2]),3\n\t" "vfmadb %%v6,%%v17,%%v30,%%v6\n\t" - "vl %%v31,16(%%r1,%[ap3])\n\t" + "vl %%v31,16(%%r1,%[ap3]),3\n\t" "vfmadb %%v7,%%v17,%%v31,%%v7\n\t" - "vl %%v24,32(%%r1,%[ap0])\n\t" + "vl %%v24,32(%%r1,%[ap0]),3\n\t" "vfmadb %%v0,%%v18,%%v24,%%v0\n\t" - "vl %%v25,32(%%r1,%[ap1])\n\t" + "vl %%v25,32(%%r1,%[ap1]),3\n\t" "vfmadb %%v1,%%v18,%%v25,%%v1\n\t" - "vl %%v26,32(%%r1,%[ap2])\n\t" + "vl %%v26,32(%%r1,%[ap2]),3\n\t" "vfmadb %%v2,%%v18,%%v26,%%v2\n\t" - "vl %%v27,32(%%r1,%[ap3])\n\t" + "vl %%v27,32(%%r1,%[ap3]),3\n\t" "vfmadb %%v3,%%v18,%%v27,%%v3\n\t" - "vl %%v28,48(%%r1,%[ap0])\n\t" + "vl %%v28,48(%%r1,%[ap0]),3\n\t" "vfmadb %%v4,%%v19,%%v28,%%v4\n\t" - "vl %%v29,48(%%r1,%[ap1])\n\t" + "vl %%v29,48(%%r1,%[ap1]),3\n\t" "vfmadb %%v5,%%v19,%%v29,%%v5\n\t" - "vl %%v30,48(%%r1,%[ap2])\n\t" + "vl %%v30,48(%%r1,%[ap2]),3\n\t" "vfmadb %%v6,%%v19,%%v30,%%v6\n\t" - "vl %%v31,48(%%r1,%[ap3])\n\t" + "vl %%v31,48(%%r1,%[ap3]),3\n\t" "vfmadb %%v7,%%v19,%%v31,%%v7\n\t" - "vl %%v24,64(%%r1,%[ap0])\n\t" + "vl %%v24,64(%%r1,%[ap0]),3\n\t" "vfmadb %%v0,%%v20,%%v24,%%v0\n\t" - "vl %%v25,64(%%r1,%[ap1])\n\t" + "vl %%v25,64(%%r1,%[ap1]),3\n\t" "vfmadb %%v1,%%v20,%%v25,%%v1\n\t" - "vl %%v26,64(%%r1,%[ap2])\n\t" + "vl %%v26,64(%%r1,%[ap2]),3\n\t" "vfmadb %%v2,%%v20,%%v26,%%v2\n\t" - "vl %%v27,64(%%r1,%[ap3])\n\t" + "vl %%v27,64(%%r1,%[ap3]),3\n\t" "vfmadb %%v3,%%v20,%%v27,%%v3\n\t" - "vl %%v28,80(%%r1,%[ap0])\n\t" + "vl %%v28,80(%%r1,%[ap0]),3\n\t" "vfmadb %%v4,%%v21,%%v28,%%v4\n\t" - "vl %%v29,80(%%r1,%[ap1])\n\t" + "vl %%v29,80(%%r1,%[ap1]),3\n\t" "vfmadb %%v5,%%v21,%%v29,%%v5\n\t" - "vl %%v30,80(%%r1,%[ap2])\n\t" + "vl %%v30,80(%%r1,%[ap2]),3\n\t" "vfmadb %%v6,%%v21,%%v30,%%v6\n\t" - "vl %%v31,80(%%r1,%[ap3])\n\t" + "vl %%v31,80(%%r1,%[ap3]),3\n\t" "vfmadb %%v7,%%v21,%%v31,%%v7\n\t" - "vl %%v24,96(%%r1,%[ap0])\n\t" + "vl %%v24,96(%%r1,%[ap0]),3\n\t" "vfmadb %%v0,%%v22,%%v24,%%v0\n\t" - "vl %%v25,96(%%r1,%[ap1])\n\t" + "vl %%v25,96(%%r1,%[ap1]),3\n\t" "vfmadb %%v1,%%v22,%%v25,%%v1\n\t" - "vl %%v26,96(%%r1,%[ap2])\n\t" + "vl %%v26,96(%%r1,%[ap2]),3\n\t" "vfmadb %%v2,%%v22,%%v26,%%v2\n\t" - "vl %%v27,96(%%r1,%[ap3])\n\t" + "vl %%v27,96(%%r1,%[ap3]),3\n\t" "vfmadb %%v3,%%v22,%%v27,%%v3\n\t" - "vl %%v28,112(%%r1,%[ap0])\n\t" + "vl %%v28,112(%%r1,%[ap0]),3\n\t" "vfmadb %%v4,%%v23,%%v28,%%v4\n\t" - "vl %%v29,112(%%r1,%[ap1])\n\t" + "vl %%v29,112(%%r1,%[ap1]),3\n\t" "vfmadb %%v5,%%v23,%%v29,%%v5\n\t" - "vl %%v30,112(%%r1,%[ap2])\n\t" + "vl %%v30,112(%%r1,%[ap2]),3\n\t" "vfmadb %%v6,%%v23,%%v30,%%v6\n\t" - "vl %%v31,112(%%r1,%[ap3])\n\t" + "vl %%v31,112(%%r1,%[ap3]),3\n\t" "vfmadb %%v7,%%v23,%%v31,%%v7\n\t" "agfi %%r1,128\n\t" "brctg %%r0,0b\n\t" @@ -131,23 +131,23 @@ static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { "jz 3f\n\t" "srlg %%r0,%%r0,2\n\t" "2:\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v24,0(%%r1,%[ap0])\n\t" + "vl %%v16,0(%%r1,%[x]),3\n\t" + "vl %%v17,16(%%r1,%[x]),3\n\t" + "vl %%v24,0(%%r1,%[ap0]),3\n\t" "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" - "vl %%v25,0(%%r1,%[ap1])\n\t" + "vl %%v25,0(%%r1,%[ap1]),3\n\t" "vfmadb %%v1,%%v16,%%v25,%%v1\n\t" - "vl %%v26,0(%%r1,%[ap2])\n\t" + "vl %%v26,0(%%r1,%[ap2]),3\n\t" "vfmadb %%v2,%%v16,%%v26,%%v2\n\t" - "vl %%v27,0(%%r1,%[ap3])\n\t" + "vl %%v27,0(%%r1,%[ap3]),3\n\t" "vfmadb %%v3,%%v16,%%v27,%%v3\n\t" - "vl %%v28,16(%%r1,%[ap0])\n\t" + "vl %%v28,16(%%r1,%[ap0]),3\n\t" "vfmadb %%v4,%%v17,%%v28,%%v4\n\t" - "vl %%v29,16(%%r1,%[ap1])\n\t" + "vl %%v29,16(%%r1,%[ap1]),3\n\t" "vfmadb %%v5,%%v17,%%v29,%%v5\n\t" - "vl %%v30,16(%%r1,%[ap2])\n\t" + "vl %%v30,16(%%r1,%[ap2]),3\n\t" "vfmadb %%v6,%%v17,%%v30,%%v6\n\t" - "vl %%v31,16(%%r1,%[ap3])\n\t" + "vl %%v31,16(%%r1,%[ap3]),3\n\t" "vfmadb %%v7,%%v17,%%v31,%%v7\n\t" "agfi %%r1,32\n\t" "brctg %%r0,2b\n\t" @@ -198,45 +198,45 @@ static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { "pfd 1,1024(%%r1,%[ap0])\n\t" "pfd 1,1024(%%r1,%[ap1])\n\t" "pfd 1,1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vl %%v24,0(%%r1,%[ap0])\n\t" + "vl %%v16,0(%%r1,%[x]),3\n\t" + "vl %%v17,16(%%r1,%[x]),3\n\t" + "vl %%v18,32(%%r1,%[x]),3\n\t" + "vl %%v19,48(%%r1,%[x]),3\n\t" + "vl %%v20,64(%%r1,%[x]),3\n\t" + "vl %%v21,80(%%r1,%[x]),3\n\t" + "vl %%v22,96(%%r1,%[x]),3\n\t" + "vl %%v23,112(%%r1,%[x]),3\n\t" + "vl %%v24,0(%%r1,%[ap0]),3\n\t" "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" - "vl %%v25,0(%%r1,%[ap1])\n\t" + "vl %%v25,0(%%r1,%[ap1]),3\n\t" "vfmadb %%v1,%%v16,%%v25,%%v1\n\t" - "vl %%v26,16(%%r1,%[ap0])\n\t" + "vl %%v26,16(%%r1,%[ap0]),3\n\t" "vfmadb %%v2,%%v17,%%v26,%%v2\n\t" - "vl %%v27,16(%%r1,%[ap1])\n\t" + "vl %%v27,16(%%r1,%[ap1]),3\n\t" "vfmadb %%v3,%%v17,%%v27,%%v3\n\t" - "vl %%v28,32(%%r1,%[ap0])\n\t" + "vl %%v28,32(%%r1,%[ap0]),3\n\t" "vfmadb %%v4,%%v18,%%v28,%%v4\n\t" - "vl %%v29,32(%%r1,%[ap1])\n\t" + "vl %%v29,32(%%r1,%[ap1]),3\n\t" "vfmadb %%v5,%%v18,%%v29,%%v5\n\t" - "vl %%v30,48(%%r1,%[ap0])\n\t" + "vl %%v30,48(%%r1,%[ap0]),3\n\t" "vfmadb %%v6,%%v19,%%v30,%%v6\n\t" - "vl %%v31,48(%%r1,%[ap1])\n\t" + "vl %%v31,48(%%r1,%[ap1]),3\n\t" "vfmadb %%v7,%%v19,%%v31,%%v7\n\t" - "vl %%v24,64(%%r1,%[ap0])\n\t" + "vl %%v24,64(%%r1,%[ap0]),3\n\t" "vfmadb %%v0,%%v20,%%v24,%%v0\n\t" - "vl %%v25,64(%%r1,%[ap1])\n\t" + "vl %%v25,64(%%r1,%[ap1]),3\n\t" "vfmadb %%v1,%%v20,%%v25,%%v1\n\t" - "vl %%v26,80(%%r1,%[ap0])\n\t" + "vl %%v26,80(%%r1,%[ap0]),3\n\t" "vfmadb %%v2,%%v21,%%v26,%%v2\n\t" - "vl %%v27,80(%%r1,%[ap1])\n\t" + "vl %%v27,80(%%r1,%[ap1]),3\n\t" "vfmadb %%v3,%%v21,%%v27,%%v3\n\t" - "vl %%v28,96(%%r1,%[ap0])\n\t" + "vl %%v28,96(%%r1,%[ap0]),3\n\t" "vfmadb %%v4,%%v22,%%v28,%%v4\n\t" - "vl %%v29,96(%%r1,%[ap1])\n\t" + "vl %%v29,96(%%r1,%[ap1]),3\n\t" "vfmadb %%v5,%%v22,%%v29,%%v5\n\t" - "vl %%v30,112(%%r1,%[ap0])\n\t" + "vl %%v30,112(%%r1,%[ap0]),3\n\t" "vfmadb %%v6,%%v23,%%v30,%%v6\n\t" - "vl %%v31,112(%%r1,%[ap1])\n\t" + "vl %%v31,112(%%r1,%[ap1]),3\n\t" "vfmadb %%v7,%%v23,%%v31,%%v7\n\t" "agfi %%r1,128\n\t" "brctg %%r0,0b\n\t" @@ -247,15 +247,15 @@ static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { "jz 3f\n\t" "srlg %%r0,%%r0,2\n\t" "2:\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v24,0(%%r1,%[ap0])\n\t" + "vl %%v16,0(%%r1,%[x]),3\n\t" + "vl %%v17,16(%%r1,%[x]),3\n\t" + "vl %%v24,0(%%r1,%[ap0]),3\n\t" "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" - "vl %%v25,0(%%r1,%[ap1])\n\t" + "vl %%v25,0(%%r1,%[ap1]),3\n\t" "vfmadb %%v1,%%v16,%%v25,%%v1\n\t" - "vl %%v26,16(%%r1,%[ap0])\n\t" + "vl %%v26,16(%%r1,%[ap0]),3\n\t" "vfmadb %%v2,%%v17,%%v26,%%v2\n\t" - "vl %%v27,16(%%r1,%[ap1])\n\t" + "vl %%v27,16(%%r1,%[ap1]),3\n\t" "vfmadb %%v3,%%v17,%%v27,%%v3\n\t" "agfi %%r1,32\n\t" "brctg %%r0,2b\n\t" @@ -299,29 +299,29 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y) { "0:\n\t" "pfd 1,1024(%%r1,%[a0])\n\t" "pfd 1,1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vl %%v24,0(%%r1,%[a0])\n\t" + "vl %%v16,0(%%r1,%[x]),3\n\t" + "vl %%v17,16(%%r1,%[x]),3\n\t" + "vl %%v18,32(%%r1,%[x]),3\n\t" + "vl %%v19,48(%%r1,%[x]),3\n\t" + "vl %%v20,64(%%r1,%[x]),3\n\t" + "vl %%v21,80(%%r1,%[x]),3\n\t" + "vl %%v22,96(%%r1,%[x]),3\n\t" + "vl %%v23,112(%%r1,%[x]),3\n\t" + "vl %%v24,0(%%r1,%[a0]),3\n\t" "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" - "vl %%v25,16(%%r1,%[a0])\n\t" + "vl %%v25,16(%%r1,%[a0]),3\n\t" "vfmadb %%v1,%%v17,%%v25,%%v1\n\t" - "vl %%v26,32(%%r1,%[a0])\n\t" + "vl %%v26,32(%%r1,%[a0]),3\n\t" "vfmadb %%v2,%%v18,%%v26,%%v2\n\t" - "vl %%v27,48(%%r1,%[a0])\n\t" + "vl %%v27,48(%%r1,%[a0]),3\n\t" "vfmadb %%v3,%%v19,%%v27,%%v3\n\t" - "vl %%v28,64(%%r1,%[a0])\n\t" + "vl %%v28,64(%%r1,%[a0]),3\n\t" "vfmadb %%v4,%%v20,%%v28,%%v4\n\t" - "vl %%v29,80(%%r1,%[a0])\n\t" + "vl %%v29,80(%%r1,%[a0]),3\n\t" "vfmadb %%v5,%%v21,%%v29,%%v5\n\t" - "vl %%v30,96(%%r1,%[a0])\n\t" + "vl %%v30,96(%%r1,%[a0]),3\n\t" "vfmadb %%v6,%%v22,%%v30,%%v6\n\t" - "vl %%v31,112(%%r1,%[a0])\n\t" + "vl %%v31,112(%%r1,%[a0]),3\n\t" "vfmadb %%v7,%%v23,%%v31,%%v7\n\t" "agfi %%r1,128\n\t" "brctg %%r0,0b\n\t" @@ -332,11 +332,11 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y) { "jz 3f\n\t" "srlg %%r0,%%r0,2\n\t" "2:\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v24,0(%%r1,%[a0])\n\t" + "vl %%v16,0(%%r1,%[x]),3\n\t" + "vl %%v17,16(%%r1,%[x]),3\n\t" + "vl %%v24,0(%%r1,%[a0]),3\n\t" "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" - "vl %%v25,16(%%r1,%[a0])\n\t" + "vl %%v25,16(%%r1,%[a0]),3\n\t" "vfmadb %%v1,%%v17,%%v25,%%v1\n\t" "agfi %%r1,32\n\t" "brctg %%r0,2b\n\t" @@ -378,38 +378,38 @@ static void add_y_kernel_4(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest) { "0:\n\t" "pfd 1,1024(%%r1,%[src])\n\t" "pfd 2,1024(%%r1,%[dest])\n\t" - "vl %%v16,0(%%r1,%[src])\n\t" - "vl %%v17,16(%%r1,%[src])\n\t" - "vl %%v18,32(%%r1,%[src])\n\t" - "vl %%v19,48(%%r1,%[src])\n\t" - "vl %%v20,64(%%r1,%[src])\n\t" - "vl %%v21,80(%%r1,%[src])\n\t" - "vl %%v22,96(%%r1,%[src])\n\t" - "vl %%v23,112(%%r1,%[src])\n\t" - "vl %%v24, 0(%%r1,%[dest])\n\t" + "vl %%v16,0(%%r1,%[src]),3\n\t" + "vl %%v17,16(%%r1,%[src]),3\n\t" + "vl %%v18,32(%%r1,%[src]),3\n\t" + "vl %%v19,48(%%r1,%[src]),3\n\t" + "vl %%v20,64(%%r1,%[src]),3\n\t" + "vl %%v21,80(%%r1,%[src]),3\n\t" + "vl %%v22,96(%%r1,%[src]),3\n\t" + "vl %%v23,112(%%r1,%[src]),3\n\t" + "vl %%v24, 0(%%r1,%[dest]),3\n\t" "vfmadb %%v24,%%v16,%%v0,%%v24\n\t" - "vst %%v24, 0(%%r1,%[dest])\n\t" - "vl %%v25, 16(%%r1,%[dest])\n\t" + "vst %%v24, 0(%%r1,%[dest]),3\n\t" + "vl %%v25, 16(%%r1,%[dest]),3\n\t" "vfmadb %%v25,%%v17,%%v0,%%v25\n\t" - "vst %%v25, 16(%%r1,%[dest])\n\t" - "vl %%v26, 32(%%r1,%[dest])\n\t" + "vst %%v25, 16(%%r1,%[dest]),3\n\t" + "vl %%v26, 32(%%r1,%[dest]),3\n\t" "vfmadb %%v26,%%v18,%%v0,%%v26\n\t" - "vst %%v26, 32(%%r1,%[dest])\n\t" - "vl %%v27, 48(%%r1,%[dest])\n\t" + "vst %%v26, 32(%%r1,%[dest]),3\n\t" + "vl %%v27, 48(%%r1,%[dest]),3\n\t" "vfmadb %%v27,%%v19,%%v0,%%v27\n\t" - "vst %%v27, 48(%%r1,%[dest])\n\t" - "vl %%v28, 64(%%r1,%[dest])\n\t" + "vst %%v27, 48(%%r1,%[dest]),3\n\t" + "vl %%v28, 64(%%r1,%[dest]),3\n\t" "vfmadb %%v28,%%v20,%%v0,%%v28\n\t" - "vst %%v28, 64(%%r1,%[dest])\n\t" - "vl %%v29, 80(%%r1,%[dest])\n\t" + "vst %%v28, 64(%%r1,%[dest]),3\n\t" + "vl %%v29, 80(%%r1,%[dest]),3\n\t" "vfmadb %%v29,%%v21,%%v0,%%v29\n\t" - "vst %%v29, 80(%%r1,%[dest])\n\t" - "vl %%v30, 96(%%r1,%[dest])\n\t" + "vst %%v29, 80(%%r1,%[dest]),3\n\t" + "vl %%v30, 96(%%r1,%[dest]),3\n\t" "vfmadb %%v30,%%v22,%%v0,%%v30\n\t" - "vst %%v30, 96(%%r1,%[dest])\n\t" - "vl %%v31, 112(%%r1,%[dest])\n\t" + "vst %%v30, 96(%%r1,%[dest]),3\n\t" + "vl %%v31, 112(%%r1,%[dest]),3\n\t" "vfmadb %%v31,%%v23,%%v0,%%v31\n\t" - "vst %%v31, 112(%%r1,%[dest])\n\t" + "vst %%v31, 112(%%r1,%[dest]),3\n\t" "agfi %%r1,128\n\t" "brctg %%r0,0b\n\t" "1:\n\t" @@ -419,14 +419,14 @@ static void add_y_kernel_4(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest) { "jz 3f\n\t" "srlg %%r0,%%r0,2\n\t" "2:\n\t" - "vl %%v16,0(%%r1,%[src])\n\t" - "vl %%v17,16(%%r1,%[src])\n\t" - "vl %%v24, 0(%%r1,%[dest])\n\t" + "vl %%v16,0(%%r1,%[src]),3\n\t" + "vl %%v17,16(%%r1,%[src]),3\n\t" + "vl %%v24, 0(%%r1,%[dest]),3\n\t" "vfmadb %%v24,%%v16,%%v0,%%v24\n\t" - "vst %%v24, 0(%%r1,%[dest])\n\t" - "vl %%v25, 16(%%r1,%[dest])\n\t" + "vst %%v24, 0(%%r1,%[dest]),3\n\t" + "vl %%v25, 16(%%r1,%[dest]),3\n\t" "vfmadb %%v25,%%v17,%%v0,%%v25\n\t" - "vst %%v25, 16(%%r1,%[dest])\n\t" + "vst %%v25, 16(%%r1,%[dest]),3\n\t" "agfi %%r1,32\n\t" "brctg %%r0,2b\n\t" "3:\n\t" diff --git a/kernel/zarch/dmax.c b/kernel/zarch/dmax.c index 65ed31f01..cc0f23c87 100644 --- a/kernel/zarch/dmax.c +++ b/kernel/zarch/dmax.c @@ -30,27 +30,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static FLOAT dmax_kernel_32(BLASLONG n, FLOAT *x) { FLOAT max; - __asm__("vl %%v0,0(%[x])\n\t" + __asm__("vl %%v0,0(%[x]),3\n\t" "srlg %[n],%[n],5\n\t" "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vl %%v24,128(%%r1,%[x])\n\t" - "vl %%v25,144(%%r1,%[x])\n\t" - "vl %%v26,160(%%r1,%[x])\n\t" - "vl %%v27,176(%%r1,%[x])\n\t" - "vl %%v28,192(%%r1,%[x])\n\t" - "vl %%v29,208(%%r1,%[x])\n\t" - "vl %%v30,224(%%r1,%[x])\n\t" - "vl %%v31,240(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x]),3\n\t" + "vl %%v17,16(%%r1,%[x]),3\n\t" + "vl %%v18,32(%%r1,%[x]),3\n\t" + "vl %%v19,48(%%r1,%[x]),3\n\t" + "vl %%v20,64(%%r1,%[x]),3\n\t" + "vl %%v21,80(%%r1,%[x]),3\n\t" + "vl %%v22,96(%%r1,%[x]),3\n\t" + "vl %%v23,112(%%r1,%[x]),3\n\t" + "vl %%v24,128(%%r1,%[x]),3\n\t" + "vl %%v25,144(%%r1,%[x]),3\n\t" + "vl %%v26,160(%%r1,%[x]),3\n\t" + "vl %%v27,176(%%r1,%[x]),3\n\t" + "vl %%v28,192(%%r1,%[x]),3\n\t" + "vl %%v29,208(%%r1,%[x]),3\n\t" + "vl %%v30,224(%%r1,%[x]),3\n\t" + "vl %%v31,240(%%r1,%[x]),3\n\t" "vfmaxdb %%v16,%%v16,%%v24,0\n\t" "vfmaxdb %%v17,%%v17,%%v25,0\n\t" "vfmaxdb %%v18,%%v18,%%v26,0\n\t" diff --git a/kernel/zarch/dmax_z13.c b/kernel/zarch/dmax_z13.c index 87bccbe55..83d827d35 100644 --- a/kernel/zarch/dmax_z13.c +++ b/kernel/zarch/dmax_z13.c @@ -30,19 +30,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static FLOAT dmax_kernel_32(BLASLONG n, FLOAT *x) { FLOAT max; - __asm__("vl %%v0,0(%[x])\n\t" + __asm__("vl %%v0,0(%[x]),3\n\t" "srlg %[n],%[n],5\n\t" "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x]),3\n\t" + "vl %%v17,16(%%r1,%[x]),3\n\t" + "vl %%v18,32(%%r1,%[x]),3\n\t" + "vl %%v19,48(%%r1,%[x]),3\n\t" + "vl %%v20,64(%%r1,%[x]),3\n\t" + "vl %%v21,80(%%r1,%[x]),3\n\t" + "vl %%v22,96(%%r1,%[x]),3\n\t" + "vl %%v23,112(%%r1,%[x]),3\n\t" "vfchdb %%v24,%%v16,%%v17\n\t" "vfchdb %%v25,%%v18,%%v19\n\t" "vfchdb %%v26,%%v20,%%v21\n\t" @@ -59,14 +59,14 @@ static FLOAT dmax_kernel_32(BLASLONG n, FLOAT *x) { "vsel %%v30,%%v28,%%v29,%%v30\n\t" "vfchdb %%v31,%%v30,%%v0\n\t" "vsel %%v0,%%v30,%%v0,%%v31\n\t" - "vl %%v16,128(%%r1,%[x])\n\t" - "vl %%v17,144(%%r1,%[x])\n\t" - "vl %%v18,160(%%r1,%[x])\n\t" - "vl %%v19,176(%%r1,%[x])\n\t" - "vl %%v20,192(%%r1,%[x])\n\t" - "vl %%v21,208(%%r1,%[x])\n\t" - "vl %%v22,224(%%r1,%[x])\n\t" - "vl %%v23,240(%%r1,%[x])\n\t" + "vl %%v16,128(%%r1,%[x]),3\n\t" + "vl %%v17,144(%%r1,%[x]),3\n\t" + "vl %%v18,160(%%r1,%[x]),3\n\t" + "vl %%v19,176(%%r1,%[x]),3\n\t" + "vl %%v20,192(%%r1,%[x]),3\n\t" + "vl %%v21,208(%%r1,%[x]),3\n\t" + "vl %%v22,224(%%r1,%[x]),3\n\t" + "vl %%v23,240(%%r1,%[x]),3\n\t" "vfchdb %%v24,%%v16,%%v17\n\t" "vfchdb %%v25,%%v18,%%v19\n\t" "vfchdb %%v26,%%v20,%%v21\n\t" diff --git a/kernel/zarch/dmin.c b/kernel/zarch/dmin.c index 518cc262c..754828b7c 100644 --- a/kernel/zarch/dmin.c +++ b/kernel/zarch/dmin.c @@ -30,27 +30,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static FLOAT dmin_kernel_32(BLASLONG n, FLOAT *x) { FLOAT min; - __asm__("vl %%v0,0(%[x])\n\t" + __asm__("vl %%v0,0(%[x]),3\n\t" "srlg %[n],%[n],5\n\t" "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vl %%v24,128(%%r1,%[x])\n\t" - "vl %%v25,144(%%r1,%[x])\n\t" - "vl %%v26,160(%%r1,%[x])\n\t" - "vl %%v27,176(%%r1,%[x])\n\t" - "vl %%v28,192(%%r1,%[x])\n\t" - "vl %%v29,208(%%r1,%[x])\n\t" - "vl %%v30,224(%%r1,%[x])\n\t" - "vl %%v31,240(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x]),3\n\t" + "vl %%v17,16(%%r1,%[x]),3\n\t" + "vl %%v18,32(%%r1,%[x]),3\n\t" + "vl %%v19,48(%%r1,%[x]),3\n\t" + "vl %%v20,64(%%r1,%[x]),3\n\t" + "vl %%v21,80(%%r1,%[x]),3\n\t" + "vl %%v22,96(%%r1,%[x]),3\n\t" + "vl %%v23,112(%%r1,%[x]),3\n\t" + "vl %%v24,128(%%r1,%[x]),3\n\t" + "vl %%v25,144(%%r1,%[x]),3\n\t" + "vl %%v26,160(%%r1,%[x]),3\n\t" + "vl %%v27,176(%%r1,%[x]),3\n\t" + "vl %%v28,192(%%r1,%[x]),3\n\t" + "vl %%v29,208(%%r1,%[x]),3\n\t" + "vl %%v30,224(%%r1,%[x]),3\n\t" + "vl %%v31,240(%%r1,%[x]),3\n\t" "vfmindb %%v16,%%v16,%%v24,0\n\t" "vfmindb %%v17,%%v17,%%v25,0\n\t" "vfmindb %%v18,%%v18,%%v26,0\n\t" diff --git a/kernel/zarch/dmin_z13.c b/kernel/zarch/dmin_z13.c index 91561992f..ff0fca48c 100644 --- a/kernel/zarch/dmin_z13.c +++ b/kernel/zarch/dmin_z13.c @@ -30,19 +30,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static FLOAT dmin_kernel_32(BLASLONG n, FLOAT *x) { FLOAT min; - __asm__("vl %%v0,0(%[x])\n\t" + __asm__("vl %%v0,0(%[x]),3\n\t" "srlg %[n],%[n],5\n\t" "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x]),3\n\t" + "vl %%v17,16(%%r1,%[x]),3\n\t" + "vl %%v18,32(%%r1,%[x]),3\n\t" + "vl %%v19,48(%%r1,%[x]),3\n\t" + "vl %%v20,64(%%r1,%[x]),3\n\t" + "vl %%v21,80(%%r1,%[x]),3\n\t" + "vl %%v22,96(%%r1,%[x]),3\n\t" + "vl %%v23,112(%%r1,%[x]),3\n\t" "vfchdb %%v24,%%v17,%%v16\n\t" "vfchdb %%v25,%%v19,%%v18\n\t" "vfchdb %%v26,%%v21,%%v20\n\t" @@ -59,14 +59,14 @@ static FLOAT dmin_kernel_32(BLASLONG n, FLOAT *x) { "vsel %%v30,%%v28,%%v29,%%v30\n\t" "vfchdb %%v31,%%v0,%%v30\n\t" "vsel %%v0,%%v30,%%v0,%%v31\n\t" - "vl %%v16,128(%%r1,%[x])\n\t" - "vl %%v17,144(%%r1,%[x])\n\t" - "vl %%v18,160(%%r1,%[x])\n\t" - "vl %%v19,176(%%r1,%[x])\n\t" - "vl %%v20,192(%%r1,%[x])\n\t" - "vl %%v21,208(%%r1,%[x])\n\t" - "vl %%v22,224(%%r1,%[x])\n\t" - "vl %%v23,240(%%r1,%[x])\n\t" + "vl %%v16,128(%%r1,%[x]),3\n\t" + "vl %%v17,144(%%r1,%[x]),3\n\t" + "vl %%v18,160(%%r1,%[x]),3\n\t" + "vl %%v19,176(%%r1,%[x]),3\n\t" + "vl %%v20,192(%%r1,%[x]),3\n\t" + "vl %%v21,208(%%r1,%[x]),3\n\t" + "vl %%v22,224(%%r1,%[x]),3\n\t" + "vl %%v23,240(%%r1,%[x]),3\n\t" "vfchdb %%v24,%%v17,%%v16\n\t" "vfchdb %%v25,%%v19,%%v18\n\t" "vfchdb %%v26,%%v21,%%v20\n\t" diff --git a/kernel/zarch/drot.c b/kernel/zarch/drot.c index 8f0197f02..de2207fcd 100644 --- a/kernel/zarch/drot.c +++ b/kernel/zarch/drot.c @@ -35,14 +35,14 @@ static void drot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { "0:\n\t" "pfd 2, 1024(%%r1,%[x])\n\t" "pfd 2, 1024(%%r1,%[y])\n\t" - "vl %%v24, 0(%%r1,%[x])\n\t" - "vl %%v25, 16(%%r1,%[x])\n\t" - "vl %%v26, 32(%%r1,%[x])\n\t" - "vl %%v27, 48(%%r1,%[x])\n\t" - "vl %%v16, 0(%%r1,%[y])\n\t" - "vl %%v17, 16(%%r1,%[y])\n\t" - "vl %%v18, 32(%%r1,%[y])\n\t" - "vl %%v19, 48(%%r1,%[y])\n\t" + "vl %%v24, 0(%%r1,%[x]),3\n\t" + "vl %%v25, 16(%%r1,%[x]),3\n\t" + "vl %%v26, 32(%%r1,%[x]),3\n\t" + "vl %%v27, 48(%%r1,%[x]),3\n\t" + "vl %%v16, 0(%%r1,%[y]),3\n\t" + "vl %%v17, 16(%%r1,%[y]),3\n\t" + "vl %%v18, 32(%%r1,%[y]),3\n\t" + "vl %%v19, 48(%%r1,%[y]),3\n\t" "vfmdb %%v28,%%v24,%%v0\n\t" "vfmdb %%v29,%%v25,%%v0\n\t" "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ @@ -60,22 +60,22 @@ static void drot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ - "vst %%v28, 0(%%r1,%[x])\n\t" - "vst %%v29, 16(%%r1,%[x])\n\t" - "vst %%v30, 32(%%r1,%[x])\n\t" - "vst %%v31, 48(%%r1,%[x])\n\t" - "vst %%v20, 0(%%r1,%[y])\n\t" - "vst %%v21, 16(%%r1,%[y])\n\t" - "vst %%v22, 32(%%r1,%[y])\n\t" - "vst %%v23, 48(%%r1,%[y])\n\t" - "vl %%v24, 64(%%r1,%[x])\n\t" - "vl %%v25, 80(%%r1,%[x])\n\t" - "vl %%v26, 96(%%r1,%[x])\n\t" - "vl %%v27, 112(%%r1,%[x])\n\t" - "vl %%v16, 64(%%r1,%[y])\n\t" - "vl %%v17, 80(%%r1,%[y])\n\t" - "vl %%v18, 96(%%r1,%[y])\n\t" - "vl %%v19, 112(%%r1,%[y])\n\t" + "vst %%v28, 0(%%r1,%[x]),3\n\t" + "vst %%v29, 16(%%r1,%[x]),3\n\t" + "vst %%v30, 32(%%r1,%[x]),3\n\t" + "vst %%v31, 48(%%r1,%[x]),3\n\t" + "vst %%v20, 0(%%r1,%[y]),3\n\t" + "vst %%v21, 16(%%r1,%[y]),3\n\t" + "vst %%v22, 32(%%r1,%[y]),3\n\t" + "vst %%v23, 48(%%r1,%[y]),3\n\t" + "vl %%v24, 64(%%r1,%[x]),3\n\t" + "vl %%v25, 80(%%r1,%[x]),3\n\t" + "vl %%v26, 96(%%r1,%[x]),3\n\t" + "vl %%v27, 112(%%r1,%[x]),3\n\t" + "vl %%v16, 64(%%r1,%[y]),3\n\t" + "vl %%v17, 80(%%r1,%[y]),3\n\t" + "vl %%v18, 96(%%r1,%[y]),3\n\t" + "vl %%v19, 112(%%r1,%[y]),3\n\t" "vfmdb %%v28,%%v24,%%v0\n\t" "vfmdb %%v29,%%v25,%%v0\n\t" "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ @@ -93,22 +93,22 @@ static void drot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ - "vst %%v28, 64(%%r1,%[x])\n\t" - "vst %%v29, 80(%%r1,%[x])\n\t" - "vst %%v30, 96(%%r1,%[x])\n\t" - "vst %%v31, 112(%%r1,%[x])\n\t" - "vst %%v20, 64(%%r1,%[y])\n\t" - "vst %%v21, 80(%%r1,%[y])\n\t" - "vst %%v22, 96(%%r1,%[y])\n\t" - "vst %%v23, 112(%%r1,%[y])\n\t" - "vl %%v24, 128(%%r1,%[x])\n\t" - "vl %%v25, 144(%%r1,%[x])\n\t" - "vl %%v26, 160(%%r1,%[x])\n\t" - "vl %%v27, 176(%%r1,%[x])\n\t" - "vl %%v16, 128(%%r1,%[y])\n\t" - "vl %%v17, 144(%%r1,%[y])\n\t" - "vl %%v18, 160(%%r1,%[y])\n\t" - "vl %%v19, 176(%%r1,%[y])\n\t" + "vst %%v28, 64(%%r1,%[x]),3\n\t" + "vst %%v29, 80(%%r1,%[x]),3\n\t" + "vst %%v30, 96(%%r1,%[x]),3\n\t" + "vst %%v31, 112(%%r1,%[x]),3\n\t" + "vst %%v20, 64(%%r1,%[y]),3\n\t" + "vst %%v21, 80(%%r1,%[y]),3\n\t" + "vst %%v22, 96(%%r1,%[y]),3\n\t" + "vst %%v23, 112(%%r1,%[y]),3\n\t" + "vl %%v24, 128(%%r1,%[x]),3\n\t" + "vl %%v25, 144(%%r1,%[x]),3\n\t" + "vl %%v26, 160(%%r1,%[x]),3\n\t" + "vl %%v27, 176(%%r1,%[x]),3\n\t" + "vl %%v16, 128(%%r1,%[y]),3\n\t" + "vl %%v17, 144(%%r1,%[y]),3\n\t" + "vl %%v18, 160(%%r1,%[y]),3\n\t" + "vl %%v19, 176(%%r1,%[y]),3\n\t" "vfmdb %%v28,%%v24,%%v0\n\t" "vfmdb %%v29,%%v25,%%v0\n\t" "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ @@ -126,22 +126,22 @@ static void drot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ - "vst %%v28, 128(%%r1,%[x])\n\t" - "vst %%v29, 144(%%r1,%[x])\n\t" - "vst %%v30, 160(%%r1,%[x])\n\t" - "vst %%v31, 176(%%r1,%[x])\n\t" - "vst %%v20, 128(%%r1,%[y])\n\t" - "vst %%v21, 144(%%r1,%[y])\n\t" - "vst %%v22, 160(%%r1,%[y])\n\t" - "vst %%v23, 176(%%r1,%[y])\n\t" - "vl %%v24, 192(%%r1,%[x])\n\t" - "vl %%v25, 208(%%r1,%[x])\n\t" - "vl %%v26, 224(%%r1,%[x])\n\t" - "vl %%v27, 240(%%r1,%[x])\n\t" - "vl %%v16, 192(%%r1,%[y])\n\t" - "vl %%v17, 208(%%r1,%[y])\n\t" - "vl %%v18, 224(%%r1,%[y])\n\t" - "vl %%v19, 240(%%r1,%[y])\n\t" + "vst %%v28, 128(%%r1,%[x]),3\n\t" + "vst %%v29, 144(%%r1,%[x]),3\n\t" + "vst %%v30, 160(%%r1,%[x]),3\n\t" + "vst %%v31, 176(%%r1,%[x]),3\n\t" + "vst %%v20, 128(%%r1,%[y]),3\n\t" + "vst %%v21, 144(%%r1,%[y]),3\n\t" + "vst %%v22, 160(%%r1,%[y]),3\n\t" + "vst %%v23, 176(%%r1,%[y]),3\n\t" + "vl %%v24, 192(%%r1,%[x]),3\n\t" + "vl %%v25, 208(%%r1,%[x]),3\n\t" + "vl %%v26, 224(%%r1,%[x]),3\n\t" + "vl %%v27, 240(%%r1,%[x]),3\n\t" + "vl %%v16, 192(%%r1,%[y]),3\n\t" + "vl %%v17, 208(%%r1,%[y]),3\n\t" + "vl %%v18, 224(%%r1,%[y]),3\n\t" + "vl %%v19, 240(%%r1,%[y]),3\n\t" "vfmdb %%v28,%%v24,%%v0\n\t" "vfmdb %%v29,%%v25,%%v0\n\t" "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ @@ -159,14 +159,14 @@ static void drot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ - "vst %%v28, 192(%%r1,%[x])\n\t" - "vst %%v29, 208(%%r1,%[x])\n\t" - "vst %%v30, 224(%%r1,%[x])\n\t" - "vst %%v31, 240(%%r1,%[x])\n\t" - "vst %%v20, 192(%%r1,%[y])\n\t" - "vst %%v21, 208(%%r1,%[y])\n\t" - "vst %%v22, 224(%%r1,%[y])\n\t" - "vst %%v23, 240(%%r1,%[y])\n\t" + "vst %%v28, 192(%%r1,%[x]),3\n\t" + "vst %%v29, 208(%%r1,%[x]),3\n\t" + "vst %%v30, 224(%%r1,%[x]),3\n\t" + "vst %%v31, 240(%%r1,%[x]),3\n\t" + "vst %%v20, 192(%%r1,%[y]),3\n\t" + "vst %%v21, 208(%%r1,%[y]),3\n\t" + "vst %%v22, 224(%%r1,%[y]),3\n\t" + "vst %%v23, 240(%%r1,%[y]),3\n\t" "agfi %%r1,256\n\t" "brctg %[n],0b" : "+m"(*(FLOAT (*)[n]) x), "+m"(*(FLOAT (*)[n]) y),[n] "+&r"(n) diff --git a/kernel/zarch/dscal.c b/kernel/zarch/dscal.c index c944990b5..bc58569d5 100644 --- a/kernel/zarch/dscal.c +++ b/kernel/zarch/dscal.c @@ -33,30 +33,30 @@ static void dscal_kernel_16(BLASLONG n, FLOAT da, FLOAT *x) { "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 2, 1024(%%r1,%[x])\n\t" - "vl %%v24,0(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[x]),3\n\t" "vfmdb %%v24,%%v24,%%v0\n\t" - "vst %%v24,0(%%r1,%[x])\n\t" - "vl %%v25,16(%%r1,%[x])\n\t" + "vst %%v24,0(%%r1,%[x]),3\n\t" + "vl %%v25,16(%%r1,%[x]),3\n\t" "vfmdb %%v25,%%v25,%%v0\n\t" - "vst %%v25,16(%%r1,%[x])\n\t" - "vl %%v26,32(%%r1,%[x])\n\t" + "vst %%v25,16(%%r1,%[x]),3\n\t" + "vl %%v26,32(%%r1,%[x]),3\n\t" "vfmdb %%v26,%%v26,%%v0\n\t" - "vst %%v26,32(%%r1,%[x])\n\t" - "vl %%v27,48(%%r1,%[x])\n\t" + "vst %%v26,32(%%r1,%[x]),3\n\t" + "vl %%v27,48(%%r1,%[x]),3\n\t" "vfmdb %%v27,%%v27,%%v0\n\t" - "vst %%v27,48(%%r1,%[x])\n\t" - "vl %%v28,64(%%r1,%[x])\n\t" + "vst %%v27,48(%%r1,%[x]),3\n\t" + "vl %%v28,64(%%r1,%[x]),3\n\t" "vfmdb %%v28,%%v28,%%v0\n\t" - "vst %%v28,64(%%r1,%[x])\n\t" - "vl %%v29,80(%%r1,%[x])\n\t" + "vst %%v28,64(%%r1,%[x]),3\n\t" + "vl %%v29,80(%%r1,%[x]),3\n\t" "vfmdb %%v29,%%v29,%%v0\n\t" - "vst %%v29,80(%%r1,%[x])\n\t" - "vl %%v30,96(%%r1,%[x])\n\t" + "vst %%v29,80(%%r1,%[x]),3\n\t" + "vl %%v30,96(%%r1,%[x]),3\n\t" "vfmdb %%v30,%%v30,%%v0\n\t" - "vst %%v30,96(%%r1,%[x])\n\t" - "vl %%v31,112(%%r1,%[x])\n\t" + "vst %%v30,96(%%r1,%[x]),3\n\t" + "vl %%v31,112(%%r1,%[x]),3\n\t" "vfmdb %%v31,%%v31,%%v0\n\t" - "vst %%v31,112(%%r1,%[x])\n\t" + "vst %%v31,112(%%r1,%[x]),3\n\t" "agfi %%r1,128\n\t" "brctg %[n],0b" : "+m"(*(FLOAT (*)[n]) x),[n] "+&r"(n) @@ -71,14 +71,14 @@ static void dscal_kernel_16_zero(BLASLONG n, FLOAT *x) { "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 2, 1024(%%r1,%[x])\n\t" - "vst %%v0,0(%%r1,%[x])\n\t" - "vst %%v0,16(%%r1,%[x])\n\t" - "vst %%v0,32(%%r1,%[x])\n\t" - "vst %%v0,48(%%r1,%[x])\n\t" - "vst %%v0,64(%%r1,%[x])\n\t" - "vst %%v0,80(%%r1,%[x])\n\t" - "vst %%v0,96(%%r1,%[x])\n\t" - "vst %%v0,112(%%r1,%[x])\n\t" + "vst %%v0,0(%%r1,%[x]),3\n\t" + "vst %%v0,16(%%r1,%[x]),3\n\t" + "vst %%v0,32(%%r1,%[x]),3\n\t" + "vst %%v0,48(%%r1,%[x]),3\n\t" + "vst %%v0,64(%%r1,%[x]),3\n\t" + "vst %%v0,80(%%r1,%[x]),3\n\t" + "vst %%v0,96(%%r1,%[x]),3\n\t" + "vst %%v0,112(%%r1,%[x]),3\n\t" "agfi %%r1,128\n\t" "brctg %[n],0b" : "=m"(*(FLOAT (*)[n]) x),[n] "+&r"(n) diff --git a/kernel/zarch/dswap.c b/kernel/zarch/dswap.c index 60ba40bd6..f4da46dc1 100644 --- a/kernel/zarch/dswap.c +++ b/kernel/zarch/dswap.c @@ -33,70 +33,70 @@ static void dswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) { "0:\n\t" "pfd 2, 1024(%%r1,%[x])\n\t" "pfd 2, 1024(%%r1,%[y])\n\t" - "vl %%v16, 0(%%r1,%[x])\n\t" - "vl %%v17, 16(%%r1,%[x])\n\t" - "vl %%v18, 32(%%r1,%[x])\n\t" - "vl %%v19, 48(%%r1,%[x])\n\t" - "vl %%v20, 64(%%r1,%[x])\n\t" - "vl %%v21, 80(%%r1,%[x])\n\t" - "vl %%v22, 96(%%r1,%[x])\n\t" - "vl %%v23, 112(%%r1,%[x])\n\t" - "vl %%v24, 128(%%r1,%[x])\n\t" - "vl %%v25, 144(%%r1,%[x])\n\t" - "vl %%v26, 160(%%r1,%[x])\n\t" - "vl %%v27, 176(%%r1,%[x])\n\t" - "vl %%v28, 192(%%r1,%[x])\n\t" - "vl %%v29, 208(%%r1,%[x])\n\t" - "vl %%v30, 224(%%r1,%[x])\n\t" - "vl %%v31, 240(%%r1,%[x])\n\t" - "vl %%v0, 0(%%r1,%[y])\n\t" - "vl %%v1, 16(%%r1,%[y])\n\t" - "vl %%v2, 32(%%r1,%[y])\n\t" - "vl %%v3, 48(%%r1,%[y])\n\t" - "vl %%v4, 64(%%r1,%[y])\n\t" - "vl %%v5, 80(%%r1,%[y])\n\t" - "vl %%v6, 96(%%r1,%[y])\n\t" - "vl %%v7, 112(%%r1,%[y])\n\t" - "vst %%v0, 0(%%r1,%[x])\n\t" - "vst %%v1, 16(%%r1,%[x])\n\t" - "vst %%v2, 32(%%r1,%[x])\n\t" - "vst %%v3, 48(%%r1,%[x])\n\t" - "vst %%v4, 64(%%r1,%[x])\n\t" - "vst %%v5, 80(%%r1,%[x])\n\t" - "vst %%v6, 96(%%r1,%[x])\n\t" - "vst %%v7, 112(%%r1,%[x])\n\t" - "vl %%v0, 128(%%r1,%[y])\n\t" - "vl %%v1, 144(%%r1,%[y])\n\t" - "vl %%v2, 160(%%r1,%[y])\n\t" - "vl %%v3, 176(%%r1,%[y])\n\t" - "vl %%v4, 192(%%r1,%[y])\n\t" - "vl %%v5, 208(%%r1,%[y])\n\t" - "vl %%v6, 224(%%r1,%[y])\n\t" - "vl %%v7, 240(%%r1,%[y])\n\t" - "vst %%v0, 128(%%r1,%[x])\n\t" - "vst %%v1, 144(%%r1,%[x])\n\t" - "vst %%v2, 160(%%r1,%[x])\n\t" - "vst %%v3, 176(%%r1,%[x])\n\t" - "vst %%v4, 192(%%r1,%[x])\n\t" - "vst %%v5, 208(%%r1,%[x])\n\t" - "vst %%v6, 224(%%r1,%[x])\n\t" - "vst %%v7, 240(%%r1,%[x])\n\t" - "vst %%v16, 0(%%r1,%[y])\n\t" - "vst %%v17, 16(%%r1,%[y])\n\t" - "vst %%v18, 32(%%r1,%[y])\n\t" - "vst %%v19, 48(%%r1,%[y])\n\t" - "vst %%v20, 64(%%r1,%[y])\n\t" - "vst %%v21, 80(%%r1,%[y])\n\t" - "vst %%v22, 96(%%r1,%[y])\n\t" - "vst %%v23, 112(%%r1,%[y])\n\t" - "vst %%v24, 128(%%r1,%[y])\n\t" - "vst %%v25, 144(%%r1,%[y])\n\t" - "vst %%v26, 160(%%r1,%[y])\n\t" - "vst %%v27, 176(%%r1,%[y])\n\t" - "vst %%v28, 192(%%r1,%[y])\n\t" - "vst %%v29, 208(%%r1,%[y])\n\t" - "vst %%v30, 224(%%r1,%[y])\n\t" - "vst %%v31, 240(%%r1,%[y])\n\t" + "vl %%v16, 0(%%r1,%[x]),3\n\t" + "vl %%v17, 16(%%r1,%[x]),3\n\t" + "vl %%v18, 32(%%r1,%[x]),3\n\t" + "vl %%v19, 48(%%r1,%[x]),3\n\t" + "vl %%v20, 64(%%r1,%[x]),3\n\t" + "vl %%v21, 80(%%r1,%[x]),3\n\t" + "vl %%v22, 96(%%r1,%[x]),3\n\t" + "vl %%v23, 112(%%r1,%[x]),3\n\t" + "vl %%v24, 128(%%r1,%[x]),3\n\t" + "vl %%v25, 144(%%r1,%[x]),3\n\t" + "vl %%v26, 160(%%r1,%[x]),3\n\t" + "vl %%v27, 176(%%r1,%[x]),3\n\t" + "vl %%v28, 192(%%r1,%[x]),3\n\t" + "vl %%v29, 208(%%r1,%[x]),3\n\t" + "vl %%v30, 224(%%r1,%[x]),3\n\t" + "vl %%v31, 240(%%r1,%[x]),3\n\t" + "vl %%v0, 0(%%r1,%[y]),3\n\t" + "vl %%v1, 16(%%r1,%[y]),3\n\t" + "vl %%v2, 32(%%r1,%[y]),3\n\t" + "vl %%v3, 48(%%r1,%[y]),3\n\t" + "vl %%v4, 64(%%r1,%[y]),3\n\t" + "vl %%v5, 80(%%r1,%[y]),3\n\t" + "vl %%v6, 96(%%r1,%[y]),3\n\t" + "vl %%v7, 112(%%r1,%[y]),3\n\t" + "vst %%v0, 0(%%r1,%[x]),3\n\t" + "vst %%v1, 16(%%r1,%[x]),3\n\t" + "vst %%v2, 32(%%r1,%[x]),3\n\t" + "vst %%v3, 48(%%r1,%[x]),3\n\t" + "vst %%v4, 64(%%r1,%[x]),3\n\t" + "vst %%v5, 80(%%r1,%[x]),3\n\t" + "vst %%v6, 96(%%r1,%[x]),3\n\t" + "vst %%v7, 112(%%r1,%[x]),3\n\t" + "vl %%v0, 128(%%r1,%[y]),3\n\t" + "vl %%v1, 144(%%r1,%[y]),3\n\t" + "vl %%v2, 160(%%r1,%[y]),3\n\t" + "vl %%v3, 176(%%r1,%[y]),3\n\t" + "vl %%v4, 192(%%r1,%[y]),3\n\t" + "vl %%v5, 208(%%r1,%[y]),3\n\t" + "vl %%v6, 224(%%r1,%[y]),3\n\t" + "vl %%v7, 240(%%r1,%[y]),3\n\t" + "vst %%v0, 128(%%r1,%[x]),3\n\t" + "vst %%v1, 144(%%r1,%[x]),3\n\t" + "vst %%v2, 160(%%r1,%[x]),3\n\t" + "vst %%v3, 176(%%r1,%[x]),3\n\t" + "vst %%v4, 192(%%r1,%[x]),3\n\t" + "vst %%v5, 208(%%r1,%[x]),3\n\t" + "vst %%v6, 224(%%r1,%[x]),3\n\t" + "vst %%v7, 240(%%r1,%[x]),3\n\t" + "vst %%v16, 0(%%r1,%[y]),3\n\t" + "vst %%v17, 16(%%r1,%[y]),3\n\t" + "vst %%v18, 32(%%r1,%[y]),3\n\t" + "vst %%v19, 48(%%r1,%[y]),3\n\t" + "vst %%v20, 64(%%r1,%[y]),3\n\t" + "vst %%v21, 80(%%r1,%[y]),3\n\t" + "vst %%v22, 96(%%r1,%[y]),3\n\t" + "vst %%v23, 112(%%r1,%[y]),3\n\t" + "vst %%v24, 128(%%r1,%[y]),3\n\t" + "vst %%v25, 144(%%r1,%[y]),3\n\t" + "vst %%v26, 160(%%r1,%[y]),3\n\t" + "vst %%v27, 176(%%r1,%[y]),3\n\t" + "vst %%v28, 192(%%r1,%[y]),3\n\t" + "vst %%v29, 208(%%r1,%[y]),3\n\t" + "vst %%v30, 224(%%r1,%[y]),3\n\t" + "vst %%v31, 240(%%r1,%[y]),3\n\t" "agfi %%r1,256\n\t" "brctg %[n],0b" : "+m"(*(FLOAT (*)[n]) x), "+m"(*(FLOAT (*)[n]) y),[n] "+&r"(n) diff --git a/kernel/zarch/idamax.c b/kernel/zarch/idamax.c index 8434c811f..bd0f18115 100644 --- a/kernel/zarch/idamax.c +++ b/kernel/zarch/idamax.c @@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static BLASLONG idamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) { BLASLONG iamax; - __asm__("vl %%v0,0(%[x])\n\t" + __asm__("vl %%v0,0(%[x]),3\n\t" "vflpdb %%v0,%%v0\n\t" "vleig %%v1,0,0\n\t" "vleig %%v1,1,1\n\t" @@ -59,14 +59,14 @@ static BLASLONG idamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) { "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x]),3\n\t" + "vl %%v17,16(%%r1,%[x]),3\n\t" + "vl %%v18,32(%%r1,%[x]),3\n\t" + "vl %%v19,48(%%r1,%[x]),3\n\t" + "vl %%v20,64(%%r1,%[x]),3\n\t" + "vl %%v21,80(%%r1,%[x]),3\n\t" + "vl %%v22,96(%%r1,%[x]),3\n\t" + "vl %%v23,112(%%r1,%[x]),3\n\t" "vflpdb %%v16, %%v16\n\t" "vflpdb %%v17, %%v17\n\t" "vflpdb %%v18, %%v18\n\t" @@ -101,14 +101,14 @@ static BLASLONG idamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) { "vsel %%v0,%%v0,%%v16,%%v5\n\t" "vsel %%v1,%%v1,%%v4,%%v5\n\t" "vag %%v3,%%v3,%%v2\n\t" - "vl %%v16,128(%%r1,%[x])\n\t" - "vl %%v17,144(%%r1,%[x])\n\t" - "vl %%v18,160(%%r1,%[x])\n\t" - "vl %%v19,176(%%r1,%[x])\n\t" - "vl %%v20,192(%%r1,%[x])\n\t" - "vl %%v21,208(%%r1,%[x])\n\t" - "vl %%v22,224(%%r1,%[x])\n\t" - "vl %%v23,240(%%r1,%[x])\n\t" + "vl %%v16,128(%%r1,%[x]),3\n\t" + "vl %%v17,144(%%r1,%[x]),3\n\t" + "vl %%v18,160(%%r1,%[x]),3\n\t" + "vl %%v19,176(%%r1,%[x]),3\n\t" + "vl %%v20,192(%%r1,%[x]),3\n\t" + "vl %%v21,208(%%r1,%[x]),3\n\t" + "vl %%v22,224(%%r1,%[x]),3\n\t" + "vl %%v23,240(%%r1,%[x]),3\n\t" "vflpdb %%v16, %%v16\n\t" "vflpdb %%v17, %%v17\n\t" "vflpdb %%v18, %%v18\n\t" diff --git a/kernel/zarch/idamin.c b/kernel/zarch/idamin.c index 80a37e6c2..4884d1e3a 100644 --- a/kernel/zarch/idamin.c +++ b/kernel/zarch/idamin.c @@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static BLASLONG idamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) { BLASLONG iamin; - __asm__("vl %%v0,0(%[x])\n\t" + __asm__("vl %%v0,0(%[x]),3\n\t" "vflpdb %%v0,%%v0\n\t" "vleig %%v1,0,0\n\t" "vleig %%v1,1,1\n\t" @@ -59,14 +59,14 @@ static BLASLONG idamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) { "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x]),3\n\t" + "vl %%v17,16(%%r1,%[x]),3\n\t" + "vl %%v18,32(%%r1,%[x]),3\n\t" + "vl %%v19,48(%%r1,%[x]),3\n\t" + "vl %%v20,64(%%r1,%[x]),3\n\t" + "vl %%v21,80(%%r1,%[x]),3\n\t" + "vl %%v22,96(%%r1,%[x]),3\n\t" + "vl %%v23,112(%%r1,%[x]),3\n\t" "vflpdb %%v16, %%v16\n\t" "vflpdb %%v17, %%v17\n\t" "vflpdb %%v18, %%v18\n\t" @@ -101,14 +101,14 @@ static BLASLONG idamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) { "vsel %%v0,%%v0,%%v16,%%v5\n\t" "vsel %%v1,%%v1,%%v4,%%v5\n\t" "vag %%v3,%%v3,%%v2\n\t" - "vl %%v16,128(%%r1,%[x])\n\t" - "vl %%v17,144(%%r1,%[x])\n\t" - "vl %%v18,160(%%r1,%[x])\n\t" - "vl %%v19,176(%%r1,%[x])\n\t" - "vl %%v20,192(%%r1,%[x])\n\t" - "vl %%v21,208(%%r1,%[x])\n\t" - "vl %%v22,224(%%r1,%[x])\n\t" - "vl %%v23,240(%%r1,%[x])\n\t" + "vl %%v16,128(%%r1,%[x]),3\n\t" + "vl %%v17,144(%%r1,%[x]),3\n\t" + "vl %%v18,160(%%r1,%[x]),3\n\t" + "vl %%v19,176(%%r1,%[x]),3\n\t" + "vl %%v20,192(%%r1,%[x]),3\n\t" + "vl %%v21,208(%%r1,%[x]),3\n\t" + "vl %%v22,224(%%r1,%[x]),3\n\t" + "vl %%v23,240(%%r1,%[x]),3\n\t" "vflpdb %%v16, %%v16\n\t" "vflpdb %%v17, %%v17\n\t" "vflpdb %%v18, %%v18\n\t" diff --git a/kernel/zarch/idmax.c b/kernel/zarch/idmax.c index 18cdba437..a6b95bf3e 100644 --- a/kernel/zarch/idmax.c +++ b/kernel/zarch/idmax.c @@ -30,7 +30,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static BLASLONG idmax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *max) { BLASLONG imax; - __asm__("vl %%v0,0(%[x])\n\t" + __asm__("vl %%v0,0(%[x]),3\n\t" "vleig %%v1,0,0\n\t" "vleig %%v1,1,1\n\t" "vrepig %%v2,16\n\t" @@ -55,14 +55,14 @@ static BLASLONG idmax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *max) { "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x]),3\n\t" + "vl %%v17,16(%%r1,%[x]),3\n\t" + "vl %%v18,32(%%r1,%[x]),3\n\t" + "vl %%v19,48(%%r1,%[x]),3\n\t" + "vl %%v20,64(%%r1,%[x]),3\n\t" + "vl %%v21,80(%%r1,%[x]),3\n\t" + "vl %%v22,96(%%r1,%[x]),3\n\t" + "vl %%v23,112(%%r1,%[x]),3\n\t" "vfchedb %%v4,%%v16,%%v17\n\t" "vfchedb %%v5,%%v18,%%v19\n\t" "vfchedb %%v6,%%v20,%%v21\n\t" @@ -89,14 +89,14 @@ static BLASLONG idmax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *max) { "vsel %%v0,%%v0,%%v16,%%v5\n\t" "vsel %%v1,%%v1,%%v4,%%v5\n\t" "vag %%v3,%%v3,%%v2\n\t" - "vl %%v16,128(%%r1,%[x])\n\t" - "vl %%v17,144(%%r1,%[x])\n\t" - "vl %%v18,160(%%r1,%[x])\n\t" - "vl %%v19,176(%%r1,%[x])\n\t" - "vl %%v20,192(%%r1,%[x])\n\t" - "vl %%v21,208(%%r1,%[x])\n\t" - "vl %%v22,224(%%r1,%[x])\n\t" - "vl %%v23,240(%%r1,%[x])\n\t" + "vl %%v16,128(%%r1,%[x]),3\n\t" + "vl %%v17,144(%%r1,%[x]),3\n\t" + "vl %%v18,160(%%r1,%[x]),3\n\t" + "vl %%v19,176(%%r1,%[x]),3\n\t" + "vl %%v20,192(%%r1,%[x]),3\n\t" + "vl %%v21,208(%%r1,%[x]),3\n\t" + "vl %%v22,224(%%r1,%[x]),3\n\t" + "vl %%v23,240(%%r1,%[x]),3\n\t" "vfchedb %%v4,%%v16,%%v17\n\t" "vfchedb %%v5,%%v18,%%v19\n\t" "vfchedb %%v6,%%v20,%%v21\n\t" diff --git a/kernel/zarch/idmin.c b/kernel/zarch/idmin.c index 02ca427e4..c3f36d964 100644 --- a/kernel/zarch/idmin.c +++ b/kernel/zarch/idmin.c @@ -30,7 +30,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static BLASLONG idmin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *min) { BLASLONG imin; - __asm__("vl %%v0,0(%[x])\n\t" + __asm__("vl %%v0,0(%[x]),3\n\t" "vleig %%v1,0,0\n\t" "vleig %%v1,1,1\n\t" "vrepig %%v2,16\n\t" @@ -55,14 +55,14 @@ static BLASLONG idmin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *min) { "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x]),3\n\t" + "vl %%v17,16(%%r1,%[x]),3\n\t" + "vl %%v18,32(%%r1,%[x]),3\n\t" + "vl %%v19,48(%%r1,%[x]),3\n\t" + "vl %%v20,64(%%r1,%[x]),3\n\t" + "vl %%v21,80(%%r1,%[x]),3\n\t" + "vl %%v22,96(%%r1,%[x]),3\n\t" + "vl %%v23,112(%%r1,%[x]),3\n\t" "vfchedb %%v4,%%v17,%%v16\n\t" "vfchedb %%v5,%%v19,%%v18\n\t" "vfchedb %%v6,%%v21,%%v20\n\t" @@ -89,14 +89,14 @@ static BLASLONG idmin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *min) { "vsel %%v0,%%v0,%%v16,%%v5\n\t" "vsel %%v1,%%v1,%%v4,%%v5\n\t" "vag %%v3,%%v3,%%v2\n\t" - "vl %%v16,128(%%r1,%[x])\n\t" - "vl %%v17,144(%%r1,%[x])\n\t" - "vl %%v18,160(%%r1,%[x])\n\t" - "vl %%v19,176(%%r1,%[x])\n\t" - "vl %%v20,192(%%r1,%[x])\n\t" - "vl %%v21,208(%%r1,%[x])\n\t" - "vl %%v22,224(%%r1,%[x])\n\t" - "vl %%v23,240(%%r1,%[x])\n\t" + "vl %%v16,128(%%r1,%[x]),3\n\t" + "vl %%v17,144(%%r1,%[x]),3\n\t" + "vl %%v18,160(%%r1,%[x]),3\n\t" + "vl %%v19,176(%%r1,%[x]),3\n\t" + "vl %%v20,192(%%r1,%[x]),3\n\t" + "vl %%v21,208(%%r1,%[x]),3\n\t" + "vl %%v22,224(%%r1,%[x]),3\n\t" + "vl %%v23,240(%%r1,%[x]),3\n\t" "vfchedb %%v4,%%v17,%%v16\n\t" "vfchedb %%v5,%%v19,%%v18\n\t" "vfchedb %%v6,%%v21,%%v20\n\t" diff --git a/kernel/zarch/zasum.c b/kernel/zarch/zasum.c index 43ae8ff8b..83e5e93c9 100644 --- a/kernel/zarch/zasum.c +++ b/kernel/zarch/zasum.c @@ -45,14 +45,14 @@ static FLOAT zasum_kernel_16(BLASLONG n, FLOAT *x) { "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16, 0(%%r1,%[x])\n\t" - "vl %%v17, 16(%%r1,%[x])\n\t" - "vl %%v18, 32(%%r1,%[x])\n\t" - "vl %%v19, 48(%%r1,%[x])\n\t" - "vl %%v20, 64(%%r1,%[x])\n\t" - "vl %%v21, 80(%%r1,%[x])\n\t" - "vl %%v22, 96(%%r1,%[x])\n\t" - "vl %%v23, 112(%%r1,%[x])\n\t" + "vl %%v16, 0(%%r1,%[x]),3\n\t" + "vl %%v17, 16(%%r1,%[x]),3\n\t" + "vl %%v18, 32(%%r1,%[x]),3\n\t" + "vl %%v19, 48(%%r1,%[x]),3\n\t" + "vl %%v20, 64(%%r1,%[x]),3\n\t" + "vl %%v21, 80(%%r1,%[x]),3\n\t" + "vl %%v22, 96(%%r1,%[x]),3\n\t" + "vl %%v23, 112(%%r1,%[x]),3\n\t" "vflpdb %%v16, %%v16\n\t" "vflpdb %%v17, %%v17\n\t" "vflpdb %%v18, %%v18\n\t" @@ -69,14 +69,14 @@ static FLOAT zasum_kernel_16(BLASLONG n, FLOAT *x) { "vfadb %%v29,%%v29,%%v21\n\t" "vfadb %%v30,%%v30,%%v22\n\t" "vfadb %%v31,%%v31,%%v23\n\t" - "vl %%v16, 128(%%r1,%[x])\n\t" - "vl %%v17, 144(%%r1,%[x])\n\t" - "vl %%v18, 160(%%r1,%[x])\n\t" - "vl %%v19, 176(%%r1,%[x])\n\t" - "vl %%v20, 192(%%r1,%[x])\n\t" - "vl %%v21, 208(%%r1,%[x])\n\t" - "vl %%v22, 224(%%r1,%[x])\n\t" - "vl %%v23, 240(%%r1,%[x])\n\t" + "vl %%v16, 128(%%r1,%[x]),3\n\t" + "vl %%v17, 144(%%r1,%[x]),3\n\t" + "vl %%v18, 160(%%r1,%[x]),3\n\t" + "vl %%v19, 176(%%r1,%[x]),3\n\t" + "vl %%v20, 192(%%r1,%[x]),3\n\t" + "vl %%v21, 208(%%r1,%[x]),3\n\t" + "vl %%v22, 224(%%r1,%[x]),3\n\t" + "vl %%v23, 240(%%r1,%[x]),3\n\t" "vflpdb %%v16, %%v16\n\t" "vflpdb %%v17, %%v17\n\t" "vflpdb %%v18, %%v18\n\t" diff --git a/kernel/zarch/zaxpy.c b/kernel/zarch/zaxpy.c index 31549849d..77bb09a2e 100644 --- a/kernel/zarch/zaxpy.c +++ b/kernel/zarch/zaxpy.c @@ -45,22 +45,22 @@ static void zaxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { "0:\n\t" "pfd 1, 1024(%%r1,%[x])\n\t" "pfd 2, 1024(%%r1,%[y])\n\t" - "vl %%v8,0(%%r1,%[x])\n\t" - "vl %%v9,16(%%r1,%[x])\n\t" - "vl %%v10,32(%%r1,%[x])\n\t" - "vl %%v11,48(%%r1,%[x])\n\t" - "vl %%v12,0(%%r1,%[y])\n\t" - "vl %%v13,16(%%r1,%[y])\n\t" - "vl %%v14,32(%%r1,%[y])\n\t" - "vl %%v15,48(%%r1,%[y])\n\t" - "vl %%v16,64(%%r1,%[x])\n\t" - "vl %%v17,80(%%r1,%[x])\n\t" - "vl %%v18,96(%%r1,%[x])\n\t" - "vl %%v19,112(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[y])\n\t" - "vl %%v21,80(%%r1,%[y])\n\t" - "vl %%v22,96(%%r1,%[y])\n\t" - "vl %%v23,112(%%r1,%[y])\n\t" + "vl %%v8,0(%%r1,%[x]),3\n\t" + "vl %%v9,16(%%r1,%[x]),3\n\t" + "vl %%v10,32(%%r1,%[x]),3\n\t" + "vl %%v11,48(%%r1,%[x]),3\n\t" + "vl %%v12,0(%%r1,%[y]),3\n\t" + "vl %%v13,16(%%r1,%[y]),3\n\t" + "vl %%v14,32(%%r1,%[y]),3\n\t" + "vl %%v15,48(%%r1,%[y]),3\n\t" + "vl %%v16,64(%%r1,%[x]),3\n\t" + "vl %%v17,80(%%r1,%[x]),3\n\t" + "vl %%v18,96(%%r1,%[x]),3\n\t" + "vl %%v19,112(%%r1,%[x]),3\n\t" + "vl %%v20,64(%%r1,%[y]),3\n\t" + "vl %%v21,80(%%r1,%[y]),3\n\t" + "vl %%v22,96(%%r1,%[y]),3\n\t" + "vl %%v23,112(%%r1,%[y]),3\n\t" "vpdi %%v24,%%v8,%%v8,4\n\t" "vpdi %%v25,%%v9,%%v9,4\n\t" "vpdi %%v26,%%v10,%%v10,4\n\t" @@ -85,14 +85,14 @@ static void zaxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { "vfmadb %%v17,%%v29,%%v1,%%v17\n\t" "vfmadb %%v18,%%v30,%%v1,%%v18\n\t" "vfmadb %%v19,%%v31,%%v1,%%v19\n\t" - "vst %%v8,0(%%r1,%[y])\n\t" - "vst %%v9,16(%%r1,%[y])\n\t" - "vst %%v10,32(%%r1,%[y])\n\t" - "vst %%v11,48(%%r1,%[y])\n\t" - "vst %%v16,64(%%r1,%[y])\n\t" - "vst %%v17,80(%%r1,%[y])\n\t" - "vst %%v18,96(%%r1,%[y])\n\t" - "vst %%v19,112(%%r1,%[y])\n\t" + "vst %%v8,0(%%r1,%[y]),3\n\t" + "vst %%v9,16(%%r1,%[y]),3\n\t" + "vst %%v10,32(%%r1,%[y]),3\n\t" + "vst %%v11,48(%%r1,%[y]),3\n\t" + "vst %%v16,64(%%r1,%[y]),3\n\t" + "vst %%v17,80(%%r1,%[y]),3\n\t" + "vst %%v18,96(%%r1,%[y]),3\n\t" + "vst %%v19,112(%%r1,%[y]),3\n\t" "agfi %%r1,128\n\t" "brctg %[n],0b" : "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) diff --git a/kernel/zarch/zdot.c b/kernel/zarch/zdot.c index 7a67ef734..8cfbaadb8 100644 --- a/kernel/zarch/zdot.c +++ b/kernel/zarch/zdot.c @@ -41,14 +41,14 @@ static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) { "0:\n\t" "pfd 1, 1024(%%r1,%[x])\n\t" "pfd 1, 1024(%%r1,%[y])\n\t" - "vl %%v16, 0(%%r1,%[x])\n\t" - "vl %%v17, 16(%%r1,%[x])\n\t" - "vl %%v18, 32(%%r1,%[x])\n\t" - "vl %%v19, 48(%%r1,%[x])\n\t" - "vl %%v0, 0(%%r1,%[y])\n\t" - "vl %%v1, 16(%%r1,%[y])\n\t" - "vl %%v2, 32(%%r1,%[y])\n\t" - "vl %%v3, 48(%%r1,%[y])\n\t" + "vl %%v16, 0(%%r1,%[x]),3\n\t" + "vl %%v17, 16(%%r1,%[x]),3\n\t" + "vl %%v18, 32(%%r1,%[x]),3\n\t" + "vl %%v19, 48(%%r1,%[x]),3\n\t" + "vl %%v0, 0(%%r1,%[y]),3\n\t" + "vl %%v1, 16(%%r1,%[y]),3\n\t" + "vl %%v2, 32(%%r1,%[y]),3\n\t" + "vl %%v3, 48(%%r1,%[y]),3\n\t" "vpdi %%v20,%%v16,%%v16,4\n\t" "vpdi %%v21,%%v17,%%v17,4\n\t" "vpdi %%v22,%%v18,%%v18,4\n\t" @@ -61,14 +61,14 @@ static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) { "vfmadb %%v29,%%v22,%%v2,%%v29\n\t" "vfmadb %%v30,%%v19,%%v3,%%v30\n\t" "vfmadb %%v31,%%v23,%%v3,%%v31\n\t" - "vl %%v16, 64(%%r1,%[x])\n\t" - "vl %%v17, 80(%%r1,%[x])\n\t" - "vl %%v18, 96(%%r1,%[x])\n\t" - "vl %%v19, 112(%%r1,%[x])\n\t" - "vl %%v0, 64(%%r1,%[y])\n\t" - "vl %%v1, 80(%%r1,%[y])\n\t" - "vl %%v2, 96(%%r1,%[y])\n\t" - "vl %%v3, 112(%%r1,%[y])\n\t" + "vl %%v16, 64(%%r1,%[x]),3\n\t" + "vl %%v17, 80(%%r1,%[x]),3\n\t" + "vl %%v18, 96(%%r1,%[x]),3\n\t" + "vl %%v19, 112(%%r1,%[x]),3\n\t" + "vl %%v0, 64(%%r1,%[y]),3\n\t" + "vl %%v1, 80(%%r1,%[y]),3\n\t" + "vl %%v2, 96(%%r1,%[y]),3\n\t" + "vl %%v3, 112(%%r1,%[y]),3\n\t" "vpdi %%v20,%%v16,%%v16,4\n\t" "vpdi %%v21,%%v17,%%v17,4\n\t" "vpdi %%v22,%%v18,%%v18,4\n\t" diff --git a/kernel/zarch/zgemv_n_4.c b/kernel/zarch/zgemv_n_4.c index 7f21985ec..4b64fc8a5 100644 --- a/kernel/zarch/zgemv_n_4.c +++ b/kernel/zarch/zgemv_n_4.c @@ -30,10 +30,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define NBMAX 1024 static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { - __asm__("vl %%v16,0(%[x])\n\t" - "vl %%v17,16(%[x])\n\t" - "vl %%v18,32(%[x])\n\t" - "vl %%v19,48(%[x])\n\t" + __asm__("vl %%v16,0(%[x]),3\n\t" + "vl %%v17,16(%[x]),3\n\t" + "vl %%v18,32(%[x]),3\n\t" + "vl %%v19,48(%[x]),3\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) "vleg %%v20,8(%[x]),0\n\t" "wflcdb %%v20,%%v20\n\t" @@ -69,8 +69,8 @@ static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { "pfd 1,1024(%%r1,%[ap2])\n\t" "pfd 1,1024(%%r1,%[ap3])\n\t" "pfd 2,1024(%%r1,%[y])\n\t" - "vl %%v0,0(%%r1,%[y])\n\t" - "vl %%v1,16(%%r1,%[y])\n\t" + "vl %%v0,0(%%r1,%[y]),3\n\t" + "vl %%v1,16(%%r1,%[y]),3\n\t" "vlrepg %%v24,0(%%r1,%[ap0])\n\t" "vlrepg %%v25,8(%%r1,%[ap0])\n\t" "vlrepg %%v26,0(%%r1,%[ap1])\n\t" @@ -103,8 +103,8 @@ static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { "vfmadb %%v1,%%v30,%%v19,%%v1\n\t" "vfmadb %%v0,%%v27,%%v23,%%v0\n\t" "vfmadb %%v1,%%v31,%%v23,%%v1\n\t" - "vst %%v0,0(%%r1,%[y])\n\t" - "vst %%v1,16(%%r1,%[y])\n\t" + "vst %%v0,0(%%r1,%[y]),3\n\t" + "vst %%v1,16(%%r1,%[y]),3\n\t" "agfi %%r1,32\n\t" "brctg %[n],0b" : "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) @@ -119,8 +119,8 @@ static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { } static void zgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { - __asm__("vl %%v16,0(%[x])\n\t" - "vl %%v17,16(%[x])\n\t" + __asm__("vl %%v16,0(%[x]),3\n\t" + "vl %%v17,16(%[x]),3\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) "vleg %%v18,8(%[x]),0\n\t" "wflcdb %%v18,%%v18\n\t" @@ -142,8 +142,8 @@ static void zgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { "pfd 1,1024(%%r1,%[ap0])\n\t" "pfd 1,1024(%%r1,%[ap1])\n\t" "pfd 2,1024(%%r1,%[y])\n\t" - "vl %%v0,0(%%r1,%[y])\n\t" - "vl %%v1,16(%%r1,%[y])\n\t" + "vl %%v0,0(%%r1,%[y]),3\n\t" + "vl %%v1,16(%%r1,%[y]),3\n\t" "vlrepg %%v20,0(%%r1,%[ap0])\n\t" "vlrepg %%v21,8(%%r1,%[ap0])\n\t" "vlrepg %%v22,0(%%r1,%[ap1])\n\t" @@ -160,8 +160,8 @@ static void zgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { "vfmadb %%v1,%%v26,%%v17,%%v1\n\t" "vfmadb %%v0,%%v23,%%v19,%%v0\n\t" "vfmadb %%v1,%%v27,%%v19,%%v1\n\t" - "vst %%v0,0(%%r1,%[y])\n\t" - "vst %%v1,16(%%r1,%[y])\n\t" + "vst %%v0,0(%%r1,%[y]),3\n\t" + "vst %%v1,16(%%r1,%[y]),3\n\t" "agfi %%r1,32\n\t" "brctg %[n],0b" : "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) @@ -173,7 +173,7 @@ static void zgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { } static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) { - __asm__("vl %%v16,0(%[x])\n\t" + __asm__("vl %%v16,0(%[x]),3\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) "vleg %%v17,8(%[x]),0\n\t" "wflcdb %%v17,%%v17\n\t" @@ -188,8 +188,8 @@ static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) { "0:\n\t" "pfd 1,1024(%%r1,%[ap])\n\t" "pfd 2,1024(%%r1,%[y])\n\t" - "vl %%v0,0(%%r1,%[y])\n\t" - "vl %%v1,16(%%r1,%[y])\n\t" + "vl %%v0,0(%%r1,%[y]),3\n\t" + "vl %%v1,16(%%r1,%[y]),3\n\t" "vlrepg %%v18,0(%%r1,%[ap])\n\t" "vlrepg %%v19,8(%%r1,%[ap])\n\t" "vlrepg %%v20,16(%%r1,%[ap])\n\t" @@ -198,8 +198,8 @@ static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) { "vfmadb %%v1,%%v20,%%v16,%%v1\n\t" "vfmadb %%v0,%%v19,%%v17,%%v0\n\t" "vfmadb %%v1,%%v21,%%v17,%%v1\n\t" - "vst %%v0,0(%%r1,%[y])\n\t" - "vst %%v1,16(%%r1,%[y])\n\t" + "vst %%v0,0(%%r1,%[y]),3\n\t" + "vst %%v1,16(%%r1,%[y]),3\n\t" "agfi %%r1,32\n\t" "brctg %[n],0b" : "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) @@ -227,14 +227,14 @@ static void add_y_4(BLASLONG n, FLOAT *src, FLOAT *dest, FLOAT alpha_r, "0:\n\t" "pfd 1,1024(%%r1,%[src])\n\t" "pfd 2,1024(%%r1,%[dest])\n\t" - "vl %%v16,0(%%r1,%[src])\n\t" - "vl %%v17,16(%%r1,%[src])\n\t" - "vl %%v18,32(%%r1,%[src])\n\t" - "vl %%v19,48(%%r1,%[src])\n\t" - "vl %%v20,0(%%r1,%[dest])\n\t" - "vl %%v21,16(%%r1,%[dest])\n\t" - "vl %%v22,32(%%r1,%[dest])\n\t" - "vl %%v23,48(%%r1,%[dest])\n\t" + "vl %%v16,0(%%r1,%[src]),3\n\t" + "vl %%v17,16(%%r1,%[src]),3\n\t" + "vl %%v18,32(%%r1,%[src]),3\n\t" + "vl %%v19,48(%%r1,%[src]),3\n\t" + "vl %%v20,0(%%r1,%[dest]),3\n\t" + "vl %%v21,16(%%r1,%[dest]),3\n\t" + "vl %%v22,32(%%r1,%[dest]),3\n\t" + "vl %%v23,48(%%r1,%[dest]),3\n\t" "vpdi %%v24,%%v16,%%v16,4\n\t" "vpdi %%v25,%%v17,%%v17,4\n\t" "vpdi %%v26,%%v18,%%v18,4\n\t" @@ -247,10 +247,10 @@ static void add_y_4(BLASLONG n, FLOAT *src, FLOAT *dest, FLOAT alpha_r, "vfmadb %%v29,%%v25,%%v1,%%v29\n\t" "vfmadb %%v30,%%v26,%%v1,%%v30\n\t" "vfmadb %%v31,%%v27,%%v1,%%v31\n\t" - "vst %%v28,0(%%r1,%[dest])\n\t" - "vst %%v29,16(%%r1,%[dest])\n\t" - "vst %%v30,32(%%r1,%[dest])\n\t" - "vst %%v31,48(%%r1,%[dest])\n\t" + "vst %%v28,0(%%r1,%[dest]),3\n\t" + "vst %%v29,16(%%r1,%[dest]),3\n\t" + "vst %%v30,32(%%r1,%[dest]),3\n\t" + "vst %%v31,48(%%r1,%[dest]),3\n\t" "agfi %%r1,64\n\t" "brctg %[n],0b" : "+m"(*(FLOAT (*)[n * 2]) dest),[n] "+&r"(n) diff --git a/kernel/zarch/zgemv_t_4.c b/kernel/zarch/zgemv_t_4.c index 7b3e6c1fc..429824dcf 100644 --- a/kernel/zarch/zgemv_t_4.c +++ b/kernel/zarch/zgemv_t_4.c @@ -47,7 +47,7 @@ static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "pfd 1,1024(%%r1,%[ap2])\n\t" "pfd 1,1024(%%r1,%[ap3])\n\t" "pfd 1,1024(%%r1,%[x])\n\t" - "vl %%v0,0(%%r1,%[x])\n\t" + "vl %%v0,0(%%r1,%[x]),3\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) "vleg %%v1,8(%%r1,%[x]),0\n\t" "wflcdb %%v1,%%v1\n\t" @@ -73,7 +73,7 @@ static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "vfmadb %%v22,%%v29,%%v1,%%v22\n\t" "vfmadb %%v19,%%v30,%%v0,%%v19\n\t" "vfmadb %%v23,%%v31,%%v1,%%v23\n\t" - "vl %%v0,16(%%r1,%[x])\n\t" + "vl %%v0,16(%%r1,%[x]),3\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) "vleg %%v1,24(%%r1,%[x]),0\n\t" "wflcdb %%v1,%%v1\n\t" @@ -120,10 +120,10 @@ static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "vleg %%v24,0(%[alpha]),0\n\t" "vlrepg %%v25,8(%[alpha])\n\t" #endif - "vl %%v26,0(%[y])\n\t" - "vl %%v27,16(%[y])\n\t" - "vl %%v28,32(%[y])\n\t" - "vl %%v29,48(%[y])\n\t" + "vl %%v26,0(%[y]),3\n\t" + "vl %%v27,16(%[y]),3\n\t" + "vl %%v28,32(%[y]),3\n\t" + "vl %%v29,48(%[y]),3\n\t" "vfmadb %%v26,%%v16,%%v24,%%v26\n\t" "vfmadb %%v26,%%v20,%%v25,%%v26\n\t" "vfmadb %%v27,%%v17,%%v24,%%v27\n\t" @@ -132,10 +132,10 @@ static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "vfmadb %%v28,%%v22,%%v25,%%v28\n\t" "vfmadb %%v29,%%v19,%%v24,%%v29\n\t" "vfmadb %%v29,%%v23,%%v25,%%v29\n\t" - "vst %%v26,0(%[y])\n\t" - "vst %%v27,16(%[y])\n\t" - "vst %%v28,32(%[y])\n\t" - "vst %%v29,48(%[y])" + "vst %%v26,0(%[y]),3\n\t" + "vst %%v27,16(%[y]),3\n\t" + "vst %%v28,32(%[y]),3\n\t" + "vst %%v29,48(%[y]),3" : "+m"(*(FLOAT (*)[8]) y),[n] "+&r"(n) : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) ap[0]),[ap0] "a"(ap[0]), "m"(*(const FLOAT (*)[n * 2]) ap[1]),[ap1] "a"(ap[1]), @@ -160,7 +160,7 @@ static void zgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "pfd 1,1024(%%r1,%[ap0])\n\t" "pfd 1,1024(%%r1,%[ap1])\n\t" "pfd 1,1024(%%r1,%[x])\n\t" - "vl %%v0,0(%%r1,%[x])\n\t" + "vl %%v0,0(%%r1,%[x]),3\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) "vleg %%v1,8(%%r1,%[x]),0\n\t" "wflcdb %%v1,%%v1\n\t" @@ -178,7 +178,7 @@ static void zgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "vfmadb %%v18,%%v21,%%v1,%%v18\n\t" "vfmadb %%v17,%%v22,%%v0,%%v17\n\t" "vfmadb %%v19,%%v23,%%v1,%%v19\n\t" - "vl %%v0,16(%%r1,%[x])\n\t" + "vl %%v0,16(%%r1,%[x]),3\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) "vleg %%v1,24(%%r1,%[x]),0\n\t" "wflcdb %%v1,%%v1\n\t" @@ -213,14 +213,14 @@ static void zgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "vleg %%v20,0(%[alpha]),0\n\t" "vlrepg %%v21,8(%[alpha])\n\t" #endif - "vl %%v22,0(%[y])\n\t" - "vl %%v23,16(%[y])\n\t" + "vl %%v22,0(%[y]),3\n\t" + "vl %%v23,16(%[y]),3\n\t" "vfmadb %%v22,%%v16,%%v20,%%v22\n\t" "vfmadb %%v22,%%v18,%%v21,%%v22\n\t" "vfmadb %%v23,%%v17,%%v20,%%v23\n\t" "vfmadb %%v23,%%v19,%%v21,%%v23\n\t" - "vst %%v22,0(%[y])\n\t" - "vst %%v23,16(%[y])\n\t" + "vst %%v22,0(%[y]),3\n\t" + "vst %%v23,16(%[y]),3\n\t" : "+m"(*(FLOAT (*)[4]) y),[n] "+&r"(n) : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) ap[0]),[ap0] "a"(ap[0]), "m"(*(const FLOAT (*)[n * 2]) ap[1]),[ap1] "a"(ap[1]), @@ -239,7 +239,7 @@ static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, "0:\n\t" "pfd 1,1024(%%r1,%[ap])\n\t" "pfd 1,1024(%%r1,%[x])\n\t" - "vl %%v0,0(%%r1,%[x])\n\t" + "vl %%v0,0(%%r1,%[x]),3\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) "vleg %%v1,8(%%r1,%[x]),0\n\t" "wflcdb %%v1,%%v1\n\t" @@ -253,7 +253,7 @@ static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, "vlrepg %%v19,8(%%r1,%[ap])\n\t" "vfmadb %%v16,%%v18,%%v0,%%v16\n\t" "vfmadb %%v17,%%v19,%%v1,%%v17\n\t" - "vl %%v0,16(%%r1,%[x])\n\t" + "vl %%v0,16(%%r1,%[x]),3\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) "vleg %%v1,24(%%r1,%[x]),0\n\t" "wflcdb %%v1,%%v1\n\t" @@ -282,10 +282,10 @@ static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, "vleg %%v18,0(%[alpha]),0\n\t" "vlrepg %%v19,8(%[alpha])\n\t" #endif - "vl %%v0,0(%[y])\n\t" + "vl %%v0,0(%[y]),3\n\t" "vfmadb %%v0,%%v16,%%v18,%%v0\n\t" "vfmadb %%v0,%%v17,%%v19,%%v0\n\t" - "vst %%v0,0(%[y])\n\t" + "vst %%v0,0(%[y]),3\n\t" : "+m"(*(FLOAT (*)[2]) y),[n] "+&r"(n) : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) ap),[ap] "a"(ap), "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x), diff --git a/kernel/zarch/zrot.c b/kernel/zarch/zrot.c index aa7f16605..ea81e4741 100644 --- a/kernel/zarch/zrot.c +++ b/kernel/zarch/zrot.c @@ -35,14 +35,14 @@ static void zrot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { "0:\n\t" "pfd 2, 1024(%%r1,%[x])\n\t" "pfd 2, 1024(%%r1,%[y])\n\t" - "vl %%v24, 0(%%r1,%[x])\n\t" - "vl %%v25, 16(%%r1,%[x])\n\t" - "vl %%v26, 32(%%r1,%[x])\n\t" - "vl %%v27, 48(%%r1,%[x])\n\t" - "vl %%v16, 0(%%r1,%[y])\n\t" - "vl %%v17, 16(%%r1,%[y])\n\t" - "vl %%v18, 32(%%r1,%[y])\n\t" - "vl %%v19, 48(%%r1,%[y])\n\t" + "vl %%v24, 0(%%r1,%[x]),3\n\t" + "vl %%v25, 16(%%r1,%[x]),3\n\t" + "vl %%v26, 32(%%r1,%[x]),3\n\t" + "vl %%v27, 48(%%r1,%[x]),3\n\t" + "vl %%v16, 0(%%r1,%[y]),3\n\t" + "vl %%v17, 16(%%r1,%[y]),3\n\t" + "vl %%v18, 32(%%r1,%[y]),3\n\t" + "vl %%v19, 48(%%r1,%[y]),3\n\t" "vfmdb %%v28,%%v24,%%v0\n\t" "vfmdb %%v29,%%v25,%%v0\n\t" "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ @@ -60,22 +60,22 @@ static void zrot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ - "vst %%v28, 0(%%r1,%[x])\n\t" - "vst %%v29, 16(%%r1,%[x])\n\t" - "vst %%v30, 32(%%r1,%[x])\n\t" - "vst %%v31, 48(%%r1,%[x])\n\t" - "vst %%v20, 0(%%r1,%[y])\n\t" - "vst %%v21, 16(%%r1,%[y])\n\t" - "vst %%v22, 32(%%r1,%[y])\n\t" - "vst %%v23, 48(%%r1,%[y])\n\t" - "vl %%v24, 64(%%r1,%[x])\n\t" - "vl %%v25, 80(%%r1,%[x])\n\t" - "vl %%v26, 96(%%r1,%[x])\n\t" - "vl %%v27, 112(%%r1,%[x])\n\t" - "vl %%v16, 64(%%r1,%[y])\n\t" - "vl %%v17, 80(%%r1,%[y])\n\t" - "vl %%v18, 96(%%r1,%[y])\n\t" - "vl %%v19, 112(%%r1,%[y])\n\t" + "vst %%v28, 0(%%r1,%[x]),3\n\t" + "vst %%v29, 16(%%r1,%[x]),3\n\t" + "vst %%v30, 32(%%r1,%[x]),3\n\t" + "vst %%v31, 48(%%r1,%[x]),3\n\t" + "vst %%v20, 0(%%r1,%[y]),3\n\t" + "vst %%v21, 16(%%r1,%[y]),3\n\t" + "vst %%v22, 32(%%r1,%[y]),3\n\t" + "vst %%v23, 48(%%r1,%[y]),3\n\t" + "vl %%v24, 64(%%r1,%[x]),3\n\t" + "vl %%v25, 80(%%r1,%[x]),3\n\t" + "vl %%v26, 96(%%r1,%[x]),3\n\t" + "vl %%v27, 112(%%r1,%[x]),3\n\t" + "vl %%v16, 64(%%r1,%[y]),3\n\t" + "vl %%v17, 80(%%r1,%[y]),3\n\t" + "vl %%v18, 96(%%r1,%[y]),3\n\t" + "vl %%v19, 112(%%r1,%[y]),3\n\t" "vfmdb %%v28,%%v24,%%v0\n\t" "vfmdb %%v29,%%v25,%%v0\n\t" "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ @@ -93,22 +93,22 @@ static void zrot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ - "vst %%v28, 64(%%r1,%[x])\n\t" - "vst %%v29, 80(%%r1,%[x])\n\t" - "vst %%v30, 96(%%r1,%[x])\n\t" - "vst %%v31, 112(%%r1,%[x])\n\t" - "vst %%v20, 64(%%r1,%[y])\n\t" - "vst %%v21, 80(%%r1,%[y])\n\t" - "vst %%v22, 96(%%r1,%[y])\n\t" - "vst %%v23, 112(%%r1,%[y])\n\t" - "vl %%v24, 128(%%r1,%[x])\n\t" - "vl %%v25, 144(%%r1,%[x])\n\t" - "vl %%v26, 160(%%r1,%[x])\n\t" - "vl %%v27, 176(%%r1,%[x])\n\t" - "vl %%v16, 128(%%r1,%[y])\n\t" - "vl %%v17, 144(%%r1,%[y])\n\t" - "vl %%v18, 160(%%r1,%[y])\n\t" - "vl %%v19, 176(%%r1,%[y])\n\t" + "vst %%v28, 64(%%r1,%[x]),3\n\t" + "vst %%v29, 80(%%r1,%[x]),3\n\t" + "vst %%v30, 96(%%r1,%[x]),3\n\t" + "vst %%v31, 112(%%r1,%[x]),3\n\t" + "vst %%v20, 64(%%r1,%[y]),3\n\t" + "vst %%v21, 80(%%r1,%[y]),3\n\t" + "vst %%v22, 96(%%r1,%[y]),3\n\t" + "vst %%v23, 112(%%r1,%[y]),3\n\t" + "vl %%v24, 128(%%r1,%[x]),3\n\t" + "vl %%v25, 144(%%r1,%[x]),3\n\t" + "vl %%v26, 160(%%r1,%[x]),3\n\t" + "vl %%v27, 176(%%r1,%[x]),3\n\t" + "vl %%v16, 128(%%r1,%[y]),3\n\t" + "vl %%v17, 144(%%r1,%[y]),3\n\t" + "vl %%v18, 160(%%r1,%[y]),3\n\t" + "vl %%v19, 176(%%r1,%[y]),3\n\t" "vfmdb %%v28,%%v24,%%v0\n\t" "vfmdb %%v29,%%v25,%%v0\n\t" "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ @@ -126,22 +126,22 @@ static void zrot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ - "vst %%v28, 128(%%r1,%[x])\n\t" - "vst %%v29, 144(%%r1,%[x])\n\t" - "vst %%v30, 160(%%r1,%[x])\n\t" - "vst %%v31, 176(%%r1,%[x])\n\t" - "vst %%v20, 128(%%r1,%[y])\n\t" - "vst %%v21, 144(%%r1,%[y])\n\t" - "vst %%v22, 160(%%r1,%[y])\n\t" - "vst %%v23, 176(%%r1,%[y])\n\t" - "vl %%v24, 192(%%r1,%[x])\n\t" - "vl %%v25, 208(%%r1,%[x])\n\t" - "vl %%v26, 224(%%r1,%[x])\n\t" - "vl %%v27, 240(%%r1,%[x])\n\t" - "vl %%v16, 192(%%r1,%[y])\n\t" - "vl %%v17, 208(%%r1,%[y])\n\t" - "vl %%v18, 224(%%r1,%[y])\n\t" - "vl %%v19, 240(%%r1,%[y])\n\t" + "vst %%v28, 128(%%r1,%[x]),3\n\t" + "vst %%v29, 144(%%r1,%[x]),3\n\t" + "vst %%v30, 160(%%r1,%[x]),3\n\t" + "vst %%v31, 176(%%r1,%[x]),3\n\t" + "vst %%v20, 128(%%r1,%[y]),3\n\t" + "vst %%v21, 144(%%r1,%[y]),3\n\t" + "vst %%v22, 160(%%r1,%[y]),3\n\t" + "vst %%v23, 176(%%r1,%[y]),3\n\t" + "vl %%v24, 192(%%r1,%[x]),3\n\t" + "vl %%v25, 208(%%r1,%[x]),3\n\t" + "vl %%v26, 224(%%r1,%[x]),3\n\t" + "vl %%v27, 240(%%r1,%[x]),3\n\t" + "vl %%v16, 192(%%r1,%[y]),3\n\t" + "vl %%v17, 208(%%r1,%[y]),3\n\t" + "vl %%v18, 224(%%r1,%[y]),3\n\t" + "vl %%v19, 240(%%r1,%[y]),3\n\t" "vfmdb %%v28,%%v24,%%v0\n\t" "vfmdb %%v29,%%v25,%%v0\n\t" "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ @@ -159,14 +159,14 @@ static void zrot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ - "vst %%v28, 192(%%r1,%[x])\n\t" - "vst %%v29, 208(%%r1,%[x])\n\t" - "vst %%v30, 224(%%r1,%[x])\n\t" - "vst %%v31, 240(%%r1,%[x])\n\t" - "vst %%v20, 192(%%r1,%[y])\n\t" - "vst %%v21, 208(%%r1,%[y])\n\t" - "vst %%v22, 224(%%r1,%[y])\n\t" - "vst %%v23, 240(%%r1,%[y])\n\t" + "vst %%v28, 192(%%r1,%[x]),3\n\t" + "vst %%v29, 208(%%r1,%[x]),3\n\t" + "vst %%v30, 224(%%r1,%[x]),3\n\t" + "vst %%v31, 240(%%r1,%[x]),3\n\t" + "vst %%v20, 192(%%r1,%[y]),3\n\t" + "vst %%v21, 208(%%r1,%[y]),3\n\t" + "vst %%v22, 224(%%r1,%[y]),3\n\t" + "vst %%v23, 240(%%r1,%[y]),3\n\t" "agfi %%r1,256\n\t" "brctg %[n],0b" : "+m"(*(FLOAT (*)[n * 2]) x), "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) diff --git a/kernel/zarch/zscal.c b/kernel/zarch/zscal.c index fbcc0c5b9..7fd62a1ac 100644 --- a/kernel/zarch/zscal.c +++ b/kernel/zarch/zscal.c @@ -36,14 +36,14 @@ static void zscal_kernel_8(BLASLONG n, FLOAT *alpha, FLOAT *x) { "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 2, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x]),3\n\t" + "vl %%v17,16(%%r1,%[x]),3\n\t" + "vl %%v18,32(%%r1,%[x]),3\n\t" + "vl %%v19,48(%%r1,%[x]),3\n\t" + "vl %%v20,64(%%r1,%[x]),3\n\t" + "vl %%v21,80(%%r1,%[x]),3\n\t" + "vl %%v22,96(%%r1,%[x]),3\n\t" + "vl %%v23,112(%%r1,%[x]),3\n\t" "vpdi %%v24,%%v16,%%v16,4\n\t" "vpdi %%v25,%%v17,%%v17,4\n\t" "vpdi %%v26,%%v18,%%v18,4\n\t" @@ -68,14 +68,14 @@ static void zscal_kernel_8(BLASLONG n, FLOAT *alpha, FLOAT *x) { "vfmadb %%v21,%%v29,%%v1,%%v21\n\t" "vfmadb %%v22,%%v30,%%v1,%%v22\n\t" "vfmadb %%v23,%%v31,%%v1,%%v23\n\t" - "vst %%v16,0(%%r1,%[x])\n\t" - "vst %%v17,16(%%r1,%[x])\n\t" - "vst %%v18,32(%%r1,%[x])\n\t" - "vst %%v19,48(%%r1,%[x])\n\t" - "vst %%v20,64(%%r1,%[x])\n\t" - "vst %%v21,80(%%r1,%[x])\n\t" - "vst %%v22,96(%%r1,%[x])\n\t" - "vst %%v23,112(%%r1,%[x])\n\t" + "vst %%v16,0(%%r1,%[x]),3\n\t" + "vst %%v17,16(%%r1,%[x]),3\n\t" + "vst %%v18,32(%%r1,%[x]),3\n\t" + "vst %%v19,48(%%r1,%[x]),3\n\t" + "vst %%v20,64(%%r1,%[x]),3\n\t" + "vst %%v21,80(%%r1,%[x]),3\n\t" + "vst %%v22,96(%%r1,%[x]),3\n\t" + "vst %%v23,112(%%r1,%[x]),3\n\t" "agfi %%r1,128\n\t" "brctg %[n],0b" : "+m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n) @@ -93,14 +93,14 @@ static void zscal_kernel_8_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x) { "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 2, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x]),3\n\t" + "vl %%v17,16(%%r1,%[x]),3\n\t" + "vl %%v18,32(%%r1,%[x]),3\n\t" + "vl %%v19,48(%%r1,%[x]),3\n\t" + "vl %%v20,64(%%r1,%[x]),3\n\t" + "vl %%v21,80(%%r1,%[x]),3\n\t" + "vl %%v22,96(%%r1,%[x]),3\n\t" + "vl %%v23,112(%%r1,%[x]),3\n\t" "vpdi %%v16,%%v16,%%v16,4\n\t" "vpdi %%v17,%%v17,%%v17,4\n\t" "vpdi %%v18,%%v18,%%v18,4\n\t" @@ -117,14 +117,14 @@ static void zscal_kernel_8_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x) { "vfmdb %%v21,%%v21,%%v0\n\t" "vfmdb %%v22,%%v22,%%v0\n\t" "vfmdb %%v23,%%v23,%%v0\n\t" - "vst %%v16,0(%%r1,%[x])\n\t" - "vst %%v17,16(%%r1,%[x])\n\t" - "vst %%v18,32(%%r1,%[x])\n\t" - "vst %%v19,48(%%r1,%[x])\n\t" - "vst %%v20,64(%%r1,%[x])\n\t" - "vst %%v21,80(%%r1,%[x])\n\t" - "vst %%v22,96(%%r1,%[x])\n\t" - "vst %%v23,112(%%r1,%[x])\n\t" + "vst %%v16,0(%%r1,%[x]),3\n\t" + "vst %%v17,16(%%r1,%[x]),3\n\t" + "vst %%v18,32(%%r1,%[x]),3\n\t" + "vst %%v19,48(%%r1,%[x]),3\n\t" + "vst %%v20,64(%%r1,%[x]),3\n\t" + "vst %%v21,80(%%r1,%[x]),3\n\t" + "vst %%v22,96(%%r1,%[x]),3\n\t" + "vst %%v23,112(%%r1,%[x]),3\n\t" "agfi %%r1,128\n\t" "brctg %[n],0b" : "+m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n) @@ -139,14 +139,14 @@ static void zscal_kernel_8_zero_i(BLASLONG n, FLOAT *alpha, FLOAT *x) { "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 2, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x]),3\n\t" + "vl %%v17,16(%%r1,%[x]),3\n\t" + "vl %%v18,32(%%r1,%[x]),3\n\t" + "vl %%v19,48(%%r1,%[x]),3\n\t" + "vl %%v20,64(%%r1,%[x]),3\n\t" + "vl %%v21,80(%%r1,%[x]),3\n\t" + "vl %%v22,96(%%r1,%[x]),3\n\t" + "vl %%v23,112(%%r1,%[x]),3\n\t" "vfmdb %%v16,%%v16,%%v0\n\t" "vfmdb %%v17,%%v17,%%v0\n\t" "vfmdb %%v18,%%v18,%%v0\n\t" @@ -155,14 +155,14 @@ static void zscal_kernel_8_zero_i(BLASLONG n, FLOAT *alpha, FLOAT *x) { "vfmdb %%v21,%%v21,%%v0\n\t" "vfmdb %%v22,%%v22,%%v0\n\t" "vfmdb %%v23,%%v23,%%v0\n\t" - "vst %%v16,0(%%r1,%[x])\n\t" - "vst %%v17,16(%%r1,%[x])\n\t" - "vst %%v18,32(%%r1,%[x])\n\t" - "vst %%v19,48(%%r1,%[x])\n\t" - "vst %%v20,64(%%r1,%[x])\n\t" - "vst %%v21,80(%%r1,%[x])\n\t" - "vst %%v22,96(%%r1,%[x])\n\t" - "vst %%v23,112(%%r1,%[x])\n\t" + "vst %%v16,0(%%r1,%[x]),3\n\t" + "vst %%v17,16(%%r1,%[x]),3\n\t" + "vst %%v18,32(%%r1,%[x]),3\n\t" + "vst %%v19,48(%%r1,%[x]),3\n\t" + "vst %%v20,64(%%r1,%[x]),3\n\t" + "vst %%v21,80(%%r1,%[x]),3\n\t" + "vst %%v22,96(%%r1,%[x]),3\n\t" + "vst %%v23,112(%%r1,%[x]),3\n\t" "agfi %%r1,128\n\t" "brctg %[n],0b" : "+m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n) @@ -177,14 +177,14 @@ static void zscal_kernel_8_zero(BLASLONG n, FLOAT *x) { "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 2, 1024(%%r1,%[x])\n\t" - "vst %%v0,0(%%r1,%[x])\n\t" - "vst %%v0,16(%%r1,%[x])\n\t" - "vst %%v0,32(%%r1,%[x])\n\t" - "vst %%v0,48(%%r1,%[x])\n\t" - "vst %%v0,64(%%r1,%[x])\n\t" - "vst %%v0,80(%%r1,%[x])\n\t" - "vst %%v0,96(%%r1,%[x])\n\t" - "vst %%v0,112(%%r1,%[x])\n\t" + "vst %%v0,0(%%r1,%[x]),3\n\t" + "vst %%v0,16(%%r1,%[x]),3\n\t" + "vst %%v0,32(%%r1,%[x]),3\n\t" + "vst %%v0,48(%%r1,%[x]),3\n\t" + "vst %%v0,64(%%r1,%[x]),3\n\t" + "vst %%v0,80(%%r1,%[x]),3\n\t" + "vst %%v0,96(%%r1,%[x]),3\n\t" + "vst %%v0,112(%%r1,%[x]),3\n\t" "agfi %%r1,128\n\t" "brctg %[n],0b" : "=m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n) diff --git a/kernel/zarch/zswap.c b/kernel/zarch/zswap.c index 0f38103be..0252ab8db 100644 --- a/kernel/zarch/zswap.c +++ b/kernel/zarch/zswap.c @@ -33,70 +33,70 @@ static void zswap_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) { "0:\n\t" "pfd 2, 1024(%%r1,%[x])\n\t" "pfd 2, 1024(%%r1,%[y])\n\t" - "vl %%v16, 0(%%r1,%[x])\n\t" - "vl %%v17, 16(%%r1,%[x])\n\t" - "vl %%v18, 32(%%r1,%[x])\n\t" - "vl %%v19, 48(%%r1,%[x])\n\t" - "vl %%v20, 64(%%r1,%[x])\n\t" - "vl %%v21, 80(%%r1,%[x])\n\t" - "vl %%v22, 96(%%r1,%[x])\n\t" - "vl %%v23, 112(%%r1,%[x])\n\t" - "vl %%v24, 128(%%r1,%[x])\n\t" - "vl %%v25, 144(%%r1,%[x])\n\t" - "vl %%v26, 160(%%r1,%[x])\n\t" - "vl %%v27, 176(%%r1,%[x])\n\t" - "vl %%v28, 192(%%r1,%[x])\n\t" - "vl %%v29, 208(%%r1,%[x])\n\t" - "vl %%v30, 224(%%r1,%[x])\n\t" - "vl %%v31, 240(%%r1,%[x])\n\t" - "vl %%v0, 0(%%r1,%[y])\n\t" - "vl %%v1, 16(%%r1,%[y])\n\t" - "vl %%v2, 32(%%r1,%[y])\n\t" - "vl %%v3, 48(%%r1,%[y])\n\t" - "vl %%v4, 64(%%r1,%[y])\n\t" - "vl %%v5, 80(%%r1,%[y])\n\t" - "vl %%v6, 96(%%r1,%[y])\n\t" - "vl %%v7, 112(%%r1,%[y])\n\t" - "vst %%v0, 0(%%r1,%[x])\n\t" - "vst %%v1, 16(%%r1,%[x])\n\t" - "vst %%v2, 32(%%r1,%[x])\n\t" - "vst %%v3, 48(%%r1,%[x])\n\t" - "vst %%v4, 64(%%r1,%[x])\n\t" - "vst %%v5, 80(%%r1,%[x])\n\t" - "vst %%v6, 96(%%r1,%[x])\n\t" - "vst %%v7, 112(%%r1,%[x])\n\t" - "vl %%v0, 128(%%r1,%[y])\n\t" - "vl %%v1, 144(%%r1,%[y])\n\t" - "vl %%v2, 160(%%r1,%[y])\n\t" - "vl %%v3, 176(%%r1,%[y])\n\t" - "vl %%v4, 192(%%r1,%[y])\n\t" - "vl %%v5, 208(%%r1,%[y])\n\t" - "vl %%v6, 224(%%r1,%[y])\n\t" - "vl %%v7, 240(%%r1,%[y])\n\t" - "vst %%v0, 128(%%r1,%[x])\n\t" - "vst %%v1, 144(%%r1,%[x])\n\t" - "vst %%v2, 160(%%r1,%[x])\n\t" - "vst %%v3, 176(%%r1,%[x])\n\t" - "vst %%v4, 192(%%r1,%[x])\n\t" - "vst %%v5, 208(%%r1,%[x])\n\t" - "vst %%v6, 224(%%r1,%[x])\n\t" - "vst %%v7, 240(%%r1,%[x])\n\t" - "vst %%v16, 0(%%r1,%[y])\n\t" - "vst %%v17, 16(%%r1,%[y])\n\t" - "vst %%v18, 32(%%r1,%[y])\n\t" - "vst %%v19, 48(%%r1,%[y])\n\t" - "vst %%v20, 64(%%r1,%[y])\n\t" - "vst %%v21, 80(%%r1,%[y])\n\t" - "vst %%v22, 96(%%r1,%[y])\n\t" - "vst %%v23, 112(%%r1,%[y])\n\t" - "vst %%v24, 128(%%r1,%[y])\n\t" - "vst %%v25, 144(%%r1,%[y])\n\t" - "vst %%v26, 160(%%r1,%[y])\n\t" - "vst %%v27, 176(%%r1,%[y])\n\t" - "vst %%v28, 192(%%r1,%[y])\n\t" - "vst %%v29, 208(%%r1,%[y])\n\t" - "vst %%v30, 224(%%r1,%[y])\n\t" - "vst %%v31, 240(%%r1,%[y])\n\t" + "vl %%v16, 0(%%r1,%[x]),3\n\t" + "vl %%v17, 16(%%r1,%[x]),3\n\t" + "vl %%v18, 32(%%r1,%[x]),3\n\t" + "vl %%v19, 48(%%r1,%[x]),3\n\t" + "vl %%v20, 64(%%r1,%[x]),3\n\t" + "vl %%v21, 80(%%r1,%[x]),3\n\t" + "vl %%v22, 96(%%r1,%[x]),3\n\t" + "vl %%v23, 112(%%r1,%[x]),3\n\t" + "vl %%v24, 128(%%r1,%[x]),3\n\t" + "vl %%v25, 144(%%r1,%[x]),3\n\t" + "vl %%v26, 160(%%r1,%[x]),3\n\t" + "vl %%v27, 176(%%r1,%[x]),3\n\t" + "vl %%v28, 192(%%r1,%[x]),3\n\t" + "vl %%v29, 208(%%r1,%[x]),3\n\t" + "vl %%v30, 224(%%r1,%[x]),3\n\t" + "vl %%v31, 240(%%r1,%[x]),3\n\t" + "vl %%v0, 0(%%r1,%[y]),3\n\t" + "vl %%v1, 16(%%r1,%[y]),3\n\t" + "vl %%v2, 32(%%r1,%[y]),3\n\t" + "vl %%v3, 48(%%r1,%[y]),3\n\t" + "vl %%v4, 64(%%r1,%[y]),3\n\t" + "vl %%v5, 80(%%r1,%[y]),3\n\t" + "vl %%v6, 96(%%r1,%[y]),3\n\t" + "vl %%v7, 112(%%r1,%[y]),3\n\t" + "vst %%v0, 0(%%r1,%[x]),3\n\t" + "vst %%v1, 16(%%r1,%[x]),3\n\t" + "vst %%v2, 32(%%r1,%[x]),3\n\t" + "vst %%v3, 48(%%r1,%[x]),3\n\t" + "vst %%v4, 64(%%r1,%[x]),3\n\t" + "vst %%v5, 80(%%r1,%[x]),3\n\t" + "vst %%v6, 96(%%r1,%[x]),3\n\t" + "vst %%v7, 112(%%r1,%[x]),3\n\t" + "vl %%v0, 128(%%r1,%[y]),3\n\t" + "vl %%v1, 144(%%r1,%[y]),3\n\t" + "vl %%v2, 160(%%r1,%[y]),3\n\t" + "vl %%v3, 176(%%r1,%[y]),3\n\t" + "vl %%v4, 192(%%r1,%[y]),3\n\t" + "vl %%v5, 208(%%r1,%[y]),3\n\t" + "vl %%v6, 224(%%r1,%[y]),3\n\t" + "vl %%v7, 240(%%r1,%[y]),3\n\t" + "vst %%v0, 128(%%r1,%[x]),3\n\t" + "vst %%v1, 144(%%r1,%[x]),3\n\t" + "vst %%v2, 160(%%r1,%[x]),3\n\t" + "vst %%v3, 176(%%r1,%[x]),3\n\t" + "vst %%v4, 192(%%r1,%[x]),3\n\t" + "vst %%v5, 208(%%r1,%[x]),3\n\t" + "vst %%v6, 224(%%r1,%[x]),3\n\t" + "vst %%v7, 240(%%r1,%[x]),3\n\t" + "vst %%v16, 0(%%r1,%[y]),3\n\t" + "vst %%v17, 16(%%r1,%[y]),3\n\t" + "vst %%v18, 32(%%r1,%[y]),3\n\t" + "vst %%v19, 48(%%r1,%[y]),3\n\t" + "vst %%v20, 64(%%r1,%[y]),3\n\t" + "vst %%v21, 80(%%r1,%[y]),3\n\t" + "vst %%v22, 96(%%r1,%[y]),3\n\t" + "vst %%v23, 112(%%r1,%[y]),3\n\t" + "vst %%v24, 128(%%r1,%[y]),3\n\t" + "vst %%v25, 144(%%r1,%[y]),3\n\t" + "vst %%v26, 160(%%r1,%[y]),3\n\t" + "vst %%v27, 176(%%r1,%[y]),3\n\t" + "vst %%v28, 192(%%r1,%[y]),3\n\t" + "vst %%v29, 208(%%r1,%[y]),3\n\t" + "vst %%v30, 224(%%r1,%[y]),3\n\t" + "vst %%v31, 240(%%r1,%[y]),3\n\t" "agfi %%r1,256\n\t" "brctg %[n],0b" : "+m"(*(FLOAT (*)[n * 2]) x), "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n)